From e332f90acb05c7aeeac38c89577d75b42956bd36 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 15:25:06 -0500 Subject: [PATCH 01/33] add agentic trace replay benchmark infrastructure Trace replay benchmarking for agentic coding workloads using real Claude Code traces. Includes: - Trace replay scripts for H200, MI355X, B200 (vLLM-based) - kv-cache-tester submodule (trace replayer + 522 anonymized traces) - AIPerf submodule (alternative synthetic benchmarking) - Pareto frontier plotting and sweep aggregation - Metrics collector (prometheus scraper + visualization) - Workload distribution analysis - GitHub Actions workflow with per-TP sweep configs - MI355X runner SCRIPT_SUFFIX support Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 31 + .../workflows/benchmark-multiturn-tmpl.yml | 184 +++ .github/workflows/multiturn-sweep.yml | 231 +++ .gitmodules | 6 + .../multiturn_fp4_b200_trace_replay.sh | 210 +++ .../multiturn_fp8_h200_trace_replay.sh | 206 +++ .../multiturn_fp8_mi355x_trace_replay.sh | 207 +++ .../multiturn/vllm_benchmark/.gitignore | 4 + experimental/multiturn/vllm_benchmark/aiperf | 1 + .../vllm_benchmark/analysis/__init__.py | 0 .../vllm_benchmark/analysis/plot_pareto.py | 1247 +++++++++++++++++ .../vllm_benchmark/bench/__init__.py | 0 .../vllm_benchmark/bench/metrics_collector.py | 957 +++++++++++++ .../bench/run_metrics_collector.py | 124 ++ .../multiturn/vllm_benchmark/kv-cache-tester | 1 + .../multiturn/vllm_benchmark/requirements.txt | 9 + .../analyze_benchmark_distributions.py | 395 ++++++ .../scripts/collect_sweep_results.py | 340 +++++ .../scripts/plot_sweep_overview.py | 222 +++ runners/launch_mi355x-amds.sh | 4 +- 20 files changed, 4377 insertions(+), 2 deletions(-) create mode 100644 .github/configs/multiturn-agentic-trace.yaml create mode 100644 .github/workflows/benchmark-multiturn-tmpl.yml create mode 100644 .github/workflows/multiturn-sweep.yml create mode 100644 .gitmodules create mode 100755 benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh create mode 100755 benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh create mode 100755 benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh create mode 100644 experimental/multiturn/vllm_benchmark/.gitignore create mode 160000 experimental/multiturn/vllm_benchmark/aiperf create mode 100644 experimental/multiturn/vllm_benchmark/analysis/__init__.py create mode 100644 experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py create mode 100644 experimental/multiturn/vllm_benchmark/bench/__init__.py create mode 100644 experimental/multiturn/vllm_benchmark/bench/metrics_collector.py create mode 100644 experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py create mode 160000 experimental/multiturn/vllm_benchmark/kv-cache-tester create mode 100644 experimental/multiturn/vllm_benchmark/requirements.txt create mode 100644 experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py create mode 100755 experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py create mode 100644 experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml new file mode 100644 index 000000000..5ec98b902 --- /dev/null +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -0,0 +1,31 @@ +h200-fp8-llama70b: + tp2: + users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32] + offload: ["on", "off"] + tp4: + users: [2, 4, 6, 8, 16, 24, 32, 40, 48, 56] + offload: ["on", "off"] + tp8: + users: [2, 4, 6, 8, 16, 32, 48, 64, 80, 128, 256] + offload: ["on", "off"] + +mi355x-fp8-llama70b: + tp2: + users: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56] + offload: ["on", "off"] + tp4: + users: [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 112, 256] + offload: ["on", "off"] + tp8: + users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512] + offload: ["on", "off"] + +b200-fp4-dsr1: + tp4: + ep: 4 + users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128] + offload: ["on", "off"] + tp8: + ep: 8 + users: [1, 2, 4, 8, 12, 16, 32, 64, 128] + offload: ["on", "off"] diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml new file mode 100644 index 000000000..a72034b14 --- /dev/null +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -0,0 +1,184 @@ +name: Template - Multi-Turn Benchmark +on: + workflow_call: + inputs: + runner: + required: true + type: string + image: + required: true + type: string + model: + required: true + type: string + precision: + required: false + type: string + default: 'fp4' + exp-name: + required: true + type: string + tp: + required: true + type: string + users: + required: true + type: string + offload-mode: + description: "on = prefix+offload, off = prefix only, noprefix = no prefix caching" + required: true + type: string + duration: + required: false + type: string + default: '' + request-rate: + description: "Request rate per client (Poisson, req/s). 0 = no delay." + required: false + type: string + default: '0' + total-cpu-dram-gb: + required: false + type: string + default: '300' + script-suffix: + description: "Suffix appended to benchmark script name (e.g. '_lmcache')" + required: false + type: string + default: '' + ep: + description: "Expert parallelism size (for MoE models)" + required: false + type: string + default: '0' + ref: + description: "Git ref (branch/sha) to checkout" + required: false + type: string + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + EXP_NAME: ${{ inputs.exp-name }} + MODEL: ${{ inputs.model }} + IMAGE: ${{ inputs.image }} + PRECISION: ${{ inputs.precision }} + FRAMEWORK: 'vllm' + TP: ${{ inputs.tp }} + EP_SIZE: ${{ inputs.ep }} + USERS: ${{ inputs.users }} + OFFLOAD_MODE: ${{ inputs.offload-mode }} + DURATION: ${{ inputs.duration }} + REQUEST_RATE: ${{ inputs.request-rate }} + TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }} + SCRIPT_SUFFIX: ${{ inputs.script-suffix }} + SPEC_DECODING: 'off' + +permissions: + contents: read + +jobs: + benchmark: + runs-on: ${{ inputs.runner }} + timeout-minutes: 180 + name: "${{ inputs.exp-name }} tp=${{ inputs.tp }} users=${{ inputs.users }} offload=${{ inputs.offload-mode }}" + steps: + - name: Resource cleanup (pre-run) + run: &resource-cleanup | + # Cleanup Docker resources + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done + fi + + # Cleanup SLURM resources + if command -v squeue >/dev/null 2>&1; then + if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + else + echo "[Slurm] Cleaning up jobs for user: $USER ..." + scancel -u "$USER" || true + while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do + squeue -u "$USER" + sleep 5 + done + fi + fi + + - name: Clean stale git locks + run: find . -name 'index.lock' -delete 2>/dev/null || true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + submodules: true + + + - name: Launch job script + env: + RUNNER_NAME: ${{ runner.name }} + RESULT_DIR: /workspace/results + run: | + bash ./runners/launch_${RUNNER_NAME%%_*}.sh + + # The runner script doesn't propagate exit codes (scancel masks them). + # Check status.txt to determine if the benchmark actually succeeded. + if [ ! -f results/status.txt ]; then + echo "Run failed: results/status.txt not found." >&2 + exit 1 + fi + STATUS=$(cat results/status.txt) + if [ "$STATUS" != "SUCCESS" ]; then + echo "Run failed: status=$STATUS" >&2 + cat results/benchmark.log 2>/dev/null || true + exit 1 + fi + + - name: Upload results + if: always() + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: "multiturn_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}" + path: | + results/metrics_client_metrics.csv + results/metrics_server_metrics.csv + results/metrics_plots.png + results/benchmark.log + results/server.log + results/config.yaml + results/vllm_command.txt + results/benchmark_command.txt + results/benchmark_metadata.json + results/metrics_workload.png + results/responses.json + results/aiperf_artifacts/ + results/conversations.jsonl + results/workload_distribution_summary.txt + results/workload_distribution_plots.png + results/trace_replay/ + results/status.txt + if-no-files-found: ignore + + - name: Upload server logs + if: always() + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: "server_logs_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}" + path: results/server.log + if-no-files-found: ignore + + - name: Resource cleanup (post-run) + if: always() + run: *resource-cleanup diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml new file mode 100644 index 000000000..5ed7bf59e --- /dev/null +++ b/.github/workflows/multiturn-sweep.yml @@ -0,0 +1,231 @@ +name: Multi-Turn Benchmark Sweep +run-name: "${{ inputs.run_name || format('Multi-Turn Sweep - tp={0} users={1} offload={2}', inputs.tp_values, inputs.user_values, inputs.offload_values) }}" + +on: + # push: + # branches: + # - experimental/multi-turn-benchmark + # paths: + # - .github/workflows/multiturn-sweep.yml + workflow_dispatch: + inputs: + run_name: + description: 'Custom run name (optional)' + required: false + default: '' + type: string + tp_values: + description: 'TP sizes (JSON array)' + required: true + default: '[1, 2, 4, 8]' + type: string + user_values: + description: 'Concurrent user counts (JSON array). Ignored if config_file is set.' + required: false + default: '[8, 16, 32, 64, 128, 256, 512, 1024, 2048]' + type: string + offload_values: + description: 'Offload modes (JSON array: on/off/noprefix). Ignored if config_file is set.' + required: false + default: '["on", "off", "noprefix"]' + type: string + config_file: + description: 'YAML config with per-TP sweep settings (e.g. .github/configs/multiturn-agentic-trace.yaml). Overrides tp/user/offload values.' + required: false + default: '' + type: string + config_key: + description: 'Top-level key in config_file to use (e.g. h200-fp8-llama70b, b200-fp4-dsr1). Required if config_file has multiple entries.' + required: false + default: '' + type: string + duration: + description: 'Benchmark duration in seconds (optional, runs to completion if omitted)' + required: false + default: '' + type: string + request_rate: + description: 'Request rate per client (Poisson, req/s). 0 = no delay.' + required: false + default: '0' + type: string + total_cpu_dram_gb: + description: 'Total CPU DRAM for KV offload (GB)' + required: true + default: '100' + type: string + image: + description: 'Container image' + required: true + default: 'vllm/vllm-openai:v0.18.0' + type: string + model: + description: 'Model name' + required: true + default: 'nvidia/Llama-3.3-70B-Instruct-FP4' + type: string + precision: + description: 'Model precision (fp4, fp8, etc.) — used to select benchmark script' + required: false + default: 'fp4' + type: string + script_suffix: + description: 'Suffix for benchmark script (e.g. "_lmcache" → multiturn_fp4_b200_lmcache.sh)' + required: false + default: '' + type: string + runner: + description: 'Runner label (e.g. b200, h200-dgxc-slurm)' + required: false + default: 'b200' + type: string + ep: + description: 'Expert parallelism size (for MoE models, default 0 = disabled)' + required: false + default: '0' + type: string + ref: + description: 'Git ref (branch/sha) to checkout' + required: false + type: string + +jobs: + # --------------------------------------------------------------------------- + # Generate matrix from config file or CLI inputs + # --------------------------------------------------------------------------- + generate-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.gen.outputs.matrix }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + if: ${{ inputs.config_file != '' }} + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 1 + ref: ${{ inputs.ref || github.ref }} + sparse-checkout: ${{ inputs.config_file }} + + - id: gen + run: | + pip install -q pyyaml + python3 << 'PYEOF' + import json, os, sys + + config_file = "${{ inputs.config_file }}".strip() + + if config_file: + import yaml + with open(config_file) as f: + full_config = yaml.safe_load(f) + + config_key = "${{ inputs.config_key }}".strip() + + # If config_key specified, use that section; otherwise auto-detect + if config_key and config_key in full_config: + config = full_config[config_key] + elif config_key: + print(f"ERROR: config_key '{config_key}' not found. Available: {list(full_config.keys())}") + sys.exit(1) + elif len(full_config) == 1: + config = next(iter(full_config.values())) + else: + # Check if top-level keys look like tp entries (tp2, tp4, etc.) + if all(k.startswith("tp") for k in full_config): + config = full_config + else: + print(f"ERROR: Multiple entries in config, specify --config_key. Available: {list(full_config.keys())}") + sys.exit(1) + + includes = [] + for key, settings in config.items(): + tp = int(key.replace("tp", "")) + users = settings.get("users", []) + offloads = settings.get("offload", ["on", "off"]) + ep = settings.get("ep", 0) + for u in users: + for o in offloads: + entry = {"tp": tp, "users": u, "offload": o} + if ep > 0: + entry["ep"] = ep + includes.append(entry) + else: + tp_values = json.loads('${{ inputs.tp_values }}') + user_values = json.loads('${{ inputs.user_values }}') + offload_values = json.loads('${{ inputs.offload_values }}') + includes = [] + for tp in tp_values: + for u in user_values: + for o in offload_values: + includes.append({"tp": tp, "users": u, "offload": o}) + + matrix = {"include": includes} + print(f"Generated {len(includes)} matrix entries") + with open(os.environ["GITHUB_OUTPUT"], "a") as f: + f.write(f"matrix={json.dumps(matrix)}\n") + PYEOF + + # --------------------------------------------------------------------------- + # Matrix benchmark jobs — each cell calls the multiturn template + # --------------------------------------------------------------------------- + sweep: + needs: generate-matrix + uses: ./.github/workflows/benchmark-multiturn-tmpl.yml + name: sweep / + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }} + secrets: inherit + with: + runner: ${{ inputs.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + precision: ${{ inputs.precision }} + exp-name: "multiturn_tp${{ matrix.tp }}_users${{ matrix.users }}_offload${{ matrix.offload }}" + tp: "${{ matrix.tp }}" + users: "${{ matrix.users }}" + offload-mode: ${{ matrix.offload }} + duration: ${{ inputs.duration }} + request-rate: ${{ inputs.request_rate }} + total-cpu-dram-gb: ${{ inputs.total_cpu_dram_gb }} + script-suffix: ${{ inputs.script_suffix }} + ep: "${{ matrix.ep || inputs.ep }}" + ref: ${{ inputs.ref }} + + # --------------------------------------------------------------------------- + # Collect & aggregate results + # --------------------------------------------------------------------------- + collect: + runs-on: ubuntu-latest + needs: sweep + if: always() + name: Collect results + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 1 + ref: ${{ inputs.ref || github.ref }} + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install pandas matplotlib numpy + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + pattern: 'multiturn_*' + path: results/ + + - name: Run aggregation + run: | + python experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py results/ aggregated/ + + - name: Upload aggregated results + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: multiturn_aggregated + path: aggregated/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..c45593c07 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "experimental/multiturn/vllm_benchmark/aiperf"] + path = experimental/multiturn/vllm_benchmark/aiperf + url = https://github.com/cquil11/aiperf.git +[submodule "experimental/multiturn/vllm_benchmark/kv-cache-tester"] + path = experimental/multiturn/vllm_benchmark/kv-cache-tester + url = https://github.com/cquil11/kv-cache-tester.git diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh new file mode 100755 index 000000000..d22448892 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Trace replay benchmark for FP4 models on B200. +# Replays real agentic coding traces at a fixed number of concurrent users. +# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns. +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (default 1800, benchmark duration in seconds) +# MAX_DELAY (default 60, max gap between requests in seconds) +# ADVANCE_MIN (default 0.0, min trace advancement fraction) +# ADVANCE_MAX (default 0.7, max trace advancement fraction) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" +TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" + +pip install --quiet urllib3 requests 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +# cat > "$RESULT_DIR/config.yaml" << 'EOF' +# kv-cache-dtype: fp8 +# async-scheduling: true +# max-num-batched-tokens: 8192 +# EOF + +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB +# max_seqs=$USERS + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +# VLLM_CMD+=" --max-num-seqs $max_seqs" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" +if [ "${EP_SIZE:-0}" -gt 1 ]; then + VLLM_CMD+=" --enable-expert-parallel" +fi + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" +pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run trace replay benchmark --------------------------------------------- +REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" +REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" +REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" +REPLAY_CMD+=" --start-users $USERS" +REPLAY_CMD+=" --max-users $USERS" +REPLAY_CMD+=" --max-ttft 9999" +REPLAY_CMD+=" --test-duration $DURATION" +REPLAY_CMD+=" --recycle" +REPLAY_CMD+=" --max-delay $MAX_DELAY" +REPLAY_CMD+=" --max-concurrent-requests 0" +REPLAY_CMD+=" --max-new-tokens-per-period 999999999" +REPLAY_CMD+=" --advance-min $ADVANCE_MIN" +REPLAY_CMD+=" --advance-max $ADVANCE_MAX" +REPLAY_CMD+=" --seed 42" +REPLAY_CMD+=" --no-color" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh new file mode 100755 index 000000000..f3f967a82 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -0,0 +1,206 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Trace replay benchmark for FP8 models on H200. +# Replays real agentic coding traces at a fixed number of concurrent users. +# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns. +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (default 1800, benchmark duration in seconds) +# MAX_DELAY (default 60, max gap between requests in seconds) +# ADVANCE_MIN (default 0.0, min trace advancement fraction) +# ADVANCE_MAX (default 0.7, max trace advancement fraction) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" +TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" + +pip install --quiet urllib3 requests 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +# cat > "$RESULT_DIR/config.yaml" << 'EOF' +# kv-cache-dtype: fp8 +# async-scheduling: true +# max-num-batched-tokens: 8192 +# EOF + +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB +# max_seqs=$USERS + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +# VLLM_CMD+=" --max-num-seqs $max_seqs" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" +pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run trace replay benchmark --------------------------------------------- +REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" +REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" +REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" +REPLAY_CMD+=" --start-users $USERS" +REPLAY_CMD+=" --max-users $USERS" +REPLAY_CMD+=" --max-ttft 9999" +REPLAY_CMD+=" --test-duration $DURATION" +REPLAY_CMD+=" --recycle" +REPLAY_CMD+=" --max-delay $MAX_DELAY" +REPLAY_CMD+=" --max-concurrent-requests 0" +REPLAY_CMD+=" --max-new-tokens-per-period 999999999" +REPLAY_CMD+=" --advance-min $ADVANCE_MIN" +REPLAY_CMD+=" --advance-max $ADVANCE_MAX" +REPLAY_CMD+=" --seed 42" +REPLAY_CMD+=" --no-color" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh new file mode 100755 index 000000000..4cf20c453 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -0,0 +1,207 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Trace replay benchmark for FP8 models on MI355X. +# Replays real agentic coding traces at a fixed number of concurrent users. +# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns. +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (default 1800, benchmark duration in seconds) +# MAX_DELAY (default 60, max gap between requests in seconds) +# ADVANCE_MIN (default 0.0, min trace advancement fraction) +# ADVANCE_MAX (default 0.7, max trace advancement fraction) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi 2>/dev/null || rocm-smi 2>/dev/null || true + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" +TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" + +pip install --quiet urllib3 requests 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +# cat > "$RESULT_DIR/config.yaml" << 'EOF' +# kv-cache-dtype: fp8 +# async-scheduling: true +# max-num-batched-tokens: 8192 +# EOF + +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB +# max_seqs=$USERS + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +# VLLM_CMD+=" --max-num-seqs $max_seqs" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +# MI355X is ROCm — no CUDA arch needed +# export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" +pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run trace replay benchmark --------------------------------------------- +REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" +REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" +REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" +REPLAY_CMD+=" --start-users $USERS" +REPLAY_CMD+=" --max-users $USERS" +REPLAY_CMD+=" --max-ttft 9999" +REPLAY_CMD+=" --test-duration $DURATION" +REPLAY_CMD+=" --recycle" +REPLAY_CMD+=" --max-delay $MAX_DELAY" +REPLAY_CMD+=" --max-concurrent-requests 0" +REPLAY_CMD+=" --max-new-tokens-per-period 999999999" +REPLAY_CMD+=" --advance-min $ADVANCE_MIN" +REPLAY_CMD+=" --advance-max $ADVANCE_MAX" +REPLAY_CMD+=" --seed 42" +REPLAY_CMD+=" --no-color" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" diff --git a/experimental/multiturn/vllm_benchmark/.gitignore b/experimental/multiturn/vllm_benchmark/.gitignore new file mode 100644 index 000000000..a0c3ca327 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/.gitignore @@ -0,0 +1,4 @@ +*.png +*.json +*.parquet +results/ \ No newline at end of file diff --git a/experimental/multiturn/vllm_benchmark/aiperf b/experimental/multiturn/vllm_benchmark/aiperf new file mode 160000 index 000000000..373218fb3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/aiperf @@ -0,0 +1 @@ +Subproject commit 373218fb3c3d15fada9c4be6465daf8fb5a70ef6 diff --git a/experimental/multiturn/vllm_benchmark/analysis/__init__.py b/experimental/multiturn/vllm_benchmark/analysis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py new file mode 100644 index 000000000..277bfca7f --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -0,0 +1,1247 @@ +#!/usr/bin/env python3 +""" +Plot Pareto frontiers for prefix caching modes. +Modes: on (prefix + offload), off (prefix only), noprefix (no prefix caching) +Pareto frontier: throughput vs latency trade-off. + +Usage: + python plot_pareto.py + python plot_pareto.py ~/sweep_results_20260204_062339 +""" + +import json +import sys +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from pathlib import Path + + +def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from aiperf profile_export JSONL.""" + records = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + meta = entry.get("metadata", {}) + metrics = entry.get("metrics", {}) + + if meta.get("benchmark_phase") != "profiling": + continue + if meta.get("was_cancelled", False): + continue + + def val(key, default=0): + m = metrics.get(key) + if m is None: + return default + return m.get("value", default) if isinstance(m, dict) else m + + itl = metrics.get("inter_token_latency") + if itl and isinstance(itl, dict): + tpot_ms = itl.get("value", 0) + else: + osl = val("output_sequence_length", 1) + ttft = val("time_to_first_token", 0) + latency = val("request_latency", 0) + tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 + + start_ns = meta.get("request_start_ns", 0) + start_ms = start_ns / 1e6 + + records.append({ + "start_time_ms": start_ms, + "ttft_ms": val("time_to_first_token"), + "tpot_ms": tpot_ms, + "latency_ms": val("request_latency"), + "input_num_tokens": val("input_sequence_length"), + "output_num_tokens": val("output_sequence_length"), + }) + + if not records: + return None + return pd.DataFrame(records) + + +def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from trace_replay detailed_results.csv.""" + df = pd.read_csv(csv_path) + if len(df) == 0: + return None + + # Filter to successful requests only + df = df[df["success"] == True].copy() + if len(df) == 0: + return None + + # Convert to the same schema as _load_aiperf_jsonl + latency_s = df["request_complete_time"] - df["request_start_time"] + records = pd.DataFrame({ + "start_time_ms": df["request_start_time"] * 1000, + "ttft_ms": df["ttft"] * 1000, + "tpot_ms": df["itl"] * 1000, + "latency_ms": latency_s * 1000, + "input_num_tokens": df["input_tokens"], + "output_num_tokens": df["output_tokens_actual"], + }) + return records + + +def load_experiment_data(exp_dir: Path) -> dict | None: + """Load and aggregate metrics from an experiment directory.""" + client_metrics_file = exp_dir / "metrics_client_metrics.csv" + server_metrics_file = exp_dir / "metrics_server_metrics.csv" + status_file = exp_dir / "status.txt" + + # Check if experiment completed successfully + if not status_file.exists(): + return None + status = status_file.read_text().strip() + if status != "SUCCESS": + return None + + # Also check for aiperf output + aiperf_jsonl = None + aiperf_artifacts = exp_dir / "aiperf_artifacts" + if aiperf_artifacts.exists(): + candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) + if not candidates: + candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) + if candidates: + aiperf_jsonl = candidates[0] + + # Check for trace replay output + trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" + + if not client_metrics_file.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + return None + + try: + if client_metrics_file.exists(): + df = pd.read_csv(client_metrics_file) + elif aiperf_jsonl is not None: + df = _load_aiperf_jsonl(aiperf_jsonl) + elif trace_replay_csv.exists(): + df = _load_trace_replay_csv(trace_replay_csv) + else: + return None + + # Load server metrics for cache hit rates + gpu_hit_rate = None + cpu_hit_rate = None + if server_metrics_file.exists(): + server_df = pd.read_csv(server_metrics_file) + # Get final cumulative values + final_row = server_df.iloc[-1] + if final_row["prefix_cache_queries"] > 0: + gpu_hit_rate = 100 * final_row["prefix_cache_hits"] / final_row["prefix_cache_queries"] + if final_row["cpu_prefix_cache_queries"] > 0: + cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"] + if len(df) == 0: + return None + + # Parse experiment name: tp{N}_bs{M}_offload{on|off} + exp_name = exp_dir.name + parts = exp_name.split("_") + tp = int(parts[0].replace("tp", "")) + bs = int(parts[1].replace("bs", "")) + offload = parts[2].replace("offload", "") + + # Calculate metrics + # Prefer benchmark_metadata.json for precise wall-clock duration + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + # Fallback: derive from per-request data (first start to last finish) + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 # fallback + + num_requests = len(df) + throughput_rps = num_requests / total_time_sec if total_time_sec > 0 else 0 + + # Input token throughput (prefill) + total_input_tokens = df["input_num_tokens"].sum() + input_throughput_tps = total_input_tokens / total_time_sec if total_time_sec > 0 else 0 + + # Output token throughput (decode only) + total_output_tokens = df["output_num_tokens"].sum() + output_throughput_tps = total_output_tokens / total_time_sec if total_time_sec > 0 else 0 + + # Total token throughput (input + output) + total_tokens = total_input_tokens + total_output_tokens + total_throughput_tps = total_tokens / total_time_sec if total_time_sec > 0 else 0 + + # Normalized throughput (per GPU) + input_tps_per_gpu = input_throughput_tps / tp + output_tps_per_gpu = output_throughput_tps / tp + total_tps_per_gpu = total_throughput_tps / tp + + return { + "exp_name": exp_name, + "tp": tp, + "bs": bs, + "offload": offload, + "num_requests": num_requests, + "throughput_rps": throughput_rps, + "input_throughput_tps": input_throughput_tps, + "total_throughput_tps": total_throughput_tps, + "input_tps_per_gpu": input_tps_per_gpu, + "output_tps_per_gpu": output_tps_per_gpu, + "total_tps_per_gpu": total_tps_per_gpu, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "p999_tpot_ms": df["tpot_ms"].quantile(0.999), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + "p999_latency_ms": df["latency_ms"].quantile(0.999), + "p999_ttft_ms": df["ttft_ms"].quantile(0.999), + # Cache hit rates + "gpu_hit_rate": gpu_hit_rate, + "cpu_hit_rate": cpu_hit_rate, + } + except Exception as e: + print(f"Error loading {exp_dir}: {e}") + return None + + +def compute_pareto_frontier(points: list[tuple[float, float]], maximize_x: bool = False) -> list[tuple[float, float]]: + """ + Compute Pareto frontier for (x, y) points. + Y is always maximized. X is minimized by default, or maximized if maximize_x=True. + + For minimize X, maximize Y (e.g., latency vs throughput): + - Frontier goes bottom-left to top-right + - Low latency = low throughput, high latency = high throughput + + For maximize X, maximize Y (e.g., interactivity vs throughput): + - Frontier goes top-left to bottom-right + - Trade-off between the two "goods" + + Returns points sorted by X ascending for plotting. + """ + if not points: + return [] + + # Remove invalid points + points = [(x, y) for x, y in points if x > 0 and y > 0] + if not points: + return [] + + frontier = [] + sorted_points = sorted(points, key=lambda p: p[0]) + + if maximize_x: + # Maximize both X and Y: frontier goes top-left to bottom-right + # Traverse from high X to low X, keep points with increasing Y + max_y = float('-inf') + for x, y in reversed(sorted_points): + if y > max_y: + frontier.append((x, y)) + max_y = y + return sorted(frontier, key=lambda p: p[0]) + else: + # Minimize X, maximize Y: frontier goes bottom-left to top-right + # Traverse from low X to high X, keep points with increasing Y + max_y = float('-inf') + for x, y in sorted_points: + if y > max_y: + frontier.append((x, y)) + max_y = y + return frontier + + +def compute_pareto_frontier_with_metadata(df_subset: pd.DataFrame, x_col: str, y_col: str, maximize_x: bool = False) -> pd.DataFrame: + """ + Compute Pareto frontier and return the rows from the dataframe that are on the frontier. + """ + if len(df_subset) == 0: + return pd.DataFrame() + + # Get valid points + valid_mask = (df_subset[x_col] > 0) & (df_subset[y_col] > 0) + df_valid = df_subset[valid_mask].copy() + + if len(df_valid) == 0: + return pd.DataFrame() + + # Sort by x + df_sorted = df_valid.sort_values(x_col).reset_index(drop=True) + + frontier_indices = [] + max_y = float('-inf') + + if maximize_x: + # Traverse from high X to low X + for i in range(len(df_sorted) - 1, -1, -1): + y = df_sorted.iloc[i][y_col] + if y > max_y: + frontier_indices.append(i) + max_y = y + frontier_indices = frontier_indices[::-1] # Reverse to get ascending X order + else: + # Traverse from low X to high X + for i in range(len(df_sorted)): + y = df_sorted.iloc[i][y_col] + if y > max_y: + frontier_indices.append(i) + max_y = y + + return df_sorted.iloc[frontier_indices] + + +def generate_pareto_only_figure(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with concurrency labels.""" + + # Compute interactivity + df = df.copy() + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + # Get available modes and create subsets + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + # Create figure with columns for each mode + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers Only (with Concurrency Labels)", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x) + metrics_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + # Get Pareto frontier points with metadata + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + # Plot frontier line + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + # Plot points colored by TP + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + # Add concurrency labels + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_only_figure_p90(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with p90 latencies.""" + + df = df.copy() + df["interactivity_p90"] = 1000.0 / df["p90_tpot_ms"] + + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers (P90 Latencies) with Concurrency Labels", fontsize=14) + + if num_cols == 1: + axes = axes.reshape(-1, 1) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + metrics_configs = [ + (0, "p90_ttft_ms", "input_tps_per_gpu", "TTFT", "P90 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p90", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P90 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p90_latency_ms", "total_tps_per_gpu", "E2E Latency", "P90 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p90", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P90 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean_p90.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean P90 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure_p90(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid using p90 latencies.""" + + df = df.copy() + df["interactivity_p90"] = 1000.0 / df["p90_tpot_ms"] + + available_modes = df["offload"].unique() + + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), + "off": ("--", "none", "gray", (5, -12), "italic"), + "noprefix": (":", "red", "red", (5, -25), "oblique"), + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers (P90 Latencies): Mode Comparison", fontsize=14) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + plot_configs = [ + (0, "p90_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P90 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p90", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P90 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p90_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P90 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p90", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P90 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay_p90.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay P90 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_only_figure_p99(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with p99 latencies.""" + + # Compute interactivity using p99 + df = df.copy() + df["interactivity_p99"] = 1000.0 / df["p99_tpot_ms"] + + # Get available modes and create subsets + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + # Create figure with columns for each mode + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers (P99 Latencies) with Concurrency Labels", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x) + metrics_configs = [ + (0, "p99_ttft_ms", "input_tps_per_gpu", "TTFT", "P99 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p99", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P99 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p99_latency_ms", "total_tps_per_gpu", "E2E Latency", "P99 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p99", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P99 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + # Get Pareto frontier points with metadata + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + # Plot frontier line + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + # Plot points colored by TP + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + # Add concurrency labels + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean_p99.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean P99 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure_p99(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid using p99 latencies.""" + + # Compute interactivity using p99 + df = df.copy() + df["interactivity_p99"] = 1000.0 / df["p99_tpot_ms"] + + # Get available modes + available_modes = df["offload"].unique() + + # Mode styles + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), + "off": ("--", "none", "gray", (5, -12), "italic"), + "noprefix": (":", "red", "red", (5, -25), "oblique"), + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + # Create 4x1 figure + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers (P99 Latencies): Mode Comparison", fontsize=14) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Plot configs + plot_configs = [ + (0, "p99_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P99 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p99", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P99 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p99_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P99 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p99", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P99 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay_p99.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay P99 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_only_figure_p999(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with p99.9 latencies.""" + + df = df.copy() + df["interactivity_p999"] = 1000.0 / df["p999_tpot_ms"] + + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers (P99.9 Latencies) with Concurrency Labels", fontsize=14) + + if num_cols == 1: + axes = axes.reshape(-1, 1) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + metrics_configs = [ + (0, "p999_ttft_ms", "input_tps_per_gpu", "TTFT", "P99.9 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p999", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P99.9 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p999_latency_ms", "total_tps_per_gpu", "E2E Latency", "P99.9 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p999", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P99.9 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean_p999.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean P99.9 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure_p999(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid using p99.9 latencies.""" + + df = df.copy() + df["interactivity_p999"] = 1000.0 / df["p999_tpot_ms"] + + available_modes = df["offload"].unique() + + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), + "off": ("--", "none", "gray", (5, -12), "italic"), + "noprefix": (":", "red", "red", (5, -25), "oblique"), + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers (P99.9 Latencies): Mode Comparison", fontsize=14) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + plot_configs = [ + (0, "p999_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P99.9 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p999", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P99.9 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p999_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P99.9 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p999", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P99.9 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay_p999.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay P99.9 Pareto plot to {output_file}") + plt.close() + + +def generate_combined_pareto_figure(df: pd.DataFrame, results_dir: Path, + percentile: str = "p50"): + """Generate a combined Pareto frontier across ALL offload modes. + + Points are colored by TP and edge-styled by offload mode so the viewer + can see both the overall optimal frontier and which config each point + comes from. + + percentile: one of "p50", "p90", "p99", "p999" + """ + from matplotlib.lines import Line2D + + pct = percentile # e.g. "p50" + pct_label = {"p50": "Median", "p90": "P90", "p99": "P99", "p999": "P99.9"}[pct] + suffix = f"_{pct}" + + df = df.copy() + interactivity_col = f"interactivity{suffix}" + df[interactivity_col] = 1000.0 / df[f"{pct}_tpot_ms"] + + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle(f"Combined Pareto Frontier — {pct_label} SLA (All Configs)", fontsize=14) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + mode_edge = { + "on": {"edgecolors": "black", "linewidths": 1.8}, + "off": {"edgecolors": "gray", "linewidths": 1.2}, + "noprefix": {"edgecolors": "#cc0000", "linewidths": 1.2}, + } + mode_short = {"on": "P+O", "off": "P", "noprefix": "NP"} + + metrics_configs = [ + (0, f"{pct}_ttft_ms", "input_tps_per_gpu", "TTFT", f"{pct_label} TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, interactivity_col, "total_tps_per_gpu", "Interactivity", f"Interactivity (1000/{pct_label} TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, f"{pct}_latency_ms", "total_tps_per_gpu", "E2E Latency", f"{pct_label} E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, interactivity_col, "output_tps_per_gpu", "Output Throughput", f"Interactivity (1000/{pct_label} TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + ax = axes[row] + + # # All-data scatter (faded background) + # for tp in sorted(df["tp"].unique()): + # tp_data = df[df["tp"] == tp] + # ax.scatter(tp_data[x_col], tp_data[y_col], + # c=tp_colors.get(tp, "purple"), + # marker=tp_markers.get(tp, "x"), + # s=40, alpha=0.15, linewidths=0.3, + # edgecolors="gray") + + # Combined Pareto frontier + frontier_df = compute_pareto_frontier_with_metadata(df, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black", + label="Pareto Frontier", zorder=4) + + for _, pt in frontier_df.iterrows(): + tp = pt["tp"] + mode = pt["offload"] + edge_kw = mode_edge.get(mode, {"edgecolors": "black", "linewidths": 1}) + ax.scatter(pt[x_col], pt[y_col], + c=tp_colors.get(tp, "purple"), + marker=tp_markers.get(tp, "x"), + s=160, alpha=0.9, zorder=5, + **edge_kw) + + for _, pt in frontier_df.iterrows(): + ax.annotate( + f"conc={int(pt['bs'])} {mode_short.get(pt['offload'], '')}", + (pt[x_col], pt[y_col]), + textcoords="offset points", xytext=(5, 5), + fontsize=7, alpha=0.85) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(f"{metric_name} — All Configs Combined") + ax.grid(True, alpha=0.3) + + handles = [Line2D([0], [0], color="black", lw=2, label="Pareto Frontier")] + for tp in sorted(df["tp"].unique()): + handles.append(Line2D([0], [0], marker=tp_markers[tp], color="w", + markerfacecolor=tp_colors[tp], markersize=8, + markeredgecolor="black", label=f"TP={tp}")) + handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w", + markersize=8, markeredgecolor="black", markeredgewidth=1.8, + label="Edge: P+Offload")) + handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w", + markersize=8, markeredgecolor="gray", markeredgewidth=1.2, + label="Edge: Prefix Only")) + handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w", + markersize=8, markeredgecolor="#cc0000", markeredgewidth=1.2, + label="Edge: No Prefix")) + ax.legend(handles=handles, fontsize=7, + loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + fname = f"pareto_frontiers_combined{suffix}.png" + output_file = results_dir / fname + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved combined {pct_label} Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid for direct comparison.""" + + # Compute interactivity + df = df.copy() + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + # Get available modes + available_modes = df["offload"].unique() + + # Mode styles: (linestyle, marker_edge, line_color, label_offset, font_style) + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), # Prefix + Offload + "off": ("--", "none", "gray", (5, -12), "italic"), # Prefix only + "noprefix": (":", "red", "red", (5, -25), "oblique"), # No prefix caching + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + # Create 4x1 figure + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers: Prefix Caching Mode Comparison", fontsize=14) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Plot configs: (row, x_col, y_col, title, x_label, y_label, maximize_x) + plot_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + # Plot all available modes + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + # Plot frontier line + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + # Plot points colored by TP + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + # Only add TP to legend once (for first mode) + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + # Add concurrency labels + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay Pareto plot to {output_file}") + plt.close() + + +def main(results_dir: Path): + # Load all experiments + experiments = [] + for exp_dir in results_dir.iterdir(): + if exp_dir.is_dir() and exp_dir.name.startswith("tp"): + data = load_experiment_data(exp_dir) + if data: + experiments.append(data) + + if not experiments: + print("No experiment data found!") + return + + df = pd.DataFrame(experiments) + print(f"Loaded {len(df)} experiments") + print(df[["exp_name", "tp", "bs", "offload", "input_tps_per_gpu", "total_tps_per_gpu", "p50_ttft_ms"]].to_string()) + + # Compute interactivity = 1000 / TPOT (tokens per second for decode) + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + # Get available modes and create subsets + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + # Create figure with columns for each mode + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers: Throughput/GPU vs Latency (All Points)", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x) + metrics_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + # Compute and plot Pareto frontier + points = list(zip(df_subset[x_col], df_subset[y_col])) + frontier = compute_pareto_frontier(points, maximize_x=maximize_x) + + if frontier: + fx, fy = zip(*frontier) + ax.plot(fx, fy, linestyle='-', linewidth=2, alpha=0.8, color="black", label="Pareto frontier") + + # Plot points colored by TP + for tp in sorted(df_subset["tp"].unique()): + tp_data = df_subset[df_subset["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=100, alpha=0.8, edgecolors="black", linewidths=0.5, + label=f"TP={tp}") + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"\nSaved plot to {output_file}") + plt.close() + + # Also save summary CSV + summary_file = results_dir / "experiment_summary.csv" + df.to_csv(summary_file, index=False) + print(f"Saved summary to {summary_file}") + + # Generate clean Pareto-only figure + generate_pareto_only_figure(df, results_dir) + + # Generate combined Pareto frontier (all configs pooled) for each SLA percentile + for pct in ("p50", "p90", "p99", "p999"): + generate_combined_pareto_figure(df, results_dir, percentile=pct) + + # Generate overlay figure (on vs off comparison) + generate_pareto_overlay_figure(df, results_dir) + + # Generate P90 versions + generate_pareto_only_figure_p90(df, results_dir) + generate_pareto_overlay_figure_p90(df, results_dir) + + # Generate P99 versions + generate_pareto_only_figure_p99(df, results_dir) + generate_pareto_overlay_figure_p99(df, results_dir) + + # Generate P99.9 versions + generate_pareto_only_figure_p999(df, results_dir) + generate_pareto_overlay_figure_p999(df, results_dir) + + # Generate cache hit rate plot + generate_cache_hit_rate_figure(df, results_dir) + + +def generate_cache_hit_rate_figure(df: pd.DataFrame, results_dir: Path): + """Generate plot showing throughput vs cache hit rates (GPU and CPU).""" + + # Get available modes + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + + # Create 2x3 figure (GPU hit rate row, CPU hit rate row, columns for each mode) + num_cols = len(available_modes) + fig, axes = plt.subplots(2, num_cols, figsize=(6 * num_cols, 10)) + fig.suptitle("Cache Hit Rate vs Throughput", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Plot configs: (row, hit_rate_col, title_prefix) + hit_rate_configs = [ + (0, "gpu_hit_rate", "GPU"), + (1, "cpu_hit_rate", "CPU"), + ] + + for row, hit_rate_col, hit_type in hit_rate_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df[df["offload"] == mode].dropna(subset=[hit_rate_col]) + + if len(df_subset) == 0: + ax.text(0.5, 0.5, "No data", ha='center', va='center', transform=ax.transAxes) + ax.set_title(f"{hit_type} Hit Rate ({mode_titles.get(mode, mode)})") + continue + + # Plot points colored by TP + for tp in sorted(df_subset["tp"].unique()): + tp_data = df_subset[df_subset["tp"] == tp] + ax.scatter(tp_data[hit_rate_col], tp_data["total_tps_per_gpu"], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=100, alpha=0.8, edgecolors="black", linewidths=0.5, + label=f"TP={tp}") + + # Add concurrency labels + for _, point in df_subset.iterrows(): + ax.annotate(f"bs={int(point['bs'])}", + (point[hit_rate_col], point["total_tps_per_gpu"]), + textcoords="offset points", + xytext=(5, 5), + fontsize=7, + alpha=0.7) + + ax.set_xlabel(f"{hit_type} Cache Hit Rate (%)") + ax.set_ylabel("Total Throughput/GPU (tok/s)") + ax.set_title(f"{hit_type} Hit Rate ({mode_titles.get(mode, mode)})") + ax.set_xlim(-5, 105) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right") + + plt.tight_layout() + + output_file = results_dir / "cache_hit_rates.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved cache hit rate plot to {output_file}") + plt.close() + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python plot_pareto.py ") + print("Example: python plot_pareto.py ~/sweep_results_20260204_062339") + sys.exit(1) + + results_dir = Path(sys.argv[1]).expanduser() + if not results_dir.exists(): + print(f"Error: {results_dir} does not exist") + sys.exit(1) + + main(results_dir) diff --git a/experimental/multiturn/vllm_benchmark/bench/__init__.py b/experimental/multiturn/vllm_benchmark/bench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py new file mode 100644 index 000000000..c129f38b8 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -0,0 +1,957 @@ +""" +Metrics collector for vLLM server during benchmarks. +Polls /metrics endpoint and generates visualizations. +""" + +import asyncio +import csv +import re +import subprocess +import threading +import time +from dataclasses import dataclass, field +from pathlib import Path + +import aiohttp +import matplotlib.pyplot as plt + + +@dataclass +class GpuTransferSnapshot: + timestamp: float + gpu_id: int = 0 + tx_pci: float = 0.0 # PCIe TX (MB/s) + rx_pci: float = 0.0 # PCIe RX (MB/s) + + +class GpuTransferCollector: + """DEPRECATED: Collects GPU transfer stats using nvidia-smi dmon. + + Replaced by vLLM's native kv_offload metrics (vllm:kv_offload_total_bytes_total, + vllm:kv_offload_total_time_total) which are more precise and don't require + spawning a subprocess. + """ + + def __init__(self, gpu_id: int = 0, poll_interval: int = 1): + self.gpu_id = gpu_id + self.poll_interval = poll_interval + self.snapshots: list[GpuTransferSnapshot] = [] + self._process: subprocess.Popen | None = None + self._thread: threading.Thread | None = None + self._running = False + + def _parse_line(self, line: str) -> GpuTransferSnapshot | None: + """Parse a line of nvidia-smi dmon CSV output. + + Format: gpu, rxpci, txpci (values in MB/s) + Example: 0, 406, 32013 + """ + line = line.strip() + if not line or line.startswith('#'): # Skip header/comments + return None + + parts = [p.strip() for p in line.split(',')] + if len(parts) < 3: + return None + + try: + return GpuTransferSnapshot( + timestamp=time.time(), + gpu_id=int(parts[0]), + rx_pci=float(parts[1]) if parts[1] != '-' else 0.0, + tx_pci=float(parts[2]) if parts[2] != '-' else 0.0, + ) + except (ValueError, IndexError): + return None + + def _reader_thread(self) -> None: + """Background thread to read nvidia-smi output.""" + if self._process is None: + return + + for line in iter(self._process.stdout.readline, ''): + if not self._running: + break + snapshot = self._parse_line(line) + if snapshot and snapshot.gpu_id == self.gpu_id: + self.snapshots.append(snapshot) + + def start(self) -> None: + """Start collecting GPU transfer stats.""" + if self._running: + return + + self._running = True + self.snapshots = [] + + try: + self._process = subprocess.Popen( + [ + 'nvidia-smi', 'dmon', + '-i', str(self.gpu_id), + '-s', 't', + '-d', str(self.poll_interval), + '--format', 'csv', + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + self._thread = threading.Thread(target=self._reader_thread, daemon=True) + self._thread.start() + except FileNotFoundError: + print("nvidia-smi not found, GPU transfer monitoring disabled") + self._running = False + + def stop(self) -> None: + """Stop collecting GPU transfer stats.""" + self._running = False + if self._process: + self._process.terminate() + try: + self._process.wait(timeout=2) + except subprocess.TimeoutExpired: + self._process.kill() + self._process = None + + if self._thread: + self._thread.join(timeout=2) + self._thread = None + + +@dataclass +class MetricsSnapshot: + timestamp: float + kv_cache_usage: float = 0.0 + cpu_kv_cache_usage: float = 0.0 + num_requests_running: int = 0 + num_requests_waiting: int = 0 + prefix_cache_hits: int = 0 + prefix_cache_queries: int = 0 + cpu_prefix_cache_hits: int = 0 + cpu_prefix_cache_queries: int = 0 + prompt_tokens: int = 0 + generation_tokens: int = 0 + num_preemptions: int = 0 + request_success: int = 0 + # KV offload transfer metrics (cumulative) + kv_offload_bytes_gpu_to_cpu: float = 0.0 + kv_offload_bytes_cpu_to_gpu: float = 0.0 + kv_offload_time_gpu_to_cpu: float = 0.0 + kv_offload_time_cpu_to_gpu: float = 0.0 + # Prompt tokens by source (cumulative) + prompt_tokens_local_compute: int = 0 + prompt_tokens_local_cache_hit: int = 0 + prompt_tokens_external_kv_transfer: int = 0 + # Prefill KV computed tokens (cumulative sum from histogram) + prefill_kv_computed_tokens_sum: int = 0 + prefill_kv_computed_tokens_count: int = 0 + + +@dataclass +class MetricsCollector: + base_url: str + poll_interval: float = 1.0 + snapshots: list[MetricsSnapshot] = field(default_factory=list) + _running: bool = False + _task: asyncio.Task | None = None + gpu_transfer_collector: GpuTransferCollector | None = None + gpu_id: int = 0 + + def _parse_metrics(self, text: str) -> MetricsSnapshot: + """Parse Prometheus metrics text format.""" + snapshot = MetricsSnapshot(timestamp=time.time()) + + # Helper to extract gauge/counter value + def get_value(pattern: str, default: float = 0.0) -> float: + match = re.search(pattern, text) + if match: + return float(match.group(1)) + return default + + # KV cache usage (0-1 scale) + snapshot.kv_cache_usage = get_value( + r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' + ) + # Fallback to old metric name if new one not found + if snapshot.kv_cache_usage == 0.0: + snapshot.kv_cache_usage = get_value( + r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' + ) + + # CPU/offloaded KV cache usage + snapshot.cpu_kv_cache_usage = get_value( + r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' + ) + + # Running/waiting requests + snapshot.num_requests_running = int(get_value( + r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.num_requests_waiting = int(get_value( + r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Prefix cache (cumulative counters) - GPU + snapshot.prefix_cache_hits = int(get_value( + r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prefix_cache_queries = int(get_value( + r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Prefix cache - external/offloaded (KV connector cross-instance cache) + snapshot.cpu_prefix_cache_hits = int(get_value( + r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.cpu_prefix_cache_queries = int(get_value( + r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Token counters + snapshot.prompt_tokens = int(get_value( + r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.generation_tokens = int(get_value( + r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Preemptions + snapshot.num_preemptions = int(get_value( + r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Request success (sum all finish reasons) + for match in re.finditer( + r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', + text + ): + snapshot.request_success += int(float(match.group(1))) + + # KV offload bytes transferred (cumulative counters by direction) + snapshot.kv_offload_bytes_gpu_to_cpu = get_value( + r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' + ) + snapshot.kv_offload_bytes_cpu_to_gpu = get_value( + r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' + ) + + # KV offload time (cumulative, seconds) + snapshot.kv_offload_time_gpu_to_cpu = get_value( + r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' + ) + snapshot.kv_offload_time_cpu_to_gpu = get_value( + r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' + ) + + # Prompt tokens by source (cumulative) + snapshot.prompt_tokens_local_compute = int(get_value( + r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prompt_tokens_local_cache_hit = int(get_value( + r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prompt_tokens_external_kv_transfer = int(get_value( + r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)' + )) + + # Prefill KV computed tokens (histogram sum and count) + snapshot.prefill_kv_computed_tokens_sum = int(get_value( + r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prefill_kv_computed_tokens_count = int(get_value( + r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)' + )) + + return snapshot + + async def _poll_loop(self) -> None: + """Background polling loop.""" + metrics_url = f"{self.base_url}/metrics" + async with aiohttp.ClientSession() as session: + while self._running: + try: + async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5)) as resp: + if resp.status == 200: + text = await resp.text() + snapshot = self._parse_metrics(text) + self.snapshots.append(snapshot) + except Exception as e: + print(f"Metrics poll error: {e}") + + await asyncio.sleep(self.poll_interval) + + def start(self) -> None: + """Start background metrics collection.""" + if self._running: + return + self._running = True + self.snapshots = [] + self._task = asyncio.create_task(self._poll_loop()) + + async def stop(self) -> None: + """Stop metrics collection.""" + self._running = False + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + + def generate_plots( + self, + output_prefix: str = "metrics", + client_metrics: list | None = None, + ) -> None: + """Generate visualization plots from collected metrics. + + Args: + output_prefix: Prefix for output file names + client_metrics: Optional list of RequestStats from benchmark clients + """ + if len(self.snapshots) < 2: + print("Not enough data points for plots") + return + + # Convert to relative time (seconds from start) + start_time = self.snapshots[0].timestamp + times = [(s.timestamp - start_time) for s in self.snapshots] + + # Create figure with subplots + num_rows = 6 if client_metrics else 4 + fig, axes = plt.subplots(num_rows, 2, figsize=(14, 4 * num_rows)) + fig.suptitle("vLLM Server Metrics During Benchmark", fontsize=14) + + # 1. KV Cache Usage vs Time + ax = axes[0, 0] + kv_usage = [min(s.kv_cache_usage * 100, 100.0) for s in self.snapshots] + ax.scatter(times, kv_usage, alpha=0.15, s=2, c='blue') + kv_window = min(50, len(kv_usage) // 10) if len(kv_usage) > 10 else 1 + if kv_window > 1: + rolling_kv = [ + sum(kv_usage[max(0, i - kv_window):i + 1]) / len(kv_usage[max(0, i - kv_window):i + 1]) + for i in range(len(kv_usage)) + ] + ax.plot(times, rolling_kv, 'b-', label=f'GPU (avg n={kv_window})', linewidth=2) + else: + ax.plot(times, kv_usage, 'b-', label='GPU', linewidth=2) + # Add external cache if available + cpu_kv_usage = [s.cpu_kv_cache_usage * 100 for s in self.snapshots] + if any(v > 0 for v in cpu_kv_usage): + ax.plot(times, cpu_kv_usage, 'r--', label='External', linewidth=1.5) + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("KV Cache Usage (%)") + ax.set_title("KV Cache Utilization Over Time") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 2. Running & Waiting Requests vs Time (smoothed + total) + ax = axes[0, 1] + running = [s.num_requests_running for s in self.snapshots] + waiting = [s.num_requests_waiting for s in self.snapshots] + total_queue = [r + w for r, w in zip(running, waiting)] + q_window = min(30, len(running) // 10) if len(running) > 10 else 1 + if q_window > 1: + rolling_running = [ + sum(running[max(0, i - q_window):i + 1]) / len(running[max(0, i - q_window):i + 1]) + for i in range(len(running)) + ] + rolling_waiting = [ + sum(waiting[max(0, i - q_window):i + 1]) / len(waiting[max(0, i - q_window):i + 1]) + for i in range(len(waiting)) + ] + rolling_total = [ + sum(total_queue[max(0, i - q_window):i + 1]) / len(total_queue[max(0, i - q_window):i + 1]) + for i in range(len(total_queue)) + ] + ax.plot(times, rolling_running, 'g-', label=f'Running (avg n={q_window})', linewidth=1.5) + ax.plot(times, rolling_waiting, 'r-', label=f'Waiting (avg n={q_window})', linewidth=1.5) + ax.plot(times, rolling_total, 'b-', label=f'Total (avg n={q_window})', linewidth=1.5) + else: + ax.plot(times, running, 'g-', label='Running', linewidth=1.5) + ax.plot(times, waiting, 'r-', label='Waiting', linewidth=1.5) + ax.plot(times, total_queue, 'b-', label='Total', linewidth=1.5) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Requests") + ax.set_title("Request Queue Depth") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + + # 3. Cache Hit Rate vs Time (computed from deltas between polling intervals) + ax = axes[1, 0] + gpu_hit_rates = [] + ext_hit_rates = [] + combined_hit_rates = [] + has_ext_cache = any(s.cpu_prefix_cache_queries > 0 for s in self.snapshots) + for i in range(1, len(self.snapshots)): + # GPU (HBM) cache hit rate for this interval + gpu_delta_hits = self.snapshots[i].prefix_cache_hits - self.snapshots[i-1].prefix_cache_hits + gpu_delta_queries = self.snapshots[i].prefix_cache_queries - self.snapshots[i-1].prefix_cache_queries + if gpu_delta_queries > 0: + gpu_hit_rates.append(100.0 * gpu_delta_hits / gpu_delta_queries) + else: + gpu_hit_rates.append(gpu_hit_rates[-1] if gpu_hit_rates else 0) + + # External cache hit rate for this interval + if has_ext_cache: + ext_delta_hits = self.snapshots[i].cpu_prefix_cache_hits - self.snapshots[i-1].cpu_prefix_cache_hits + ext_delta_queries = self.snapshots[i].cpu_prefix_cache_queries - self.snapshots[i-1].cpu_prefix_cache_queries + if ext_delta_queries > 0: + ext_hit_rates.append(100.0 * ext_delta_hits / ext_delta_queries) + else: + ext_hit_rates.append(ext_hit_rates[-1] if ext_hit_rates else 0) + + # Combined hit rate: (gpu_hits + ext_hits) / (gpu_queries + ext_queries) + total_hits = gpu_delta_hits + ext_delta_hits + total_queries = gpu_delta_queries + ext_delta_queries + if total_queries > 0: + combined_hit_rates.append(100.0 * total_hits / total_queries) + else: + combined_hit_rates.append(combined_hit_rates[-1] if combined_hit_rates else 0) + + # Rolling window size + window = min(50, len(gpu_hit_rates) // 10) if len(gpu_hit_rates) > 10 else 1 + + # Scatter plot for GPU (HBM) cache hit rate + ax.scatter(times[1:], gpu_hit_rates, alpha=0.3, s=5, c='purple', label='GPU (HBM)') + if window > 1: + rolling_gpu = [ + sum(gpu_hit_rates[max(0, i - window):i + 1]) / len(gpu_hit_rates[max(0, i - window):i + 1]) + for i in range(len(gpu_hit_rates)) + ] + ax.plot(times[1:], rolling_gpu, 'purple', linewidth=1.5, label=f'GPU avg (n={window})') + + # External cache scatter + rolling (if available) + if has_ext_cache and ext_hit_rates: + ax.scatter(times[1:], ext_hit_rates, alpha=0.3, s=5, c='orange', label='External') + if window > 1: + rolling_ext = [ + sum(ext_hit_rates[max(0, i - window):i + 1]) / len(ext_hit_rates[max(0, i - window):i + 1]) + for i in range(len(ext_hit_rates)) + ] + ax.plot(times[1:], rolling_ext, 'orange', linewidth=1.5, label=f'External avg (n={window})') + + # Combined/total hit rate (only if external exists) + ax.scatter(times[1:], combined_hit_rates, alpha=0.2, s=3, c='green', label='Combined') + if window > 1: + rolling_combined = [ + sum(combined_hit_rates[max(0, i - window):i + 1]) / len(combined_hit_rates[max(0, i - window):i + 1]) + for i in range(len(combined_hit_rates)) + ] + ax.plot(times[1:], rolling_combined, 'green', linewidth=2, label=f'Combined avg (n={window})') + + ax.legend(loc='best', fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Hit Rate (%)") + ax.set_title("Prefix Cache Hit Rate Per Interval (tokens hit / tokens queried)") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 4. Throughput vs Time (tokens/sec) with rolling average — decode + total + ax = axes[1, 1] + decode_throughputs = [] + total_throughputs = [] + for i in range(1, len(self.snapshots)): + delta_gen = self.snapshots[i].generation_tokens - self.snapshots[i-1].generation_tokens + delta_prompt = self.snapshots[i].prompt_tokens - self.snapshots[i-1].prompt_tokens + delta_time = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + if delta_time > 0: + decode_throughputs.append(delta_gen / delta_time) + total_throughputs.append((delta_gen + delta_prompt) / delta_time) + else: + decode_throughputs.append(0) + total_throughputs.append(0) + # Cumulative running average total throughput (total tokens / elapsed time) + cumulative_total_avg = [] + t0 = self.snapshots[0].timestamp + tokens0 = self.snapshots[0].generation_tokens + self.snapshots[0].prompt_tokens + for i in range(1, len(self.snapshots)): + elapsed = self.snapshots[i].timestamp - t0 + total_tokens = (self.snapshots[i].generation_tokens + self.snapshots[i].prompt_tokens) - tokens0 + cumulative_total_avg.append(total_tokens / elapsed if elapsed > 0 else 0) + + window = min(30, len(decode_throughputs) // 10) if len(decode_throughputs) > 10 else 1 + if window > 1: + rolling_decode = [ + sum(decode_throughputs[max(0, i - window):i + 1]) / len(decode_throughputs[max(0, i - window):i + 1]) + for i in range(len(decode_throughputs)) + ] + rolling_total = [ + sum(total_throughputs[max(0, i - window):i + 1]) / len(total_throughputs[max(0, i - window):i + 1]) + for i in range(len(total_throughputs)) + ] + ax.plot(times[1:], rolling_total, 'steelblue', linewidth=1.5, label=f'Total (avg n={window})') + ax.plot(times[1:], rolling_decode, 'orange', linewidth=1.5, label=f'Decode (avg n={window})') + ax.legend(fontsize=8) + else: + ax.plot(times[1:], total_throughputs, 'steelblue', linewidth=1, alpha=0.8, label='Total') + ax.plot(times[1:], decode_throughputs, 'orange', linewidth=1, alpha=0.8, label='Decode') + ax.legend(fontsize=8) + ax.plot(times[1:], cumulative_total_avg, 'red', linewidth=2, label='Total Running Avg') + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Tokens/sec") + ax.set_title("Throughput (Total & Decode)") + ax.grid(True, alpha=0.3) + + # 5. KV Offload Transfer Rate (from vLLM metrics) + ax = axes[2, 0] + gpu_to_cpu_rates = [] + cpu_to_gpu_rates = [] + for i in range(1, len(self.snapshots)): + dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + if dt > 0: + delta_g2c = self.snapshots[i].kv_offload_bytes_gpu_to_cpu - self.snapshots[i-1].kv_offload_bytes_gpu_to_cpu + delta_c2g = self.snapshots[i].kv_offload_bytes_cpu_to_gpu - self.snapshots[i-1].kv_offload_bytes_cpu_to_gpu + gpu_to_cpu_rates.append(delta_g2c / dt / 1e6) # MB/s + cpu_to_gpu_rates.append(delta_c2g / dt / 1e6) # MB/s + else: + gpu_to_cpu_rates.append(0) + cpu_to_gpu_rates.append(0) + if any(r > 0 for r in gpu_to_cpu_rates) or any(r > 0 for r in cpu_to_gpu_rates): + ax.scatter(times[1:], gpu_to_cpu_rates, alpha=0.15, s=3, c='blue') + ax.scatter(times[1:], cpu_to_gpu_rates, alpha=0.15, s=3, c='red') + xfer_window = min(30, len(gpu_to_cpu_rates) // 10) if len(gpu_to_cpu_rates) > 10 else 1 + if xfer_window > 1: + rolling_g2c = [ + sum(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1]) / len(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1]) + for i in range(len(gpu_to_cpu_rates)) + ] + rolling_c2g = [ + sum(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1]) / len(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1]) + for i in range(len(cpu_to_gpu_rates)) + ] + ax.plot(times[1:], rolling_g2c, 'b-', linewidth=1.5, label=f'GPU→CPU (avg n={xfer_window})') + ax.plot(times[1:], rolling_c2g, 'r-', linewidth=1.5, label=f'CPU→GPU (avg n={xfer_window})') + else: + ax.plot(times[1:], gpu_to_cpu_rates, 'b-', linewidth=1, alpha=0.8, label='GPU→CPU') + ax.plot(times[1:], cpu_to_gpu_rates, 'r-', linewidth=1, alpha=0.8, label='CPU→GPU') + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Transfer Rate (MB/s)") + ax.set_title("KV Offload Transfer Rate") + ax.grid(True, alpha=0.3) + + # 6. Prompt Token Sources Over Time (cumulative percentage) + ax = axes[2, 1] + initial = self.snapshots[0] + cum_compute_pct = [] + cum_cache_pct = [] + cum_ext_pct = [] + for s in self.snapshots: + c = s.prompt_tokens_local_compute - initial.prompt_tokens_local_compute + h = s.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit + e = s.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer + total = c + h + e + if total > 0: + cum_compute_pct.append(100.0 * c / total) + cum_cache_pct.append(100.0 * h / total) + cum_ext_pct.append(100.0 * e / total) + else: + cum_compute_pct.append(0) + cum_cache_pct.append(0) + cum_ext_pct.append(0) + if any(v > 0 for v in cum_compute_pct): + ax.stackplot(times, cum_compute_pct, cum_cache_pct, cum_ext_pct, + labels=['Prefill', 'HBM Cache Hit', 'Offload Cache Hit'], + colors=['coral', 'steelblue', 'mediumseagreen'], alpha=0.8) + ax.legend(fontsize=8, loc='lower left') + ax.set_xlabel("Time (s)") + ax.set_ylabel("% of Prefill Tokens") + ax.set_title("Cumulative Prefill Token Source Breakdown") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 7. Cumulative KV Offload Transfers + initial = self.snapshots[0] + # GPU → CPU cumulative + ax = axes[3, 0] + cum_g2c = [(s.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu) / 1e9 + for s in self.snapshots] + if any(v > 0 for v in cum_g2c): + ax.plot(times, cum_g2c, 'b-', linewidth=1.5) + ax.fill_between(times, cum_g2c, alpha=0.2, color='blue') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Cumulative Transfer (GB)") + ax.set_title("KV Offload: GPU → CPU (Cumulative)") + ax.grid(True, alpha=0.3) + + # CPU → GPU cumulative + ax = axes[3, 1] + cum_c2g = [(s.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu) / 1e9 + for s in self.snapshots] + if any(v > 0 for v in cum_c2g): + ax.plot(times, cum_c2g, 'r-', linewidth=1.5) + ax.fill_between(times, cum_c2g, alpha=0.2, color='red') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Cumulative Transfer (GB)") + ax.set_title("KV Offload: CPU → GPU (Cumulative)") + ax.grid(True, alpha=0.3) + + # 8 & 9. Client metrics plots (TTFT and Latency vs Time) + if client_metrics and len(client_metrics) > 0: + # Sort by start time + sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms) + # Convert to relative time (seconds from first request) + first_start = sorted_metrics[0].start_time_ms + request_times = [(m.start_time_ms - first_start) / 1000.0 for m in sorted_metrics] + ttfts = [m.ttft_ms for m in sorted_metrics] + latencies = [m.latency_ms for m in sorted_metrics] + + # 8. TTFT vs Time + ax = axes[4, 0] + ax.scatter(request_times, ttfts, alpha=0.3, s=5, c='blue') + # Add rolling average + window = min(50, len(ttfts) // 10) if len(ttfts) > 10 else 1 + if window > 1: + rolling_ttft = [ + sum(ttfts[max(0, i - window):i + 1]) / len(ttfts[max(0, i - window):i + 1]) + for i in range(len(ttfts)) + ] + ax.plot(request_times, rolling_ttft, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("TTFT (ms)") + ax.set_title("Time to First Token vs Time") + ax.grid(True, alpha=0.3) + + # 9. Latency vs Time + ax = axes[4, 1] + ax.scatter(request_times, latencies, alpha=0.3, s=5, c='green') + # Add rolling average + if window > 1: + rolling_latency = [ + sum(latencies[max(0, i - window):i + 1]) / len(latencies[max(0, i - window):i + 1]) + for i in range(len(latencies)) + ] + ax.plot(request_times, rolling_latency, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("Latency (ms)") + ax.set_title("Request Latency vs Time") + ax.grid(True, alpha=0.3) + + # 10. Interactivity (1/TPOT = tokens/sec) vs Time + ax = axes[5, 0] + # Filter out zero TPOT values to avoid division by zero + tpots = [m.tpot_ms for m in sorted_metrics] + interactivity = [1000.0 / t if t > 0 else 0 for t in tpots] # Convert to tokens/sec + ax.scatter(request_times, interactivity, alpha=0.3, s=5, c='purple') + # Add rolling average + if window > 1: + rolling_inter = [ + sum(interactivity[max(0, i - window):i + 1]) / len(interactivity[max(0, i - window):i + 1]) + for i in range(len(interactivity)) + ] + ax.plot(request_times, rolling_inter, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("Interactivity (tokens/sec)") + ax.set_title("Decode Speed (1/TPOT) vs Time") + ax.grid(True, alpha=0.3) + + # 11. Preemptions over time + ax = axes[5, 1] + preemption_rates = [] + for i in range(1, len(self.snapshots)): + dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + delta = self.snapshots[i].num_preemptions - self.snapshots[i-1].num_preemptions + preemption_rates.append(delta / dt if dt > 0 else 0) + if any(r > 0 for r in preemption_rates): + ax.scatter(times[1:], preemption_rates, alpha=0.15, s=3, c='red') + preempt_window = min(30, len(preemption_rates) // 10) if len(preemption_rates) > 10 else 1 + if preempt_window > 1: + rolling_preempt = [ + sum(preemption_rates[max(0, i - preempt_window):i + 1]) / len(preemption_rates[max(0, i - preempt_window):i + 1]) + for i in range(len(preemption_rates)) + ] + ax.plot(times[1:], rolling_preempt, 'r-', linewidth=1.5, label=f'Rolling avg (n={preempt_window})') + # Cumulative on secondary axis + ax2 = ax.twinx() + cumulative = [self.snapshots[i].num_preemptions - self.snapshots[0].num_preemptions + for i in range(1, len(self.snapshots))] + ax2.plot(times[1:], cumulative, 'b--', linewidth=1, alpha=0.5, label='Cumulative') + ax2.set_ylabel("Cumulative Preemptions", color='blue') + ax2.tick_params(axis='y', labelcolor='blue') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Preemptions/sec", color='red') + ax.tick_params(axis='y', labelcolor='red') + ax.set_title("Preemptions Over Time") + ax.grid(True, alpha=0.3) + + plt.tight_layout() + plt.savefig(f"{output_prefix}_plots.png", dpi=150) + print(f"Saved plots to {output_prefix}_plots.png") + plt.close() + + # Also generate a summary + self._print_summary() + + def _print_summary(self) -> None: + """Print summary statistics.""" + if len(self.snapshots) < 2: + return + + duration = self.snapshots[-1].timestamp - self.snapshots[0].timestamp + total_gen_tokens = self.snapshots[-1].generation_tokens - self.snapshots[0].generation_tokens + total_prompt_tokens = self.snapshots[-1].prompt_tokens - self.snapshots[0].prompt_tokens + + final = self.snapshots[-1] + initial = self.snapshots[0] + + print("\n" + "="*60) + print("METRICS SUMMARY") + print("="*60) + print(f"Duration: {duration:.1f}s") + print(f"Total prompt tokens: {total_prompt_tokens:,}") + print(f"Total generation tokens: {total_gen_tokens:,}") + print(f"Avg generation throughput: {total_gen_tokens/duration:.1f} tok/s") + print(f"Peak KV cache usage: {max(s.kv_cache_usage for s in self.snapshots)*100:.1f}%") + print(f"Peak running requests: {max(s.num_requests_running for s in self.snapshots)}") + print(f"Peak waiting requests: {max(s.num_requests_waiting for s in self.snapshots)}") + print(f"Total preemptions: {final.num_preemptions - initial.num_preemptions}") + + if final.prefix_cache_queries > initial.prefix_cache_queries: + delta_hits = final.prefix_cache_hits - initial.prefix_cache_hits + delta_queries = final.prefix_cache_queries - initial.prefix_cache_queries + hit_rate = 100.0 * delta_hits / delta_queries + print(f"Overall GPU cache hit rate: {hit_rate:.1f}%") + print(f" - Cache hits: {delta_hits:,} tokens") + print(f" - Cache queries: {delta_queries:,} tokens") + + # External/offloaded cache stats if available + if final.cpu_prefix_cache_queries > initial.cpu_prefix_cache_queries: + cpu_delta_hits = final.cpu_prefix_cache_hits - initial.cpu_prefix_cache_hits + cpu_delta_queries = final.cpu_prefix_cache_queries - initial.cpu_prefix_cache_queries + cpu_hit_rate = 100.0 * cpu_delta_hits / cpu_delta_queries + print(f"Overall external cache hit rate: {cpu_hit_rate:.1f}%") + print(f" - Cache hits: {cpu_delta_hits:,} tokens") + print(f" - Cache queries: {cpu_delta_queries:,} tokens") + + # Prompt tokens by source + total_compute = final.prompt_tokens_local_compute - initial.prompt_tokens_local_compute + total_cache_hit = final.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit + total_ext = final.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer + total_by_source = total_compute + total_cache_hit + total_ext + if total_by_source > 0: + print(f"Prompt token sources:") + print(f" - Prefill: {total_compute:>12,} ({100*total_compute/total_by_source:.1f}%)") + print(f" - HBM cache hit: {total_cache_hit:>12,} ({100*total_cache_hit/total_by_source:.1f}%)") + print(f" - Offload cache hit: {total_ext:>12,} ({100*total_ext/total_by_source:.1f}%)") + + # KV offload transfer stats + g2c_bytes = final.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu + c2g_bytes = final.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu + g2c_time = final.kv_offload_time_gpu_to_cpu - initial.kv_offload_time_gpu_to_cpu + c2g_time = final.kv_offload_time_cpu_to_gpu - initial.kv_offload_time_cpu_to_gpu + if g2c_bytes > 0 or c2g_bytes > 0: + print(f"KV offload transfers:") + print(f" GPU→CPU: {g2c_bytes/1e9:.2f} GB in {g2c_time:.2f}s ({g2c_bytes/g2c_time/1e9:.1f} GB/s)" if g2c_time > 0 else f" GPU→CPU: {g2c_bytes/1e9:.2f} GB") + print(f" CPU→GPU: {c2g_bytes/1e9:.2f} GB in {c2g_time:.2f}s ({c2g_bytes/c2g_time/1e9:.1f} GB/s)" if c2g_time > 0 else f" CPU→GPU: {c2g_bytes/1e9:.2f} GB") + + # Prefill KV computed tokens + delta_kv_sum = final.prefill_kv_computed_tokens_sum - initial.prefill_kv_computed_tokens_sum + delta_kv_count = final.prefill_kv_computed_tokens_count - initial.prefill_kv_computed_tokens_count + if delta_kv_count > 0: + print(f"Prefill KV computed tokens (excluding cached):") + print(f" Total: {delta_kv_sum:,} tokens across {delta_kv_count:,} requests") + print(f" Avg per request: {delta_kv_sum/delta_kv_count:.0f} tokens") + + print("="*60 + "\n") + + def export_csv( + self, + output_prefix: str = "metrics", + client_metrics: list | None = None, + ) -> None: + """Export all time series data to CSV files. + + Args: + output_prefix: Prefix for output file names + client_metrics: Optional list of RequestStats from benchmark clients + + Generates: + - {output_prefix}_server_metrics.csv: vLLM server metrics over time + - {output_prefix}_gpu_transfer.csv: GPU PCIe transfer stats + - {output_prefix}_client_metrics.csv: Per-request client metrics (if provided) + """ + output_dir = Path(output_prefix).parent + if output_dir and not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + # 1. Export server metrics (from /metrics endpoint) + if self.snapshots: + server_csv = f"{output_prefix}_server_metrics.csv" + start_time = self.snapshots[0].timestamp + + with open(server_csv, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'timestamp_sec', + 'relative_time_sec', + 'kv_cache_usage_pct', + 'cpu_kv_cache_usage_pct', + 'num_requests_running', + 'num_requests_waiting', + 'prefix_cache_hits', + 'prefix_cache_queries', + 'cpu_prefix_cache_hits', + 'cpu_prefix_cache_queries', + 'prompt_tokens_total', + 'generation_tokens_total', + 'num_preemptions_total', + 'request_success_total', + # KV offload metrics + 'kv_offload_bytes_gpu_to_cpu', + 'kv_offload_bytes_cpu_to_gpu', + 'kv_offload_time_gpu_to_cpu', + 'kv_offload_time_cpu_to_gpu', + # Prompt tokens by source + 'prompt_tokens_local_compute', + 'prompt_tokens_local_cache_hit', + 'prompt_tokens_external_kv_transfer', + # Prefill KV computed + 'prefill_kv_computed_tokens_sum', + 'prefill_kv_computed_tokens_count', + # Computed per-interval metrics + 'interval_cache_hit_rate_pct', + 'interval_throughput_tok_per_sec', + ]) + + for i, s in enumerate(self.snapshots): + relative_time = s.timestamp - start_time + + # Compute per-interval metrics + cache_hit_rate = 0.0 + throughput = 0.0 + if i > 0: + prev = self.snapshots[i - 1] + delta_hits = s.prefix_cache_hits - prev.prefix_cache_hits + delta_queries = s.prefix_cache_queries - prev.prefix_cache_queries + if delta_queries > 0: + cache_hit_rate = 100.0 * delta_hits / delta_queries + + delta_gen = s.generation_tokens - prev.generation_tokens + delta_time = s.timestamp - prev.timestamp + if delta_time > 0: + throughput = delta_gen / delta_time + + writer.writerow([ + f"{s.timestamp:.3f}", + f"{relative_time:.3f}", + f"{s.kv_cache_usage * 100:.2f}", + f"{s.cpu_kv_cache_usage * 100:.2f}", + s.num_requests_running, + s.num_requests_waiting, + s.prefix_cache_hits, + s.prefix_cache_queries, + s.cpu_prefix_cache_hits, + s.cpu_prefix_cache_queries, + s.prompt_tokens, + s.generation_tokens, + s.num_preemptions, + s.request_success, + f"{s.kv_offload_bytes_gpu_to_cpu:.0f}", + f"{s.kv_offload_bytes_cpu_to_gpu:.0f}", + f"{s.kv_offload_time_gpu_to_cpu:.6f}", + f"{s.kv_offload_time_cpu_to_gpu:.6f}", + s.prompt_tokens_local_compute, + s.prompt_tokens_local_cache_hit, + s.prompt_tokens_external_kv_transfer, + s.prefill_kv_computed_tokens_sum, + s.prefill_kv_computed_tokens_count, + f"{cache_hit_rate:.2f}", + f"{throughput:.2f}", + ]) + + print(f"Exported server metrics to {server_csv}") + + # 2. Export GPU transfer stats (DEPRECATED - kept for backward compat) + if self.gpu_transfer_collector and self.gpu_transfer_collector.snapshots: + gpu_csv = f"{output_prefix}_gpu_transfer.csv" + gpu_snaps = self.gpu_transfer_collector.snapshots + gpu_start = gpu_snaps[0].timestamp + + with open(gpu_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'timestamp_sec', + 'relative_time_sec', + 'gpu_id', + 'tx_pci_mb_per_sec', + 'rx_pci_mb_per_sec', + 'cumulative_tx_gb', + 'cumulative_rx_gb', + ]) + + cumulative_tx = 0.0 + cumulative_rx = 0.0 + for i, s in enumerate(gpu_snaps): + relative_time = s.timestamp - gpu_start + if i > 0: + dt = s.timestamp - gpu_snaps[i - 1].timestamp + cumulative_tx += s.tx_pci * dt / 1024 # MB to GB + cumulative_rx += s.rx_pci * dt / 1024 + + writer.writerow([ + f"{s.timestamp:.3f}", + f"{relative_time:.3f}", + s.gpu_id, + f"{s.tx_pci:.2f}", + f"{s.rx_pci:.2f}", + f"{cumulative_tx:.4f}", + f"{cumulative_rx:.4f}", + ]) + + print(f"Exported GPU transfer metrics to {gpu_csv}") + + # 3. Export client metrics (per-request stats) + if client_metrics and len(client_metrics) > 0: + client_csv = f"{output_prefix}_client_metrics.csv" + sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms) + first_start = sorted_metrics[0].start_time_ms + + with open(client_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'start_time_ms', + 'relative_time_sec', + 'ttft_ms', + 'tpot_ms', + 'latency_ms', + 'input_num_turns', + 'input_num_tokens', + 'output_num_tokens', + 'output_num_chunks', + 'output_num_first_chunk_tokens', + 'approx_cached_percent', + 'conversation_id', + 'client_id', + 'interactivity_tok_per_sec', + ]) + + for m in sorted_metrics: + relative_time = (m.start_time_ms - first_start) / 1000.0 + interactivity = 1000.0 / m.tpot_ms if m.tpot_ms > 0 else 0 + + writer.writerow([ + f"{m.start_time_ms:.3f}", + f"{relative_time:.3f}", + f"{m.ttft_ms:.3f}", + f"{m.tpot_ms:.3f}", + f"{m.latency_ms:.3f}", + m.input_num_turns, + m.input_num_tokens, + m.output_num_tokens, + m.output_num_chunks, + m.output_num_first_chunk_tokens, + f"{m.approx_cached_percent:.2f}", + m.conversation_id, + m.client_id, + f"{interactivity:.2f}", + ]) + + print(f"Exported client metrics to {client_csv}") diff --git a/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py new file mode 100644 index 000000000..ddf605324 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Standalone metrics collector for vLLM server. + +Polls the vLLM /metrics endpoint and generates server-side plots. +Designed to run alongside any benchmark client (aiperf, custom, etc.). + +Usage: + # Start collecting, run your benchmark, then Ctrl+C or kill to stop: + python -m bench.run_metrics_collector \ + --url http://localhost:8888 \ + --output-prefix results/metrics \ + --duration 600 + + # Or run in background and signal when done: + python -m bench.run_metrics_collector \ + --url http://localhost:8888 \ + --output-prefix results/metrics \ + --pid-file /tmp/metrics_collector.pid +""" + +import argparse +import asyncio +import os +import signal +import sys + +from bench.metrics_collector import MetricsCollector + + +async def run(args): + collector = MetricsCollector( + base_url=args.url, + poll_interval=args.poll_interval, + ) + + collector.start() + print(f"Metrics collector started (polling {args.url}/metrics every {args.poll_interval}s)") + + if args.pid_file: + with open(args.pid_file, "w") as f: + f.write(str(os.getpid())) + print(f"PID written to {args.pid_file}") + + # Set up graceful shutdown + stop_event = asyncio.Event() + + def handle_signal(*_): + print("\nStopping metrics collector...") + stop_event.set() + + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, handle_signal) + + # Wait for duration or signal + if args.duration: + try: + await asyncio.wait_for(stop_event.wait(), timeout=args.duration) + except asyncio.TimeoutError: + print(f"Duration limit reached ({args.duration}s)") + else: + await stop_event.wait() + + await collector.stop() + + # Generate outputs + if len(collector.snapshots) < 2: + print("Not enough data points collected") + sys.exit(1) + + print(f"Collected {len(collector.snapshots)} snapshots") + + # Generate plots (without client metrics — server-only) + collector.generate_plots(output_prefix=args.output_prefix) + + # Export CSV + collector.export_csv(output_prefix=args.output_prefix) + + # Clean up PID file + if args.pid_file and os.path.exists(args.pid_file): + os.remove(args.pid_file) + + print("Done") + + +def main(): + parser = argparse.ArgumentParser( + description="Standalone vLLM metrics collector" + ) + parser.add_argument( + "--url", "-u", + default="http://localhost:8888", + help="vLLM server base URL (default: http://localhost:8888)", + ) + parser.add_argument( + "--output-prefix", "-o", + default="metrics", + help="Output file prefix (default: metrics)", + ) + parser.add_argument( + "--poll-interval", + type=float, + default=1.0, + help="Polling interval in seconds (default: 1.0)", + ) + parser.add_argument( + "--duration", "-d", + type=float, + default=None, + help="Max collection duration in seconds (default: unlimited, stop with signal)", + ) + parser.add_argument( + "--pid-file", + default=None, + help="Write PID to this file for external signaling", + ) + args = parser.parse_args() + + asyncio.run(run(args)) + + +if __name__ == "__main__": + main() diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester new file mode 160000 index 000000000..a41ee2261 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -0,0 +1 @@ +Subproject commit a41ee2261b743328be84c472b7b97112d046e62f diff --git a/experimental/multiturn/vllm_benchmark/requirements.txt b/experimental/multiturn/vllm_benchmark/requirements.txt new file mode 100644 index 000000000..f4a9625fb --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/requirements.txt @@ -0,0 +1,9 @@ +numpy>=1.24 +pandas>=2.0.0 +aiohttp>=3.10 +transformers>=4.46 +xlsxwriter>=3.2.1 +tqdm>=4.66 +datasets +tiktoken +matplotlib diff --git a/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py b/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py new file mode 100644 index 000000000..aa4b639ca --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +"""Analyze ISL/OSL/turn distributions from AIPerf benchmark results. + +Reads profile_export.jsonl and produces summary stats + distribution plots +to verify the benchmark workload matches the intended Qwen trace profile. + +Usage: + python analyze_benchmark_distributions.py path/to/aiperf_artifacts/ -o output_dir/ +""" + +from __future__ import annotations + +import argparse +import json +import math +from collections import Counter, defaultdict +from pathlib import Path + + +def load_records(artifacts_dir: Path) -> list[dict]: + """Load per-request records from profile_export.jsonl.""" + jsonl_path = artifacts_dir / "profile_export.jsonl" + records = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) + return records + + +def load_trace_replay_records(trace_replay_dir: Path) -> list[dict]: + """Load per-request records from trace_replay detailed_results.csv. + + Converts to the same format as AIPerf JSONL records so the analyze() + function can process both formats identically. + """ + import csv + import sys + csv.field_size_limit(sys.maxsize) + + csv_path = trace_replay_dir / "detailed_results.csv" + records = [] + with open(csv_path) as f: + reader = csv.DictReader(f) + for row in reader: + if row.get("success") != "True": + continue + records.append({ + "metadata": { + "x_correlation_id": row["trace_id"], + "conversation_id": row["trace_id"], + "turn_index": int(row["request_idx"]), + "benchmark_phase": "profiling", + }, + "metrics": { + "input_sequence_length": {"value": int(row["input_tokens"])}, + "output_sequence_length": {"value": int(row["output_tokens_actual"])}, + }, + }) + return records + + +def analyze(records: list[dict], output_dir: Path) -> None: + """Run distribution analysis and save results.""" + output_dir.mkdir(parents=True, exist_ok=True) + + # Group by conversation + convos: dict[str, list[dict]] = defaultdict(list) + for r in records: + metrics = r.get("metrics", {}) + if "input_sequence_length" not in metrics or "output_sequence_length" not in metrics: + continue + # Use x_correlation_id (unique per session) not conversation_id (template, reused) + cid = r["metadata"].get("x_correlation_id") or r["metadata"]["conversation_id"] + ti = r["metadata"]["turn_index"] + isl = metrics["input_sequence_length"]["value"] + osl = metrics["output_sequence_length"]["value"] + convos[cid].append({"turn": ti, "isl": isl, "osl": osl}) + + # Sort turns within each conversation + for v in convos.values(): + v.sort(key=lambda x: x["turn"]) + + # Turn count distribution + turn_counts = Counter(len(v) for v in convos.values()) + total_convos = len(convos) + total_requests = len(records) + + lines = [] + lines.append("=" * 70) + lines.append("BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS") + lines.append("=" * 70) + lines.append(f"Total conversations: {total_convos:,}") + lines.append(f"Total requests: {total_requests:,}") + lines.append(f"Avg turns/conv: {total_requests / total_convos:.2f}") + lines.append("") + + lines.append("TURN COUNT DISTRIBUTION:") + lines.append(f" {'Turns':>5s} {'Count':>6s} {'Pct':>6s} Target") + target = {1: 59, 2: 20, 3: 10, 4: 5, 5: 3, 6: 2, 7: 1} + for k in sorted(turn_counts.keys()): + pct = 100 * turn_counts[k] / total_convos + tgt = f"{target.get(k, 0):.0f}%" if k in target else "" + lines.append(f" {k:5d} {turn_counts[k]:6,} {pct:5.1f}% {tgt}") + + # ISL/OSL by turn index + lines.append("") + lines.append("ISL BY TURN INDEX:") + lines.append( + f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" + ) + max_turn = max(t["turn"] for v in convos.values() for t in v) + for ti in range(max_turn + 1): + vals = sorted(t["isl"] for v in convos.values() for t in v if t["turn"] == ti) + if not vals: + continue + n = len(vals) + mean = sum(vals) / n + std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) + median = vals[n // 2] + p5 = vals[int(n * 0.05)] + p95 = vals[int(n * 0.95)] + lines.append( + f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" + ) + + lines.append("") + lines.append("OSL BY TURN INDEX:") + lines.append( + f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" + ) + for ti in range(max_turn + 1): + vals = sorted(t["osl"] for v in convos.values() for t in v if t["turn"] == ti) + if not vals: + continue + n = len(vals) + mean = sum(vals) / n + std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) + median = vals[n // 2] + p5 = vals[int(n * 0.05)] + p95 = vals[int(n * 0.95)] + lines.append( + f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" + ) + + # Overall ISL/OSL stats + all_isl = sorted(t["isl"] for v in convos.values() for t in v) + all_osl = sorted(t["osl"] for v in convos.values() for t in v) + n = len(all_isl) + isl_mean = sum(all_isl) / n + osl_mean = sum(all_osl) / n + lines.append("") + lines.append("ALL REQUESTS ISL:") + lines.append( + f" n={n:,} mean={isl_mean:.0f} median={all_isl[n//2]} " + f"p5={all_isl[int(n*0.05)]} p95={all_isl[int(n*0.95)]}" + ) + lines.append("ALL REQUESTS OSL:") + lines.append( + f" n={n:,} mean={osl_mean:.0f} median={all_osl[n//2]} " + f"p5={all_osl[int(n*0.05)]} p95={all_osl[int(n*0.95)]}" + ) + + # Per-conversation stats + conv_max_isl = sorted(max(t["isl"] for t in v) for v in convos.values()) + conv_total_osl = sorted(sum(t["osl"] for t in v) for v in convos.values()) + nc = len(conv_max_isl) + lines.append("") + lines.append("PER-CONVERSATION MAX ISL (final context size):") + lines.append( + f" n={nc:,} mean={sum(conv_max_isl)/nc:.0f} median={conv_max_isl[nc//2]} " + f"p5={conv_max_isl[int(nc*0.05)]} p95={conv_max_isl[int(nc*0.95)]}" + ) + lines.append("PER-CONVERSATION TOTAL OSL:") + lines.append( + f" n={nc:,} mean={sum(conv_total_osl)/nc:.0f} median={conv_total_osl[nc//2]} " + f"p5={conv_total_osl[int(nc*0.05)]} p95={conv_total_osl[int(nc*0.95)]}" + ) + + # ISL context growth (shows accumulation across turns) + lines.append("") + lines.append("ISL CONTEXT GROWTH (sample multi-turn conversations):") + multi = [(cid, v) for cid, v in convos.items() if len(v) >= 3][:10] + for cid, turns in multi: + isls = " -> ".join(str(t["isl"]) for t in turns) + lines.append(f" {cid}: {isls}") + + lines.append("=" * 70) + + summary_text = "\n".join(lines) + print(summary_text) + + # Save summary + (output_dir / "workload_distribution_summary.txt").write_text(summary_text) + + # Try to generate plots (matplotlib may not be available) + try: + _generate_plots(convos, records, output_dir) + except ImportError: + print("matplotlib not available, skipping plots") + + +def _generate_plots( + convos: dict[str, list[dict]], records: list[dict], output_dir: Path +) -> None: + """Generate distribution plots.""" + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + fig, axes = plt.subplots(3, 3, figsize=(18, 15)) + fig.suptitle("Benchmark Workload Distribution Analysis", fontsize=14) + + # (0,0) Turn count distribution + ax = axes[0, 0] + turn_counts = Counter(len(v) for v in convos.values()) + turns = sorted(turn_counts.keys()) + counts = [turn_counts[t] for t in turns] + total = sum(counts) + bars = ax.bar(turns, [100 * c / total for c in counts], edgecolor="black", alpha=0.7) + for bar, t in zip(bars, turns): + ax.text( + bar.get_x() + bar.get_width() / 2, + bar.get_height(), + f"{bar.get_height():.0f}%", + ha="center", + va="bottom", + fontsize=8, + ) + ax.set_xlabel("Number of Turns") + ax.set_ylabel("% of Conversations") + ax.set_title(f"Turn Count Distribution (n={total:,})") + ax.grid(True, alpha=0.3, axis="y") + + # (0,1) All requests ISL histogram + ax = axes[0, 1] + all_isl = [t["isl"] for v in convos.values() for t in v] + clip = int(sorted(all_isl)[int(len(all_isl) * 0.99)] * 1.2) + ax.hist([v for v in all_isl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="steelblue") + all_isl_sorted = sorted(all_isl) + median_isl = all_isl_sorted[len(all_isl) // 2] + mean_isl = sum(all_isl) / len(all_isl) + ax.axvline(median_isl, color="red", linestyle="--", label=f"Median: {median_isl:,}") + ax.axvline(mean_isl, color="orange", linestyle="--", label=f"Mean: {mean_isl:,.0f}") + ax.set_xlabel("Input Sequence Length") + ax.set_ylabel("Count") + ax.set_title(f"All Requests ISL (n={len(all_isl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (0,2) All requests OSL histogram + ax = axes[0, 2] + all_osl = [t["osl"] for v in convos.values() for t in v] + clip = min(3000, int(sorted(all_osl)[int(len(all_osl) * 0.99)] * 1.2)) + ax.hist([v for v in all_osl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="coral") + all_osl_sorted = sorted(all_osl) + median_osl = all_osl_sorted[len(all_osl) // 2] + mean_osl = sum(all_osl) / len(all_osl) + ax.axvline(median_osl, color="red", linestyle="--", label=f"Median: {median_osl:,}") + ax.axvline(mean_osl, color="orange", linestyle="--", label=f"Mean: {mean_osl:,.0f}") + ax.set_xlabel("Output Sequence Length") + ax.set_ylabel("Count") + ax.set_title(f"All Requests OSL (n={len(all_osl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (1,0) Average new prefill tokens by turn index (ISL delta per turn) + ax = axes[1, 0] + # Collect deltas grouped by turn index + deltas_by_turn: dict[int, list[int]] = defaultdict(list) + for v in convos.values(): + for i, t in enumerate(v): + if i == 0: + deltas_by_turn[t["turn"]].append(t["isl"]) + else: + deltas_by_turn[t["turn"]].append(max(0, t["isl"] - v[i - 1]["isl"])) + if deltas_by_turn: + turn_indices = sorted(deltas_by_turn.keys()) + means = [sum(deltas_by_turn[ti]) / len(deltas_by_turn[ti]) for ti in turn_indices] + ns = [len(deltas_by_turn[ti]) for ti in turn_indices] + ax.plot(turn_indices, means, marker="o", markersize=3, linewidth=1, color="mediumseagreen") + ax.fill_between(turn_indices, 0, means, alpha=0.2, color="mediumseagreen") + # Label first and last points + if len(turn_indices) > 0: + ax.annotate(f"{means[0]:,.0f}", (turn_indices[0], means[0]), fontsize=7, ha="left", va="bottom") + if len(turn_indices) > 1: + ax.annotate(f"{means[-1]:,.0f}\n(n={ns[-1]})", (turn_indices[-1], means[-1]), fontsize=7, ha="right", va="bottom") + # Overall mean/median across all deltas + all_deltas = [d for dlist in deltas_by_turn.values() for d in dlist] + if all_deltas: + overall_mean = sum(all_deltas) / len(all_deltas) + all_deltas_sorted = sorted(all_deltas) + overall_median = all_deltas_sorted[len(all_deltas) // 2] + ax.axhline(overall_mean, color="orange", linestyle="--", linewidth=1, label=f"Mean: {overall_mean:,.0f}") + ax.axhline(overall_median, color="red", linestyle="--", linewidth=1, label=f"Median: {overall_median:,}") + ax.legend(fontsize=7) + ax.set_xlabel("Turn Index") + ax.set_ylabel("Mean New Prefill Tokens") + ax.set_title("Avg New Prefill Tokens by Turn") + ax.grid(True, alpha=0.3) + + # (1,1) ISL vs OSL scatter + ax = axes[1, 1] + ax.scatter(all_isl, all_osl, alpha=0.15, s=3, c="purple") + ax.set_xlabel("ISL (tokens)") + ax.set_ylabel("OSL (tokens)") + ax.set_title("ISL vs OSL (all requests)") + ax.grid(True, alpha=0.3) + + # (1,2) Per-conversation max ISL vs num turns scatter + ax = axes[1, 2] + conv_turns = [len(v) for v in convos.values()] + conv_max_isl_list = [max(t["isl"] for t in v) for v in convos.values()] + ax.scatter(conv_turns, conv_max_isl_list, alpha=0.3, s=8, c="steelblue") + ax.set_xlabel("Number of Turns") + ax.set_ylabel("Max ISL (tokens)") + ax.set_title("Final Context Size vs Turn Count") + ax.grid(True, alpha=0.3) + + # (2,0) Per-conversation max ISL (final context size per conversation) + ax = axes[2, 0] + conv_max_isl = [max(t["isl"] for t in v) for v in convos.values()] + clip = int(sorted(conv_max_isl)[int(len(conv_max_isl) * 0.99)] * 1.2) + ax.hist([v for v in conv_max_isl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="steelblue") + conv_max_isl_sorted = sorted(conv_max_isl) + median_max = conv_max_isl_sorted[len(conv_max_isl) // 2] + mean_max = sum(conv_max_isl) / len(conv_max_isl) + ax.axvline(median_max, color="red", linestyle="--", label=f"Median: {median_max:,}") + ax.axvline(mean_max, color="orange", linestyle="--", label=f"Mean: {mean_max:,.0f}") + ax.set_xlabel("Max ISL per Conversation (tokens)") + ax.set_ylabel("Count") + ax.set_title(f"Per-Conversation Final Context Size (n={len(conv_max_isl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (3,1) Per-conversation total OSL (sum of all output tokens across turns) + ax = axes[2, 1] + conv_total_osl = [sum(t["osl"] for t in v) for v in convos.values()] + clip = int(sorted(conv_total_osl)[int(len(conv_total_osl) * 0.99)] * 1.2) + ax.hist([v for v in conv_total_osl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="coral") + conv_total_osl_sorted = sorted(conv_total_osl) + median_tosl = conv_total_osl_sorted[len(conv_total_osl) // 2] + mean_tosl = sum(conv_total_osl) / len(conv_total_osl) + ax.axvline(median_tosl, color="red", linestyle="--", label=f"Median: {median_tosl:,}") + ax.axvline(mean_tosl, color="orange", linestyle="--", label=f"Mean: {mean_tosl:,.0f}") + ax.set_xlabel("Total OSL per Conversation (tokens)") + ax.set_ylabel("Count") + ax.set_title(f"Per-Conversation Total Output Tokens (n={len(conv_total_osl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (2,2) is empty — already placed scatter at (1,2) + axes[2, 2].axis("off") + + plt.tight_layout() + out = output_dir / "workload_distribution_plots.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved plots to {out}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Analyze benchmark workload distributions" + ) + parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ or trace_replay/ directory") + parser.add_argument( + "-o", "--output", default=None, help="Output directory (default: same as artifacts_dir)" + ) + args = parser.parse_args() + + artifacts_dir = Path(args.artifacts_dir) + output_dir = Path(args.output) if args.output else artifacts_dir + + # Auto-detect format + trace_replay_csv = artifacts_dir / "detailed_results.csv" + aiperf_jsonl = artifacts_dir / "profile_export.jsonl" + + if trace_replay_csv.exists(): + records = load_trace_replay_records(artifacts_dir) + print(f"Loaded {len(records):,} records from {artifacts_dir} (trace replay)") + elif aiperf_jsonl.exists(): + records = load_records(artifacts_dir) + print(f"Loaded {len(records):,} records from {artifacts_dir} (AIPerf)") + else: + print(f"No recognized data files in {artifacts_dir}") + return + + analyze(records, output_dir) + + +if __name__ == "__main__": + main() diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py new file mode 100755 index 000000000..fc02b1865 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +""" +Collect and aggregate multi-turn benchmark sweep results from GitHub Actions +artifacts. + +Expects a directory of artifact subdirectories named: + multiturn_tp{N}_users{M}_offload{mode}/ +each containing metrics CSVs, status.txt, etc. + +Produces: + - summary.csv with per-experiment aggregated metrics + - Pareto frontier plots (via plot_pareto.py) + +Usage: + python collect_sweep_results.py +""" + +import json +import sys +from pathlib import Path + +import pandas as pd +import numpy as np + + +def _load_custom_client_csv(client_csv: Path, exp_dir: Path) -> pd.DataFrame | None: + """Load per-request metrics from custom benchmark client CSV.""" + df = pd.read_csv(client_csv) + if len(df) == 0: + return None + # Columns expected: start_time_ms, ttft_ms, tpot_ms, latency_ms, + # input_num_tokens, output_num_tokens, ... + return df + + +def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from aiperf profile_export JSONL. + + Converts aiperf's per-record format into the same column schema + used by the custom benchmark client CSV. + """ + records = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + meta = entry.get("metadata", {}) + metrics = entry.get("metrics", {}) + + # Skip non-profiling records or cancelled requests + if meta.get("benchmark_phase") != "profiling": + continue + if meta.get("was_cancelled", False): + continue + + # Extract values (aiperf stores metrics as {value, unit} dicts) + def val(key, default=0): + m = metrics.get(key) + if m is None: + return default + return m.get("value", default) if isinstance(m, dict) else m + + # Compute TPOT from ITL if available + itl = metrics.get("inter_token_latency") + if itl and isinstance(itl, dict): + tpot_ms = itl.get("value", 0) + else: + # Fallback: (latency - ttft) / (output_tokens - 1) + osl = val("output_sequence_length", 1) + ttft = val("time_to_first_token", 0) + latency = val("request_latency", 0) + tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 + + # Convert request_start_ns to ms (epoch) + start_ns = meta.get("request_start_ns", 0) + start_ms = start_ns / 1e6 + + records.append({ + "start_time_ms": start_ms, + "ttft_ms": val("time_to_first_token"), + "tpot_ms": tpot_ms, + "latency_ms": val("request_latency"), + "input_num_tokens": val("input_sequence_length"), + "output_num_tokens": val("output_sequence_length"), + }) + + if not records: + return None + + return pd.DataFrame(records) + + +def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from trace_replay detailed_results.csv.""" + df = pd.read_csv(csv_path) + if len(df) == 0: + return None + + # Filter to successful requests only + df = df[df["success"] == True].copy() + if len(df) == 0: + return None + + # Convert to the same schema as _load_aiperf_jsonl + latency_s = df["request_complete_time"] - df["request_start_time"] + return pd.DataFrame({ + "start_time_ms": df["request_start_time"] * 1000, + "ttft_ms": df["ttft"] * 1000, + "tpot_ms": df["itl"] * 1000, + "latency_ms": latency_s * 1000, + "input_num_tokens": df["input_tokens"], + "output_num_tokens": df["output_tokens_actual"], + }) + + +def load_experiment(exp_dir: Path) -> dict | None: + """Load metrics from a single experiment artifact directory.""" + client_csv = exp_dir / "metrics_client_metrics.csv" + server_csv = exp_dir / "metrics_server_metrics.csv" + status_file = exp_dir / "status.txt" + + if not status_file.exists(): + return None + status = status_file.read_text().strip() + + # Also check for aiperf output + aiperf_jsonl = None + aiperf_artifacts = exp_dir / "aiperf_artifacts" + if aiperf_artifacts.exists(): + candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) + if not candidates: + candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) + if candidates: + aiperf_jsonl = candidates[0] + + # Check for trace replay output + trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" + + if not client_csv.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + return None + + # Parse experiment name from directory: multiturn_tp{N}_users{M}_offload{mode} + # or just tp{N}_users{M}_offload{mode} + name = exp_dir.name + if name.startswith("multiturn_"): + name = name[len("multiturn_"):] + + try: + parts = name.split("_") + tp = int(parts[0].replace("tp", "")) + users = int(parts[1].replace("users", "").replace("bs", "")) + offload = parts[2].replace("offload", "") + except (IndexError, ValueError): + print(f"Warning: cannot parse experiment name '{exp_dir.name}', skipping") + return None + + result = { + "exp_name": name, + "tp": tp, + "users": users, + "offload": offload, + "status": status, + } + + if status != "SUCCESS": + return result + + try: + # Determine data source: custom client CSV, aiperf JSONL, or trace replay CSV + if client_csv.exists(): + df = _load_custom_client_csv(client_csv, exp_dir) + elif aiperf_jsonl is not None: + df = _load_aiperf_jsonl(aiperf_jsonl) + elif trace_replay_csv.exists(): + df = _load_trace_replay_csv(trace_replay_csv) + else: + return result + + if df is None or len(df) == 0: + return result + + # Prefer benchmark_metadata.json for precise wall-clock duration + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + # Fallback: derive from per-request data (first start to last finish) + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) + + # Cache hit rates from server metrics + if server_csv.exists(): + try: + sdf = pd.read_csv(server_csv) + if len(sdf) > 0: + final = sdf.iloc[-1] + if final.get("prefix_cache_queries", 0) > 0: + result["gpu_hit_rate"] = 100 * final["prefix_cache_hits"] / final["prefix_cache_queries"] + if final.get("cpu_prefix_cache_queries", 0) > 0: + result["cpu_hit_rate"] = 100 * final["cpu_prefix_cache_hits"] / final["cpu_prefix_cache_queries"] + except Exception as e: + print(f"Warning: failed to load server metrics for {exp_dir.name}: {e}") + + except Exception as e: + print(f"Warning: failed to load client metrics for {exp_dir.name}: {e}") + + return result + + +def run_pareto_analysis(results_dir: Path, output_dir: Path) -> None: + """Run plot_pareto.py if available, restructuring artifacts to match its + expected layout (subdirs named tp{N}_users{M}_offload{mode}).""" + # plot_pareto.py expects direct subdirectories with experiment names + # The artifact download gives us multiturn_tp{N}_users{M}_offload{mode}/ + # We create symlinks with the canonical names + pareto_input = output_dir / "pareto_input" + pareto_input.mkdir(parents=True, exist_ok=True) + + for subdir in sorted(results_dir.iterdir()): + if not subdir.is_dir(): + continue + name = subdir.name + if name.startswith("multiturn_"): + name = name[len("multiturn_"):] + # plot_pareto.py expects "bs" not "users" in directory names + name = name.replace("_users", "_bs") + link = pareto_input / name + if not link.exists(): + link.symlink_to(subdir.resolve()) + + # Try to import and run plot_pareto + analysis_dir = Path(__file__).resolve().parent.parent / "analysis" + sys.path.insert(0, str(analysis_dir)) + try: + import plot_pareto # type: ignore + plot_pareto.main(pareto_input) + + # Move any generated plots to output dir + for f in pareto_input.glob("*.png"): + f.rename(output_dir / f.name) + for f in pareto_input.glob("*.pdf"): + f.rename(output_dir / f.name) + except Exception as e: + print(f"Warning: plot_pareto analysis failed: {e}") + print("Continuing with summary CSV only.") + + +def main() -> None: + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + artifacts_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) + output_dir.mkdir(parents=True, exist_ok=True) + + if not artifacts_dir.is_dir(): + print(f"Error: {artifacts_dir} is not a directory") + sys.exit(1) + + # Load all experiments + experiments = [] + for subdir in sorted(artifacts_dir.iterdir()): + if not subdir.is_dir(): + continue + result = load_experiment(subdir) + if result is not None: + experiments.append(result) + + if not experiments: + print("No experiments found.") + sys.exit(0) + + # Write summary CSV + summary_path = output_dir / "summary.csv" + df = pd.DataFrame(experiments) + df.to_csv(summary_path, index=False) + print(f"Summary written to {summary_path} ({len(experiments)} experiments)") + + # Print status summary + success = sum(1 for e in experiments if e.get("status") == "SUCCESS") + failed = sum(1 for e in experiments if e.get("status") == "FAILED") + other = len(experiments) - success - failed + print(f" SUCCESS: {success}, FAILED: {failed}, OTHER: {other}") + + # Run Pareto analysis + run_pareto_analysis(artifacts_dir, output_dir) + + # Run overview plots (throughput vs concurrency, workload consistency) + try: + from plot_sweep_overview import plot_throughput_vs_concurrency, plot_workload_consistency + pareto_input = output_dir / "pareto_input" + summary_csv = pareto_input / "experiment_summary.csv" + if summary_csv.exists(): + overview_df = pd.read_csv(summary_csv) + plot_throughput_vs_concurrency(overview_df, output_dir) + plot_workload_consistency(pareto_input, output_dir) + else: + print("Warning: No experiment_summary.csv found, skipping overview plots") + except Exception as e: + print(f"Warning: Overview plots failed: {e}") + + print(f"Aggregated results saved to {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py b/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py new file mode 100644 index 000000000..1fd04bdc0 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +"""Generate overview plots for sweep results. + +Produces: +- throughput_vs_concurrency.png: Throughput & cache hit rate vs concurrent sessions per TP +- workload_consistency.png: ISL distribution box plots per experiment to verify consistent workload + +Usage: + python plot_sweep_overview.py [] +""" + +import csv +import sys +from collections import defaultdict +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +def plot_throughput_vs_concurrency(df: pd.DataFrame, output_dir: Path) -> None: + """Throughput and cache hit rate vs concurrent sessions, per TP.""" + tps = sorted(df["tp"].unique()) + n = len(tps) + if n == 0: + return + + fig, axes = plt.subplots(2, n, figsize=(7 * n, 10)) + if n == 1: + axes = axes.reshape(2, 1) + fig.suptitle("Throughput & Cache Hit Rate vs Concurrent Sessions", fontsize=15) + + for idx, tp in enumerate(tps): + tp_df = df[df["tp"] == tp].sort_values("bs") + off = tp_df[tp_df["offload"] == "off"].sort_values("bs") + on = tp_df[tp_df["offload"] == "on"].sort_values("bs") + + # --- Top row: Throughput --- + ax = axes[0, idx] + if len(off) > 0: + ax.plot(off["bs"], off["total_tps_per_gpu"], "o-", color="#d62728", + linewidth=2.5, markersize=7, label="Offload OFF") + if len(on) > 0: + ax.plot(on["bs"], on["total_tps_per_gpu"], "s-", color="#2ca02c", + linewidth=2.5, markersize=7, label="Offload ON") + + # Annotate max gain + if len(off) > 0 and len(on) > 0: + merged = pd.merge(off[["bs", "total_tps_per_gpu"]], on[["bs", "total_tps_per_gpu"]], + on="bs", suffixes=("_off", "_on")) + if len(merged) > 0: + merged["gain_pct"] = ((merged["total_tps_per_gpu_on"] - merged["total_tps_per_gpu_off"]) + / merged["total_tps_per_gpu_off"] * 100) + max_row = merged.loc[merged["gain_pct"].idxmax()] + if max_row["gain_pct"] > 20: + ax.annotate(f"+{max_row['gain_pct']:.0f}%", + xy=(max_row["bs"], max_row["total_tps_per_gpu_on"]), + xytext=(0, 15), textcoords="offset points", + fontsize=11, fontweight="bold", color="green", ha="center") + + ax.set_xlabel("Concurrent Sessions", fontsize=10) + ax.set_ylabel("Throughput/GPU (tok/s)", fontsize=10) + ax.set_title(f"TP{tp} — Throughput", fontsize=13, fontweight="bold") + max_tput = df["total_tps_per_gpu"].max() + ax.set_ylim(0, max_tput * 1.15 if max_tput > 0 else 15000) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2) + + # --- Bottom row: Cache hit rate --- + ax = axes[1, idx] + if len(off) > 0: + ax.plot(off["bs"], off["gpu_hit_rate"], "o-", color="#d62728", + linewidth=2, markersize=6, label="GPU Hit — OFF") + if len(on) > 0: + ax.plot(on["bs"], on["gpu_hit_rate"], "s-", color="#2ca02c", + linewidth=2, markersize=6, label="GPU Hit — ON") + cpu_hit = on["cpu_hit_rate"].fillna(0) + if cpu_hit.max() > 1: + ax.plot(on["bs"], cpu_hit, "v--", color="#9467bd", + linewidth=2, markersize=6, label="CPU Hit — ON") + + ax.set_xlabel("Concurrent Sessions", fontsize=10) + ax.set_ylabel("Cache Hit Rate (%)", fontsize=10) + ax.set_title(f"TP{tp} — Cache Hit Rate", fontsize=13, fontweight="bold") + ax.set_ylim(0, 105) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2) + + plt.tight_layout() + out = output_dir / "throughput_vs_concurrency.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved {out}") + + +def plot_workload_consistency(pareto_input_dir: Path, output_dir: Path) -> None: + """ISL distribution box plots per experiment to verify consistent workload.""" + csv.field_size_limit(sys.maxsize) + + tps = set() + data_by_tp: dict[int, list[tuple[int, str, list[float]]]] = defaultdict(list) + + for exp_dir in sorted(pareto_input_dir.iterdir()): + if not exp_dir.is_dir() or not exp_dir.name.startswith("tp"): + continue + if "offloadon" in exp_dir.name: + continue # Only use offload-off for consistency check + + parts = exp_dir.name.split("_") + try: + tp = int(parts[0].replace("tp", "")) + bs = int(parts[1].replace("bs", "")) + except (IndexError, ValueError): + continue + + tps.add(tp) + + # Try trace replay CSV + csv_path = exp_dir / "trace_replay" / "detailed_results.csv" + if not csv_path.exists(): + # Try aiperf JSONL + continue + + isls = [] + try: + with open(csv_path) as f: + reader = csv.DictReader(f) + for row in reader: + if row.get("success") == "True": + isls.append(int(row["input_tokens"]) / 1000) # k tokens + except Exception: + continue + + if isls: + data_by_tp[tp].append((bs, exp_dir.name, isls)) + + if not data_by_tp: + print("No workload data found for consistency plot") + return + + sorted_tps = sorted(data_by_tp.keys()) + n = len(sorted_tps) + + fig, axes = plt.subplots(1, n, figsize=(7 * n, 6)) + if n == 1: + axes = [axes] + fig.suptitle("Workload Consistency — ISL Distribution Per Experiment (Offload OFF)", fontsize=14) + + for idx, tp in enumerate(sorted_tps): + ax = axes[idx] + entries = sorted(data_by_tp[tp], key=lambda x: x[0]) + + box_data = [e[2] for e in entries] + labels = [str(e[0]) for e in entries] + means = [np.mean(e[2]) for e in entries] + + bp = ax.boxplot(box_data, tick_labels=labels, patch_artist=True, + showfliers=False, widths=0.6, + medianprops=dict(color="red", linewidth=2)) + for patch in bp["boxes"]: + patch.set_facecolor("steelblue") + patch.set_alpha(0.6) + + ax.plot(range(1, len(means) + 1), means, "o--", color="orange", linewidth=2, + markersize=6, label=f"Mean ({np.mean(means):.0f}k ± {np.std(means):.0f}k)", zorder=5) + + overall_mean = np.mean(means) + overall_std = np.std(means) + ax.axhspan(overall_mean - overall_std, overall_mean + overall_std, + alpha=0.1, color="orange", label="±1σ band") + ax.axhline(overall_mean, color="orange", linestyle=":", alpha=0.5) + + ax.set_xlabel("Concurrent Sessions", fontsize=11) + ax.set_ylabel("ISL (k tokens)", fontsize=11) + ax.set_title(f"TP{tp}", fontsize=13, fontweight="bold") + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2, axis="y") + ax.set_ylim(0, 140) + + plt.tight_layout() + out = output_dir / "workload_consistency.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved {out}") + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} []") + sys.exit(1) + + pareto_input_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else pareto_input_dir.parent + output_dir.mkdir(parents=True, exist_ok=True) + + # Load experiment summary + summary_csv = pareto_input_dir / "experiment_summary.csv" + if not summary_csv.exists(): + # Try parent + summary_csv = output_dir / "summary.csv" + if not summary_csv.exists(): + print(f"No summary CSV found in {pareto_input_dir} or {output_dir}") + return + + df = pd.read_csv(summary_csv) + + # Ensure required columns exist + required = ["tp", "bs", "offload", "total_tps_per_gpu", "gpu_hit_rate"] + missing = [c for c in required if c not in df.columns] + if missing: + print(f"Missing columns in summary: {missing}") + return + + plot_throughput_vs_concurrency(df, output_dir) + plot_workload_consistency(pareto_input_dir, output_dir) + + +if __name__ == "__main__": + main() diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 52e28e9b8..ac91177ca 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -159,7 +159,7 @@ else LOCK_FILE="${SQUASH_FILE}.lock" set -x - salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)" @@ -188,7 +188,7 @@ else --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID From 28991ebac6d1e51c63ffc136d42f40d9d59e2ae7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 15:27:35 -0500 Subject: [PATCH 02/33] remove deprecated GpuTransferCollector from metrics collector Replaced by vLLM's native kv_offload metrics. Removes subprocess/threading imports and ~100 lines of dead code. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/bench/metrics_collector.py | 105 ------------------ 1 file changed, 105 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index c129f38b8..064795f51 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -6,8 +6,6 @@ import asyncio import csv import re -import subprocess -import threading import time from dataclasses import dataclass, field from pathlib import Path @@ -16,109 +14,6 @@ import matplotlib.pyplot as plt -@dataclass -class GpuTransferSnapshot: - timestamp: float - gpu_id: int = 0 - tx_pci: float = 0.0 # PCIe TX (MB/s) - rx_pci: float = 0.0 # PCIe RX (MB/s) - - -class GpuTransferCollector: - """DEPRECATED: Collects GPU transfer stats using nvidia-smi dmon. - - Replaced by vLLM's native kv_offload metrics (vllm:kv_offload_total_bytes_total, - vllm:kv_offload_total_time_total) which are more precise and don't require - spawning a subprocess. - """ - - def __init__(self, gpu_id: int = 0, poll_interval: int = 1): - self.gpu_id = gpu_id - self.poll_interval = poll_interval - self.snapshots: list[GpuTransferSnapshot] = [] - self._process: subprocess.Popen | None = None - self._thread: threading.Thread | None = None - self._running = False - - def _parse_line(self, line: str) -> GpuTransferSnapshot | None: - """Parse a line of nvidia-smi dmon CSV output. - - Format: gpu, rxpci, txpci (values in MB/s) - Example: 0, 406, 32013 - """ - line = line.strip() - if not line or line.startswith('#'): # Skip header/comments - return None - - parts = [p.strip() for p in line.split(',')] - if len(parts) < 3: - return None - - try: - return GpuTransferSnapshot( - timestamp=time.time(), - gpu_id=int(parts[0]), - rx_pci=float(parts[1]) if parts[1] != '-' else 0.0, - tx_pci=float(parts[2]) if parts[2] != '-' else 0.0, - ) - except (ValueError, IndexError): - return None - - def _reader_thread(self) -> None: - """Background thread to read nvidia-smi output.""" - if self._process is None: - return - - for line in iter(self._process.stdout.readline, ''): - if not self._running: - break - snapshot = self._parse_line(line) - if snapshot and snapshot.gpu_id == self.gpu_id: - self.snapshots.append(snapshot) - - def start(self) -> None: - """Start collecting GPU transfer stats.""" - if self._running: - return - - self._running = True - self.snapshots = [] - - try: - self._process = subprocess.Popen( - [ - 'nvidia-smi', 'dmon', - '-i', str(self.gpu_id), - '-s', 't', - '-d', str(self.poll_interval), - '--format', 'csv', - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - self._thread = threading.Thread(target=self._reader_thread, daemon=True) - self._thread.start() - except FileNotFoundError: - print("nvidia-smi not found, GPU transfer monitoring disabled") - self._running = False - - def stop(self) -> None: - """Stop collecting GPU transfer stats.""" - self._running = False - if self._process: - self._process.terminate() - try: - self._process.wait(timeout=2) - except subprocess.TimeoutExpired: - self._process.kill() - self._process = None - - if self._thread: - self._thread.join(timeout=2) - self._thread = None - - @dataclass class MetricsSnapshot: timestamp: float From 695ec2e03f62e9d0e523cb084f6c72297d3447a8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 16:50:32 -0500 Subject: [PATCH 03/33] modularize metrics collector with backend auto-detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VLLMMetricsParser and SGLangMetricsParser with shared MetricsSnapshot. Backend is auto-detected from metrics prefix (vllm: vs sglang:) on first poll. sglang metrics mapped: - token_usage / num_used_tokens → kv_cache_usage - num_running_reqs → num_requests_running - num_queue_reqs → num_requests_waiting - cache_hit_rate × prompt_tokens → prefix_cache_hits/queries - num_retracted_reqs → num_preemptions - realtime_tokens_total mode=prefill_compute/prefill_cache → token source Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/bench/metrics_collector.py | 235 ++++++++++-------- 1 file changed, 129 insertions(+), 106 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index 064795f51..6091318c0 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -1,6 +1,7 @@ """ -Metrics collector for vLLM server during benchmarks. +Metrics collector for inference servers during benchmarks. Polls /metrics endpoint and generates visualizations. +Supports vLLM and sglang backends (auto-detected from metrics prefix). """ import asyncio @@ -9,6 +10,7 @@ import time from dataclasses import dataclass, field from pathlib import Path +from typing import Protocol import aiohttp import matplotlib.pyplot as plt @@ -43,123 +45,144 @@ class MetricsSnapshot: prefill_kv_computed_tokens_count: int = 0 -@dataclass -class MetricsCollector: - base_url: str - poll_interval: float = 1.0 - snapshots: list[MetricsSnapshot] = field(default_factory=list) - _running: bool = False - _task: asyncio.Task | None = None - gpu_transfer_collector: GpuTransferCollector | None = None - gpu_id: int = 0 +# ============================================================================= +# Metrics Parsers — one per backend +# ============================================================================= + +def _get_value(text: str, pattern: str, default: float = 0.0) -> float: + """Extract a gauge/counter value from Prometheus text using a regex.""" + match = re.search(pattern, text) + return float(match.group(1)) if match else default - def _parse_metrics(self, text: str) -> MetricsSnapshot: - """Parse Prometheus metrics text format.""" - snapshot = MetricsSnapshot(timestamp=time.time()) - # Helper to extract gauge/counter value - def get_value(pattern: str, default: float = 0.0) -> float: - match = re.search(pattern, text) - if match: - return float(match.group(1)) - return default +class VLLMMetricsParser: + """Parse vLLM Prometheus metrics (prefix: vllm:).""" + + def parse(self, text: str) -> MetricsSnapshot: + snapshot = MetricsSnapshot(timestamp=time.time()) + g = lambda p, d=0.0: _get_value(text, p, d) # KV cache usage (0-1 scale) - snapshot.kv_cache_usage = get_value( - r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' - ) - # Fallback to old metric name if new one not found + snapshot.kv_cache_usage = g(r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') if snapshot.kv_cache_usage == 0.0: - snapshot.kv_cache_usage = get_value( - r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' - ) - - # CPU/offloaded KV cache usage - snapshot.cpu_kv_cache_usage = get_value( - r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' - ) - - # Running/waiting requests - snapshot.num_requests_running = int(get_value( - r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.num_requests_waiting = int(get_value( - r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Prefix cache (cumulative counters) - GPU - snapshot.prefix_cache_hits = int(get_value( - r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prefix_cache_queries = int(get_value( - r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Prefix cache - external/offloaded (KV connector cross-instance cache) - snapshot.cpu_prefix_cache_hits = int(get_value( - r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.cpu_prefix_cache_queries = int(get_value( - r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Token counters - snapshot.prompt_tokens = int(get_value( - r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.generation_tokens = int(get_value( - r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Preemptions - snapshot.num_preemptions = int(get_value( - r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Request success (sum all finish reasons) + snapshot.kv_cache_usage = g(r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') + + snapshot.cpu_kv_cache_usage = g(r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') + + snapshot.num_requests_running = int(g(r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.num_requests_waiting = int(g(r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prefix_cache_hits = int(g(r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prefix_cache_queries = int(g(r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.cpu_prefix_cache_hits = int(g(r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.cpu_prefix_cache_queries = int(g(r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prompt_tokens = int(g(r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.generation_tokens = int(g(r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.num_preemptions = int(g(r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)')) + for match in re.finditer( - r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', - text + r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', text ): snapshot.request_success += int(float(match.group(1))) - # KV offload bytes transferred (cumulative counters by direction) - snapshot.kv_offload_bytes_gpu_to_cpu = get_value( - r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' - ) - snapshot.kv_offload_bytes_cpu_to_gpu = get_value( - r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' - ) - - # KV offload time (cumulative, seconds) - snapshot.kv_offload_time_gpu_to_cpu = get_value( - r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' - ) - snapshot.kv_offload_time_cpu_to_gpu = get_value( - r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' - ) - - # Prompt tokens by source (cumulative) - snapshot.prompt_tokens_local_compute = int(get_value( - r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prompt_tokens_local_cache_hit = int(get_value( - r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prompt_tokens_external_kv_transfer = int(get_value( - r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)' - )) - - # Prefill KV computed tokens (histogram sum and count) - snapshot.prefill_kv_computed_tokens_sum = int(get_value( - r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prefill_kv_computed_tokens_count = int(get_value( - r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)' - )) + snapshot.kv_offload_bytes_gpu_to_cpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_bytes_cpu_to_gpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_time_gpu_to_cpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_time_cpu_to_gpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)') + + snapshot.prompt_tokens_local_compute = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_local_cache_hit = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_external_kv_transfer = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prefill_kv_computed_tokens_sum = int(g(r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prefill_kv_computed_tokens_count = int(g(r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)')) return snapshot + +class SGLangMetricsParser: + """Parse sglang Prometheus metrics (prefix: sglang:).""" + + def parse(self, text: str) -> MetricsSnapshot: + snapshot = MetricsSnapshot(timestamp=time.time()) + g = lambda p, d=0.0: _get_value(text, p, d) + + # KV cache usage — sglang reports token_usage as a ratio (0-1) + snapshot.kv_cache_usage = g(r'sglang:token_usage\{[^}]*\}\s+([\d.e+-]+)') + # Fallback: compute from num_used_tokens / max_total_num_tokens + if snapshot.kv_cache_usage == 0.0: + used = g(r'sglang:num_used_tokens\{[^}]*\}\s+([\d.e+-]+)') + total = g(r'sglang:max_total_num_tokens\{[^}]*\}\s+([\d.e+-]+)') + if total > 0: + snapshot.kv_cache_usage = used / total + + snapshot.num_requests_running = int(g(r'sglang:num_running_reqs\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.num_requests_waiting = int(g(r'sglang:num_queue_reqs\{[^}]*\}\s+([\d.e+-]+)')) + + # sglang exposes cache_hit_rate as a direct gauge (0-1) + # We convert to cumulative-style by tracking hits/queries from token sources + cache_hit_rate = g(r'sglang:cache_hit_rate\{[^}]*\}\s+([\d.e+-]+)') + prompt_tokens = int(g(r'sglang:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens = prompt_tokens + # Approximate cumulative cache hits from rate × total prompts + if prompt_tokens > 0 and cache_hit_rate > 0: + snapshot.prefix_cache_queries = prompt_tokens + snapshot.prefix_cache_hits = int(prompt_tokens * cache_hit_rate) + + snapshot.generation_tokens = int(g(r'sglang:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + + # Preemptions — sglang calls them "retractions" + snapshot.num_preemptions = int(g(r'sglang:num_retracted_reqs\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.request_success = int(g(r'sglang:num_requests_total\{[^}]*\}\s+([\d.e+-]+)')) + + # Token source breakdown from realtime_tokens_total + snapshot.prompt_tokens_local_compute = int(g( + r'sglang:realtime_tokens_total\{[^}]*mode="prefill_compute"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_local_cache_hit = int(g( + r'sglang:realtime_tokens_total\{[^}]*mode="prefill_cache"[^}]*\}\s+([\d.e+-]+)')) + + return snapshot + + +def detect_backend(text: str) -> str: + """Auto-detect backend from metrics text.""" + if 'vllm:' in text: + return 'vllm' + elif 'sglang:' in text: + return 'sglang' + return 'unknown' + + +def get_parser(backend: str): + """Get the appropriate parser for the backend.""" + if backend == 'sglang': + return SGLangMetricsParser() + return VLLMMetricsParser() # default + + +@dataclass +class MetricsCollector: + base_url: str + poll_interval: float = 1.0 + snapshots: list[MetricsSnapshot] = field(default_factory=list) + _running: bool = False + _task: asyncio.Task | None = None + _parser: VLLMMetricsParser | SGLangMetricsParser | None = None + _backend: str = "" + + def _parse_metrics(self, text: str) -> MetricsSnapshot: + """Parse Prometheus metrics text, auto-detecting backend on first call.""" + if self._parser is None: + self._backend = detect_backend(text) + self._parser = get_parser(self._backend) + if self._backend != 'unknown': + print(f"Auto-detected metrics backend: {self._backend}") + return self._parser.parse(text) + async def _poll_loop(self) -> None: """Background polling loop.""" metrics_url = f"{self.base_url}/metrics" From 6a41d49a2345207899e5f8c30e48078abccb25b2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 16:51:19 -0500 Subject: [PATCH 04/33] remove unused Protocol import --- experimental/multiturn/vllm_benchmark/bench/metrics_collector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index 6091318c0..7bcdf31a4 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -10,7 +10,6 @@ import time from dataclasses import dataclass, field from pathlib import Path -from typing import Protocol import aiohttp import matplotlib.pyplot as plt From c137677e1f0d5b90617d3578ae99f404ceb2a55c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:09:47 -0500 Subject: [PATCH 05/33] add LMCache agentic trace benchmark for H100 Replays SWE-bench/GAIA/WildClaw traces from sammshen/lmcache-agentic-traces via AIPerf with mooncake_trace format. Downloads and converts traces at runtime. Supports concurrency sweep with offload on/off. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp8_h100_lmcache_aiperf.sh | 230 ++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100755 benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh new file mode 100755 index 000000000..fb02a79a1 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# LMCache agentic trace benchmark for FP8 models on H100 using AIPerf. +# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format. +# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (if set, runs for this many seconds; otherwise runs to completion) +# REQUEST_RATE (default: 0 = no rate limit, concurrency-burst mode) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +REQUEST_RATE=${REQUEST_RATE:-0} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +AIPERF_DIR="$MULTITURN_DIR/aiperf" +TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" + +pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +# ---- Convert LMCache traces to mooncake format ----------------------------- +echo "Downloading and converting LMCache traces..." +python3 -c " +import json, os +try: + from datasets import load_dataset + ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') + out_path = '$TRACE_FILE' + sessions = set() + with open(out_path, 'w') as f: + for row in ds: + entry = { + 'session_id': row['session_id'], + 'messages': row['input'], + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) + print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') +except Exception as e: + print(f'ERROR converting traces: {e}') + exit(1) +" + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" + +echo "Installing aiperf in isolated venv..." +python3 -m venv /tmp/aiperf-venv --system-site-packages +/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10 +AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf" + +/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run AIPerf benchmark ---------------------------------------------------- +export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" + +AIPERF_CMD="$AIPERF_BIN profile" +AIPERF_CMD+=" --model $MODEL" +AIPERF_CMD+=" --url http://localhost:$PORT" +AIPERF_CMD+=" --endpoint-type chat" +AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --input-file $TRACE_FILE" +AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --concurrency $USERS" +if [ "$REQUEST_RATE" != "0" ]; then + AIPERF_CMD+=" --request-rate $REQUEST_RATE" +fi +if [ -n "${DURATION:-}" ]; then + AIPERF_CMD+=" --benchmark-duration $DURATION" + AIPERF_CMD+=" --benchmark-grace-period 0" +fi +AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" +AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" +AIPERF_CMD+=" --extra-inputs ignore_eos:true" +AIPERF_CMD+=" --export-level records" +AIPERF_CMD+=" --ui-type simple" +AIPERF_CMD+=" --random-seed 42" + +echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" From ee767671f52da38c31d355ab359b9a0d8000d532 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:15:20 -0500 Subject: [PATCH 06/33] add H100 LMCache trace sweep config Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 5ec98b902..e19780a21 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -20,6 +20,17 @@ mi355x-fp8-llama70b: users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512] offload: ["on", "off"] +h100-fp8-llama70b-lmcache: + tp2: + users: [2, 4, 6, 8, 10, 12, 16, 20] + offload: ["on", "off"] + tp4: + users: [2, 4, 8, 12, 16, 20, 24, 32, 40] + offload: ["on", "off"] + tp8: + users: [2, 4, 8, 16, 24, 32, 48, 64] + offload: ["on", "off"] + b200-fp4-dsr1: tp4: ep: 4 From 839ba0f8de99ef541ca8c652a6bfe087479e5a02 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:30:54 -0500 Subject: [PATCH 07/33] fix LMCache benchmark: use fixed-schedule replay, remove ignore_eos - Add --fixed-schedule to replay at exact trace timestamps - Remove --extra-inputs ignore_eos:true (let model stop naturally) - Remove unused REQUEST_RATE logic Co-Authored-By: Claude Opus 4.6 (1M context) --- .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index fb02a79a1..53d2c03b1 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -11,7 +11,6 @@ set -x # Optional: # PORT (default 8888), REQUEST_TIMEOUT (default 3600) # DURATION (if set, runs for this many seconds; otherwise runs to completion) -# REQUEST_RATE (default: 0 = no rate limit, concurrency-burst mode) source "$(dirname "$0")/../benchmark_lib.sh" @@ -25,7 +24,6 @@ check_env_vars \ PORT=${PORT:-8888} REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} -REQUEST_RATE=${REQUEST_RATE:-0} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -183,17 +181,14 @@ AIPERF_CMD+=" --endpoint-type chat" AIPERF_CMD+=" --streaming" AIPERF_CMD+=" --input-file $TRACE_FILE" AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --fixed-schedule" AIPERF_CMD+=" --concurrency $USERS" -if [ "$REQUEST_RATE" != "0" ]; then - AIPERF_CMD+=" --request-rate $REQUEST_RATE" -fi if [ -n "${DURATION:-}" ]; then AIPERF_CMD+=" --benchmark-duration $DURATION" AIPERF_CMD+=" --benchmark-grace-period 0" fi AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" -AIPERF_CMD+=" --extra-inputs ignore_eos:true" AIPERF_CMD+=" --export-level records" AIPERF_CMD+=" --ui-type simple" AIPERF_CMD+=" --random-seed 42" From fc8e3cf02d7975931233bcd43589030ab036d829 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:36:00 -0500 Subject: [PATCH 08/33] remove --fixed-schedule: use concurrency mode per Samuel's recommendation --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 53d2c03b1..ff10f0252 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -181,7 +181,6 @@ AIPERF_CMD+=" --endpoint-type chat" AIPERF_CMD+=" --streaming" AIPERF_CMD+=" --input-file $TRACE_FILE" AIPERF_CMD+=" --custom-dataset-type mooncake_trace" -AIPERF_CMD+=" --fixed-schedule" AIPERF_CMD+=" --concurrency $USERS" if [ -n "${DURATION:-}" ]; then AIPERF_CMD+=" --benchmark-duration $DURATION" From 6bbbfa989d23789385897fb015b2271a89390293 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 21:23:45 -0500 Subject: [PATCH 09/33] update yaml --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index e19780a21..500a6705e 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -22,13 +22,13 @@ mi355x-fp8-llama70b: h100-fp8-llama70b-lmcache: tp2: - users: [2, 4, 6, 8, 10, 12, 16, 20] + users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64] offload: ["on", "off"] tp4: - users: [2, 4, 8, 12, 16, 20, 24, 32, 40] + users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128] offload: ["on", "off"] tp8: - users: [2, 4, 8, 16, 24, 32, 48, 64] + users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] b200-fp4-dsr1: From a2e4fe64351a31f378eb535e903555995b9f9341 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 21:51:24 -0500 Subject: [PATCH 10/33] fix H100 runner: add SCRIPT_SUFFIX support Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_h100-cw.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 49a42e981..28e89e0cb 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -31,7 +31,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID From fee02780917b7755aec804fcea39dc940160ddaf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 22:45:33 -0500 Subject: [PATCH 11/33] fix: mkdir RESULT_DIR before trace conversion --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index ff10f0252..1bec35577 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -81,6 +81,8 @@ with open(sys.argv[1], 'w') as f: PYEOF fi +mkdir -p "$RESULT_DIR" + # ---- Convert LMCache traces to mooncake format ----------------------------- echo "Downloading and converting LMCache traces..." python3 -c " From 769532c3985bd24714d65dfdf3ad6e3651c9b60c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:15:28 -0500 Subject: [PATCH 12/33] add H200 LMCache trace benchmark and config Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 11 + .../multiturn_fp8_h200_lmcache_aiperf.sh | 226 ++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100755 benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 500a6705e..bb0e568d3 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -20,6 +20,17 @@ mi355x-fp8-llama70b: users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512] offload: ["on", "off"] +h200-fp8-llama70b-lmcache: + tp2: + users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32] + offload: ["on", "off"] + tp4: + users: [4, 8, 16, 24, 32, 40, 48, 56] + offload: ["on", "off"] + tp8: + users: [2, 4, 8, 16, 32, 48, 64, 80] + offload: ["on", "off"] + h100-fp8-llama70b-lmcache: tp2: users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64] diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh new file mode 100755 index 000000000..9a0c89e5a --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -0,0 +1,226 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# LMCache agentic trace benchmark for FP8 models on H200 using AIPerf. +# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format. +# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (if set, runs for this many seconds; otherwise runs to completion) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +AIPERF_DIR="$MULTITURN_DIR/aiperf" +TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" + +pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +mkdir -p "$RESULT_DIR" + +# ---- Convert LMCache traces to mooncake format ----------------------------- +echo "Downloading and converting LMCache traces..." +python3 -c " +import json, os +try: + from datasets import load_dataset + ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') + out_path = '$TRACE_FILE' + sessions = set() + with open(out_path, 'w') as f: + for row in ds: + entry = { + 'session_id': row['session_id'], + 'messages': row['input'], + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) + print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') +except Exception as e: + print(f'ERROR converting traces: {e}') + exit(1) +" + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" + +echo "Installing aiperf in isolated venv..." +python3 -m venv /tmp/aiperf-venv --system-site-packages +/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10 +AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf" + +/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run AIPerf benchmark ---------------------------------------------------- +export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" + +AIPERF_CMD="$AIPERF_BIN profile" +AIPERF_CMD+=" --model $MODEL" +AIPERF_CMD+=" --url http://localhost:$PORT" +AIPERF_CMD+=" --endpoint-type chat" +AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --input-file $TRACE_FILE" +AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --concurrency $USERS" +if [ -n "${DURATION:-}" ]; then + AIPERF_CMD+=" --benchmark-duration $DURATION" + AIPERF_CMD+=" --benchmark-grace-period 0" +fi +AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" +AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" +AIPERF_CMD+=" --export-level records" +AIPERF_CMD+=" --ui-type simple" +AIPERF_CMD+=" --random-seed 42" + +echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" From 02876afda83786ab96df394708356f99076d9fe0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:15:47 -0500 Subject: [PATCH 13/33] update yaml --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index bb0e568d3..63892d202 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -22,13 +22,13 @@ mi355x-fp8-llama70b: h200-fp8-llama70b-lmcache: tp2: - users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32] + users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64] offload: ["on", "off"] tp4: - users: [4, 8, 16, 24, 32, 40, 48, 56] + users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128] offload: ["on", "off"] tp8: - users: [2, 4, 8, 16, 32, 48, 64, 80] + users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] h100-fp8-llama70b-lmcache: From 2134fd8664effdb5066834c2e81a5c53a50ce3fd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:19:08 -0500 Subject: [PATCH 14/33] fix H200-nb runner: add SCRIPT_SUFFIX support --- runners/launch_h200-nb.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..8c75700df 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -19,4 +19,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh From ab2812a8eaea1d52c1d08e37383d7649308ca613 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:23:08 -0500 Subject: [PATCH 15/33] fix all H200 runners: add SCRIPT_SUFFIX support --- runners/launch_h200-cw.sh | 2 +- runners/launch_h200-dgxc-slurm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 657f84792..c4bdad736 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -44,7 +44,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 9b3b771a5..e09eaeeed 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -258,7 +258,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp')${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID From 5aa993f5eef7ecf3625bb861c04530e976d2a1a0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 08:20:01 -0500 Subject: [PATCH 16/33] fix all runners: add SCRIPT_SUFFIX support Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_b200-dgxc-slurm.sh | 2 +- runners/launch_b200-dgxc.sh | 2 +- runners/launch_b200-nb.sh | 2 +- runners/launch_gb200-nv.sh | 2 +- runners/launch_h100-cr.sh | 2 +- runners/launch_h100-dgxc-slurm.sh | 2 +- runners/launch_mi300x-amds.sh | 2 +- runners/launch_mi325x-amd.sh | 2 +- runners/launch_mi355x-amds.sh | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 0d1bd40cc..d2ad4bc5d 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -234,5 +234,5 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh fi diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f8c614936..8243fd6d0 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -36,7 +36,7 @@ docker run --rm --init --network host --name $server_name \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" +benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh" # Try graceful first docker stop -t 90 "$server_name" || true diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index c321ee0f9..eda4b17ba 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -17,4 +17,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh \ No newline at end of file +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh \ No newline at end of file diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f8f0ef26e..8d20ea162 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -63,7 +63,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" && -z "$CONFIG_FILE" ]]; then else BENCHMARK_SUBDIR="single_node" fi - bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" + bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME%.sh}${SCRIPT_SUFFIX:-}.sh" # Wait for all jobs to complete echo "Waiting for all jobs to complete..." while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 5100419b9..7539d99db 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -15,4 +15,4 @@ docker run --rm --network=host --name=$server_name \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh" +benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh" diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index bb0335955..98af3caf2 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -247,7 +247,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b654c515a..8b9896e00 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \ --container-remap-root \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi300x.sh +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi300x${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID \ No newline at end of file diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 67f93a309..e6c3ca4e4 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \ --container-remap-root \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index ac91177ca..2069774ba 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -57,7 +57,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then else BENCHMARK_SUBDIR="single_node" fi - JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME%.sh}${SCRIPT_SUFFIX:-}.sh") # Wait for job to complete LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" From d5dd15103276a358988792f7d8d41c37b5ff07d0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 08:35:56 -0500 Subject: [PATCH 17/33] reduce multiturn artifact size: upload only files needed for post-processing Drops ~18GB per artifact by excluding inputs.json, conversations.jsonl, responses.json, GPU telemetry, raw records, and full aiperf_artifacts/. Only uploads the specific files used by collect_sweep_results.py and plot_pareto.py. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index a72034b14..20777d0eb 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -156,18 +156,17 @@ jobs: results/metrics_server_metrics.csv results/metrics_plots.png results/benchmark.log - results/server.log results/config.yaml results/vllm_command.txt results/benchmark_command.txt results/benchmark_metadata.json results/metrics_workload.png - results/responses.json - results/aiperf_artifacts/ - results/conversations.jsonl + results/aiperf_artifacts/profile_export_aiperf.csv + results/aiperf_artifacts/profile_export_aiperf.json + results/aiperf_artifacts/profile_export.jsonl results/workload_distribution_summary.txt results/workload_distribution_plots.png - results/trace_replay/ + results/trace_replay/detailed_results.csv results/status.txt if-no-files-found: ignore From bd4ec30ec4d83fefc403828b61db0fe599c00aab Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 10:04:42 -0500 Subject: [PATCH 18/33] add exclusive --- runners/launch_h100-dgxc-slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 98af3caf2..b3190577a 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -242,6 +242,7 @@ else fi srun --jobid=$JOB_ID \ + --exclusive \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ From a12cc9d2498c2571b98e9bb4239a3c2c047901f4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 10:04:57 -0500 Subject: [PATCH 19/33] add exclusive --- runners/launch_b200-dgxc-slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index d2ad4bc5d..3ff289e61 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -229,6 +229,7 @@ else fi srun --jobid=$JOB_ID \ + --exclusive \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ From af49d11635ee979f36b9550bfcf56199671a8ce3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 11:04:40 -0500 Subject: [PATCH 20/33] add exclusive --- runners/launch_h100-dgxc-slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index b3190577a..124c8de6e 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -3,7 +3,7 @@ # System-specific configuration for H100 DGXC Slurm cluster SLURM_PARTITION="hpc-gpu-1" SLURM_ACCOUNT="customer" -SLURM_EXCLUDED_NODELIST="hpc-gpu-1-7" +SLURM_EXCLUDED_NODELIST="hpc-gpu-1-1,hpc-gpu-1-7,hpc-gpu-1-18" set -x From 48ef44d54c63823ff127c214e0785dc4f70cafb2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 11:14:32 -0500 Subject: [PATCH 21/33] use aiperf summary CSV instead of per-record JSONL for post-processing The profile_export.jsonl with 233K records was ~10GB per artifact. Switch collect_sweep_results.py and plot_pareto.py to read from the pre-computed profile_export_aiperf.csv (~4KB) instead. Remove the JSONL from the artifact upload. Existing client CSV and trace_replay paths are unchanged. Also exclude low-FreeMem H100 nodes (1, 7, 18) to avoid cudaMallocHost/mlock failures during vLLM CPU KV cache allocation. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../workflows/benchmark-multiturn-tmpl.yml | 1 - .../vllm_benchmark/analysis/plot_pareto.py | 172 +++++++------ .../scripts/collect_sweep_results.py | 242 ++++++++++-------- 3 files changed, 223 insertions(+), 192 deletions(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 20777d0eb..7c1d5ce0d 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -163,7 +163,6 @@ jobs: results/metrics_workload.png results/aiperf_artifacts/profile_export_aiperf.csv results/aiperf_artifacts/profile_export_aiperf.json - results/aiperf_artifacts/profile_export.jsonl results/workload_distribution_summary.txt results/workload_distribution_plots.png results/trace_replay/detailed_results.csv diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py index 277bfca7f..7da67c8a4 100644 --- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -17,53 +17,69 @@ from pathlib import Path -def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: - """Load per-request metrics from aiperf profile_export JSONL.""" - records = [] - with open(jsonl_path) as f: - for line in f: - line = line.strip() - if not line: - continue - entry = json.loads(line) - meta = entry.get("metadata", {}) - metrics = entry.get("metrics", {}) - - if meta.get("benchmark_phase") != "profiling": - continue - if meta.get("was_cancelled", False): - continue - - def val(key, default=0): - m = metrics.get(key) - if m is None: - return default - return m.get("value", default) if isinstance(m, dict) else m - - itl = metrics.get("inter_token_latency") - if itl and isinstance(itl, dict): - tpot_ms = itl.get("value", 0) - else: - osl = val("output_sequence_length", 1) - ttft = val("time_to_first_token", 0) - latency = val("request_latency", 0) - tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 - - start_ns = meta.get("request_start_ns", 0) - start_ms = start_ns / 1e6 - - records.append({ - "start_time_ms": start_ms, - "ttft_ms": val("time_to_first_token"), - "tpot_ms": tpot_ms, - "latency_ms": val("request_latency"), - "input_num_tokens": val("input_sequence_length"), - "output_num_tokens": val("output_sequence_length"), - }) - - if not records: +def _load_aiperf_summary_csv(csv_path: Path, exp_dir: Path, tp: int, + gpu_hit_rate: float | None, + cpu_hit_rate: float | None) -> dict | None: + """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv.""" + df = pd.read_csv(csv_path) + if len(df) == 0: return None - return pd.DataFrame(records) + + per_metric = df[df["avg"].notna()].set_index("Metric") + scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + + def metric_stat(metric_name, stat): + if metric_name in per_metric.index: + return float(per_metric.loc[metric_name, stat]) + return 0 + + def scalar_val(metric_name): + if metric_name in scalars.index: + return float(scalars.loc[metric_name, "min"]) + return 0 + + exp_name = exp_dir.name + parts = exp_name.split("_") + tp_parsed = int(parts[0].replace("tp", "")) + bs = int(parts[1].replace("bs", "")) + offload = parts[2].replace("offload", "") + + num_requests = int(scalar_val("Request Count")) + throughput_rps = scalar_val("Request Throughput (requests/sec)") + output_throughput_tps = scalar_val("Output Token Throughput (tokens/sec)") + total_throughput_tps = scalar_val("Total Token Throughput (tokens/sec)") + input_throughput_tps = total_throughput_tps - output_throughput_tps + + return { + "exp_name": exp_name, + "tp": tp_parsed, + "bs": bs, + "offload": offload, + "num_requests": num_requests, + "throughput_rps": throughput_rps, + "input_throughput_tps": input_throughput_tps, + "total_throughput_tps": total_throughput_tps, + "input_tps_per_gpu": input_throughput_tps / tp_parsed, + "output_tps_per_gpu": output_throughput_tps / tp_parsed, + "total_tps_per_gpu": total_throughput_tps / tp_parsed, + "mean_ttft_ms": metric_stat("Time to First Token (ms)", "avg"), + "p50_ttft_ms": metric_stat("Time to First Token (ms)", "p50"), + "p90_ttft_ms": metric_stat("Time to First Token (ms)", "p90"), + "p99_ttft_ms": metric_stat("Time to First Token (ms)", "p99"), + "mean_tpot_ms": metric_stat("Inter Token Latency (ms)", "avg"), + "p50_tpot_ms": metric_stat("Inter Token Latency (ms)", "p50"), + "p90_tpot_ms": metric_stat("Inter Token Latency (ms)", "p90"), + "p99_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"), + "p999_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"), # p999 not available, use p99 + "mean_latency_ms": metric_stat("Request Latency (ms)", "avg"), + "p50_latency_ms": metric_stat("Request Latency (ms)", "p50"), + "p90_latency_ms": metric_stat("Request Latency (ms)", "p90"), + "p99_latency_ms": metric_stat("Request Latency (ms)", "p99"), + "p999_latency_ms": metric_stat("Request Latency (ms)", "p99"), # p999 not available, use p99 + "p999_ttft_ms": metric_stat("Time to First Token (ms)", "p99"), # p999 not available, use p99 + "gpu_hit_rate": gpu_hit_rate, + "cpu_hit_rate": cpu_hit_rate, + } def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: @@ -103,43 +119,46 @@ def load_experiment_data(exp_dir: Path) -> dict | None: if status != "SUCCESS": return None - # Also check for aiperf output - aiperf_jsonl = None + # Check for aiperf summary CSV (preferred) + aiperf_summary_csv = None aiperf_artifacts = exp_dir / "aiperf_artifacts" if aiperf_artifacts.exists(): - candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) - if not candidates: - candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) - if candidates: - aiperf_jsonl = candidates[0] + candidate = aiperf_artifacts / "profile_export_aiperf.csv" + if candidate.exists(): + aiperf_summary_csv = candidate # Check for trace replay output trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" - if not client_metrics_file.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + if not client_metrics_file.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists(): return None try: - if client_metrics_file.exists(): - df = pd.read_csv(client_metrics_file) - elif aiperf_jsonl is not None: - df = _load_aiperf_jsonl(aiperf_jsonl) - elif trace_replay_csv.exists(): - df = _load_trace_replay_csv(trace_replay_csv) - else: - return None - # Load server metrics for cache hit rates gpu_hit_rate = None cpu_hit_rate = None if server_metrics_file.exists(): server_df = pd.read_csv(server_metrics_file) - # Get final cumulative values final_row = server_df.iloc[-1] if final_row["prefix_cache_queries"] > 0: gpu_hit_rate = 100 * final_row["prefix_cache_hits"] / final_row["prefix_cache_queries"] if final_row["cpu_prefix_cache_queries"] > 0: cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"] + + # Use aiperf summary CSV directly if available + if aiperf_summary_csv is not None and not client_metrics_file.exists(): + exp_name = exp_dir.name + parts = exp_name.split("_") + tp = int(parts[0].replace("tp", "")) + return _load_aiperf_summary_csv(aiperf_summary_csv, exp_dir, tp, gpu_hit_rate, cpu_hit_rate) + + if client_metrics_file.exists(): + df = pd.read_csv(client_metrics_file) + elif trace_replay_csv.exists(): + df = _load_trace_replay_csv(trace_replay_csv) + else: + return None + if len(df) == 0: return None @@ -151,7 +170,6 @@ def load_experiment_data(exp_dir: Path) -> dict | None: offload = parts[2].replace("offload", "") # Calculate metrics - # Prefer benchmark_metadata.json for precise wall-clock duration metadata_file = exp_dir / "benchmark_metadata.json" total_time_sec = None if metadata_file.exists(): @@ -162,33 +180,20 @@ def load_experiment_data(exp_dir: Path) -> dict | None: except Exception: pass - # Fallback: derive from per-request data (first start to last finish) if not total_time_sec or total_time_sec <= 0: first_start_ms = df["start_time_ms"].min() last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 if total_time_sec <= 0: - total_time_sec = df["latency_ms"].sum() / 1000 # fallback + total_time_sec = df["latency_ms"].sum() / 1000 num_requests = len(df) throughput_rps = num_requests / total_time_sec if total_time_sec > 0 else 0 - - # Input token throughput (prefill) total_input_tokens = df["input_num_tokens"].sum() input_throughput_tps = total_input_tokens / total_time_sec if total_time_sec > 0 else 0 - - # Output token throughput (decode only) total_output_tokens = df["output_num_tokens"].sum() output_throughput_tps = total_output_tokens / total_time_sec if total_time_sec > 0 else 0 - - # Total token throughput (input + output) - total_tokens = total_input_tokens + total_output_tokens - total_throughput_tps = total_tokens / total_time_sec if total_time_sec > 0 else 0 - - # Normalized throughput (per GPU) - input_tps_per_gpu = input_throughput_tps / tp - output_tps_per_gpu = output_throughput_tps / tp - total_tps_per_gpu = total_throughput_tps / tp + total_throughput_tps = (total_input_tokens + total_output_tokens) / total_time_sec if total_time_sec > 0 else 0 return { "exp_name": exp_name, @@ -199,9 +204,9 @@ def load_experiment_data(exp_dir: Path) -> dict | None: "throughput_rps": throughput_rps, "input_throughput_tps": input_throughput_tps, "total_throughput_tps": total_throughput_tps, - "input_tps_per_gpu": input_tps_per_gpu, - "output_tps_per_gpu": output_tps_per_gpu, - "total_tps_per_gpu": total_tps_per_gpu, + "input_tps_per_gpu": input_throughput_tps / tp, + "output_tps_per_gpu": output_throughput_tps / tp, + "total_tps_per_gpu": total_throughput_tps / tp, "mean_ttft_ms": df["ttft_ms"].mean(), "p50_ttft_ms": df["ttft_ms"].median(), "p90_ttft_ms": df["ttft_ms"].quantile(0.9), @@ -217,7 +222,6 @@ def load_experiment_data(exp_dir: Path) -> dict | None: "p99_latency_ms": df["latency_ms"].quantile(0.99), "p999_latency_ms": df["latency_ms"].quantile(0.999), "p999_ttft_ms": df["ttft_ms"].quantile(0.999), - # Cache hit rates "gpu_hit_rate": gpu_hit_rate, "cpu_hit_rate": cpu_hit_rate, } diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py index fc02b1865..9910fb8ff 100755 --- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -33,63 +33,52 @@ def _load_custom_client_csv(client_csv: Path, exp_dir: Path) -> pd.DataFrame | N return df -def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: - """Load per-request metrics from aiperf profile_export JSONL. +def _load_aiperf_summary_csv(csv_path: Path) -> dict | None: + """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv. - Converts aiperf's per-record format into the same column schema - used by the custom benchmark client CSV. + Returns a dict with pre-computed metrics matching the result schema, + or None if the file can't be parsed. """ - records = [] - with open(jsonl_path) as f: - for line in f: - line = line.strip() - if not line: - continue - entry = json.loads(line) - meta = entry.get("metadata", {}) - metrics = entry.get("metrics", {}) - - # Skip non-profiling records or cancelled requests - if meta.get("benchmark_phase") != "profiling": - continue - if meta.get("was_cancelled", False): - continue - - # Extract values (aiperf stores metrics as {value, unit} dicts) - def val(key, default=0): - m = metrics.get(key) - if m is None: - return default - return m.get("value", default) if isinstance(m, dict) else m - - # Compute TPOT from ITL if available - itl = metrics.get("inter_token_latency") - if itl and isinstance(itl, dict): - tpot_ms = itl.get("value", 0) - else: - # Fallback: (latency - ttft) / (output_tokens - 1) - osl = val("output_sequence_length", 1) - ttft = val("time_to_first_token", 0) - latency = val("request_latency", 0) - tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 - - # Convert request_start_ns to ms (epoch) - start_ns = meta.get("request_start_ns", 0) - start_ms = start_ns / 1e6 - - records.append({ - "start_time_ms": start_ms, - "ttft_ms": val("time_to_first_token"), - "tpot_ms": tpot_ms, - "latency_ms": val("request_latency"), - "input_num_tokens": val("input_sequence_length"), - "output_num_tokens": val("output_sequence_length"), - }) - - if not records: + df = pd.read_csv(csv_path) + if len(df) == 0: return None - return pd.DataFrame(records) + # The CSV has two sections: + # 1. Per-metric rows with columns: Metric, avg, min, max, sum, p1..p99, std + # 2. Scalar rows with columns: Metric, Value + # Split by finding rows where only Metric and Value are populated + per_metric = df[df["avg"].notna()].set_index("Metric") + scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + + def metric_stat(metric_name, stat): + if metric_name in per_metric.index: + return float(per_metric.loc[metric_name, stat]) + return 0 + + def scalar_val(metric_name): + if metric_name in scalars.index: + return float(scalars.loc[metric_name, "min"]) # "min" column holds Value + return 0 + + return { + "num_requests": int(scalar_val("Request Count")), + "throughput_rps": scalar_val("Request Throughput (requests/sec)"), + "output_throughput_tps": scalar_val("Output Token Throughput (tokens/sec)"), + "total_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)"), + "input_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)") - scalar_val("Output Token Throughput (tokens/sec)"), + "mean_ttft_ms": metric_stat("Time to First Token (ms)", "avg"), + "p50_ttft_ms": metric_stat("Time to First Token (ms)", "p50"), + "p90_ttft_ms": metric_stat("Time to First Token (ms)", "p90"), + "p99_ttft_ms": metric_stat("Time to First Token (ms)", "p99"), + "mean_tpot_ms": metric_stat("Inter Token Latency (ms)", "avg"), + "p50_tpot_ms": metric_stat("Inter Token Latency (ms)", "p50"), + "p90_tpot_ms": metric_stat("Inter Token Latency (ms)", "p90"), + "p99_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"), + "mean_latency_ms": metric_stat("Request Latency (ms)", "avg"), + "p50_latency_ms": metric_stat("Request Latency (ms)", "p50"), + "p90_latency_ms": metric_stat("Request Latency (ms)", "p90"), + "p99_latency_ms": metric_stat("Request Latency (ms)", "p99"), + } def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: @@ -125,20 +114,18 @@ def load_experiment(exp_dir: Path) -> dict | None: return None status = status_file.read_text().strip() - # Also check for aiperf output - aiperf_jsonl = None + # Check for aiperf summary CSV (preferred) or per-record JSONL (fallback) + aiperf_summary_csv = None aiperf_artifacts = exp_dir / "aiperf_artifacts" if aiperf_artifacts.exists(): - candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) - if not candidates: - candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) - if candidates: - aiperf_jsonl = candidates[0] + candidate = aiperf_artifacts / "profile_export_aiperf.csv" + if candidate.exists(): + aiperf_summary_csv = candidate # Check for trace replay output trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" - if not client_csv.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + if not client_csv.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists(): return None # Parse experiment name from directory: multiturn_tp{N}_users{M}_offload{mode} @@ -168,59 +155,100 @@ def load_experiment(exp_dir: Path) -> dict | None: return result try: - # Determine data source: custom client CSV, aiperf JSONL, or trace replay CSV + # Determine data source: custom client CSV, aiperf summary CSV, or trace replay CSV if client_csv.exists(): df = _load_custom_client_csv(client_csv, exp_dir) - elif aiperf_jsonl is not None: - df = _load_aiperf_jsonl(aiperf_jsonl) + if df is None or len(df) == 0: + return result + + # Prefer benchmark_metadata.json for precise wall-clock duration + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) + elif aiperf_summary_csv is not None: + aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) + if aiperf_metrics is None: + return result + result.update(aiperf_metrics) elif trace_replay_csv.exists(): df = _load_trace_replay_csv(trace_replay_csv) + if df is None or len(df) == 0: + return result + + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) else: return result - if df is None or len(df) == 0: - return result - - # Prefer benchmark_metadata.json for precise wall-clock duration - metadata_file = exp_dir / "benchmark_metadata.json" - total_time_sec = None - if metadata_file.exists(): - try: - with open(metadata_file) as f: - metadata = json.load(f) - total_time_sec = metadata.get("benchmark_runtime_sec") - except Exception: - pass - - # Fallback: derive from per-request data (first start to last finish) - if not total_time_sec or total_time_sec <= 0: - first_start_ms = df["start_time_ms"].min() - last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() - total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 - if total_time_sec <= 0: - total_time_sec = df["latency_ms"].sum() / 1000 - - num_requests = len(df) - result.update({ - "num_requests": num_requests, - "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, - "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, - "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, - "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, - "mean_ttft_ms": df["ttft_ms"].mean(), - "p50_ttft_ms": df["ttft_ms"].median(), - "p90_ttft_ms": df["ttft_ms"].quantile(0.9), - "p99_ttft_ms": df["ttft_ms"].quantile(0.99), - "mean_tpot_ms": df["tpot_ms"].mean(), - "p50_tpot_ms": df["tpot_ms"].median(), - "p90_tpot_ms": df["tpot_ms"].quantile(0.9), - "p99_tpot_ms": df["tpot_ms"].quantile(0.99), - "mean_latency_ms": df["latency_ms"].mean(), - "p50_latency_ms": df["latency_ms"].median(), - "p90_latency_ms": df["latency_ms"].quantile(0.9), - "p99_latency_ms": df["latency_ms"].quantile(0.99), - }) - # Cache hit rates from server metrics if server_csv.exists(): try: From 4f106b8fdc9e27b30ca843eaf699510204e28216 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 11:26:03 -0500 Subject: [PATCH 22/33] debug --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 1bec35577..926bda021 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -14,6 +14,10 @@ set -x source "$(dirname "$0")/../benchmark_lib.sh" +export CUDA_LAUNCH_BLOCKING=1 + +ulimit -a + check_env_vars \ MODEL \ TP \ From cfb25fb509e7a87b0d8f8dadb4b60821f06eb072 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:31:49 -0500 Subject: [PATCH 23/33] fix LMCache traces: convert system role to developer for vLLM v0.18+ vLLM v0.18.0 follows the newer OpenAI API spec where the 'system' message role was renamed to 'developer'. The LMCache traces use 'system', causing 100% 400 Bad Request errors. Also drop the 15GB profile_export_aiperf.json from artifact uploads. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 1 - .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 9 ++++++++- .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 9 ++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 7c1d5ce0d..f366564d3 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -162,7 +162,6 @@ jobs: results/benchmark_metadata.json results/metrics_workload.png results/aiperf_artifacts/profile_export_aiperf.csv - results/aiperf_artifacts/profile_export_aiperf.json results/workload_distribution_summary.txt results/workload_distribution_plots.png results/trace_replay/detailed_results.csv diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 926bda021..1d1c3154d 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -98,9 +98,16 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role + # was renamed to 'developer'. Convert to avoid 400 validation errors. + messages = [] + for msg in row['input']: + if msg.get('role') == 'system': + msg = {**msg, 'role': 'developer'} + messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 9a0c89e5a..03fd4402e 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -94,9 +94,16 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role + # was renamed to 'developer'. Convert to avoid 400 validation errors. + messages = [] + for msg in row['input']: + if msg.get('role') == 'system': + msg = {**msg, 'role': 'developer'} + messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') From ede9bde6e081eff22aad6683a9472d8babb2be86 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:33:59 -0500 Subject: [PATCH 24/33] revert system->developer role conversion in LMCache traces Co-Authored-By: Claude Opus 4.6 (1M context) --- .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 9 +-------- .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 9 +-------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 1d1c3154d..926bda021 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -98,16 +98,9 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: - # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role - # was renamed to 'developer'. Convert to avoid 400 validation errors. - messages = [] - for msg in row['input']: - if msg.get('role') == 'system': - msg = {**msg, 'role': 'developer'} - messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': messages, + 'messages': row['input'], 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 03fd4402e..9a0c89e5a 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -94,16 +94,9 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: - # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role - # was renamed to 'developer'. Convert to avoid 400 validation errors. - messages = [] - for msg in row['input']: - if msg.get('role') == 'system': - msg = {**msg, 'role': 'developer'} - messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': messages, + 'messages': row['input'], 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') From a7ac440570908ca5f64e71b06b83fec3ea2da444 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:34:47 -0500 Subject: [PATCH 25/33] fix MetricsCollector missing gpu_transfer_collector attribute Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/bench/metrics_collector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index 7bcdf31a4..b38653ea8 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -172,6 +172,7 @@ class MetricsCollector: _task: asyncio.Task | None = None _parser: VLLMMetricsParser | SGLangMetricsParser | None = None _backend: str = "" + gpu_transfer_collector: object = None def _parse_metrics(self, text: str) -> MetricsSnapshot: """Parse Prometheus metrics text, auto-detecting backend on first call.""" From db87b95fc7eb55e19ec318569e1771369fa6ac28 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:44:27 -0500 Subject: [PATCH 26/33] fix LMCache traces: strip null fields to pass vLLM Pydantic validation The LMCache traces include explicit null values for optional fields (tool_calls, tool_call_id, name) on every message. vLLM's strict Pydantic validation rejects these, causing 100% HTTP 400 errors. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 5 ++++- benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 926bda021..034df4d89 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -98,9 +98,12 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + # for optional fields like tool_calls, tool_call_id, name + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 9a0c89e5a..c4d26dd7e 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -94,9 +94,12 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + # for optional fields like tool_calls, tool_call_id, name + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') From 07ce85de133bf608d48bd635b3816458a2e5db53 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 13:13:29 -0500 Subject: [PATCH 27/33] use hf download for LMCache traces instead of datasets.load_dataset Avoids flaky streaming downloads that fail mid-transfer. The dataset is now cached via hf download (same as model weights) and read from the local parquet files. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp8_h100_lmcache_aiperf.sh | 58 ++++++++++++------- .../multiturn_fp8_h200_lmcache_aiperf.sh | 58 ++++++++++++------- 2 files changed, 72 insertions(+), 44 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 034df4d89..ae666c37b 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -87,31 +87,45 @@ fi mkdir -p "$RESULT_DIR" -# ---- Convert LMCache traces to mooncake format ----------------------------- -echo "Downloading and converting LMCache traces..." +# ---- Download and convert LMCache traces to mooncake format ---------------- +echo "Downloading LMCache traces..." +hf download sammshen/lmcache-agentic-traces --repo-type dataset + +echo "Converting LMCache traces to mooncake format..." python3 -c " -import json, os -try: +import json, glob, os +hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')) +# Find the downloaded parquet/jsonl files in the HF cache +candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True) +if not candidates: + candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True) +if not candidates: + # Fallback: use datasets library to load from cache from datasets import load_dataset ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') - out_path = '$TRACE_FILE' - sessions = set() - with open(out_path, 'w') as f: - for row in ds: - # Strip None fields — vLLM's Pydantic validation rejects explicit nulls - # for optional fields like tool_calls, tool_call_id, name - messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] - entry = { - 'session_id': row['session_id'], - 'messages': messages, - 'output_length': row['output_length'], - } - f.write(json.dumps(entry) + '\n') - sessions.add(row['session_id']) - print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') -except Exception as e: - print(f'ERROR converting traces: {e}') - exit(1) + rows = list(ds) +else: + import pyarrow.parquet as pq + rows = [] + for f in sorted(candidates): + table = pq.read_table(f) + rows.extend(table.to_pylist()) + print(f'Loaded {len(rows)} rows from {len(candidates)} cached files') + +out_path = '$TRACE_FILE' +sessions = set() +with open(out_path, 'w') as f: + for row in rows: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + entry = { + 'session_id': row['session_id'], + 'messages': messages, + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) +print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index c4d26dd7e..56232cf58 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -83,31 +83,45 @@ fi mkdir -p "$RESULT_DIR" -# ---- Convert LMCache traces to mooncake format ----------------------------- -echo "Downloading and converting LMCache traces..." +# ---- Download and convert LMCache traces to mooncake format ---------------- +echo "Downloading LMCache traces..." +hf download sammshen/lmcache-agentic-traces --repo-type dataset + +echo "Converting LMCache traces to mooncake format..." python3 -c " -import json, os -try: +import json, glob, os +hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')) +# Find the downloaded parquet/jsonl files in the HF cache +candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True) +if not candidates: + candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True) +if not candidates: + # Fallback: use datasets library to load from cache from datasets import load_dataset ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') - out_path = '$TRACE_FILE' - sessions = set() - with open(out_path, 'w') as f: - for row in ds: - # Strip None fields — vLLM's Pydantic validation rejects explicit nulls - # for optional fields like tool_calls, tool_call_id, name - messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] - entry = { - 'session_id': row['session_id'], - 'messages': messages, - 'output_length': row['output_length'], - } - f.write(json.dumps(entry) + '\n') - sessions.add(row['session_id']) - print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') -except Exception as e: - print(f'ERROR converting traces: {e}') - exit(1) + rows = list(ds) +else: + import pyarrow.parquet as pq + rows = [] + for f in sorted(candidates): + table = pq.read_table(f) + rows.extend(table.to_pylist()) + print(f'Loaded {len(rows)} rows from {len(candidates)} cached files') + +out_path = '$TRACE_FILE' +sessions = set() +with open(out_path, 'w') as f: + for row in rows: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + entry = { + 'session_id': row['session_id'], + 'messages': messages, + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) +print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" From 195ca66d90e2dd14412bbeee38fa1ee612949832 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:00:04 -0500 Subject: [PATCH 28/33] add B200 FP4 multiturn benchmark script using aiperf Based on H100 aiperf script with B200-specific changes: - TORCH_CUDA_ARCH_LIST=10.0 (Blackwell) - B200 compilation config (FULL_DECODE_ONLY cudagraphs, custom ops) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp4_b200_lmcache_aiperf.sh | 248 ++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100755 benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh new file mode 100755 index 000000000..2e8164f3f --- /dev/null +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -0,0 +1,248 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# LMCache agentic trace benchmark for FP4 models on B200 using AIPerf. +# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format. +# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (if set, runs for this many seconds; otherwise runs to completion) + +source "$(dirname "$0")/../benchmark_lib.sh" + +export CUDA_LAUNCH_BLOCKING=1 + +ulimit -a + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +AIPERF_DIR="$MULTITURN_DIR/aiperf" +TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" + +pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +mkdir -p "$RESULT_DIR" + +# ---- Download and convert LMCache traces to mooncake format ---------------- +echo "Downloading LMCache traces..." +hf download sammshen/lmcache-agentic-traces --repo-type dataset + +echo "Converting LMCache traces to mooncake format..." +python3 -c " +import json, glob, os +hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')) +# Find the downloaded parquet/jsonl files in the HF cache +candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True) +if not candidates: + candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True) +if not candidates: + # Fallback: use datasets library to load from cache + from datasets import load_dataset + ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') + rows = list(ds) +else: + import pyarrow.parquet as pq + rows = [] + for f in sorted(candidates): + table = pq.read_table(f) + rows.extend(table.to_pylist()) + print(f'Loaded {len(rows)} rows from {len(candidates)} cached files') + +out_path = '$TRACE_FILE' +sessions = set() +with open(out_path, 'w') as f: + for row in rows: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + entry = { + 'session_id': row['session_id'], + 'messages': messages, + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) +print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +" + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" + +echo "Installing aiperf in isolated venv..." +python3 -m venv /tmp/aiperf-venv --system-site-packages +/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10 +AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf" + +/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run AIPerf benchmark ---------------------------------------------------- +export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" + +AIPERF_CMD="$AIPERF_BIN profile" +AIPERF_CMD+=" --model $MODEL" +AIPERF_CMD+=" --url http://localhost:$PORT" +AIPERF_CMD+=" --endpoint-type chat" +AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --input-file $TRACE_FILE" +AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --concurrency $USERS" +if [ -n "${DURATION:-}" ]; then + AIPERF_CMD+=" --benchmark-duration $DURATION" + AIPERF_CMD+=" --benchmark-grace-period 0" +fi +AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" +AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" +AIPERF_CMD+=" --export-level records" +AIPERF_CMD+=" --ui-type simple" +AIPERF_CMD+=" --random-seed 42" + +echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" From 09e6ec1c9746f86563e178aebde9681e04899cae Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:09:13 -0500 Subject: [PATCH 29/33] add entry for b200 ds --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 63892d202..f371c5625 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -42,12 +42,12 @@ h100-fp8-llama70b-lmcache: users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] -b200-fp4-dsr1: +b200-fp4-dsr1-weka-trace: tp4: ep: 4 - users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128] + users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] tp8: ep: 8 - users: [1, 2, 4, 8, 12, 16, 32, 64, 128] + users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] offload: ["on", "off"] From 951326a2b5cf281c7b057b18b75558eb01a70b20 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:09:47 -0500 Subject: [PATCH 30/33] add expert parallel support to B200 FP4 aiperf script Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh index 2e8164f3f..5acba8a73 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -146,6 +146,10 @@ VLLM_CMD+=" --config $RESULT_DIR/config.yaml" VLLM_CMD+=" --gpu-memory-utilization 0.9" VLLM_CMD+=" --tensor-parallel-size $TP" +if [ "${EP_SIZE:-0}" -gt 1 ]; then + VLLM_CMD+=" --enable-expert-parallel" +fi + if [ "$OFFLOAD_MODE" = "on" ]; then VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" From 0100fa1bc6ed4326c73de918088feddef542471c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:32:48 -0500 Subject: [PATCH 31/33] skip LMCache trace entries with empty messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dataset was updated (24K → 74K rows) and now includes entries with empty message lists, causing aiperf MooncakeTrace validation to fail. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../single_node/multiturn_fp4_b200_lmcache_aiperf.sh | 8 +++++++- .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 8 +++++++- .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 8 +++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh index 5acba8a73..0df4efb0c 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -114,10 +114,14 @@ else: out_path = '$TRACE_FILE' sessions = set() +skipped = 0 with open(out_path, 'w') as f: for row in rows: # Strip None fields — vLLM's Pydantic validation rejects explicit nulls messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + if not messages: + skipped += 1 + continue entry = { 'session_id': row['session_id'], 'messages': messages, @@ -125,7 +129,9 @@ with open(out_path, 'w') as f: } f.write(json.dumps(entry) + '\n') sessions.add(row['session_id']) -print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +if skipped: + print(f'Skipped {skipped} entries with empty messages') +print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index ae666c37b..b81105d5b 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -114,10 +114,14 @@ else: out_path = '$TRACE_FILE' sessions = set() +skipped = 0 with open(out_path, 'w') as f: for row in rows: # Strip None fields — vLLM's Pydantic validation rejects explicit nulls messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + if not messages: + skipped += 1 + continue entry = { 'session_id': row['session_id'], 'messages': messages, @@ -125,7 +129,9 @@ with open(out_path, 'w') as f: } f.write(json.dumps(entry) + '\n') sessions.add(row['session_id']) -print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +if skipped: + print(f'Skipped {skipped} entries with empty messages') +print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 56232cf58..e3acd1bb0 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -110,10 +110,14 @@ else: out_path = '$TRACE_FILE' sessions = set() +skipped = 0 with open(out_path, 'w') as f: for row in rows: # Strip None fields — vLLM's Pydantic validation rejects explicit nulls messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + if not messages: + skipped += 1 + continue entry = { 'session_id': row['session_id'], 'messages': messages, @@ -121,7 +125,9 @@ with open(out_path, 'w') as f: } f.write(json.dumps(entry) + '\n') sessions.add(row['session_id']) -print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +if skipped: + print(f'Skipped {skipped} entries with empty messages') +print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" From 110dfa4803fcdf6d529baecfb5ca6598bdc8516b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 17:56:15 -0500 Subject: [PATCH 32/33] fix: prioritize aiperf summary CSV over malformed client CSV Both collect_sweep_results.py and plot_pareto.py were trying to load metrics_client_metrics.csv first, which fails with "Expected 15 fields, saw 19" on aiperf runs. Now aiperf summary CSV is checked first. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/analysis/plot_pareto.py | 4 ++-- .../scripts/collect_sweep_results.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py index 7da67c8a4..081c98ebd 100644 --- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -145,8 +145,8 @@ def load_experiment_data(exp_dir: Path) -> dict | None: if final_row["cpu_prefix_cache_queries"] > 0: cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"] - # Use aiperf summary CSV directly if available - if aiperf_summary_csv is not None and not client_metrics_file.exists(): + # Use aiperf summary CSV directly if available (preferred over client CSV) + if aiperf_summary_csv is not None: exp_name = exp_dir.name parts = exp_name.split("_") tp = int(parts[0].replace("tp", "")) diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py index 9910fb8ff..28f115f47 100755 --- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -155,8 +155,13 @@ def load_experiment(exp_dir: Path) -> dict | None: return result try: - # Determine data source: custom client CSV, aiperf summary CSV, or trace replay CSV - if client_csv.exists(): + # Determine data source: aiperf summary CSV (preferred), custom client CSV, or trace replay CSV + if aiperf_summary_csv is not None: + aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) + if aiperf_metrics is None: + return result + result.update(aiperf_metrics) + elif client_csv.exists(): df = _load_custom_client_csv(client_csv, exp_dir) if df is None or len(df) == 0: return result @@ -199,11 +204,6 @@ def load_experiment(exp_dir: Path) -> dict | None: "p90_latency_ms": df["latency_ms"].quantile(0.9), "p99_latency_ms": df["latency_ms"].quantile(0.99), }) - elif aiperf_summary_csv is not None: - aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) - if aiperf_metrics is None: - return result - result.update(aiperf_metrics) elif trace_replay_csv.exists(): df = _load_trace_replay_csv(trace_replay_csv) if df is None or len(df) == 0: From c64e644b4b1af0a0bd6c7eb0a364d455ec02db71 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 17:58:44 -0500 Subject: [PATCH 33/33] fix aiperf CSV parser: handle multi-section format with different column counts The profile_export_aiperf.csv has 3 sections (per-metric stats, scalar values, GPU metrics) with different column counts. pd.read_csv choked on the GPU section (19 cols vs 14). Parse manually by splitting on column count changes. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/analysis/plot_pareto.py | 35 +++++++++++---- .../scripts/collect_sweep_results.py | 43 +++++++++++++------ 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py index 081c98ebd..90b7ed1f8 100644 --- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -21,21 +21,40 @@ def _load_aiperf_summary_csv(csv_path: Path, exp_dir: Path, tp: int, gpu_hit_rate: float | None, cpu_hit_rate: float | None) -> dict | None: """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv.""" - df = pd.read_csv(csv_path) - if len(df) == 0: + # The CSV has multiple sections with different column counts. + # Read raw lines and split into per-metric and scalar sections. + lines = csv_path.read_text().strip().split('\n') + if len(lines) < 2: return None - per_metric = df[df["avg"].notna()].set_index("Metric") - scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + header = lines[0].split(',') + per_metric = {} + scalars = {} + for line in lines[1:]: + if not line.strip(): + continue + parts = line.split(',') + if len(parts) == len(header): + per_metric[parts[0]] = {h: parts[i] for i, h in enumerate(header)} + elif len(parts) == 2: + scalars[parts[0]] = parts[1] + else: + break def metric_stat(metric_name, stat): - if metric_name in per_metric.index: - return float(per_metric.loc[metric_name, stat]) + if metric_name in per_metric: + try: + return float(per_metric[metric_name].get(stat, 0)) + except (ValueError, TypeError): + return 0 return 0 def scalar_val(metric_name): - if metric_name in scalars.index: - return float(scalars.loc[metric_name, "min"]) + if metric_name in scalars: + try: + return float(scalars[metric_name]) + except (ValueError, TypeError): + return 0 return 0 exp_name = exp_dir.name diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py index 28f115f47..89cf990f3 100755 --- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -39,25 +39,44 @@ def _load_aiperf_summary_csv(csv_path: Path) -> dict | None: Returns a dict with pre-computed metrics matching the result schema, or None if the file can't be parsed. """ - df = pd.read_csv(csv_path) - if len(df) == 0: + # The CSV has multiple sections with different column counts. + # Read raw lines and split into per-metric and scalar sections. + lines = csv_path.read_text().strip().split('\n') + if len(lines) < 2: return None - # The CSV has two sections: - # 1. Per-metric rows with columns: Metric, avg, min, max, sum, p1..p99, std - # 2. Scalar rows with columns: Metric, Value - # Split by finding rows where only Metric and Value are populated - per_metric = df[df["avg"].notna()].set_index("Metric") - scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + # Section 1: per-metric stats (header + data rows with 14 columns) + header = lines[0].split(',') + per_metric = {} + scalars = {} + for line in lines[1:]: + if not line.strip(): + continue + parts = line.split(',') + if len(parts) == len(header): + # Per-metric row + per_metric[parts[0]] = {h: parts[i] for i, h in enumerate(header)} + elif len(parts) == 2: + # Scalar row (Metric, Value) + scalars[parts[0]] = parts[1] + else: + # Different section (GPU metrics) — stop + break def metric_stat(metric_name, stat): - if metric_name in per_metric.index: - return float(per_metric.loc[metric_name, stat]) + if metric_name in per_metric: + try: + return float(per_metric[metric_name].get(stat, 0)) + except (ValueError, TypeError): + return 0 return 0 def scalar_val(metric_name): - if metric_name in scalars.index: - return float(scalars.loc[metric_name, "min"]) # "min" column holds Value + if metric_name in scalars: + try: + return float(scalars[metric_name]) + except (ValueError, TypeError): + return 0 return 0 return {