From e332f90acb05c7aeeac38c89577d75b42956bd36 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 15:25:06 -0500
Subject: [PATCH 01/33] add agentic trace replay benchmark infrastructure

Trace replay benchmarking for agentic coding workloads using real
Claude Code traces. Includes:

- Trace replay scripts for H200, MI355X, B200 (vLLM-based)
- kv-cache-tester submodule (trace replayer + 522 anonymized traces)
- AIPerf submodule (alternative synthetic benchmarking)
- Pareto frontier plotting and sweep aggregation
- Metrics collector (prometheus scraper + visualization)
- Workload distribution analysis
- GitHub Actions workflow with per-TP sweep configs
- MI355X runner SCRIPT_SUFFIX support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/multiturn-agentic-trace.yaml  |   31 +
 .../workflows/benchmark-multiturn-tmpl.yml    |  184 +++
 .github/workflows/multiturn-sweep.yml         |  231 +++
 .gitmodules                                   |    6 +
 .../multiturn_fp4_b200_trace_replay.sh        |  210 +++
 .../multiturn_fp8_h200_trace_replay.sh        |  206 +++
 .../multiturn_fp8_mi355x_trace_replay.sh      |  207 +++
 .../multiturn/vllm_benchmark/.gitignore       |    4 +
 experimental/multiturn/vllm_benchmark/aiperf  |    1 +
 .../vllm_benchmark/analysis/__init__.py       |    0
 .../vllm_benchmark/analysis/plot_pareto.py    | 1247 +++++++++++++++++
 .../vllm_benchmark/bench/__init__.py          |    0
 .../vllm_benchmark/bench/metrics_collector.py |  957 +++++++++++++
 .../bench/run_metrics_collector.py            |  124 ++
 .../multiturn/vllm_benchmark/kv-cache-tester  |    1 +
 .../multiturn/vllm_benchmark/requirements.txt |    9 +
 .../analyze_benchmark_distributions.py        |  395 ++++++
 .../scripts/collect_sweep_results.py          |  340 +++++
 .../scripts/plot_sweep_overview.py            |  222 +++
 runners/launch_mi355x-amds.sh                 |    4 +-
 20 files changed, 4377 insertions(+), 2 deletions(-)
 create mode 100644 .github/configs/multiturn-agentic-trace.yaml
 create mode 100644 .github/workflows/benchmark-multiturn-tmpl.yml
 create mode 100644 .github/workflows/multiturn-sweep.yml
 create mode 100644 .gitmodules
 create mode 100755 benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh
 create mode 100755 benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh
 create mode 100755 benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh
 create mode 100644 experimental/multiturn/vllm_benchmark/.gitignore
 create mode 160000 experimental/multiturn/vllm_benchmark/aiperf
 create mode 100644 experimental/multiturn/vllm_benchmark/analysis/__init__.py
 create mode 100644 experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
 create mode 100644 experimental/multiturn/vllm_benchmark/bench/__init__.py
 create mode 100644 experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
 create mode 100644 experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py
 create mode 160000 experimental/multiturn/vllm_benchmark/kv-cache-tester
 create mode 100644 experimental/multiturn/vllm_benchmark/requirements.txt
 create mode 100644 experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py
 create mode 100755 experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
 create mode 100644 experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py

diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml
new file mode 100644
index 000000000..5ec98b902
--- /dev/null
+++ b/.github/configs/multiturn-agentic-trace.yaml
@@ -0,0 +1,31 @@
+h200-fp8-llama70b:
+  tp2:
+    users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32]
+    offload: ["on", "off"]
+  tp4:
+    users: [2, 4, 6, 8, 16, 24, 32, 40, 48, 56]
+    offload: ["on", "off"]
+  tp8:
+    users: [2, 4, 6, 8, 16, 32, 48, 64, 80, 128, 256]
+    offload: ["on", "off"]
+
+mi355x-fp8-llama70b:
+  tp2:
+    users: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56]
+    offload: ["on", "off"]
+  tp4:
+    users: [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 112, 256]
+    offload: ["on", "off"]
+  tp8:
+    users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512]
+    offload: ["on", "off"]
+
+b200-fp4-dsr1:
+  tp4:
+    ep: 4
+    users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128]
+    offload: ["on", "off"]
+  tp8:
+    ep: 8
+    users: [1, 2, 4, 8, 12, 16, 32, 64, 128]
+    offload: ["on", "off"]
diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml
new file mode 100644
index 000000000..a72034b14
--- /dev/null
+++ b/.github/workflows/benchmark-multiturn-tmpl.yml
@@ -0,0 +1,184 @@
+name: Template - Multi-Turn Benchmark
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      model:
+        required: true
+        type: string
+      precision:
+        required: false
+        type: string
+        default: 'fp4'
+      exp-name:
+        required: true
+        type: string
+      tp:
+        required: true
+        type: string
+      users:
+        required: true
+        type: string
+      offload-mode:
+        description: "on = prefix+offload, off = prefix only, noprefix = no prefix caching"
+        required: true
+        type: string
+      duration:
+        required: false
+        type: string
+        default: ''
+      request-rate:
+        description: "Request rate per client (Poisson, req/s). 0 = no delay."
+        required: false
+        type: string
+        default: '0'
+      total-cpu-dram-gb:
+        required: false
+        type: string
+        default: '300'
+      script-suffix:
+        description: "Suffix appended to benchmark script name (e.g. '_lmcache')"
+        required: false
+        type: string
+        default: ''
+      ep:
+        description: "Expert parallelism size (for MoE models)"
+        required: false
+        type: string
+        default: '0'
+      ref:
+        description: "Git ref (branch/sha) to checkout"
+        required: false
+        type: string
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+  EXP_NAME: ${{ inputs.exp-name }}
+  MODEL: ${{ inputs.model }}
+  IMAGE: ${{ inputs.image }}
+  PRECISION: ${{ inputs.precision }}
+  FRAMEWORK: 'vllm'
+  TP: ${{ inputs.tp }}
+  EP_SIZE: ${{ inputs.ep }}
+  USERS: ${{ inputs.users }}
+  OFFLOAD_MODE: ${{ inputs.offload-mode }}
+  DURATION: ${{ inputs.duration }}
+  REQUEST_RATE: ${{ inputs.request-rate }}
+  TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
+  SCRIPT_SUFFIX: ${{ inputs.script-suffix }}
+  SPEC_DECODING: 'off'
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 180
+    name: "${{ inputs.exp-name }} tp=${{ inputs.tp }} users=${{ inputs.users }} offload=${{ inputs.offload-mode }}"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: &resource-cleanup |
+          # Cleanup Docker resources
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            echo "[Docker] Cleaning up resources ..."
+            docker ps -aq | xargs -r docker rm -f
+            docker network prune -f
+            while [ -n "$(docker ps -aq)" ]; do
+              docker ps -a
+              sleep 5
+            done
+          fi
+
+          # Cleanup SLURM resources
+          if command -v squeue >/dev/null 2>&1; then
+            if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then
+              echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+              scancel --name="${{ runner.name }}" || true
+              while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
+                squeue --name="${{ runner.name }}"
+                sleep 5
+              done
+            else
+              echo "[Slurm] Cleaning up jobs for user: $USER ..."
+              scancel -u "$USER" || true
+              while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do
+                squeue -u "$USER"
+                sleep 5
+              done
+            fi
+          fi
+
+      - name: Clean stale git locks
+        run: find . -name 'index.lock' -delete 2>/dev/null || true
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.ref }}
+          submodules: true
+
+
+      - name: Launch job script
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+          RESULT_DIR: /workspace/results
+        run: |
+          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
+
+          # The runner script doesn't propagate exit codes (scancel masks them).
+          # Check status.txt to determine if the benchmark actually succeeded.
+          if [ ! -f results/status.txt ]; then
+            echo "Run failed: results/status.txt not found." >&2
+            exit 1
+          fi
+          STATUS=$(cat results/status.txt)
+          if [ "$STATUS" != "SUCCESS" ]; then
+            echo "Run failed: status=$STATUS" >&2
+            cat results/benchmark.log 2>/dev/null || true
+            exit 1
+          fi
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: "multiturn_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
+          path: |
+            results/metrics_client_metrics.csv
+            results/metrics_server_metrics.csv
+            results/metrics_plots.png
+            results/benchmark.log
+            results/server.log
+            results/config.yaml
+            results/vllm_command.txt
+            results/benchmark_command.txt
+            results/benchmark_metadata.json
+            results/metrics_workload.png
+            results/responses.json
+            results/aiperf_artifacts/
+            results/conversations.jsonl
+            results/workload_distribution_summary.txt
+            results/workload_distribution_plots.png
+            results/trace_replay/
+            results/status.txt
+          if-no-files-found: ignore
+
+      - name: Upload server logs
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: "server_logs_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
+          path: results/server.log
+          if-no-files-found: ignore
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: *resource-cleanup
diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml
new file mode 100644
index 000000000..5ed7bf59e
--- /dev/null
+++ b/.github/workflows/multiturn-sweep.yml
@@ -0,0 +1,231 @@
+name: Multi-Turn Benchmark Sweep
+run-name: "${{ inputs.run_name || format('Multi-Turn Sweep - tp={0} users={1} offload={2}', inputs.tp_values, inputs.user_values, inputs.offload_values) }}"
+
+on:
+  # push:
+  #   branches:
+  #     - experimental/multi-turn-benchmark
+  #   paths:
+  #     - .github/workflows/multiturn-sweep.yml
+  workflow_dispatch:
+    inputs:
+      run_name:
+        description: 'Custom run name (optional)'
+        required: false
+        default: ''
+        type: string
+      tp_values:
+        description: 'TP sizes (JSON array)'
+        required: true
+        default: '[1, 2, 4, 8]'
+        type: string
+      user_values:
+        description: 'Concurrent user counts (JSON array). Ignored if config_file is set.'
+        required: false
+        default: '[8, 16, 32, 64, 128, 256, 512, 1024, 2048]'
+        type: string
+      offload_values:
+        description: 'Offload modes (JSON array: on/off/noprefix). Ignored if config_file is set.'
+        required: false
+        default: '["on", "off", "noprefix"]'
+        type: string
+      config_file:
+        description: 'YAML config with per-TP sweep settings (e.g. .github/configs/multiturn-agentic-trace.yaml). Overrides tp/user/offload values.'
+        required: false
+        default: ''
+        type: string
+      config_key:
+        description: 'Top-level key in config_file to use (e.g. h200-fp8-llama70b, b200-fp4-dsr1). Required if config_file has multiple entries.'
+        required: false
+        default: ''
+        type: string
+      duration:
+        description: 'Benchmark duration in seconds (optional, runs to completion if omitted)'
+        required: false
+        default: ''
+        type: string
+      request_rate:
+        description: 'Request rate per client (Poisson, req/s). 0 = no delay.'
+        required: false
+        default: '0'
+        type: string
+      total_cpu_dram_gb:
+        description: 'Total CPU DRAM for KV offload (GB)'
+        required: true
+        default: '100'
+        type: string
+      image:
+        description: 'Container image'
+        required: true
+        default: 'vllm/vllm-openai:v0.18.0'
+        type: string
+      model:
+        description: 'Model name'
+        required: true
+        default: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+        type: string
+      precision:
+        description: 'Model precision (fp4, fp8, etc.) — used to select benchmark script'
+        required: false
+        default: 'fp4'
+        type: string
+      script_suffix:
+        description: 'Suffix for benchmark script (e.g. "_lmcache" → multiturn_fp4_b200_lmcache.sh)'
+        required: false
+        default: ''
+        type: string
+      runner:
+        description: 'Runner label (e.g. b200, h200-dgxc-slurm)'
+        required: false
+        default: 'b200'
+        type: string
+      ep:
+        description: 'Expert parallelism size (for MoE models, default 0 = disabled)'
+        required: false
+        default: '0'
+        type: string
+      ref:
+        description: 'Git ref (branch/sha) to checkout'
+        required: false
+        type: string
+
+jobs:
+  # ---------------------------------------------------------------------------
+  # Generate matrix from config file or CLI inputs
+  # ---------------------------------------------------------------------------
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.gen.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        if: ${{ inputs.config_file != '' }}
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 1
+          ref: ${{ inputs.ref || github.ref }}
+          sparse-checkout: ${{ inputs.config_file }}
+
+      - id: gen
+        run: |
+          pip install -q pyyaml
+          python3 << 'PYEOF'
+          import json, os, sys
+
+          config_file = "${{ inputs.config_file }}".strip()
+
+          if config_file:
+              import yaml
+              with open(config_file) as f:
+                  full_config = yaml.safe_load(f)
+
+              config_key = "${{ inputs.config_key }}".strip()
+
+              # If config_key specified, use that section; otherwise auto-detect
+              if config_key and config_key in full_config:
+                  config = full_config[config_key]
+              elif config_key:
+                  print(f"ERROR: config_key '{config_key}' not found. Available: {list(full_config.keys())}")
+                  sys.exit(1)
+              elif len(full_config) == 1:
+                  config = next(iter(full_config.values()))
+              else:
+                  # Check if top-level keys look like tp entries (tp2, tp4, etc.)
+                  if all(k.startswith("tp") for k in full_config):
+                      config = full_config
+                  else:
+                      print(f"ERROR: Multiple entries in config, specify --config_key. Available: {list(full_config.keys())}")
+                      sys.exit(1)
+
+              includes = []
+              for key, settings in config.items():
+                  tp = int(key.replace("tp", ""))
+                  users = settings.get("users", [])
+                  offloads = settings.get("offload", ["on", "off"])
+                  ep = settings.get("ep", 0)
+                  for u in users:
+                      for o in offloads:
+                          entry = {"tp": tp, "users": u, "offload": o}
+                          if ep > 0:
+                              entry["ep"] = ep
+                          includes.append(entry)
+          else:
+              tp_values = json.loads('${{ inputs.tp_values }}')
+              user_values = json.loads('${{ inputs.user_values }}')
+              offload_values = json.loads('${{ inputs.offload_values }}')
+              includes = []
+              for tp in tp_values:
+                  for u in user_values:
+                      for o in offload_values:
+                          includes.append({"tp": tp, "users": u, "offload": o})
+
+          matrix = {"include": includes}
+          print(f"Generated {len(includes)} matrix entries")
+          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+              f.write(f"matrix={json.dumps(matrix)}\n")
+          PYEOF
+
+  # ---------------------------------------------------------------------------
+  # Matrix benchmark jobs — each cell calls the multiturn template
+  # ---------------------------------------------------------------------------
+  sweep:
+    needs: generate-matrix
+    uses: ./.github/workflows/benchmark-multiturn-tmpl.yml
+    name: sweep /
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
+    secrets: inherit
+    with:
+      runner: ${{ inputs.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      precision: ${{ inputs.precision }}
+      exp-name: "multiturn_tp${{ matrix.tp }}_users${{ matrix.users }}_offload${{ matrix.offload }}"
+      tp: "${{ matrix.tp }}"
+      users: "${{ matrix.users }}"
+      offload-mode: ${{ matrix.offload }}
+      duration: ${{ inputs.duration }}
+      request-rate: ${{ inputs.request_rate }}
+      total-cpu-dram-gb: ${{ inputs.total_cpu_dram_gb }}
+      script-suffix: ${{ inputs.script_suffix }}
+      ep: "${{ matrix.ep || inputs.ep }}"
+      ref: ${{ inputs.ref }}
+
+  # ---------------------------------------------------------------------------
+  # Collect & aggregate results
+  # ---------------------------------------------------------------------------
+  collect:
+    runs-on: ubuntu-latest
+    needs: sweep
+    if: always()
+    name: Collect results
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 1
+          ref: ${{ inputs.ref || github.ref }}
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install pandas matplotlib numpy
+
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: 'multiturn_*'
+          path: results/
+
+      - name: Run aggregation
+        run: |
+          python experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py results/ aggregated/
+
+      - name: Upload aggregated results
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: multiturn_aggregated
+          path: aggregated/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..c45593c07
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "experimental/multiturn/vllm_benchmark/aiperf"]
+	path = experimental/multiturn/vllm_benchmark/aiperf
+	url = https://github.com/cquil11/aiperf.git
+[submodule "experimental/multiturn/vllm_benchmark/kv-cache-tester"]
+	path = experimental/multiturn/vllm_benchmark/kv-cache-tester
+	url = https://github.com/cquil11/kv-cache-tester.git
diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh
new file mode 100755
index 000000000..d22448892
--- /dev/null
+++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh
@@ -0,0 +1,210 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Trace replay benchmark for FP4 models on B200.
+# Replays real agentic coding traces at a fixed number of concurrent users.
+# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns.
+#
+# Required env vars:
+#   MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR
+# Optional:
+#   PORT (default 8888), REQUEST_TIMEOUT (default 3600)
+#   DURATION (default 1800, benchmark duration in seconds)
+#   MAX_DELAY (default 60, max gap between requests in seconds)
+#   ADVANCE_MIN (default 0.0, min trace advancement fraction)
+#   ADVANCE_MAX (default 0.7, max trace advancement fraction)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    USERS \
+    OFFLOAD_MODE \
+    TOTAL_CPU_DRAM_GB \
+    RESULT_DIR
+
+PORT=${PORT:-8888}
+REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ---- Download model --------------------------------------------------------
+hf download "$MODEL"
+
+nvidia-smi
+
+# ---- Paths -----------------------------------------------------------------
+MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark
+KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester"
+TRACE_DIR="$KV_CACHE_TESTER_DIR/traces"
+
+pip install --quiet urllib3 requests 2>/dev/null || true
+
+# Patch vLLM bug: local_cache_hit counter can go negative under high load
+# (causes "Counters can only be incremented by non-negative amounts" crash)
+STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "")
+if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then
+    echo "Patching vLLM stats.py: $STATS_FILE"
+    python3 -c "
+import re, sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'self.local_cache_hit += (\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+    'self.local_cache_hit += max(0,\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+" "$STATS_FILE"
+fi
+
+# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859)
+# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading)
+SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "")
+if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then
+    echo "Patching vLLM scheduler.py: $SCHED_FILE"
+    python3 << 'PYEOF' "$SCHED_FILE"
+import sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'assert req_id in self.requests\n            req = self.requests[req_id]\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n                self.finished_recving_kv_req_ids.discard(req_id)\n                continue\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+)
+src = src.replace(
+    'assert req_id in self.requests\n            self._free_blocks(self.requests[req_id])',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n                continue\n            self._free_blocks(req)',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+PYEOF
+fi
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Generate vLLM config --------------------------------------------------
+# cat > "$RESULT_DIR/config.yaml" << 'EOF'
+# kv-cache-dtype: fp8
+# async-scheduling: true
+# max-num-batched-tokens: 8192
+# EOF
+
+cat > "$RESULT_DIR/config.yaml" << 'EOF'
+kv-cache-dtype: fp8
+compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
+async-scheduling: true
+EOF
+
+# ---- Build vLLM command -----------------------------------------------------
+offload_size=$TOTAL_CPU_DRAM_GB
+# max_seqs=$USERS
+
+VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT"
+VLLM_CMD+=" --config $RESULT_DIR/config.yaml"
+# VLLM_CMD+=" --max-num-seqs $max_seqs"
+VLLM_CMD+=" --gpu-memory-utilization 0.9"
+VLLM_CMD+=" --tensor-parallel-size $TP"
+if [ "${EP_SIZE:-0}" -gt 1 ]; then
+    VLLM_CMD+=" --enable-expert-parallel"
+fi
+
+if [ "$OFFLOAD_MODE" = "on" ]; then
+    VLLM_CMD+=" --kv_offloading_backend native"
+    VLLM_CMD+=" --kv_offloading_size $offload_size"
+    VLLM_CMD+=" --disable-hybrid-kv-cache-manager"
+elif [ "$OFFLOAD_MODE" = "noprefix" ]; then
+    VLLM_CMD+=" --no-enable-prefix-caching"
+fi
+
+echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt"
+
+# ---- Start vLLM server ------------------------------------------------------
+echo "Starting vllm server..."
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PYTHONNOUSERSITE=1
+
+$VLLM_CMD > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready \
+    --port "$PORT" \
+    --server-log "$SERVER_LOG" \
+    --server-pid "$SERVER_PID"
+
+# ---- Install dependencies ---------------------------------------------------
+set -x
+pip install -q -r "$MULTITURN_DIR/requirements.txt"
+pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt"
+set +x
+
+# ---- Start server metrics collector -----------------------------------------
+export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}"
+
+echo "Starting server metrics collector..."
+python3 -m bench.run_metrics_collector \
+    --url "http://localhost:$PORT" \
+    --output-prefix "$RESULT_DIR/metrics" \
+    --pid-file "$RESULT_DIR/metrics_collector.pid" &
+METRICS_PID=$!
+echo "Metrics collector PID: $METRICS_PID"
+
+sleep 2
+
+# ---- Run trace replay benchmark ---------------------------------------------
+REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py"
+REPLAY_CMD+=" --api-endpoint http://localhost:$PORT"
+REPLAY_CMD+=" --trace-directory $TRACE_DIR"
+REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay"
+REPLAY_CMD+=" --start-users $USERS"
+REPLAY_CMD+=" --max-users $USERS"
+REPLAY_CMD+=" --max-ttft 9999"
+REPLAY_CMD+=" --test-duration $DURATION"
+REPLAY_CMD+=" --recycle"
+REPLAY_CMD+=" --max-delay $MAX_DELAY"
+REPLAY_CMD+=" --max-concurrent-requests 0"
+REPLAY_CMD+=" --max-new-tokens-per-period 999999999"
+REPLAY_CMD+=" --advance-min $ADVANCE_MIN"
+REPLAY_CMD+=" --advance-max $ADVANCE_MAX"
+REPLAY_CMD+=" --seed 42"
+REPLAY_CMD+=" --no-color"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then
+    echo "SUCCESS" > "$RESULT_DIR/status.txt"
+    echo "Benchmark completed successfully"
+else
+    echo "FAILED" > "$RESULT_DIR/status.txt"
+    echo "Benchmark failed"
+fi
+set +x
+
+# ---- Analyze workload distributions -----------------------------------------
+echo "Analyzing workload distributions..."
+python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+
+# ---- Stop metrics collector -------------------------------------------------
+echo "Stopping metrics collector..."
+if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then
+    kill -TERM "$METRICS_PID" 2>/dev/null || true
+    wait "$METRICS_PID" 2>/dev/null || true
+fi
+
+# ---- Cleanup -----------------------------------------------------------------
+echo "Stopping vllm server..."
+kill "$SERVER_PID" 2>/dev/null || true
+wait "$SERVER_PID" 2>/dev/null || true
+
+echo "Experiment finished at $(date)"
diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh
new file mode 100755
index 000000000..f3f967a82
--- /dev/null
+++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh
@@ -0,0 +1,206 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Trace replay benchmark for FP8 models on H200.
+# Replays real agentic coding traces at a fixed number of concurrent users.
+# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns.
+#
+# Required env vars:
+#   MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR
+# Optional:
+#   PORT (default 8888), REQUEST_TIMEOUT (default 3600)
+#   DURATION (default 1800, benchmark duration in seconds)
+#   MAX_DELAY (default 60, max gap between requests in seconds)
+#   ADVANCE_MIN (default 0.0, min trace advancement fraction)
+#   ADVANCE_MAX (default 0.7, max trace advancement fraction)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    USERS \
+    OFFLOAD_MODE \
+    TOTAL_CPU_DRAM_GB \
+    RESULT_DIR
+
+PORT=${PORT:-8888}
+REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ---- Download model --------------------------------------------------------
+hf download "$MODEL"
+
+nvidia-smi
+
+# ---- Paths -----------------------------------------------------------------
+MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark
+KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester"
+TRACE_DIR="$KV_CACHE_TESTER_DIR/traces"
+
+pip install --quiet urllib3 requests 2>/dev/null || true
+
+# Patch vLLM bug: local_cache_hit counter can go negative under high load
+# (causes "Counters can only be incremented by non-negative amounts" crash)
+STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "")
+if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then
+    echo "Patching vLLM stats.py: $STATS_FILE"
+    python3 -c "
+import re, sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'self.local_cache_hit += (\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+    'self.local_cache_hit += max(0,\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+" "$STATS_FILE"
+fi
+
+# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859)
+# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading)
+SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "")
+if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then
+    echo "Patching vLLM scheduler.py: $SCHED_FILE"
+    python3 << 'PYEOF' "$SCHED_FILE"
+import sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'assert req_id in self.requests\n            req = self.requests[req_id]\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n                self.finished_recving_kv_req_ids.discard(req_id)\n                continue\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+)
+src = src.replace(
+    'assert req_id in self.requests\n            self._free_blocks(self.requests[req_id])',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n                continue\n            self._free_blocks(req)',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+PYEOF
+fi
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Generate vLLM config --------------------------------------------------
+# cat > "$RESULT_DIR/config.yaml" << 'EOF'
+# kv-cache-dtype: fp8
+# async-scheduling: true
+# max-num-batched-tokens: 8192
+# EOF
+
+cat > "$RESULT_DIR/config.yaml" << 'EOF'
+kv-cache-dtype: fp8
+async-scheduling: true
+EOF
+
+# ---- Build vLLM command -----------------------------------------------------
+offload_size=$TOTAL_CPU_DRAM_GB
+# max_seqs=$USERS
+
+VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT"
+VLLM_CMD+=" --config $RESULT_DIR/config.yaml"
+# VLLM_CMD+=" --max-num-seqs $max_seqs"
+VLLM_CMD+=" --gpu-memory-utilization 0.9"
+VLLM_CMD+=" --tensor-parallel-size $TP"
+
+if [ "$OFFLOAD_MODE" = "on" ]; then
+    VLLM_CMD+=" --kv_offloading_backend native"
+    VLLM_CMD+=" --kv_offloading_size $offload_size"
+    VLLM_CMD+=" --disable-hybrid-kv-cache-manager"
+elif [ "$OFFLOAD_MODE" = "noprefix" ]; then
+    VLLM_CMD+=" --no-enable-prefix-caching"
+fi
+
+echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt"
+
+# ---- Start vLLM server ------------------------------------------------------
+echo "Starting vllm server..."
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+$VLLM_CMD > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready \
+    --port "$PORT" \
+    --server-log "$SERVER_LOG" \
+    --server-pid "$SERVER_PID"
+
+# ---- Install dependencies ---------------------------------------------------
+set -x
+pip install -q -r "$MULTITURN_DIR/requirements.txt"
+pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt"
+set +x
+
+# ---- Start server metrics collector -----------------------------------------
+export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}"
+
+echo "Starting server metrics collector..."
+python3 -m bench.run_metrics_collector \
+    --url "http://localhost:$PORT" \
+    --output-prefix "$RESULT_DIR/metrics" \
+    --pid-file "$RESULT_DIR/metrics_collector.pid" &
+METRICS_PID=$!
+echo "Metrics collector PID: $METRICS_PID"
+
+sleep 2
+
+# ---- Run trace replay benchmark ---------------------------------------------
+REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py"
+REPLAY_CMD+=" --api-endpoint http://localhost:$PORT"
+REPLAY_CMD+=" --trace-directory $TRACE_DIR"
+REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay"
+REPLAY_CMD+=" --start-users $USERS"
+REPLAY_CMD+=" --max-users $USERS"
+REPLAY_CMD+=" --max-ttft 9999"
+REPLAY_CMD+=" --test-duration $DURATION"
+REPLAY_CMD+=" --recycle"
+REPLAY_CMD+=" --max-delay $MAX_DELAY"
+REPLAY_CMD+=" --max-concurrent-requests 0"
+REPLAY_CMD+=" --max-new-tokens-per-period 999999999"
+REPLAY_CMD+=" --advance-min $ADVANCE_MIN"
+REPLAY_CMD+=" --advance-max $ADVANCE_MAX"
+REPLAY_CMD+=" --seed 42"
+REPLAY_CMD+=" --no-color"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then
+    echo "SUCCESS" > "$RESULT_DIR/status.txt"
+    echo "Benchmark completed successfully"
+else
+    echo "FAILED" > "$RESULT_DIR/status.txt"
+    echo "Benchmark failed"
+fi
+set +x
+
+# ---- Analyze workload distributions -----------------------------------------
+echo "Analyzing workload distributions..."
+python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+
+# ---- Stop metrics collector -------------------------------------------------
+echo "Stopping metrics collector..."
+if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then
+    kill -TERM "$METRICS_PID" 2>/dev/null || true
+    wait "$METRICS_PID" 2>/dev/null || true
+fi
+
+# ---- Cleanup -----------------------------------------------------------------
+echo "Stopping vllm server..."
+kill "$SERVER_PID" 2>/dev/null || true
+wait "$SERVER_PID" 2>/dev/null || true
+
+echo "Experiment finished at $(date)"
diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh
new file mode 100755
index 000000000..4cf20c453
--- /dev/null
+++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh
@@ -0,0 +1,207 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Trace replay benchmark for FP8 models on MI355X.
+# Replays real agentic coding traces at a fixed number of concurrent users.
+# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns.
+#
+# Required env vars:
+#   MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR
+# Optional:
+#   PORT (default 8888), REQUEST_TIMEOUT (default 3600)
+#   DURATION (default 1800, benchmark duration in seconds)
+#   MAX_DELAY (default 60, max gap between requests in seconds)
+#   ADVANCE_MIN (default 0.0, min trace advancement fraction)
+#   ADVANCE_MAX (default 0.7, max trace advancement fraction)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    USERS \
+    OFFLOAD_MODE \
+    TOTAL_CPU_DRAM_GB \
+    RESULT_DIR
+
+PORT=${PORT:-8888}
+REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ---- Download model --------------------------------------------------------
+hf download "$MODEL"
+
+nvidia-smi 2>/dev/null || rocm-smi 2>/dev/null || true
+
+# ---- Paths -----------------------------------------------------------------
+MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark
+KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester"
+TRACE_DIR="$KV_CACHE_TESTER_DIR/traces"
+
+pip install --quiet urllib3 requests 2>/dev/null || true
+
+# Patch vLLM bug: local_cache_hit counter can go negative under high load
+# (causes "Counters can only be incremented by non-negative amounts" crash)
+STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "")
+if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then
+    echo "Patching vLLM stats.py: $STATS_FILE"
+    python3 -c "
+import re, sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'self.local_cache_hit += (\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+    'self.local_cache_hit += max(0,\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+" "$STATS_FILE"
+fi
+
+# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859)
+# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading)
+SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "")
+if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then
+    echo "Patching vLLM scheduler.py: $SCHED_FILE"
+    python3 << 'PYEOF' "$SCHED_FILE"
+import sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'assert req_id in self.requests\n            req = self.requests[req_id]\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n                self.finished_recving_kv_req_ids.discard(req_id)\n                continue\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+)
+src = src.replace(
+    'assert req_id in self.requests\n            self._free_blocks(self.requests[req_id])',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n                continue\n            self._free_blocks(req)',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+PYEOF
+fi
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Generate vLLM config --------------------------------------------------
+# cat > "$RESULT_DIR/config.yaml" << 'EOF'
+# kv-cache-dtype: fp8
+# async-scheduling: true
+# max-num-batched-tokens: 8192
+# EOF
+
+cat > "$RESULT_DIR/config.yaml" << 'EOF'
+kv-cache-dtype: fp8
+async-scheduling: true
+EOF
+
+# ---- Build vLLM command -----------------------------------------------------
+offload_size=$TOTAL_CPU_DRAM_GB
+# max_seqs=$USERS
+
+VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT"
+VLLM_CMD+=" --config $RESULT_DIR/config.yaml"
+# VLLM_CMD+=" --max-num-seqs $max_seqs"
+VLLM_CMD+=" --gpu-memory-utilization 0.9"
+VLLM_CMD+=" --tensor-parallel-size $TP"
+
+if [ "$OFFLOAD_MODE" = "on" ]; then
+    VLLM_CMD+=" --kv_offloading_backend native"
+    VLLM_CMD+=" --kv_offloading_size $offload_size"
+    VLLM_CMD+=" --disable-hybrid-kv-cache-manager"
+elif [ "$OFFLOAD_MODE" = "noprefix" ]; then
+    VLLM_CMD+=" --no-enable-prefix-caching"
+fi
+
+echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt"
+
+# ---- Start vLLM server ------------------------------------------------------
+echo "Starting vllm server..."
+# MI355X is ROCm — no CUDA arch needed
+# export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+$VLLM_CMD > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready \
+    --port "$PORT" \
+    --server-log "$SERVER_LOG" \
+    --server-pid "$SERVER_PID"
+
+# ---- Install dependencies ---------------------------------------------------
+set -x
+pip install -q -r "$MULTITURN_DIR/requirements.txt"
+pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt"
+set +x
+
+# ---- Start server metrics collector -----------------------------------------
+export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}"
+
+echo "Starting server metrics collector..."
+python3 -m bench.run_metrics_collector \
+    --url "http://localhost:$PORT" \
+    --output-prefix "$RESULT_DIR/metrics" \
+    --pid-file "$RESULT_DIR/metrics_collector.pid" &
+METRICS_PID=$!
+echo "Metrics collector PID: $METRICS_PID"
+
+sleep 2
+
+# ---- Run trace replay benchmark ---------------------------------------------
+REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py"
+REPLAY_CMD+=" --api-endpoint http://localhost:$PORT"
+REPLAY_CMD+=" --trace-directory $TRACE_DIR"
+REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay"
+REPLAY_CMD+=" --start-users $USERS"
+REPLAY_CMD+=" --max-users $USERS"
+REPLAY_CMD+=" --max-ttft 9999"
+REPLAY_CMD+=" --test-duration $DURATION"
+REPLAY_CMD+=" --recycle"
+REPLAY_CMD+=" --max-delay $MAX_DELAY"
+REPLAY_CMD+=" --max-concurrent-requests 0"
+REPLAY_CMD+=" --max-new-tokens-per-period 999999999"
+REPLAY_CMD+=" --advance-min $ADVANCE_MIN"
+REPLAY_CMD+=" --advance-max $ADVANCE_MAX"
+REPLAY_CMD+=" --seed 42"
+REPLAY_CMD+=" --no-color"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then
+    echo "SUCCESS" > "$RESULT_DIR/status.txt"
+    echo "Benchmark completed successfully"
+else
+    echo "FAILED" > "$RESULT_DIR/status.txt"
+    echo "Benchmark failed"
+fi
+set +x
+
+# ---- Analyze workload distributions -----------------------------------------
+echo "Analyzing workload distributions..."
+python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+
+# ---- Stop metrics collector -------------------------------------------------
+echo "Stopping metrics collector..."
+if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then
+    kill -TERM "$METRICS_PID" 2>/dev/null || true
+    wait "$METRICS_PID" 2>/dev/null || true
+fi
+
+# ---- Cleanup -----------------------------------------------------------------
+echo "Stopping vllm server..."
+kill "$SERVER_PID" 2>/dev/null || true
+wait "$SERVER_PID" 2>/dev/null || true
+
+echo "Experiment finished at $(date)"
diff --git a/experimental/multiturn/vllm_benchmark/.gitignore b/experimental/multiturn/vllm_benchmark/.gitignore
new file mode 100644
index 000000000..a0c3ca327
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/.gitignore
@@ -0,0 +1,4 @@
+*.png
+*.json
+*.parquet
+results/
\ No newline at end of file
diff --git a/experimental/multiturn/vllm_benchmark/aiperf b/experimental/multiturn/vllm_benchmark/aiperf
new file mode 160000
index 000000000..373218fb3
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/aiperf
@@ -0,0 +1 @@
+Subproject commit 373218fb3c3d15fada9c4be6465daf8fb5a70ef6
diff --git a/experimental/multiturn/vllm_benchmark/analysis/__init__.py b/experimental/multiturn/vllm_benchmark/analysis/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
new file mode 100644
index 000000000..277bfca7f
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
@@ -0,0 +1,1247 @@
+#!/usr/bin/env python3
+"""
+Plot Pareto frontiers for prefix caching modes.
+Modes: on (prefix + offload), off (prefix only), noprefix (no prefix caching)
+Pareto frontier: throughput vs latency trade-off.
+
+Usage:
+    python plot_pareto.py <results_dir>
+    python plot_pareto.py ~/sweep_results_20260204_062339
+"""
+
+import json
+import sys
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+
+
+def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None:
+    """Load per-request metrics from aiperf profile_export JSONL."""
+    records = []
+    with open(jsonl_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            entry = json.loads(line)
+            meta = entry.get("metadata", {})
+            metrics = entry.get("metrics", {})
+
+            if meta.get("benchmark_phase") != "profiling":
+                continue
+            if meta.get("was_cancelled", False):
+                continue
+
+            def val(key, default=0):
+                m = metrics.get(key)
+                if m is None:
+                    return default
+                return m.get("value", default) if isinstance(m, dict) else m
+
+            itl = metrics.get("inter_token_latency")
+            if itl and isinstance(itl, dict):
+                tpot_ms = itl.get("value", 0)
+            else:
+                osl = val("output_sequence_length", 1)
+                ttft = val("time_to_first_token", 0)
+                latency = val("request_latency", 0)
+                tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0
+
+            start_ns = meta.get("request_start_ns", 0)
+            start_ms = start_ns / 1e6
+
+            records.append({
+                "start_time_ms": start_ms,
+                "ttft_ms": val("time_to_first_token"),
+                "tpot_ms": tpot_ms,
+                "latency_ms": val("request_latency"),
+                "input_num_tokens": val("input_sequence_length"),
+                "output_num_tokens": val("output_sequence_length"),
+            })
+
+    if not records:
+        return None
+    return pd.DataFrame(records)
+
+
+def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None:
+    """Load per-request metrics from trace_replay detailed_results.csv."""
+    df = pd.read_csv(csv_path)
+    if len(df) == 0:
+        return None
+
+    # Filter to successful requests only
+    df = df[df["success"] == True].copy()
+    if len(df) == 0:
+        return None
+
+    # Convert to the same schema as _load_aiperf_jsonl
+    latency_s = df["request_complete_time"] - df["request_start_time"]
+    records = pd.DataFrame({
+        "start_time_ms": df["request_start_time"] * 1000,
+        "ttft_ms": df["ttft"] * 1000,
+        "tpot_ms": df["itl"] * 1000,
+        "latency_ms": latency_s * 1000,
+        "input_num_tokens": df["input_tokens"],
+        "output_num_tokens": df["output_tokens_actual"],
+    })
+    return records
+
+
+def load_experiment_data(exp_dir: Path) -> dict | None:
+    """Load and aggregate metrics from an experiment directory."""
+    client_metrics_file = exp_dir / "metrics_client_metrics.csv"
+    server_metrics_file = exp_dir / "metrics_server_metrics.csv"
+    status_file = exp_dir / "status.txt"
+
+    # Check if experiment completed successfully
+    if not status_file.exists():
+        return None
+    status = status_file.read_text().strip()
+    if status != "SUCCESS":
+        return None
+
+    # Also check for aiperf output
+    aiperf_jsonl = None
+    aiperf_artifacts = exp_dir / "aiperf_artifacts"
+    if aiperf_artifacts.exists():
+        candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl"))
+        if not candidates:
+            candidates = list(aiperf_artifacts.glob("profile_export*.jsonl"))
+        if candidates:
+            aiperf_jsonl = candidates[0]
+
+    # Check for trace replay output
+    trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv"
+
+    if not client_metrics_file.exists() and aiperf_jsonl is None and not trace_replay_csv.exists():
+        return None
+
+    try:
+        if client_metrics_file.exists():
+            df = pd.read_csv(client_metrics_file)
+        elif aiperf_jsonl is not None:
+            df = _load_aiperf_jsonl(aiperf_jsonl)
+        elif trace_replay_csv.exists():
+            df = _load_trace_replay_csv(trace_replay_csv)
+        else:
+            return None
+
+        # Load server metrics for cache hit rates
+        gpu_hit_rate = None
+        cpu_hit_rate = None
+        if server_metrics_file.exists():
+            server_df = pd.read_csv(server_metrics_file)
+            # Get final cumulative values
+            final_row = server_df.iloc[-1]
+            if final_row["prefix_cache_queries"] > 0:
+                gpu_hit_rate = 100 * final_row["prefix_cache_hits"] / final_row["prefix_cache_queries"]
+            if final_row["cpu_prefix_cache_queries"] > 0:
+                cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"]
+        if len(df) == 0:
+            return None
+
+        # Parse experiment name: tp{N}_bs{M}_offload{on|off}
+        exp_name = exp_dir.name
+        parts = exp_name.split("_")
+        tp = int(parts[0].replace("tp", ""))
+        bs = int(parts[1].replace("bs", ""))
+        offload = parts[2].replace("offload", "")
+
+        # Calculate metrics
+        # Prefer benchmark_metadata.json for precise wall-clock duration
+        metadata_file = exp_dir / "benchmark_metadata.json"
+        total_time_sec = None
+        if metadata_file.exists():
+            try:
+                with open(metadata_file) as f:
+                    metadata = json.load(f)
+                total_time_sec = metadata.get("benchmark_runtime_sec")
+            except Exception:
+                pass
+
+        # Fallback: derive from per-request data (first start to last finish)
+        if not total_time_sec or total_time_sec <= 0:
+            first_start_ms = df["start_time_ms"].min()
+            last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max()
+            total_time_sec = (last_finish_ms - first_start_ms) / 1000.0
+        if total_time_sec <= 0:
+            total_time_sec = df["latency_ms"].sum() / 1000  # fallback
+
+        num_requests = len(df)
+        throughput_rps = num_requests / total_time_sec if total_time_sec > 0 else 0
+
+        # Input token throughput (prefill)
+        total_input_tokens = df["input_num_tokens"].sum()
+        input_throughput_tps = total_input_tokens / total_time_sec if total_time_sec > 0 else 0
+
+        # Output token throughput (decode only)
+        total_output_tokens = df["output_num_tokens"].sum()
+        output_throughput_tps = total_output_tokens / total_time_sec if total_time_sec > 0 else 0
+
+        # Total token throughput (input + output)
+        total_tokens = total_input_tokens + total_output_tokens
+        total_throughput_tps = total_tokens / total_time_sec if total_time_sec > 0 else 0
+
+        # Normalized throughput (per GPU)
+        input_tps_per_gpu = input_throughput_tps / tp
+        output_tps_per_gpu = output_throughput_tps / tp
+        total_tps_per_gpu = total_throughput_tps / tp
+
+        return {
+            "exp_name": exp_name,
+            "tp": tp,
+            "bs": bs,
+            "offload": offload,
+            "num_requests": num_requests,
+            "throughput_rps": throughput_rps,
+            "input_throughput_tps": input_throughput_tps,
+            "total_throughput_tps": total_throughput_tps,
+            "input_tps_per_gpu": input_tps_per_gpu,
+            "output_tps_per_gpu": output_tps_per_gpu,
+            "total_tps_per_gpu": total_tps_per_gpu,
+            "mean_ttft_ms": df["ttft_ms"].mean(),
+            "p50_ttft_ms": df["ttft_ms"].median(),
+            "p90_ttft_ms": df["ttft_ms"].quantile(0.9),
+            "p99_ttft_ms": df["ttft_ms"].quantile(0.99),
+            "mean_tpot_ms": df["tpot_ms"].mean(),
+            "p50_tpot_ms": df["tpot_ms"].median(),
+            "p90_tpot_ms": df["tpot_ms"].quantile(0.9),
+            "p99_tpot_ms": df["tpot_ms"].quantile(0.99),
+            "p999_tpot_ms": df["tpot_ms"].quantile(0.999),
+            "mean_latency_ms": df["latency_ms"].mean(),
+            "p50_latency_ms": df["latency_ms"].median(),
+            "p90_latency_ms": df["latency_ms"].quantile(0.9),
+            "p99_latency_ms": df["latency_ms"].quantile(0.99),
+            "p999_latency_ms": df["latency_ms"].quantile(0.999),
+            "p999_ttft_ms": df["ttft_ms"].quantile(0.999),
+            # Cache hit rates
+            "gpu_hit_rate": gpu_hit_rate,
+            "cpu_hit_rate": cpu_hit_rate,
+        }
+    except Exception as e:
+        print(f"Error loading {exp_dir}: {e}")
+        return None
+
+
+def compute_pareto_frontier(points: list[tuple[float, float]], maximize_x: bool = False) -> list[tuple[float, float]]:
+    """
+    Compute Pareto frontier for (x, y) points.
+    Y is always maximized. X is minimized by default, or maximized if maximize_x=True.
+
+    For minimize X, maximize Y (e.g., latency vs throughput):
+        - Frontier goes bottom-left to top-right
+        - Low latency = low throughput, high latency = high throughput
+
+    For maximize X, maximize Y (e.g., interactivity vs throughput):
+        - Frontier goes top-left to bottom-right
+        - Trade-off between the two "goods"
+
+    Returns points sorted by X ascending for plotting.
+    """
+    if not points:
+        return []
+
+    # Remove invalid points
+    points = [(x, y) for x, y in points if x > 0 and y > 0]
+    if not points:
+        return []
+
+    frontier = []
+    sorted_points = sorted(points, key=lambda p: p[0])
+
+    if maximize_x:
+        # Maximize both X and Y: frontier goes top-left to bottom-right
+        # Traverse from high X to low X, keep points with increasing Y
+        max_y = float('-inf')
+        for x, y in reversed(sorted_points):
+            if y > max_y:
+                frontier.append((x, y))
+                max_y = y
+        return sorted(frontier, key=lambda p: p[0])
+    else:
+        # Minimize X, maximize Y: frontier goes bottom-left to top-right
+        # Traverse from low X to high X, keep points with increasing Y
+        max_y = float('-inf')
+        for x, y in sorted_points:
+            if y > max_y:
+                frontier.append((x, y))
+                max_y = y
+        return frontier
+
+
+def compute_pareto_frontier_with_metadata(df_subset: pd.DataFrame, x_col: str, y_col: str, maximize_x: bool = False) -> pd.DataFrame:
+    """
+    Compute Pareto frontier and return the rows from the dataframe that are on the frontier.
+    """
+    if len(df_subset) == 0:
+        return pd.DataFrame()
+
+    # Get valid points
+    valid_mask = (df_subset[x_col] > 0) & (df_subset[y_col] > 0)
+    df_valid = df_subset[valid_mask].copy()
+
+    if len(df_valid) == 0:
+        return pd.DataFrame()
+
+    # Sort by x
+    df_sorted = df_valid.sort_values(x_col).reset_index(drop=True)
+
+    frontier_indices = []
+    max_y = float('-inf')
+
+    if maximize_x:
+        # Traverse from high X to low X
+        for i in range(len(df_sorted) - 1, -1, -1):
+            y = df_sorted.iloc[i][y_col]
+            if y > max_y:
+                frontier_indices.append(i)
+                max_y = y
+        frontier_indices = frontier_indices[::-1]  # Reverse to get ascending X order
+    else:
+        # Traverse from low X to high X
+        for i in range(len(df_sorted)):
+            y = df_sorted.iloc[i][y_col]
+            if y > max_y:
+                frontier_indices.append(i)
+                max_y = y
+
+    return df_sorted.iloc[frontier_indices]
+
+
+def generate_pareto_only_figure(df: pd.DataFrame, results_dir: Path):
+    """Generate a clean figure showing only Pareto frontier points with concurrency labels."""
+
+    # Compute interactivity
+    df = df.copy()
+    df["interactivity"] = 1000.0 / df["p50_tpot_ms"]
+
+    # Get available modes and create subsets
+    available_modes = sorted(df["offload"].unique())
+    mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"}
+    df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes}
+
+    # Create figure with columns for each mode
+    num_cols = len(available_modes)
+    fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18))
+    fig.suptitle("Pareto Frontiers Only (with Concurrency Labels)", fontsize=14)
+
+    # Handle single column case
+    if num_cols == 1:
+        axes = axes.reshape(-1, 1)
+
+    # Color by TP
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x)
+    metrics_configs = [
+        (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True),
+        (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs:
+        for col, mode in enumerate(available_modes):
+            ax = axes[row, col]
+            df_subset = df_subsets[mode]
+            title = f"{metric_name} ({mode_titles.get(mode, mode)})"
+
+            # Get Pareto frontier points with metadata
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                # Plot frontier line
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle='-', linewidth=2, alpha=0.5, color="black")
+
+                # Plot points colored by TP
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors="black", linewidths=1,
+                              label=f"TP={tp}", zorder=5)
+
+                # Add concurrency labels
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=(5, 5),
+                               fontsize=8,
+                               alpha=0.8)
+
+            ax.set_xlabel(x_label)
+            ax.set_ylabel(y_label)
+            ax.set_title(title)
+            ax.grid(True, alpha=0.3)
+            if len(frontier_df) > 0:
+                ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_clean.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved clean Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_pareto_only_figure_p90(df: pd.DataFrame, results_dir: Path):
+    """Generate a clean figure showing only Pareto frontier points with p90 latencies."""
+
+    df = df.copy()
+    df["interactivity_p90"] = 1000.0 / df["p90_tpot_ms"]
+
+    available_modes = sorted(df["offload"].unique())
+    mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"}
+    df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes}
+
+    num_cols = len(available_modes)
+    fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18))
+    fig.suptitle("Pareto Frontiers (P90 Latencies) with Concurrency Labels", fontsize=14)
+
+    if num_cols == 1:
+        axes = axes.reshape(-1, 1)
+
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    metrics_configs = [
+        (0, "p90_ttft_ms", "input_tps_per_gpu", "TTFT", "P90 TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity_p90", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P90 TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p90_latency_ms", "total_tps_per_gpu", "E2E Latency", "P90 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity_p90", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P90 TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs:
+        for col, mode in enumerate(available_modes):
+            ax = axes[row, col]
+            df_subset = df_subsets[mode]
+            title = f"{metric_name} ({mode_titles.get(mode, mode)})"
+
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle='-', linewidth=2, alpha=0.5, color="black")
+
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors="black", linewidths=1,
+                              label=f"TP={tp}", zorder=5)
+
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=(5, 5),
+                               fontsize=8,
+                               alpha=0.8)
+
+            ax.set_xlabel(x_label)
+            ax.set_ylabel(y_label)
+            ax.set_title(title)
+            ax.grid(True, alpha=0.3)
+            if len(frontier_df) > 0:
+                ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_clean_p90.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved clean P90 Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_pareto_overlay_figure_p90(df: pd.DataFrame, results_dir: Path):
+    """Generate a figure with all prefix cache modes overlaid using p90 latencies."""
+
+    df = df.copy()
+    df["interactivity_p90"] = 1000.0 / df["p90_tpot_ms"]
+
+    available_modes = df["offload"].unique()
+
+    mode_styles = {
+        "on": ("-", "black", "black", (5, 8), "normal"),
+        "off": ("--", "none", "gray", (5, -12), "italic"),
+        "noprefix": (":", "red", "red", (5, -25), "oblique"),
+    }
+    mode_labels = {
+        "on": "Prefix+Offload",
+        "off": "Prefix Only",
+        "noprefix": "No Prefix",
+    }
+
+    fig, axes = plt.subplots(4, 1, figsize=(10, 18))
+    fig.suptitle("Pareto Frontiers (P90 Latencies): Mode Comparison", fontsize=14)
+
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    plot_configs = [
+        (0, "p90_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P90 TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity_p90", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P90 TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p90_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P90 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity_p90", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P90 TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs:
+        ax = axes[row]
+
+        for mode in ["on", "off", "noprefix"]:
+            if mode not in available_modes:
+                continue
+
+            df_subset = df[df["offload"] == mode]
+            linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode]
+
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color,
+                       label=f"Pareto ({mode_labels[mode]})")
+
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    label = f"TP={tp}" if mode == "on" else None
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5,
+                              label=label, zorder=5)
+
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=label_offset,
+                               fontsize=7,
+                               alpha=0.7,
+                               style=font_style)
+
+        ax.set_xlabel(x_label)
+        ax.set_ylabel(y_label)
+        ax.set_title(title)
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_overlay_p90.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved overlay P90 Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_pareto_only_figure_p99(df: pd.DataFrame, results_dir: Path):
+    """Generate a clean figure showing only Pareto frontier points with p99 latencies."""
+
+    # Compute interactivity using p99
+    df = df.copy()
+    df["interactivity_p99"] = 1000.0 / df["p99_tpot_ms"]
+
+    # Get available modes and create subsets
+    available_modes = sorted(df["offload"].unique())
+    mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"}
+    df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes}
+
+    # Create figure with columns for each mode
+    num_cols = len(available_modes)
+    fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18))
+    fig.suptitle("Pareto Frontiers (P99 Latencies) with Concurrency Labels", fontsize=14)
+
+    # Handle single column case
+    if num_cols == 1:
+        axes = axes.reshape(-1, 1)
+
+    # Color by TP
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x)
+    metrics_configs = [
+        (0, "p99_ttft_ms", "input_tps_per_gpu", "TTFT", "P99 TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity_p99", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P99 TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p99_latency_ms", "total_tps_per_gpu", "E2E Latency", "P99 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity_p99", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P99 TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs:
+        for col, mode in enumerate(available_modes):
+            ax = axes[row, col]
+            df_subset = df_subsets[mode]
+            title = f"{metric_name} ({mode_titles.get(mode, mode)})"
+
+            # Get Pareto frontier points with metadata
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                # Plot frontier line
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle='-', linewidth=2, alpha=0.5, color="black")
+
+                # Plot points colored by TP
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors="black", linewidths=1,
+                              label=f"TP={tp}", zorder=5)
+
+                # Add concurrency labels
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=(5, 5),
+                               fontsize=8,
+                               alpha=0.8)
+
+            ax.set_xlabel(x_label)
+            ax.set_ylabel(y_label)
+            ax.set_title(title)
+            ax.grid(True, alpha=0.3)
+            if len(frontier_df) > 0:
+                ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_clean_p99.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved clean P99 Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_pareto_overlay_figure_p99(df: pd.DataFrame, results_dir: Path):
+    """Generate a figure with all prefix cache modes overlaid using p99 latencies."""
+
+    # Compute interactivity using p99
+    df = df.copy()
+    df["interactivity_p99"] = 1000.0 / df["p99_tpot_ms"]
+
+    # Get available modes
+    available_modes = df["offload"].unique()
+
+    # Mode styles
+    mode_styles = {
+        "on": ("-", "black", "black", (5, 8), "normal"),
+        "off": ("--", "none", "gray", (5, -12), "italic"),
+        "noprefix": (":", "red", "red", (5, -25), "oblique"),
+    }
+    mode_labels = {
+        "on": "Prefix+Offload",
+        "off": "Prefix Only",
+        "noprefix": "No Prefix",
+    }
+
+    # Create 4x1 figure
+    fig, axes = plt.subplots(4, 1, figsize=(10, 18))
+    fig.suptitle("Pareto Frontiers (P99 Latencies): Mode Comparison", fontsize=14)
+
+    # Color by TP
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    # Plot configs
+    plot_configs = [
+        (0, "p99_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P99 TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity_p99", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P99 TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p99_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P99 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity_p99", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P99 TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs:
+        ax = axes[row]
+
+        for mode in ["on", "off", "noprefix"]:
+            if mode not in available_modes:
+                continue
+
+            df_subset = df[df["offload"] == mode]
+            linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode]
+
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color,
+                       label=f"Pareto ({mode_labels[mode]})")
+
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    label = f"TP={tp}" if mode == "on" else None
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5,
+                              label=label, zorder=5)
+
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=label_offset,
+                               fontsize=7,
+                               alpha=0.7,
+                               style=font_style)
+
+        ax.set_xlabel(x_label)
+        ax.set_ylabel(y_label)
+        ax.set_title(title)
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_overlay_p99.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved overlay P99 Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_pareto_only_figure_p999(df: pd.DataFrame, results_dir: Path):
+    """Generate a clean figure showing only Pareto frontier points with p99.9 latencies."""
+
+    df = df.copy()
+    df["interactivity_p999"] = 1000.0 / df["p999_tpot_ms"]
+
+    available_modes = sorted(df["offload"].unique())
+    mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"}
+    df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes}
+
+    num_cols = len(available_modes)
+    fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18))
+    fig.suptitle("Pareto Frontiers (P99.9 Latencies) with Concurrency Labels", fontsize=14)
+
+    if num_cols == 1:
+        axes = axes.reshape(-1, 1)
+
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    metrics_configs = [
+        (0, "p999_ttft_ms", "input_tps_per_gpu", "TTFT", "P99.9 TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity_p999", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P99.9 TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p999_latency_ms", "total_tps_per_gpu", "E2E Latency", "P99.9 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity_p999", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P99.9 TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs:
+        for col, mode in enumerate(available_modes):
+            ax = axes[row, col]
+            df_subset = df_subsets[mode]
+            title = f"{metric_name} ({mode_titles.get(mode, mode)})"
+
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle='-', linewidth=2, alpha=0.5, color="black")
+
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors="black", linewidths=1,
+                              label=f"TP={tp}", zorder=5)
+
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=(5, 5),
+                               fontsize=8,
+                               alpha=0.8)
+
+            ax.set_xlabel(x_label)
+            ax.set_ylabel(y_label)
+            ax.set_title(title)
+            ax.grid(True, alpha=0.3)
+            if len(frontier_df) > 0:
+                ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_clean_p999.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved clean P99.9 Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_pareto_overlay_figure_p999(df: pd.DataFrame, results_dir: Path):
+    """Generate a figure with all prefix cache modes overlaid using p99.9 latencies."""
+
+    df = df.copy()
+    df["interactivity_p999"] = 1000.0 / df["p999_tpot_ms"]
+
+    available_modes = df["offload"].unique()
+
+    mode_styles = {
+        "on": ("-", "black", "black", (5, 8), "normal"),
+        "off": ("--", "none", "gray", (5, -12), "italic"),
+        "noprefix": (":", "red", "red", (5, -25), "oblique"),
+    }
+    mode_labels = {
+        "on": "Prefix+Offload",
+        "off": "Prefix Only",
+        "noprefix": "No Prefix",
+    }
+
+    fig, axes = plt.subplots(4, 1, figsize=(10, 18))
+    fig.suptitle("Pareto Frontiers (P99.9 Latencies): Mode Comparison", fontsize=14)
+
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    plot_configs = [
+        (0, "p999_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P99.9 TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity_p999", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P99.9 TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p999_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P99.9 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity_p999", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P99.9 TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs:
+        ax = axes[row]
+
+        for mode in ["on", "off", "noprefix"]:
+            if mode not in available_modes:
+                continue
+
+            df_subset = df[df["offload"] == mode]
+            linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode]
+
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color,
+                       label=f"Pareto ({mode_labels[mode]})")
+
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    label = f"TP={tp}" if mode == "on" else None
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5,
+                              label=label, zorder=5)
+
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=label_offset,
+                               fontsize=7,
+                               alpha=0.7,
+                               style=font_style)
+
+        ax.set_xlabel(x_label)
+        ax.set_ylabel(y_label)
+        ax.set_title(title)
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_overlay_p999.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved overlay P99.9 Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_combined_pareto_figure(df: pd.DataFrame, results_dir: Path,
+                                    percentile: str = "p50"):
+    """Generate a combined Pareto frontier across ALL offload modes.
+
+    Points are colored by TP and edge-styled by offload mode so the viewer
+    can see both the overall optimal frontier and which config each point
+    comes from.
+
+    percentile: one of "p50", "p90", "p99", "p999"
+    """
+    from matplotlib.lines import Line2D
+
+    pct = percentile  # e.g. "p50"
+    pct_label = {"p50": "Median", "p90": "P90", "p99": "P99", "p999": "P99.9"}[pct]
+    suffix = f"_{pct}"
+
+    df = df.copy()
+    interactivity_col = f"interactivity{suffix}"
+    df[interactivity_col] = 1000.0 / df[f"{pct}_tpot_ms"]
+
+    fig, axes = plt.subplots(4, 1, figsize=(10, 18))
+    fig.suptitle(f"Combined Pareto Frontier — {pct_label} SLA (All Configs)", fontsize=14)
+
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    mode_edge = {
+        "on":       {"edgecolors": "black",  "linewidths": 1.8},
+        "off":      {"edgecolors": "gray",   "linewidths": 1.2},
+        "noprefix": {"edgecolors": "#cc0000", "linewidths": 1.2},
+    }
+    mode_short = {"on": "P+O", "off": "P", "noprefix": "NP"}
+
+    metrics_configs = [
+        (0, f"{pct}_ttft_ms",     "input_tps_per_gpu", "TTFT",          f"{pct_label} TTFT (ms)",                       "Input Throughput/GPU (tok/s)", False),
+        (1, interactivity_col,    "total_tps_per_gpu", "Interactivity", f"Interactivity (1000/{pct_label} TPOT)",       "Total Throughput/GPU (tok/s)", True),
+        (2, f"{pct}_latency_ms",  "total_tps_per_gpu", "E2E Latency",   f"{pct_label} E2E Latency (ms)",               "Total Throughput/GPU (tok/s)", False),
+        (3, interactivity_col,    "output_tps_per_gpu", "Output Throughput", f"Interactivity (1000/{pct_label} TPOT)",       "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs:
+        ax = axes[row]
+
+        # # All-data scatter (faded background)
+        # for tp in sorted(df["tp"].unique()):
+        #     tp_data = df[df["tp"] == tp]
+        #     ax.scatter(tp_data[x_col], tp_data[y_col],
+        #                c=tp_colors.get(tp, "purple"),
+        #                marker=tp_markers.get(tp, "x"),
+        #                s=40, alpha=0.15, linewidths=0.3,
+        #                edgecolors="gray")
+
+        # Combined Pareto frontier
+        frontier_df = compute_pareto_frontier_with_metadata(df, x_col, y_col, maximize_x)
+
+        if len(frontier_df) > 0:
+            ax.plot(frontier_df[x_col], frontier_df[y_col],
+                    linestyle='-', linewidth=2, alpha=0.5, color="black",
+                    label="Pareto Frontier", zorder=4)
+
+            for _, pt in frontier_df.iterrows():
+                tp = pt["tp"]
+                mode = pt["offload"]
+                edge_kw = mode_edge.get(mode, {"edgecolors": "black", "linewidths": 1})
+                ax.scatter(pt[x_col], pt[y_col],
+                           c=tp_colors.get(tp, "purple"),
+                           marker=tp_markers.get(tp, "x"),
+                           s=160, alpha=0.9, zorder=5,
+                           **edge_kw)
+
+            for _, pt in frontier_df.iterrows():
+                ax.annotate(
+                    f"conc={int(pt['bs'])} {mode_short.get(pt['offload'], '')}",
+                    (pt[x_col], pt[y_col]),
+                    textcoords="offset points", xytext=(5, 5),
+                    fontsize=7, alpha=0.85)
+
+        ax.set_xlabel(x_label)
+        ax.set_ylabel(y_label)
+        ax.set_title(f"{metric_name} — All Configs Combined")
+        ax.grid(True, alpha=0.3)
+
+        handles = [Line2D([0], [0], color="black", lw=2, label="Pareto Frontier")]
+        for tp in sorted(df["tp"].unique()):
+            handles.append(Line2D([0], [0], marker=tp_markers[tp], color="w",
+                                  markerfacecolor=tp_colors[tp], markersize=8,
+                                  markeredgecolor="black", label=f"TP={tp}"))
+        handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w",
+                              markersize=8, markeredgecolor="black", markeredgewidth=1.8,
+                              label="Edge: P+Offload"))
+        handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w",
+                              markersize=8, markeredgecolor="gray", markeredgewidth=1.2,
+                              label="Edge: Prefix Only"))
+        handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w",
+                              markersize=8, markeredgecolor="#cc0000", markeredgewidth=1.2,
+                              label="Edge: No Prefix"))
+        ax.legend(handles=handles, fontsize=7,
+                  loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+    fname = f"pareto_frontiers_combined{suffix}.png"
+    output_file = results_dir / fname
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved combined {pct_label} Pareto plot to {output_file}")
+    plt.close()
+
+
+def generate_pareto_overlay_figure(df: pd.DataFrame, results_dir: Path):
+    """Generate a figure with all prefix cache modes overlaid for direct comparison."""
+
+    # Compute interactivity
+    df = df.copy()
+    df["interactivity"] = 1000.0 / df["p50_tpot_ms"]
+
+    # Get available modes
+    available_modes = df["offload"].unique()
+
+    # Mode styles: (linestyle, marker_edge, line_color, label_offset, font_style)
+    mode_styles = {
+        "on": ("-", "black", "black", (5, 8), "normal"),       # Prefix + Offload
+        "off": ("--", "none", "gray", (5, -12), "italic"),     # Prefix only
+        "noprefix": (":", "red", "red", (5, -25), "oblique"),  # No prefix caching
+    }
+    mode_labels = {
+        "on": "Prefix+Offload",
+        "off": "Prefix Only",
+        "noprefix": "No Prefix",
+    }
+
+    # Create 4x1 figure
+    fig, axes = plt.subplots(4, 1, figsize=(10, 18))
+    fig.suptitle("Pareto Frontiers: Prefix Caching Mode Comparison", fontsize=14)
+
+    # Color by TP
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    # Plot configs: (row, x_col, y_col, title, x_label, y_label, maximize_x)
+    plot_configs = [
+        (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs:
+        ax = axes[row]
+
+        # Plot all available modes
+        for mode in ["on", "off", "noprefix"]:
+            if mode not in available_modes:
+                continue
+
+            df_subset = df[df["offload"] == mode]
+            linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode]
+
+            frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x)
+
+            if len(frontier_df) > 0:
+                # Plot frontier line
+                ax.plot(frontier_df[x_col], frontier_df[y_col],
+                       linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color,
+                       label=f"Pareto ({mode_labels[mode]})")
+
+                # Plot points colored by TP
+                for tp in sorted(frontier_df["tp"].unique()):
+                    tp_data = frontier_df[frontier_df["tp"] == tp]
+                    # Only add TP to legend once (for first mode)
+                    label = f"TP={tp}" if mode == "on" else None
+                    ax.scatter(tp_data[x_col], tp_data[y_col],
+                              c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                              s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5,
+                              label=label, zorder=5)
+
+                # Add concurrency labels
+                for _, point in frontier_df.iterrows():
+                    ax.annotate(f"conc={point['bs']}",
+                               (point[x_col], point[y_col]),
+                               textcoords="offset points",
+                               xytext=label_offset,
+                               fontsize=7,
+                               alpha=0.7,
+                               style=font_style)
+
+        ax.set_xlabel(x_label)
+        ax.set_ylabel(y_label)
+        ax.set_title(title)
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers_overlay.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved overlay Pareto plot to {output_file}")
+    plt.close()
+
+
+def main(results_dir: Path):
+    # Load all experiments
+    experiments = []
+    for exp_dir in results_dir.iterdir():
+        if exp_dir.is_dir() and exp_dir.name.startswith("tp"):
+            data = load_experiment_data(exp_dir)
+            if data:
+                experiments.append(data)
+
+    if not experiments:
+        print("No experiment data found!")
+        return
+
+    df = pd.DataFrame(experiments)
+    print(f"Loaded {len(df)} experiments")
+    print(df[["exp_name", "tp", "bs", "offload", "input_tps_per_gpu", "total_tps_per_gpu", "p50_ttft_ms"]].to_string())
+
+    # Compute interactivity = 1000 / TPOT (tokens per second for decode)
+    df["interactivity"] = 1000.0 / df["p50_tpot_ms"]
+
+    # Get available modes and create subsets
+    available_modes = sorted(df["offload"].unique())
+    mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"}
+    df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes}
+
+    # Create figure with columns for each mode
+    num_cols = len(available_modes)
+    fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18))
+    fig.suptitle("Pareto Frontiers: Throughput/GPU vs Latency (All Points)", fontsize=14)
+
+    # Handle single column case
+    if num_cols == 1:
+        axes = axes.reshape(-1, 1)
+
+    # Color by TP
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x)
+    metrics_configs = [
+        (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False),
+        (1, "interactivity", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True),
+        (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False),
+        (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True),
+        (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True),
+    ]
+
+    for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs:
+        for col, mode in enumerate(available_modes):
+            ax = axes[row, col]
+            df_subset = df_subsets[mode]
+            title = f"{metric_name} ({mode_titles.get(mode, mode)})"
+
+            # Compute and plot Pareto frontier
+            points = list(zip(df_subset[x_col], df_subset[y_col]))
+            frontier = compute_pareto_frontier(points, maximize_x=maximize_x)
+
+            if frontier:
+                fx, fy = zip(*frontier)
+                ax.plot(fx, fy, linestyle='-', linewidth=2, alpha=0.8, color="black", label="Pareto frontier")
+
+            # Plot points colored by TP
+            for tp in sorted(df_subset["tp"].unique()):
+                tp_data = df_subset[df_subset["tp"] == tp]
+                ax.scatter(tp_data[x_col], tp_data[y_col],
+                          c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                          s=100, alpha=0.8, edgecolors="black", linewidths=0.5,
+                          label=f"TP={tp}")
+
+            ax.set_xlabel(x_label)
+            ax.set_ylabel(y_label)
+            ax.set_title(title)
+            ax.grid(True, alpha=0.3)
+            ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "pareto_frontiers.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"\nSaved plot to {output_file}")
+    plt.close()
+
+    # Also save summary CSV
+    summary_file = results_dir / "experiment_summary.csv"
+    df.to_csv(summary_file, index=False)
+    print(f"Saved summary to {summary_file}")
+
+    # Generate clean Pareto-only figure
+    generate_pareto_only_figure(df, results_dir)
+
+    # Generate combined Pareto frontier (all configs pooled) for each SLA percentile
+    for pct in ("p50", "p90", "p99", "p999"):
+        generate_combined_pareto_figure(df, results_dir, percentile=pct)
+
+    # Generate overlay figure (on vs off comparison)
+    generate_pareto_overlay_figure(df, results_dir)
+
+    # Generate P90 versions
+    generate_pareto_only_figure_p90(df, results_dir)
+    generate_pareto_overlay_figure_p90(df, results_dir)
+
+    # Generate P99 versions
+    generate_pareto_only_figure_p99(df, results_dir)
+    generate_pareto_overlay_figure_p99(df, results_dir)
+
+    # Generate P99.9 versions
+    generate_pareto_only_figure_p999(df, results_dir)
+    generate_pareto_overlay_figure_p999(df, results_dir)
+
+    # Generate cache hit rate plot
+    generate_cache_hit_rate_figure(df, results_dir)
+
+
+def generate_cache_hit_rate_figure(df: pd.DataFrame, results_dir: Path):
+    """Generate plot showing throughput vs cache hit rates (GPU and CPU)."""
+
+    # Get available modes
+    available_modes = sorted(df["offload"].unique())
+    mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"}
+
+    # Create 2x3 figure (GPU hit rate row, CPU hit rate row, columns for each mode)
+    num_cols = len(available_modes)
+    fig, axes = plt.subplots(2, num_cols, figsize=(6 * num_cols, 10))
+    fig.suptitle("Cache Hit Rate vs Throughput", fontsize=14)
+
+    # Handle single column case
+    if num_cols == 1:
+        axes = axes.reshape(-1, 1)
+
+    # Color by TP
+    tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"}
+    tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"}
+
+    # Plot configs: (row, hit_rate_col, title_prefix)
+    hit_rate_configs = [
+        (0, "gpu_hit_rate", "GPU"),
+        (1, "cpu_hit_rate", "CPU"),
+    ]
+
+    for row, hit_rate_col, hit_type in hit_rate_configs:
+        for col, mode in enumerate(available_modes):
+            ax = axes[row, col]
+            df_subset = df[df["offload"] == mode].dropna(subset=[hit_rate_col])
+
+            if len(df_subset) == 0:
+                ax.text(0.5, 0.5, "No data", ha='center', va='center', transform=ax.transAxes)
+                ax.set_title(f"{hit_type} Hit Rate ({mode_titles.get(mode, mode)})")
+                continue
+
+            # Plot points colored by TP
+            for tp in sorted(df_subset["tp"].unique()):
+                tp_data = df_subset[df_subset["tp"] == tp]
+                ax.scatter(tp_data[hit_rate_col], tp_data["total_tps_per_gpu"],
+                          c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"),
+                          s=100, alpha=0.8, edgecolors="black", linewidths=0.5,
+                          label=f"TP={tp}")
+
+            # Add concurrency labels
+            for _, point in df_subset.iterrows():
+                ax.annotate(f"bs={int(point['bs'])}",
+                           (point[hit_rate_col], point["total_tps_per_gpu"]),
+                           textcoords="offset points",
+                           xytext=(5, 5),
+                           fontsize=7,
+                           alpha=0.7)
+
+            ax.set_xlabel(f"{hit_type} Cache Hit Rate (%)")
+            ax.set_ylabel("Total Throughput/GPU (tok/s)")
+            ax.set_title(f"{hit_type} Hit Rate ({mode_titles.get(mode, mode)})")
+            ax.set_xlim(-5, 105)
+            ax.grid(True, alpha=0.3)
+            ax.legend(fontsize=8, loc="lower right")
+
+    plt.tight_layout()
+
+    output_file = results_dir / "cache_hit_rates.png"
+    plt.savefig(output_file, dpi=150, bbox_inches='tight')
+    print(f"Saved cache hit rate plot to {output_file}")
+    plt.close()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python plot_pareto.py <results_dir>")
+        print("Example: python plot_pareto.py ~/sweep_results_20260204_062339")
+        sys.exit(1)
+
+    results_dir = Path(sys.argv[1]).expanduser()
+    if not results_dir.exists():
+        print(f"Error: {results_dir} does not exist")
+        sys.exit(1)
+
+    main(results_dir)
diff --git a/experimental/multiturn/vllm_benchmark/bench/__init__.py b/experimental/multiturn/vllm_benchmark/bench/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
new file mode 100644
index 000000000..c129f38b8
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
@@ -0,0 +1,957 @@
+"""
+Metrics collector for vLLM server during benchmarks.
+Polls /metrics endpoint and generates visualizations.
+"""
+
+import asyncio
+import csv
+import re
+import subprocess
+import threading
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import aiohttp
+import matplotlib.pyplot as plt
+
+
+@dataclass
+class GpuTransferSnapshot:
+    timestamp: float
+    gpu_id: int = 0
+    tx_pci: float = 0.0  # PCIe TX (MB/s)
+    rx_pci: float = 0.0  # PCIe RX (MB/s)
+
+
+class GpuTransferCollector:
+    """DEPRECATED: Collects GPU transfer stats using nvidia-smi dmon.
+
+    Replaced by vLLM's native kv_offload metrics (vllm:kv_offload_total_bytes_total,
+    vllm:kv_offload_total_time_total) which are more precise and don't require
+    spawning a subprocess.
+    """
+
+    def __init__(self, gpu_id: int = 0, poll_interval: int = 1):
+        self.gpu_id = gpu_id
+        self.poll_interval = poll_interval
+        self.snapshots: list[GpuTransferSnapshot] = []
+        self._process: subprocess.Popen | None = None
+        self._thread: threading.Thread | None = None
+        self._running = False
+
+    def _parse_line(self, line: str) -> GpuTransferSnapshot | None:
+        """Parse a line of nvidia-smi dmon CSV output.
+
+        Format: gpu, rxpci, txpci (values in MB/s)
+        Example: 0, 406, 32013
+        """
+        line = line.strip()
+        if not line or line.startswith('#'):  # Skip header/comments
+            return None
+
+        parts = [p.strip() for p in line.split(',')]
+        if len(parts) < 3:
+            return None
+
+        try:
+            return GpuTransferSnapshot(
+                timestamp=time.time(),
+                gpu_id=int(parts[0]),
+                rx_pci=float(parts[1]) if parts[1] != '-' else 0.0,
+                tx_pci=float(parts[2]) if parts[2] != '-' else 0.0,
+            )
+        except (ValueError, IndexError):
+            return None
+
+    def _reader_thread(self) -> None:
+        """Background thread to read nvidia-smi output."""
+        if self._process is None:
+            return
+
+        for line in iter(self._process.stdout.readline, ''):
+            if not self._running:
+                break
+            snapshot = self._parse_line(line)
+            if snapshot and snapshot.gpu_id == self.gpu_id:
+                self.snapshots.append(snapshot)
+
+    def start(self) -> None:
+        """Start collecting GPU transfer stats."""
+        if self._running:
+            return
+
+        self._running = True
+        self.snapshots = []
+
+        try:
+            self._process = subprocess.Popen(
+                [
+                    'nvidia-smi', 'dmon',
+                    '-i', str(self.gpu_id),
+                    '-s', 't',
+                    '-d', str(self.poll_interval),
+                    '--format', 'csv',
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+            self._thread = threading.Thread(target=self._reader_thread, daemon=True)
+            self._thread.start()
+        except FileNotFoundError:
+            print("nvidia-smi not found, GPU transfer monitoring disabled")
+            self._running = False
+
+    def stop(self) -> None:
+        """Stop collecting GPU transfer stats."""
+        self._running = False
+        if self._process:
+            self._process.terminate()
+            try:
+                self._process.wait(timeout=2)
+            except subprocess.TimeoutExpired:
+                self._process.kill()
+            self._process = None
+
+        if self._thread:
+            self._thread.join(timeout=2)
+            self._thread = None
+
+
+@dataclass
+class MetricsSnapshot:
+    timestamp: float
+    kv_cache_usage: float = 0.0
+    cpu_kv_cache_usage: float = 0.0
+    num_requests_running: int = 0
+    num_requests_waiting: int = 0
+    prefix_cache_hits: int = 0
+    prefix_cache_queries: int = 0
+    cpu_prefix_cache_hits: int = 0
+    cpu_prefix_cache_queries: int = 0
+    prompt_tokens: int = 0
+    generation_tokens: int = 0
+    num_preemptions: int = 0
+    request_success: int = 0
+    # KV offload transfer metrics (cumulative)
+    kv_offload_bytes_gpu_to_cpu: float = 0.0
+    kv_offload_bytes_cpu_to_gpu: float = 0.0
+    kv_offload_time_gpu_to_cpu: float = 0.0
+    kv_offload_time_cpu_to_gpu: float = 0.0
+    # Prompt tokens by source (cumulative)
+    prompt_tokens_local_compute: int = 0
+    prompt_tokens_local_cache_hit: int = 0
+    prompt_tokens_external_kv_transfer: int = 0
+    # Prefill KV computed tokens (cumulative sum from histogram)
+    prefill_kv_computed_tokens_sum: int = 0
+    prefill_kv_computed_tokens_count: int = 0
+
+
+@dataclass
+class MetricsCollector:
+    base_url: str
+    poll_interval: float = 1.0
+    snapshots: list[MetricsSnapshot] = field(default_factory=list)
+    _running: bool = False
+    _task: asyncio.Task | None = None
+    gpu_transfer_collector: GpuTransferCollector | None = None
+    gpu_id: int = 0
+
+    def _parse_metrics(self, text: str) -> MetricsSnapshot:
+        """Parse Prometheus metrics text format."""
+        snapshot = MetricsSnapshot(timestamp=time.time())
+
+        # Helper to extract gauge/counter value
+        def get_value(pattern: str, default: float = 0.0) -> float:
+            match = re.search(pattern, text)
+            if match:
+                return float(match.group(1))
+            return default
+
+        # KV cache usage (0-1 scale)
+        snapshot.kv_cache_usage = get_value(
+            r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)'
+        )
+        # Fallback to old metric name if new one not found
+        if snapshot.kv_cache_usage == 0.0:
+            snapshot.kv_cache_usage = get_value(
+                r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)'
+            )
+
+        # CPU/offloaded KV cache usage
+        snapshot.cpu_kv_cache_usage = get_value(
+            r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)'
+        )
+
+        # Running/waiting requests
+        snapshot.num_requests_running = int(get_value(
+            r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+        snapshot.num_requests_waiting = int(get_value(
+            r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+
+        # Prefix cache (cumulative counters) - GPU
+        snapshot.prefix_cache_hits = int(get_value(
+            r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+        snapshot.prefix_cache_queries = int(get_value(
+            r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+
+        # Prefix cache - external/offloaded (KV connector cross-instance cache)
+        snapshot.cpu_prefix_cache_hits = int(get_value(
+            r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+        snapshot.cpu_prefix_cache_queries = int(get_value(
+            r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+
+        # Token counters
+        snapshot.prompt_tokens = int(get_value(
+            r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+        snapshot.generation_tokens = int(get_value(
+            r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+
+        # Preemptions
+        snapshot.num_preemptions = int(get_value(
+            r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+
+        # Request success (sum all finish reasons)
+        for match in re.finditer(
+            r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)',
+            text
+        ):
+            snapshot.request_success += int(float(match.group(1)))
+
+        # KV offload bytes transferred (cumulative counters by direction)
+        snapshot.kv_offload_bytes_gpu_to_cpu = get_value(
+            r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)'
+        )
+        snapshot.kv_offload_bytes_cpu_to_gpu = get_value(
+            r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)'
+        )
+
+        # KV offload time (cumulative, seconds)
+        snapshot.kv_offload_time_gpu_to_cpu = get_value(
+            r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)'
+        )
+        snapshot.kv_offload_time_cpu_to_gpu = get_value(
+            r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)'
+        )
+
+        # Prompt tokens by source (cumulative)
+        snapshot.prompt_tokens_local_compute = int(get_value(
+            r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)'
+        ))
+        snapshot.prompt_tokens_local_cache_hit = int(get_value(
+            r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)'
+        ))
+        snapshot.prompt_tokens_external_kv_transfer = int(get_value(
+            r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)'
+        ))
+
+        # Prefill KV computed tokens (histogram sum and count)
+        snapshot.prefill_kv_computed_tokens_sum = int(get_value(
+            r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+        snapshot.prefill_kv_computed_tokens_count = int(get_value(
+            r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)'
+        ))
+
+        return snapshot
+
+    async def _poll_loop(self) -> None:
+        """Background polling loop."""
+        metrics_url = f"{self.base_url}/metrics"
+        async with aiohttp.ClientSession() as session:
+            while self._running:
+                try:
+                    async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5)) as resp:
+                        if resp.status == 200:
+                            text = await resp.text()
+                            snapshot = self._parse_metrics(text)
+                            self.snapshots.append(snapshot)
+                except Exception as e:
+                    print(f"Metrics poll error: {e}")
+
+                await asyncio.sleep(self.poll_interval)
+
+    def start(self) -> None:
+        """Start background metrics collection."""
+        if self._running:
+            return
+        self._running = True
+        self.snapshots = []
+        self._task = asyncio.create_task(self._poll_loop())
+
+    async def stop(self) -> None:
+        """Stop metrics collection."""
+        self._running = False
+        if self._task:
+            self._task.cancel()
+            try:
+                await self._task
+            except asyncio.CancelledError:
+                pass
+
+    def generate_plots(
+        self,
+        output_prefix: str = "metrics",
+        client_metrics: list | None = None,
+    ) -> None:
+        """Generate visualization plots from collected metrics.
+
+        Args:
+            output_prefix: Prefix for output file names
+            client_metrics: Optional list of RequestStats from benchmark clients
+        """
+        if len(self.snapshots) < 2:
+            print("Not enough data points for plots")
+            return
+
+        # Convert to relative time (seconds from start)
+        start_time = self.snapshots[0].timestamp
+        times = [(s.timestamp - start_time) for s in self.snapshots]
+
+        # Create figure with subplots
+        num_rows = 6 if client_metrics else 4
+        fig, axes = plt.subplots(num_rows, 2, figsize=(14, 4 * num_rows))
+        fig.suptitle("vLLM Server Metrics During Benchmark", fontsize=14)
+
+        # 1. KV Cache Usage vs Time
+        ax = axes[0, 0]
+        kv_usage = [min(s.kv_cache_usage * 100, 100.0) for s in self.snapshots]
+        ax.scatter(times, kv_usage, alpha=0.15, s=2, c='blue')
+        kv_window = min(50, len(kv_usage) // 10) if len(kv_usage) > 10 else 1
+        if kv_window > 1:
+            rolling_kv = [
+                sum(kv_usage[max(0, i - kv_window):i + 1]) / len(kv_usage[max(0, i - kv_window):i + 1])
+                for i in range(len(kv_usage))
+            ]
+            ax.plot(times, rolling_kv, 'b-', label=f'GPU (avg n={kv_window})', linewidth=2)
+        else:
+            ax.plot(times, kv_usage, 'b-', label='GPU', linewidth=2)
+        # Add external cache if available
+        cpu_kv_usage = [s.cpu_kv_cache_usage * 100 for s in self.snapshots]
+        if any(v > 0 for v in cpu_kv_usage):
+            ax.plot(times, cpu_kv_usage, 'r--', label='External', linewidth=1.5)
+        ax.legend(fontsize=8)
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("KV Cache Usage (%)")
+        ax.set_title("KV Cache Utilization Over Time")
+        ax.set_ylim(0, 105)
+        ax.grid(True, alpha=0.3)
+
+        # 2. Running & Waiting Requests vs Time (smoothed + total)
+        ax = axes[0, 1]
+        running = [s.num_requests_running for s in self.snapshots]
+        waiting = [s.num_requests_waiting for s in self.snapshots]
+        total_queue = [r + w for r, w in zip(running, waiting)]
+        q_window = min(30, len(running) // 10) if len(running) > 10 else 1
+        if q_window > 1:
+            rolling_running = [
+                sum(running[max(0, i - q_window):i + 1]) / len(running[max(0, i - q_window):i + 1])
+                for i in range(len(running))
+            ]
+            rolling_waiting = [
+                sum(waiting[max(0, i - q_window):i + 1]) / len(waiting[max(0, i - q_window):i + 1])
+                for i in range(len(waiting))
+            ]
+            rolling_total = [
+                sum(total_queue[max(0, i - q_window):i + 1]) / len(total_queue[max(0, i - q_window):i + 1])
+                for i in range(len(total_queue))
+            ]
+            ax.plot(times, rolling_running, 'g-', label=f'Running (avg n={q_window})', linewidth=1.5)
+            ax.plot(times, rolling_waiting, 'r-', label=f'Waiting (avg n={q_window})', linewidth=1.5)
+            ax.plot(times, rolling_total, 'b-', label=f'Total (avg n={q_window})', linewidth=1.5)
+        else:
+            ax.plot(times, running, 'g-', label='Running', linewidth=1.5)
+            ax.plot(times, waiting, 'r-', label='Waiting', linewidth=1.5)
+            ax.plot(times, total_queue, 'b-', label='Total', linewidth=1.5)
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Requests")
+        ax.set_title("Request Queue Depth")
+        ax.legend(fontsize=8)
+        ax.grid(True, alpha=0.3)
+
+        # 3. Cache Hit Rate vs Time (computed from deltas between polling intervals)
+        ax = axes[1, 0]
+        gpu_hit_rates = []
+        ext_hit_rates = []
+        combined_hit_rates = []
+        has_ext_cache = any(s.cpu_prefix_cache_queries > 0 for s in self.snapshots)
+        for i in range(1, len(self.snapshots)):
+            # GPU (HBM) cache hit rate for this interval
+            gpu_delta_hits = self.snapshots[i].prefix_cache_hits - self.snapshots[i-1].prefix_cache_hits
+            gpu_delta_queries = self.snapshots[i].prefix_cache_queries - self.snapshots[i-1].prefix_cache_queries
+            if gpu_delta_queries > 0:
+                gpu_hit_rates.append(100.0 * gpu_delta_hits / gpu_delta_queries)
+            else:
+                gpu_hit_rates.append(gpu_hit_rates[-1] if gpu_hit_rates else 0)
+
+            # External cache hit rate for this interval
+            if has_ext_cache:
+                ext_delta_hits = self.snapshots[i].cpu_prefix_cache_hits - self.snapshots[i-1].cpu_prefix_cache_hits
+                ext_delta_queries = self.snapshots[i].cpu_prefix_cache_queries - self.snapshots[i-1].cpu_prefix_cache_queries
+                if ext_delta_queries > 0:
+                    ext_hit_rates.append(100.0 * ext_delta_hits / ext_delta_queries)
+                else:
+                    ext_hit_rates.append(ext_hit_rates[-1] if ext_hit_rates else 0)
+
+                # Combined hit rate: (gpu_hits + ext_hits) / (gpu_queries + ext_queries)
+                total_hits = gpu_delta_hits + ext_delta_hits
+                total_queries = gpu_delta_queries + ext_delta_queries
+                if total_queries > 0:
+                    combined_hit_rates.append(100.0 * total_hits / total_queries)
+                else:
+                    combined_hit_rates.append(combined_hit_rates[-1] if combined_hit_rates else 0)
+
+        # Rolling window size
+        window = min(50, len(gpu_hit_rates) // 10) if len(gpu_hit_rates) > 10 else 1
+
+        # Scatter plot for GPU (HBM) cache hit rate
+        ax.scatter(times[1:], gpu_hit_rates, alpha=0.3, s=5, c='purple', label='GPU (HBM)')
+        if window > 1:
+            rolling_gpu = [
+                sum(gpu_hit_rates[max(0, i - window):i + 1]) / len(gpu_hit_rates[max(0, i - window):i + 1])
+                for i in range(len(gpu_hit_rates))
+            ]
+            ax.plot(times[1:], rolling_gpu, 'purple', linewidth=1.5, label=f'GPU avg (n={window})')
+
+        # External cache scatter + rolling (if available)
+        if has_ext_cache and ext_hit_rates:
+            ax.scatter(times[1:], ext_hit_rates, alpha=0.3, s=5, c='orange', label='External')
+            if window > 1:
+                rolling_ext = [
+                    sum(ext_hit_rates[max(0, i - window):i + 1]) / len(ext_hit_rates[max(0, i - window):i + 1])
+                    for i in range(len(ext_hit_rates))
+                ]
+                ax.plot(times[1:], rolling_ext, 'orange', linewidth=1.5, label=f'External avg (n={window})')
+
+            # Combined/total hit rate (only if external exists)
+            ax.scatter(times[1:], combined_hit_rates, alpha=0.2, s=3, c='green', label='Combined')
+            if window > 1:
+                rolling_combined = [
+                    sum(combined_hit_rates[max(0, i - window):i + 1]) / len(combined_hit_rates[max(0, i - window):i + 1])
+                    for i in range(len(combined_hit_rates))
+                ]
+                ax.plot(times[1:], rolling_combined, 'green', linewidth=2, label=f'Combined avg (n={window})')
+
+        ax.legend(loc='best', fontsize=8)
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Hit Rate (%)")
+        ax.set_title("Prefix Cache Hit Rate Per Interval (tokens hit / tokens queried)")
+        ax.set_ylim(0, 105)
+        ax.grid(True, alpha=0.3)
+
+        # 4. Throughput vs Time (tokens/sec) with rolling average — decode + total
+        ax = axes[1, 1]
+        decode_throughputs = []
+        total_throughputs = []
+        for i in range(1, len(self.snapshots)):
+            delta_gen = self.snapshots[i].generation_tokens - self.snapshots[i-1].generation_tokens
+            delta_prompt = self.snapshots[i].prompt_tokens - self.snapshots[i-1].prompt_tokens
+            delta_time = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp
+            if delta_time > 0:
+                decode_throughputs.append(delta_gen / delta_time)
+                total_throughputs.append((delta_gen + delta_prompt) / delta_time)
+            else:
+                decode_throughputs.append(0)
+                total_throughputs.append(0)
+        # Cumulative running average total throughput (total tokens / elapsed time)
+        cumulative_total_avg = []
+        t0 = self.snapshots[0].timestamp
+        tokens0 = self.snapshots[0].generation_tokens + self.snapshots[0].prompt_tokens
+        for i in range(1, len(self.snapshots)):
+            elapsed = self.snapshots[i].timestamp - t0
+            total_tokens = (self.snapshots[i].generation_tokens + self.snapshots[i].prompt_tokens) - tokens0
+            cumulative_total_avg.append(total_tokens / elapsed if elapsed > 0 else 0)
+
+        window = min(30, len(decode_throughputs) // 10) if len(decode_throughputs) > 10 else 1
+        if window > 1:
+            rolling_decode = [
+                sum(decode_throughputs[max(0, i - window):i + 1]) / len(decode_throughputs[max(0, i - window):i + 1])
+                for i in range(len(decode_throughputs))
+            ]
+            rolling_total = [
+                sum(total_throughputs[max(0, i - window):i + 1]) / len(total_throughputs[max(0, i - window):i + 1])
+                for i in range(len(total_throughputs))
+            ]
+            ax.plot(times[1:], rolling_total, 'steelblue', linewidth=1.5, label=f'Total (avg n={window})')
+            ax.plot(times[1:], rolling_decode, 'orange', linewidth=1.5, label=f'Decode (avg n={window})')
+            ax.legend(fontsize=8)
+        else:
+            ax.plot(times[1:], total_throughputs, 'steelblue', linewidth=1, alpha=0.8, label='Total')
+            ax.plot(times[1:], decode_throughputs, 'orange', linewidth=1, alpha=0.8, label='Decode')
+            ax.legend(fontsize=8)
+        ax.plot(times[1:], cumulative_total_avg, 'red', linewidth=2, label='Total Running Avg')
+        ax.legend(fontsize=8)
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Tokens/sec")
+        ax.set_title("Throughput (Total & Decode)")
+        ax.grid(True, alpha=0.3)
+
+        # 5. KV Offload Transfer Rate (from vLLM metrics)
+        ax = axes[2, 0]
+        gpu_to_cpu_rates = []
+        cpu_to_gpu_rates = []
+        for i in range(1, len(self.snapshots)):
+            dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp
+            if dt > 0:
+                delta_g2c = self.snapshots[i].kv_offload_bytes_gpu_to_cpu - self.snapshots[i-1].kv_offload_bytes_gpu_to_cpu
+                delta_c2g = self.snapshots[i].kv_offload_bytes_cpu_to_gpu - self.snapshots[i-1].kv_offload_bytes_cpu_to_gpu
+                gpu_to_cpu_rates.append(delta_g2c / dt / 1e6)  # MB/s
+                cpu_to_gpu_rates.append(delta_c2g / dt / 1e6)  # MB/s
+            else:
+                gpu_to_cpu_rates.append(0)
+                cpu_to_gpu_rates.append(0)
+        if any(r > 0 for r in gpu_to_cpu_rates) or any(r > 0 for r in cpu_to_gpu_rates):
+            ax.scatter(times[1:], gpu_to_cpu_rates, alpha=0.15, s=3, c='blue')
+            ax.scatter(times[1:], cpu_to_gpu_rates, alpha=0.15, s=3, c='red')
+            xfer_window = min(30, len(gpu_to_cpu_rates) // 10) if len(gpu_to_cpu_rates) > 10 else 1
+            if xfer_window > 1:
+                rolling_g2c = [
+                    sum(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1]) / len(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1])
+                    for i in range(len(gpu_to_cpu_rates))
+                ]
+                rolling_c2g = [
+                    sum(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1]) / len(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1])
+                    for i in range(len(cpu_to_gpu_rates))
+                ]
+                ax.plot(times[1:], rolling_g2c, 'b-', linewidth=1.5, label=f'GPU→CPU (avg n={xfer_window})')
+                ax.plot(times[1:], rolling_c2g, 'r-', linewidth=1.5, label=f'CPU→GPU (avg n={xfer_window})')
+            else:
+                ax.plot(times[1:], gpu_to_cpu_rates, 'b-', linewidth=1, alpha=0.8, label='GPU→CPU')
+                ax.plot(times[1:], cpu_to_gpu_rates, 'r-', linewidth=1, alpha=0.8, label='CPU→GPU')
+            ax.legend(fontsize=8)
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Transfer Rate (MB/s)")
+        ax.set_title("KV Offload Transfer Rate")
+        ax.grid(True, alpha=0.3)
+
+        # 6. Prompt Token Sources Over Time (cumulative percentage)
+        ax = axes[2, 1]
+        initial = self.snapshots[0]
+        cum_compute_pct = []
+        cum_cache_pct = []
+        cum_ext_pct = []
+        for s in self.snapshots:
+            c = s.prompt_tokens_local_compute - initial.prompt_tokens_local_compute
+            h = s.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit
+            e = s.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer
+            total = c + h + e
+            if total > 0:
+                cum_compute_pct.append(100.0 * c / total)
+                cum_cache_pct.append(100.0 * h / total)
+                cum_ext_pct.append(100.0 * e / total)
+            else:
+                cum_compute_pct.append(0)
+                cum_cache_pct.append(0)
+                cum_ext_pct.append(0)
+        if any(v > 0 for v in cum_compute_pct):
+            ax.stackplot(times, cum_compute_pct, cum_cache_pct, cum_ext_pct,
+                        labels=['Prefill', 'HBM Cache Hit', 'Offload Cache Hit'],
+                        colors=['coral', 'steelblue', 'mediumseagreen'], alpha=0.8)
+            ax.legend(fontsize=8, loc='lower left')
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("% of Prefill Tokens")
+        ax.set_title("Cumulative Prefill Token Source Breakdown")
+        ax.set_ylim(0, 105)
+        ax.grid(True, alpha=0.3)
+
+        # 7. Cumulative KV Offload Transfers
+        initial = self.snapshots[0]
+        # GPU → CPU cumulative
+        ax = axes[3, 0]
+        cum_g2c = [(s.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu) / 1e9
+                    for s in self.snapshots]
+        if any(v > 0 for v in cum_g2c):
+            ax.plot(times, cum_g2c, 'b-', linewidth=1.5)
+            ax.fill_between(times, cum_g2c, alpha=0.2, color='blue')
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Cumulative Transfer (GB)")
+        ax.set_title("KV Offload: GPU → CPU (Cumulative)")
+        ax.grid(True, alpha=0.3)
+
+        # CPU → GPU cumulative
+        ax = axes[3, 1]
+        cum_c2g = [(s.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu) / 1e9
+                    for s in self.snapshots]
+        if any(v > 0 for v in cum_c2g):
+            ax.plot(times, cum_c2g, 'r-', linewidth=1.5)
+            ax.fill_between(times, cum_c2g, alpha=0.2, color='red')
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Cumulative Transfer (GB)")
+        ax.set_title("KV Offload: CPU → GPU (Cumulative)")
+        ax.grid(True, alpha=0.3)
+
+        # 8 & 9. Client metrics plots (TTFT and Latency vs Time)
+        if client_metrics and len(client_metrics) > 0:
+            # Sort by start time
+            sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms)
+            # Convert to relative time (seconds from first request)
+            first_start = sorted_metrics[0].start_time_ms
+            request_times = [(m.start_time_ms - first_start) / 1000.0 for m in sorted_metrics]
+            ttfts = [m.ttft_ms for m in sorted_metrics]
+            latencies = [m.latency_ms for m in sorted_metrics]
+
+            # 8. TTFT vs Time
+            ax = axes[4, 0]
+            ax.scatter(request_times, ttfts, alpha=0.3, s=5, c='blue')
+            # Add rolling average
+            window = min(50, len(ttfts) // 10) if len(ttfts) > 10 else 1
+            if window > 1:
+                rolling_ttft = [
+                    sum(ttfts[max(0, i - window):i + 1]) / len(ttfts[max(0, i - window):i + 1])
+                    for i in range(len(ttfts))
+                ]
+                ax.plot(request_times, rolling_ttft, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})')
+                ax.legend()
+            ax.set_xlabel("Time (s)")
+            ax.set_ylabel("TTFT (ms)")
+            ax.set_title("Time to First Token vs Time")
+            ax.grid(True, alpha=0.3)
+
+            # 9. Latency vs Time
+            ax = axes[4, 1]
+            ax.scatter(request_times, latencies, alpha=0.3, s=5, c='green')
+            # Add rolling average
+            if window > 1:
+                rolling_latency = [
+                    sum(latencies[max(0, i - window):i + 1]) / len(latencies[max(0, i - window):i + 1])
+                    for i in range(len(latencies))
+                ]
+                ax.plot(request_times, rolling_latency, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})')
+                ax.legend()
+            ax.set_xlabel("Time (s)")
+            ax.set_ylabel("Latency (ms)")
+            ax.set_title("Request Latency vs Time")
+            ax.grid(True, alpha=0.3)
+
+            # 10. Interactivity (1/TPOT = tokens/sec) vs Time
+            ax = axes[5, 0]
+            # Filter out zero TPOT values to avoid division by zero
+            tpots = [m.tpot_ms for m in sorted_metrics]
+            interactivity = [1000.0 / t if t > 0 else 0 for t in tpots]  # Convert to tokens/sec
+            ax.scatter(request_times, interactivity, alpha=0.3, s=5, c='purple')
+            # Add rolling average
+            if window > 1:
+                rolling_inter = [
+                    sum(interactivity[max(0, i - window):i + 1]) / len(interactivity[max(0, i - window):i + 1])
+                    for i in range(len(interactivity))
+                ]
+                ax.plot(request_times, rolling_inter, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})')
+                ax.legend()
+            ax.set_xlabel("Time (s)")
+            ax.set_ylabel("Interactivity (tokens/sec)")
+            ax.set_title("Decode Speed (1/TPOT) vs Time")
+            ax.grid(True, alpha=0.3)
+
+            # 11. Preemptions over time
+            ax = axes[5, 1]
+            preemption_rates = []
+            for i in range(1, len(self.snapshots)):
+                dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp
+                delta = self.snapshots[i].num_preemptions - self.snapshots[i-1].num_preemptions
+                preemption_rates.append(delta / dt if dt > 0 else 0)
+            if any(r > 0 for r in preemption_rates):
+                ax.scatter(times[1:], preemption_rates, alpha=0.15, s=3, c='red')
+                preempt_window = min(30, len(preemption_rates) // 10) if len(preemption_rates) > 10 else 1
+                if preempt_window > 1:
+                    rolling_preempt = [
+                        sum(preemption_rates[max(0, i - preempt_window):i + 1]) / len(preemption_rates[max(0, i - preempt_window):i + 1])
+                        for i in range(len(preemption_rates))
+                    ]
+                    ax.plot(times[1:], rolling_preempt, 'r-', linewidth=1.5, label=f'Rolling avg (n={preempt_window})')
+                # Cumulative on secondary axis
+                ax2 = ax.twinx()
+                cumulative = [self.snapshots[i].num_preemptions - self.snapshots[0].num_preemptions
+                              for i in range(1, len(self.snapshots))]
+                ax2.plot(times[1:], cumulative, 'b--', linewidth=1, alpha=0.5, label='Cumulative')
+                ax2.set_ylabel("Cumulative Preemptions", color='blue')
+                ax2.tick_params(axis='y', labelcolor='blue')
+            ax.set_xlabel("Time (s)")
+            ax.set_ylabel("Preemptions/sec", color='red')
+            ax.tick_params(axis='y', labelcolor='red')
+            ax.set_title("Preemptions Over Time")
+            ax.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        plt.savefig(f"{output_prefix}_plots.png", dpi=150)
+        print(f"Saved plots to {output_prefix}_plots.png")
+        plt.close()
+
+        # Also generate a summary
+        self._print_summary()
+
+    def _print_summary(self) -> None:
+        """Print summary statistics."""
+        if len(self.snapshots) < 2:
+            return
+
+        duration = self.snapshots[-1].timestamp - self.snapshots[0].timestamp
+        total_gen_tokens = self.snapshots[-1].generation_tokens - self.snapshots[0].generation_tokens
+        total_prompt_tokens = self.snapshots[-1].prompt_tokens - self.snapshots[0].prompt_tokens
+
+        final = self.snapshots[-1]
+        initial = self.snapshots[0]
+
+        print("\n" + "="*60)
+        print("METRICS SUMMARY")
+        print("="*60)
+        print(f"Duration: {duration:.1f}s")
+        print(f"Total prompt tokens: {total_prompt_tokens:,}")
+        print(f"Total generation tokens: {total_gen_tokens:,}")
+        print(f"Avg generation throughput: {total_gen_tokens/duration:.1f} tok/s")
+        print(f"Peak KV cache usage: {max(s.kv_cache_usage for s in self.snapshots)*100:.1f}%")
+        print(f"Peak running requests: {max(s.num_requests_running for s in self.snapshots)}")
+        print(f"Peak waiting requests: {max(s.num_requests_waiting for s in self.snapshots)}")
+        print(f"Total preemptions: {final.num_preemptions - initial.num_preemptions}")
+
+        if final.prefix_cache_queries > initial.prefix_cache_queries:
+            delta_hits = final.prefix_cache_hits - initial.prefix_cache_hits
+            delta_queries = final.prefix_cache_queries - initial.prefix_cache_queries
+            hit_rate = 100.0 * delta_hits / delta_queries
+            print(f"Overall GPU cache hit rate: {hit_rate:.1f}%")
+            print(f"  - Cache hits: {delta_hits:,} tokens")
+            print(f"  - Cache queries: {delta_queries:,} tokens")
+
+        # External/offloaded cache stats if available
+        if final.cpu_prefix_cache_queries > initial.cpu_prefix_cache_queries:
+            cpu_delta_hits = final.cpu_prefix_cache_hits - initial.cpu_prefix_cache_hits
+            cpu_delta_queries = final.cpu_prefix_cache_queries - initial.cpu_prefix_cache_queries
+            cpu_hit_rate = 100.0 * cpu_delta_hits / cpu_delta_queries
+            print(f"Overall external cache hit rate: {cpu_hit_rate:.1f}%")
+            print(f"  - Cache hits: {cpu_delta_hits:,} tokens")
+            print(f"  - Cache queries: {cpu_delta_queries:,} tokens")
+
+        # Prompt tokens by source
+        total_compute = final.prompt_tokens_local_compute - initial.prompt_tokens_local_compute
+        total_cache_hit = final.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit
+        total_ext = final.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer
+        total_by_source = total_compute + total_cache_hit + total_ext
+        if total_by_source > 0:
+            print(f"Prompt token sources:")
+            print(f"  - Prefill:            {total_compute:>12,} ({100*total_compute/total_by_source:.1f}%)")
+            print(f"  - HBM cache hit:      {total_cache_hit:>12,} ({100*total_cache_hit/total_by_source:.1f}%)")
+            print(f"  - Offload cache hit:  {total_ext:>12,} ({100*total_ext/total_by_source:.1f}%)")
+
+        # KV offload transfer stats
+        g2c_bytes = final.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu
+        c2g_bytes = final.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu
+        g2c_time = final.kv_offload_time_gpu_to_cpu - initial.kv_offload_time_gpu_to_cpu
+        c2g_time = final.kv_offload_time_cpu_to_gpu - initial.kv_offload_time_cpu_to_gpu
+        if g2c_bytes > 0 or c2g_bytes > 0:
+            print(f"KV offload transfers:")
+            print(f"  GPU→CPU: {g2c_bytes/1e9:.2f} GB in {g2c_time:.2f}s ({g2c_bytes/g2c_time/1e9:.1f} GB/s)" if g2c_time > 0 else f"  GPU→CPU: {g2c_bytes/1e9:.2f} GB")
+            print(f"  CPU→GPU: {c2g_bytes/1e9:.2f} GB in {c2g_time:.2f}s ({c2g_bytes/c2g_time/1e9:.1f} GB/s)" if c2g_time > 0 else f"  CPU→GPU: {c2g_bytes/1e9:.2f} GB")
+
+        # Prefill KV computed tokens
+        delta_kv_sum = final.prefill_kv_computed_tokens_sum - initial.prefill_kv_computed_tokens_sum
+        delta_kv_count = final.prefill_kv_computed_tokens_count - initial.prefill_kv_computed_tokens_count
+        if delta_kv_count > 0:
+            print(f"Prefill KV computed tokens (excluding cached):")
+            print(f"  Total: {delta_kv_sum:,} tokens across {delta_kv_count:,} requests")
+            print(f"  Avg per request: {delta_kv_sum/delta_kv_count:.0f} tokens")
+
+        print("="*60 + "\n")
+
+    def export_csv(
+        self,
+        output_prefix: str = "metrics",
+        client_metrics: list | None = None,
+    ) -> None:
+        """Export all time series data to CSV files.
+
+        Args:
+            output_prefix: Prefix for output file names
+            client_metrics: Optional list of RequestStats from benchmark clients
+
+        Generates:
+            - {output_prefix}_server_metrics.csv: vLLM server metrics over time
+            - {output_prefix}_gpu_transfer.csv: GPU PCIe transfer stats
+            - {output_prefix}_client_metrics.csv: Per-request client metrics (if provided)
+        """
+        output_dir = Path(output_prefix).parent
+        if output_dir and not output_dir.exists():
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+        # 1. Export server metrics (from /metrics endpoint)
+        if self.snapshots:
+            server_csv = f"{output_prefix}_server_metrics.csv"
+            start_time = self.snapshots[0].timestamp
+
+            with open(server_csv, 'w', newline='') as f:
+                writer = csv.writer(f)
+                # Header
+                writer.writerow([
+                    'timestamp_sec',
+                    'relative_time_sec',
+                    'kv_cache_usage_pct',
+                    'cpu_kv_cache_usage_pct',
+                    'num_requests_running',
+                    'num_requests_waiting',
+                    'prefix_cache_hits',
+                    'prefix_cache_queries',
+                    'cpu_prefix_cache_hits',
+                    'cpu_prefix_cache_queries',
+                    'prompt_tokens_total',
+                    'generation_tokens_total',
+                    'num_preemptions_total',
+                    'request_success_total',
+                    # KV offload metrics
+                    'kv_offload_bytes_gpu_to_cpu',
+                    'kv_offload_bytes_cpu_to_gpu',
+                    'kv_offload_time_gpu_to_cpu',
+                    'kv_offload_time_cpu_to_gpu',
+                    # Prompt tokens by source
+                    'prompt_tokens_local_compute',
+                    'prompt_tokens_local_cache_hit',
+                    'prompt_tokens_external_kv_transfer',
+                    # Prefill KV computed
+                    'prefill_kv_computed_tokens_sum',
+                    'prefill_kv_computed_tokens_count',
+                    # Computed per-interval metrics
+                    'interval_cache_hit_rate_pct',
+                    'interval_throughput_tok_per_sec',
+                ])
+
+                for i, s in enumerate(self.snapshots):
+                    relative_time = s.timestamp - start_time
+
+                    # Compute per-interval metrics
+                    cache_hit_rate = 0.0
+                    throughput = 0.0
+                    if i > 0:
+                        prev = self.snapshots[i - 1]
+                        delta_hits = s.prefix_cache_hits - prev.prefix_cache_hits
+                        delta_queries = s.prefix_cache_queries - prev.prefix_cache_queries
+                        if delta_queries > 0:
+                            cache_hit_rate = 100.0 * delta_hits / delta_queries
+
+                        delta_gen = s.generation_tokens - prev.generation_tokens
+                        delta_time = s.timestamp - prev.timestamp
+                        if delta_time > 0:
+                            throughput = delta_gen / delta_time
+
+                    writer.writerow([
+                        f"{s.timestamp:.3f}",
+                        f"{relative_time:.3f}",
+                        f"{s.kv_cache_usage * 100:.2f}",
+                        f"{s.cpu_kv_cache_usage * 100:.2f}",
+                        s.num_requests_running,
+                        s.num_requests_waiting,
+                        s.prefix_cache_hits,
+                        s.prefix_cache_queries,
+                        s.cpu_prefix_cache_hits,
+                        s.cpu_prefix_cache_queries,
+                        s.prompt_tokens,
+                        s.generation_tokens,
+                        s.num_preemptions,
+                        s.request_success,
+                        f"{s.kv_offload_bytes_gpu_to_cpu:.0f}",
+                        f"{s.kv_offload_bytes_cpu_to_gpu:.0f}",
+                        f"{s.kv_offload_time_gpu_to_cpu:.6f}",
+                        f"{s.kv_offload_time_cpu_to_gpu:.6f}",
+                        s.prompt_tokens_local_compute,
+                        s.prompt_tokens_local_cache_hit,
+                        s.prompt_tokens_external_kv_transfer,
+                        s.prefill_kv_computed_tokens_sum,
+                        s.prefill_kv_computed_tokens_count,
+                        f"{cache_hit_rate:.2f}",
+                        f"{throughput:.2f}",
+                    ])
+
+            print(f"Exported server metrics to {server_csv}")
+
+        # 2. Export GPU transfer stats (DEPRECATED - kept for backward compat)
+        if self.gpu_transfer_collector and self.gpu_transfer_collector.snapshots:
+            gpu_csv = f"{output_prefix}_gpu_transfer.csv"
+            gpu_snaps = self.gpu_transfer_collector.snapshots
+            gpu_start = gpu_snaps[0].timestamp
+
+            with open(gpu_csv, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow([
+                    'timestamp_sec',
+                    'relative_time_sec',
+                    'gpu_id',
+                    'tx_pci_mb_per_sec',
+                    'rx_pci_mb_per_sec',
+                    'cumulative_tx_gb',
+                    'cumulative_rx_gb',
+                ])
+
+                cumulative_tx = 0.0
+                cumulative_rx = 0.0
+                for i, s in enumerate(gpu_snaps):
+                    relative_time = s.timestamp - gpu_start
+                    if i > 0:
+                        dt = s.timestamp - gpu_snaps[i - 1].timestamp
+                        cumulative_tx += s.tx_pci * dt / 1024  # MB to GB
+                        cumulative_rx += s.rx_pci * dt / 1024
+
+                    writer.writerow([
+                        f"{s.timestamp:.3f}",
+                        f"{relative_time:.3f}",
+                        s.gpu_id,
+                        f"{s.tx_pci:.2f}",
+                        f"{s.rx_pci:.2f}",
+                        f"{cumulative_tx:.4f}",
+                        f"{cumulative_rx:.4f}",
+                    ])
+
+            print(f"Exported GPU transfer metrics to {gpu_csv}")
+
+        # 3. Export client metrics (per-request stats)
+        if client_metrics and len(client_metrics) > 0:
+            client_csv = f"{output_prefix}_client_metrics.csv"
+            sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms)
+            first_start = sorted_metrics[0].start_time_ms
+
+            with open(client_csv, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow([
+                    'start_time_ms',
+                    'relative_time_sec',
+                    'ttft_ms',
+                    'tpot_ms',
+                    'latency_ms',
+                    'input_num_turns',
+                    'input_num_tokens',
+                    'output_num_tokens',
+                    'output_num_chunks',
+                    'output_num_first_chunk_tokens',
+                    'approx_cached_percent',
+                    'conversation_id',
+                    'client_id',
+                    'interactivity_tok_per_sec',
+                ])
+
+                for m in sorted_metrics:
+                    relative_time = (m.start_time_ms - first_start) / 1000.0
+                    interactivity = 1000.0 / m.tpot_ms if m.tpot_ms > 0 else 0
+
+                    writer.writerow([
+                        f"{m.start_time_ms:.3f}",
+                        f"{relative_time:.3f}",
+                        f"{m.ttft_ms:.3f}",
+                        f"{m.tpot_ms:.3f}",
+                        f"{m.latency_ms:.3f}",
+                        m.input_num_turns,
+                        m.input_num_tokens,
+                        m.output_num_tokens,
+                        m.output_num_chunks,
+                        m.output_num_first_chunk_tokens,
+                        f"{m.approx_cached_percent:.2f}",
+                        m.conversation_id,
+                        m.client_id,
+                        f"{interactivity:.2f}",
+                    ])
+
+            print(f"Exported client metrics to {client_csv}")
diff --git a/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py
new file mode 100644
index 000000000..ddf605324
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Standalone metrics collector for vLLM server.
+
+Polls the vLLM /metrics endpoint and generates server-side plots.
+Designed to run alongside any benchmark client (aiperf, custom, etc.).
+
+Usage:
+    # Start collecting, run your benchmark, then Ctrl+C or kill to stop:
+    python -m bench.run_metrics_collector \
+        --url http://localhost:8888 \
+        --output-prefix results/metrics \
+        --duration 600
+
+    # Or run in background and signal when done:
+    python -m bench.run_metrics_collector \
+        --url http://localhost:8888 \
+        --output-prefix results/metrics \
+        --pid-file /tmp/metrics_collector.pid
+"""
+
+import argparse
+import asyncio
+import os
+import signal
+import sys
+
+from bench.metrics_collector import MetricsCollector
+
+
+async def run(args):
+    collector = MetricsCollector(
+        base_url=args.url,
+        poll_interval=args.poll_interval,
+    )
+
+    collector.start()
+    print(f"Metrics collector started (polling {args.url}/metrics every {args.poll_interval}s)")
+
+    if args.pid_file:
+        with open(args.pid_file, "w") as f:
+            f.write(str(os.getpid()))
+        print(f"PID written to {args.pid_file}")
+
+    # Set up graceful shutdown
+    stop_event = asyncio.Event()
+
+    def handle_signal(*_):
+        print("\nStopping metrics collector...")
+        stop_event.set()
+
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGINT, signal.SIGTERM):
+        loop.add_signal_handler(sig, handle_signal)
+
+    # Wait for duration or signal
+    if args.duration:
+        try:
+            await asyncio.wait_for(stop_event.wait(), timeout=args.duration)
+        except asyncio.TimeoutError:
+            print(f"Duration limit reached ({args.duration}s)")
+    else:
+        await stop_event.wait()
+
+    await collector.stop()
+
+    # Generate outputs
+    if len(collector.snapshots) < 2:
+        print("Not enough data points collected")
+        sys.exit(1)
+
+    print(f"Collected {len(collector.snapshots)} snapshots")
+
+    # Generate plots (without client metrics — server-only)
+    collector.generate_plots(output_prefix=args.output_prefix)
+
+    # Export CSV
+    collector.export_csv(output_prefix=args.output_prefix)
+
+    # Clean up PID file
+    if args.pid_file and os.path.exists(args.pid_file):
+        os.remove(args.pid_file)
+
+    print("Done")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Standalone vLLM metrics collector"
+    )
+    parser.add_argument(
+        "--url", "-u",
+        default="http://localhost:8888",
+        help="vLLM server base URL (default: http://localhost:8888)",
+    )
+    parser.add_argument(
+        "--output-prefix", "-o",
+        default="metrics",
+        help="Output file prefix (default: metrics)",
+    )
+    parser.add_argument(
+        "--poll-interval",
+        type=float,
+        default=1.0,
+        help="Polling interval in seconds (default: 1.0)",
+    )
+    parser.add_argument(
+        "--duration", "-d",
+        type=float,
+        default=None,
+        help="Max collection duration in seconds (default: unlimited, stop with signal)",
+    )
+    parser.add_argument(
+        "--pid-file",
+        default=None,
+        help="Write PID to this file for external signaling",
+    )
+    args = parser.parse_args()
+
+    asyncio.run(run(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester
new file mode 160000
index 000000000..a41ee2261
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester
@@ -0,0 +1 @@
+Subproject commit a41ee2261b743328be84c472b7b97112d046e62f
diff --git a/experimental/multiturn/vllm_benchmark/requirements.txt b/experimental/multiturn/vllm_benchmark/requirements.txt
new file mode 100644
index 000000000..f4a9625fb
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/requirements.txt
@@ -0,0 +1,9 @@
+numpy>=1.24
+pandas>=2.0.0
+aiohttp>=3.10
+transformers>=4.46
+xlsxwriter>=3.2.1
+tqdm>=4.66
+datasets
+tiktoken
+matplotlib
diff --git a/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py b/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py
new file mode 100644
index 000000000..aa4b639ca
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+"""Analyze ISL/OSL/turn distributions from AIPerf benchmark results.
+
+Reads profile_export.jsonl and produces summary stats + distribution plots
+to verify the benchmark workload matches the intended Qwen trace profile.
+
+Usage:
+    python analyze_benchmark_distributions.py path/to/aiperf_artifacts/ -o output_dir/
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+from collections import Counter, defaultdict
+from pathlib import Path
+
+
+def load_records(artifacts_dir: Path) -> list[dict]:
+    """Load per-request records from profile_export.jsonl."""
+    jsonl_path = artifacts_dir / "profile_export.jsonl"
+    records = []
+    with open(jsonl_path) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                records.append(json.loads(line))
+    return records
+
+
+def load_trace_replay_records(trace_replay_dir: Path) -> list[dict]:
+    """Load per-request records from trace_replay detailed_results.csv.
+
+    Converts to the same format as AIPerf JSONL records so the analyze()
+    function can process both formats identically.
+    """
+    import csv
+    import sys
+    csv.field_size_limit(sys.maxsize)
+
+    csv_path = trace_replay_dir / "detailed_results.csv"
+    records = []
+    with open(csv_path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            if row.get("success") != "True":
+                continue
+            records.append({
+                "metadata": {
+                    "x_correlation_id": row["trace_id"],
+                    "conversation_id": row["trace_id"],
+                    "turn_index": int(row["request_idx"]),
+                    "benchmark_phase": "profiling",
+                },
+                "metrics": {
+                    "input_sequence_length": {"value": int(row["input_tokens"])},
+                    "output_sequence_length": {"value": int(row["output_tokens_actual"])},
+                },
+            })
+    return records
+
+
+def analyze(records: list[dict], output_dir: Path) -> None:
+    """Run distribution analysis and save results."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Group by conversation
+    convos: dict[str, list[dict]] = defaultdict(list)
+    for r in records:
+        metrics = r.get("metrics", {})
+        if "input_sequence_length" not in metrics or "output_sequence_length" not in metrics:
+            continue
+        # Use x_correlation_id (unique per session) not conversation_id (template, reused)
+        cid = r["metadata"].get("x_correlation_id") or r["metadata"]["conversation_id"]
+        ti = r["metadata"]["turn_index"]
+        isl = metrics["input_sequence_length"]["value"]
+        osl = metrics["output_sequence_length"]["value"]
+        convos[cid].append({"turn": ti, "isl": isl, "osl": osl})
+
+    # Sort turns within each conversation
+    for v in convos.values():
+        v.sort(key=lambda x: x["turn"])
+
+    # Turn count distribution
+    turn_counts = Counter(len(v) for v in convos.values())
+    total_convos = len(convos)
+    total_requests = len(records)
+
+    lines = []
+    lines.append("=" * 70)
+    lines.append("BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS")
+    lines.append("=" * 70)
+    lines.append(f"Total conversations: {total_convos:,}")
+    lines.append(f"Total requests: {total_requests:,}")
+    lines.append(f"Avg turns/conv: {total_requests / total_convos:.2f}")
+    lines.append("")
+
+    lines.append("TURN COUNT DISTRIBUTION:")
+    lines.append(f"  {'Turns':>5s}  {'Count':>6s}  {'Pct':>6s}   Target")
+    target = {1: 59, 2: 20, 3: 10, 4: 5, 5: 3, 6: 2, 7: 1}
+    for k in sorted(turn_counts.keys()):
+        pct = 100 * turn_counts[k] / total_convos
+        tgt = f"{target.get(k, 0):.0f}%" if k in target else ""
+        lines.append(f"  {k:5d}  {turn_counts[k]:6,}  {pct:5.1f}%   {tgt}")
+
+    # ISL/OSL by turn index
+    lines.append("")
+    lines.append("ISL BY TURN INDEX:")
+    lines.append(
+        f"  {'Turn':>4s}  {'N':>6s}  {'Mean':>8s}  {'Median':>8s}  {'Std':>8s}  {'P5':>8s}  {'P95':>8s}"
+    )
+    max_turn = max(t["turn"] for v in convos.values() for t in v)
+    for ti in range(max_turn + 1):
+        vals = sorted(t["isl"] for v in convos.values() for t in v if t["turn"] == ti)
+        if not vals:
+            continue
+        n = len(vals)
+        mean = sum(vals) / n
+        std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n)
+        median = vals[n // 2]
+        p5 = vals[int(n * 0.05)]
+        p95 = vals[int(n * 0.95)]
+        lines.append(
+            f"  {ti:4d}  {n:6,}  {mean:8.0f}  {median:8.0f}  {std:8.0f}  {p5:8.0f}  {p95:8.0f}"
+        )
+
+    lines.append("")
+    lines.append("OSL BY TURN INDEX:")
+    lines.append(
+        f"  {'Turn':>4s}  {'N':>6s}  {'Mean':>8s}  {'Median':>8s}  {'Std':>8s}  {'P5':>8s}  {'P95':>8s}"
+    )
+    for ti in range(max_turn + 1):
+        vals = sorted(t["osl"] for v in convos.values() for t in v if t["turn"] == ti)
+        if not vals:
+            continue
+        n = len(vals)
+        mean = sum(vals) / n
+        std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n)
+        median = vals[n // 2]
+        p5 = vals[int(n * 0.05)]
+        p95 = vals[int(n * 0.95)]
+        lines.append(
+            f"  {ti:4d}  {n:6,}  {mean:8.0f}  {median:8.0f}  {std:8.0f}  {p5:8.0f}  {p95:8.0f}"
+        )
+
+    # Overall ISL/OSL stats
+    all_isl = sorted(t["isl"] for v in convos.values() for t in v)
+    all_osl = sorted(t["osl"] for v in convos.values() for t in v)
+    n = len(all_isl)
+    isl_mean = sum(all_isl) / n
+    osl_mean = sum(all_osl) / n
+    lines.append("")
+    lines.append("ALL REQUESTS ISL:")
+    lines.append(
+        f"  n={n:,}  mean={isl_mean:.0f}  median={all_isl[n//2]}  "
+        f"p5={all_isl[int(n*0.05)]}  p95={all_isl[int(n*0.95)]}"
+    )
+    lines.append("ALL REQUESTS OSL:")
+    lines.append(
+        f"  n={n:,}  mean={osl_mean:.0f}  median={all_osl[n//2]}  "
+        f"p5={all_osl[int(n*0.05)]}  p95={all_osl[int(n*0.95)]}"
+    )
+
+    # Per-conversation stats
+    conv_max_isl = sorted(max(t["isl"] for t in v) for v in convos.values())
+    conv_total_osl = sorted(sum(t["osl"] for t in v) for v in convos.values())
+    nc = len(conv_max_isl)
+    lines.append("")
+    lines.append("PER-CONVERSATION MAX ISL (final context size):")
+    lines.append(
+        f"  n={nc:,}  mean={sum(conv_max_isl)/nc:.0f}  median={conv_max_isl[nc//2]}  "
+        f"p5={conv_max_isl[int(nc*0.05)]}  p95={conv_max_isl[int(nc*0.95)]}"
+    )
+    lines.append("PER-CONVERSATION TOTAL OSL:")
+    lines.append(
+        f"  n={nc:,}  mean={sum(conv_total_osl)/nc:.0f}  median={conv_total_osl[nc//2]}  "
+        f"p5={conv_total_osl[int(nc*0.05)]}  p95={conv_total_osl[int(nc*0.95)]}"
+    )
+
+    # ISL context growth (shows accumulation across turns)
+    lines.append("")
+    lines.append("ISL CONTEXT GROWTH (sample multi-turn conversations):")
+    multi = [(cid, v) for cid, v in convos.items() if len(v) >= 3][:10]
+    for cid, turns in multi:
+        isls = " -> ".join(str(t["isl"]) for t in turns)
+        lines.append(f"  {cid}: {isls}")
+
+    lines.append("=" * 70)
+
+    summary_text = "\n".join(lines)
+    print(summary_text)
+
+    # Save summary
+    (output_dir / "workload_distribution_summary.txt").write_text(summary_text)
+
+    # Try to generate plots (matplotlib may not be available)
+    try:
+        _generate_plots(convos, records, output_dir)
+    except ImportError:
+        print("matplotlib not available, skipping plots")
+
+
+def _generate_plots(
+    convos: dict[str, list[dict]], records: list[dict], output_dir: Path
+) -> None:
+    """Generate distribution plots."""
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    fig, axes = plt.subplots(3, 3, figsize=(18, 15))
+    fig.suptitle("Benchmark Workload Distribution Analysis", fontsize=14)
+
+    # (0,0) Turn count distribution
+    ax = axes[0, 0]
+    turn_counts = Counter(len(v) for v in convos.values())
+    turns = sorted(turn_counts.keys())
+    counts = [turn_counts[t] for t in turns]
+    total = sum(counts)
+    bars = ax.bar(turns, [100 * c / total for c in counts], edgecolor="black", alpha=0.7)
+    for bar, t in zip(bars, turns):
+        ax.text(
+            bar.get_x() + bar.get_width() / 2,
+            bar.get_height(),
+            f"{bar.get_height():.0f}%",
+            ha="center",
+            va="bottom",
+            fontsize=8,
+        )
+    ax.set_xlabel("Number of Turns")
+    ax.set_ylabel("% of Conversations")
+    ax.set_title(f"Turn Count Distribution (n={total:,})")
+    ax.grid(True, alpha=0.3, axis="y")
+
+    # (0,1) All requests ISL histogram
+    ax = axes[0, 1]
+    all_isl = [t["isl"] for v in convos.values() for t in v]
+    clip = int(sorted(all_isl)[int(len(all_isl) * 0.99)] * 1.2)
+    ax.hist([v for v in all_isl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="steelblue")
+    all_isl_sorted = sorted(all_isl)
+    median_isl = all_isl_sorted[len(all_isl) // 2]
+    mean_isl = sum(all_isl) / len(all_isl)
+    ax.axvline(median_isl, color="red", linestyle="--", label=f"Median: {median_isl:,}")
+    ax.axvline(mean_isl, color="orange", linestyle="--", label=f"Mean: {mean_isl:,.0f}")
+    ax.set_xlabel("Input Sequence Length")
+    ax.set_ylabel("Count")
+    ax.set_title(f"All Requests ISL (n={len(all_isl):,})")
+    ax.legend(fontsize=8)
+    ax.grid(True, alpha=0.3, axis="y")
+
+    # (0,2) All requests OSL histogram
+    ax = axes[0, 2]
+    all_osl = [t["osl"] for v in convos.values() for t in v]
+    clip = min(3000, int(sorted(all_osl)[int(len(all_osl) * 0.99)] * 1.2))
+    ax.hist([v for v in all_osl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="coral")
+    all_osl_sorted = sorted(all_osl)
+    median_osl = all_osl_sorted[len(all_osl) // 2]
+    mean_osl = sum(all_osl) / len(all_osl)
+    ax.axvline(median_osl, color="red", linestyle="--", label=f"Median: {median_osl:,}")
+    ax.axvline(mean_osl, color="orange", linestyle="--", label=f"Mean: {mean_osl:,.0f}")
+    ax.set_xlabel("Output Sequence Length")
+    ax.set_ylabel("Count")
+    ax.set_title(f"All Requests OSL (n={len(all_osl):,})")
+    ax.legend(fontsize=8)
+    ax.grid(True, alpha=0.3, axis="y")
+
+    # (1,0) Average new prefill tokens by turn index (ISL delta per turn)
+    ax = axes[1, 0]
+    # Collect deltas grouped by turn index
+    deltas_by_turn: dict[int, list[int]] = defaultdict(list)
+    for v in convos.values():
+        for i, t in enumerate(v):
+            if i == 0:
+                deltas_by_turn[t["turn"]].append(t["isl"])
+            else:
+                deltas_by_turn[t["turn"]].append(max(0, t["isl"] - v[i - 1]["isl"]))
+    if deltas_by_turn:
+        turn_indices = sorted(deltas_by_turn.keys())
+        means = [sum(deltas_by_turn[ti]) / len(deltas_by_turn[ti]) for ti in turn_indices]
+        ns = [len(deltas_by_turn[ti]) for ti in turn_indices]
+        ax.plot(turn_indices, means, marker="o", markersize=3, linewidth=1, color="mediumseagreen")
+        ax.fill_between(turn_indices, 0, means, alpha=0.2, color="mediumseagreen")
+        # Label first and last points
+        if len(turn_indices) > 0:
+            ax.annotate(f"{means[0]:,.0f}", (turn_indices[0], means[0]), fontsize=7, ha="left", va="bottom")
+        if len(turn_indices) > 1:
+            ax.annotate(f"{means[-1]:,.0f}\n(n={ns[-1]})", (turn_indices[-1], means[-1]), fontsize=7, ha="right", va="bottom")
+    # Overall mean/median across all deltas
+    all_deltas = [d for dlist in deltas_by_turn.values() for d in dlist]
+    if all_deltas:
+        overall_mean = sum(all_deltas) / len(all_deltas)
+        all_deltas_sorted = sorted(all_deltas)
+        overall_median = all_deltas_sorted[len(all_deltas) // 2]
+        ax.axhline(overall_mean, color="orange", linestyle="--", linewidth=1, label=f"Mean: {overall_mean:,.0f}")
+        ax.axhline(overall_median, color="red", linestyle="--", linewidth=1, label=f"Median: {overall_median:,}")
+        ax.legend(fontsize=7)
+    ax.set_xlabel("Turn Index")
+    ax.set_ylabel("Mean New Prefill Tokens")
+    ax.set_title("Avg New Prefill Tokens by Turn")
+    ax.grid(True, alpha=0.3)
+
+    # (1,1) ISL vs OSL scatter
+    ax = axes[1, 1]
+    ax.scatter(all_isl, all_osl, alpha=0.15, s=3, c="purple")
+    ax.set_xlabel("ISL (tokens)")
+    ax.set_ylabel("OSL (tokens)")
+    ax.set_title("ISL vs OSL (all requests)")
+    ax.grid(True, alpha=0.3)
+
+    # (1,2) Per-conversation max ISL vs num turns scatter
+    ax = axes[1, 2]
+    conv_turns = [len(v) for v in convos.values()]
+    conv_max_isl_list = [max(t["isl"] for t in v) for v in convos.values()]
+    ax.scatter(conv_turns, conv_max_isl_list, alpha=0.3, s=8, c="steelblue")
+    ax.set_xlabel("Number of Turns")
+    ax.set_ylabel("Max ISL (tokens)")
+    ax.set_title("Final Context Size vs Turn Count")
+    ax.grid(True, alpha=0.3)
+
+    # (2,0) Per-conversation max ISL (final context size per conversation)
+    ax = axes[2, 0]
+    conv_max_isl = [max(t["isl"] for t in v) for v in convos.values()]
+    clip = int(sorted(conv_max_isl)[int(len(conv_max_isl) * 0.99)] * 1.2)
+    ax.hist([v for v in conv_max_isl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="steelblue")
+    conv_max_isl_sorted = sorted(conv_max_isl)
+    median_max = conv_max_isl_sorted[len(conv_max_isl) // 2]
+    mean_max = sum(conv_max_isl) / len(conv_max_isl)
+    ax.axvline(median_max, color="red", linestyle="--", label=f"Median: {median_max:,}")
+    ax.axvline(mean_max, color="orange", linestyle="--", label=f"Mean: {mean_max:,.0f}")
+    ax.set_xlabel("Max ISL per Conversation (tokens)")
+    ax.set_ylabel("Count")
+    ax.set_title(f"Per-Conversation Final Context Size (n={len(conv_max_isl):,})")
+    ax.legend(fontsize=8)
+    ax.grid(True, alpha=0.3, axis="y")
+
+    # (3,1) Per-conversation total OSL (sum of all output tokens across turns)
+    ax = axes[2, 1]
+    conv_total_osl = [sum(t["osl"] for t in v) for v in convos.values()]
+    clip = int(sorted(conv_total_osl)[int(len(conv_total_osl) * 0.99)] * 1.2)
+    ax.hist([v for v in conv_total_osl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="coral")
+    conv_total_osl_sorted = sorted(conv_total_osl)
+    median_tosl = conv_total_osl_sorted[len(conv_total_osl) // 2]
+    mean_tosl = sum(conv_total_osl) / len(conv_total_osl)
+    ax.axvline(median_tosl, color="red", linestyle="--", label=f"Median: {median_tosl:,}")
+    ax.axvline(mean_tosl, color="orange", linestyle="--", label=f"Mean: {mean_tosl:,.0f}")
+    ax.set_xlabel("Total OSL per Conversation (tokens)")
+    ax.set_ylabel("Count")
+    ax.set_title(f"Per-Conversation Total Output Tokens (n={len(conv_total_osl):,})")
+    ax.legend(fontsize=8)
+    ax.grid(True, alpha=0.3, axis="y")
+
+    # (2,2) is empty — already placed scatter at (1,2)
+    axes[2, 2].axis("off")
+
+    plt.tight_layout()
+    out = output_dir / "workload_distribution_plots.png"
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"Saved plots to {out}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Analyze benchmark workload distributions"
+    )
+    parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ or trace_replay/ directory")
+    parser.add_argument(
+        "-o", "--output", default=None, help="Output directory (default: same as artifacts_dir)"
+    )
+    args = parser.parse_args()
+
+    artifacts_dir = Path(args.artifacts_dir)
+    output_dir = Path(args.output) if args.output else artifacts_dir
+
+    # Auto-detect format
+    trace_replay_csv = artifacts_dir / "detailed_results.csv"
+    aiperf_jsonl = artifacts_dir / "profile_export.jsonl"
+
+    if trace_replay_csv.exists():
+        records = load_trace_replay_records(artifacts_dir)
+        print(f"Loaded {len(records):,} records from {artifacts_dir} (trace replay)")
+    elif aiperf_jsonl.exists():
+        records = load_records(artifacts_dir)
+        print(f"Loaded {len(records):,} records from {artifacts_dir} (AIPerf)")
+    else:
+        print(f"No recognized data files in {artifacts_dir}")
+        return
+
+    analyze(records, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
new file mode 100755
index 000000000..fc02b1865
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Collect and aggregate multi-turn benchmark sweep results from GitHub Actions
+artifacts.
+
+Expects a directory of artifact subdirectories named:
+    multiturn_tp{N}_users{M}_offload{mode}/
+each containing metrics CSVs, status.txt, etc.
+
+Produces:
+    - summary.csv with per-experiment aggregated metrics
+    - Pareto frontier plots (via plot_pareto.py)
+
+Usage:
+    python collect_sweep_results.py <artifacts_dir> <output_dir>
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import pandas as pd
+import numpy as np
+
+
+def _load_custom_client_csv(client_csv: Path, exp_dir: Path) -> pd.DataFrame | None:
+    """Load per-request metrics from custom benchmark client CSV."""
+    df = pd.read_csv(client_csv)
+    if len(df) == 0:
+        return None
+    # Columns expected: start_time_ms, ttft_ms, tpot_ms, latency_ms,
+    #                   input_num_tokens, output_num_tokens, ...
+    return df
+
+
+def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None:
+    """Load per-request metrics from aiperf profile_export JSONL.
+
+    Converts aiperf's per-record format into the same column schema
+    used by the custom benchmark client CSV.
+    """
+    records = []
+    with open(jsonl_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            entry = json.loads(line)
+            meta = entry.get("metadata", {})
+            metrics = entry.get("metrics", {})
+
+            # Skip non-profiling records or cancelled requests
+            if meta.get("benchmark_phase") != "profiling":
+                continue
+            if meta.get("was_cancelled", False):
+                continue
+
+            # Extract values (aiperf stores metrics as {value, unit} dicts)
+            def val(key, default=0):
+                m = metrics.get(key)
+                if m is None:
+                    return default
+                return m.get("value", default) if isinstance(m, dict) else m
+
+            # Compute TPOT from ITL if available
+            itl = metrics.get("inter_token_latency")
+            if itl and isinstance(itl, dict):
+                tpot_ms = itl.get("value", 0)
+            else:
+                # Fallback: (latency - ttft) / (output_tokens - 1)
+                osl = val("output_sequence_length", 1)
+                ttft = val("time_to_first_token", 0)
+                latency = val("request_latency", 0)
+                tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0
+
+            # Convert request_start_ns to ms (epoch)
+            start_ns = meta.get("request_start_ns", 0)
+            start_ms = start_ns / 1e6
+
+            records.append({
+                "start_time_ms": start_ms,
+                "ttft_ms": val("time_to_first_token"),
+                "tpot_ms": tpot_ms,
+                "latency_ms": val("request_latency"),
+                "input_num_tokens": val("input_sequence_length"),
+                "output_num_tokens": val("output_sequence_length"),
+            })
+
+    if not records:
+        return None
+
+    return pd.DataFrame(records)
+
+
+def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None:
+    """Load per-request metrics from trace_replay detailed_results.csv."""
+    df = pd.read_csv(csv_path)
+    if len(df) == 0:
+        return None
+
+    # Filter to successful requests only
+    df = df[df["success"] == True].copy()
+    if len(df) == 0:
+        return None
+
+    # Convert to the same schema as _load_aiperf_jsonl
+    latency_s = df["request_complete_time"] - df["request_start_time"]
+    return pd.DataFrame({
+        "start_time_ms": df["request_start_time"] * 1000,
+        "ttft_ms": df["ttft"] * 1000,
+        "tpot_ms": df["itl"] * 1000,
+        "latency_ms": latency_s * 1000,
+        "input_num_tokens": df["input_tokens"],
+        "output_num_tokens": df["output_tokens_actual"],
+    })
+
+
+def load_experiment(exp_dir: Path) -> dict | None:
+    """Load metrics from a single experiment artifact directory."""
+    client_csv = exp_dir / "metrics_client_metrics.csv"
+    server_csv = exp_dir / "metrics_server_metrics.csv"
+    status_file = exp_dir / "status.txt"
+
+    if not status_file.exists():
+        return None
+    status = status_file.read_text().strip()
+
+    # Also check for aiperf output
+    aiperf_jsonl = None
+    aiperf_artifacts = exp_dir / "aiperf_artifacts"
+    if aiperf_artifacts.exists():
+        candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl"))
+        if not candidates:
+            candidates = list(aiperf_artifacts.glob("profile_export*.jsonl"))
+        if candidates:
+            aiperf_jsonl = candidates[0]
+
+    # Check for trace replay output
+    trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv"
+
+    if not client_csv.exists() and aiperf_jsonl is None and not trace_replay_csv.exists():
+        return None
+
+    # Parse experiment name from directory: multiturn_tp{N}_users{M}_offload{mode}
+    # or just tp{N}_users{M}_offload{mode}
+    name = exp_dir.name
+    if name.startswith("multiturn_"):
+        name = name[len("multiturn_"):]
+
+    try:
+        parts = name.split("_")
+        tp = int(parts[0].replace("tp", ""))
+        users = int(parts[1].replace("users", "").replace("bs", ""))
+        offload = parts[2].replace("offload", "")
+    except (IndexError, ValueError):
+        print(f"Warning: cannot parse experiment name '{exp_dir.name}', skipping")
+        return None
+
+    result = {
+        "exp_name": name,
+        "tp": tp,
+        "users": users,
+        "offload": offload,
+        "status": status,
+    }
+
+    if status != "SUCCESS":
+        return result
+
+    try:
+        # Determine data source: custom client CSV, aiperf JSONL, or trace replay CSV
+        if client_csv.exists():
+            df = _load_custom_client_csv(client_csv, exp_dir)
+        elif aiperf_jsonl is not None:
+            df = _load_aiperf_jsonl(aiperf_jsonl)
+        elif trace_replay_csv.exists():
+            df = _load_trace_replay_csv(trace_replay_csv)
+        else:
+            return result
+
+        if df is None or len(df) == 0:
+            return result
+
+        # Prefer benchmark_metadata.json for precise wall-clock duration
+        metadata_file = exp_dir / "benchmark_metadata.json"
+        total_time_sec = None
+        if metadata_file.exists():
+            try:
+                with open(metadata_file) as f:
+                    metadata = json.load(f)
+                total_time_sec = metadata.get("benchmark_runtime_sec")
+            except Exception:
+                pass
+
+        # Fallback: derive from per-request data (first start to last finish)
+        if not total_time_sec or total_time_sec <= 0:
+            first_start_ms = df["start_time_ms"].min()
+            last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max()
+            total_time_sec = (last_finish_ms - first_start_ms) / 1000.0
+        if total_time_sec <= 0:
+            total_time_sec = df["latency_ms"].sum() / 1000
+
+        num_requests = len(df)
+        result.update({
+            "num_requests": num_requests,
+            "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0,
+            "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
+            "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
+            "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0,
+            "mean_ttft_ms": df["ttft_ms"].mean(),
+            "p50_ttft_ms": df["ttft_ms"].median(),
+            "p90_ttft_ms": df["ttft_ms"].quantile(0.9),
+            "p99_ttft_ms": df["ttft_ms"].quantile(0.99),
+            "mean_tpot_ms": df["tpot_ms"].mean(),
+            "p50_tpot_ms": df["tpot_ms"].median(),
+            "p90_tpot_ms": df["tpot_ms"].quantile(0.9),
+            "p99_tpot_ms": df["tpot_ms"].quantile(0.99),
+            "mean_latency_ms": df["latency_ms"].mean(),
+            "p50_latency_ms": df["latency_ms"].median(),
+            "p90_latency_ms": df["latency_ms"].quantile(0.9),
+            "p99_latency_ms": df["latency_ms"].quantile(0.99),
+        })
+
+        # Cache hit rates from server metrics
+        if server_csv.exists():
+            try:
+                sdf = pd.read_csv(server_csv)
+                if len(sdf) > 0:
+                    final = sdf.iloc[-1]
+                    if final.get("prefix_cache_queries", 0) > 0:
+                        result["gpu_hit_rate"] = 100 * final["prefix_cache_hits"] / final["prefix_cache_queries"]
+                    if final.get("cpu_prefix_cache_queries", 0) > 0:
+                        result["cpu_hit_rate"] = 100 * final["cpu_prefix_cache_hits"] / final["cpu_prefix_cache_queries"]
+            except Exception as e:
+                print(f"Warning: failed to load server metrics for {exp_dir.name}: {e}")
+
+    except Exception as e:
+        print(f"Warning: failed to load client metrics for {exp_dir.name}: {e}")
+
+    return result
+
+
+def run_pareto_analysis(results_dir: Path, output_dir: Path) -> None:
+    """Run plot_pareto.py if available, restructuring artifacts to match its
+    expected layout (subdirs named tp{N}_users{M}_offload{mode})."""
+    # plot_pareto.py expects direct subdirectories with experiment names
+    # The artifact download gives us multiturn_tp{N}_users{M}_offload{mode}/
+    # We create symlinks with the canonical names
+    pareto_input = output_dir / "pareto_input"
+    pareto_input.mkdir(parents=True, exist_ok=True)
+
+    for subdir in sorted(results_dir.iterdir()):
+        if not subdir.is_dir():
+            continue
+        name = subdir.name
+        if name.startswith("multiturn_"):
+            name = name[len("multiturn_"):]
+        # plot_pareto.py expects "bs" not "users" in directory names
+        name = name.replace("_users", "_bs")
+        link = pareto_input / name
+        if not link.exists():
+            link.symlink_to(subdir.resolve())
+
+    # Try to import and run plot_pareto
+    analysis_dir = Path(__file__).resolve().parent.parent / "analysis"
+    sys.path.insert(0, str(analysis_dir))
+    try:
+        import plot_pareto  # type: ignore
+        plot_pareto.main(pareto_input)
+
+        # Move any generated plots to output dir
+        for f in pareto_input.glob("*.png"):
+            f.rename(output_dir / f.name)
+        for f in pareto_input.glob("*.pdf"):
+            f.rename(output_dir / f.name)
+    except Exception as e:
+        print(f"Warning: plot_pareto analysis failed: {e}")
+        print("Continuing with summary CSV only.")
+
+
+def main() -> None:
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} <artifacts_dir> <output_dir>")
+        sys.exit(1)
+
+    artifacts_dir = Path(sys.argv[1])
+    output_dir = Path(sys.argv[2])
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if not artifacts_dir.is_dir():
+        print(f"Error: {artifacts_dir} is not a directory")
+        sys.exit(1)
+
+    # Load all experiments
+    experiments = []
+    for subdir in sorted(artifacts_dir.iterdir()):
+        if not subdir.is_dir():
+            continue
+        result = load_experiment(subdir)
+        if result is not None:
+            experiments.append(result)
+
+    if not experiments:
+        print("No experiments found.")
+        sys.exit(0)
+
+    # Write summary CSV
+    summary_path = output_dir / "summary.csv"
+    df = pd.DataFrame(experiments)
+    df.to_csv(summary_path, index=False)
+    print(f"Summary written to {summary_path} ({len(experiments)} experiments)")
+
+    # Print status summary
+    success = sum(1 for e in experiments if e.get("status") == "SUCCESS")
+    failed = sum(1 for e in experiments if e.get("status") == "FAILED")
+    other = len(experiments) - success - failed
+    print(f"  SUCCESS: {success}, FAILED: {failed}, OTHER: {other}")
+
+    # Run Pareto analysis
+    run_pareto_analysis(artifacts_dir, output_dir)
+
+    # Run overview plots (throughput vs concurrency, workload consistency)
+    try:
+        from plot_sweep_overview import plot_throughput_vs_concurrency, plot_workload_consistency
+        pareto_input = output_dir / "pareto_input"
+        summary_csv = pareto_input / "experiment_summary.csv"
+        if summary_csv.exists():
+            overview_df = pd.read_csv(summary_csv)
+            plot_throughput_vs_concurrency(overview_df, output_dir)
+            plot_workload_consistency(pareto_input, output_dir)
+        else:
+            print("Warning: No experiment_summary.csv found, skipping overview plots")
+    except Exception as e:
+        print(f"Warning: Overview plots failed: {e}")
+
+    print(f"Aggregated results saved to {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py b/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py
new file mode 100644
index 000000000..1fd04bdc0
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""Generate overview plots for sweep results.
+
+Produces:
+- throughput_vs_concurrency.png: Throughput & cache hit rate vs concurrent sessions per TP
+- workload_consistency.png: ISL distribution box plots per experiment to verify consistent workload
+
+Usage:
+    python plot_sweep_overview.py <pareto_input_dir> [<output_dir>]
+"""
+
+import csv
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+
+def plot_throughput_vs_concurrency(df: pd.DataFrame, output_dir: Path) -> None:
+    """Throughput and cache hit rate vs concurrent sessions, per TP."""
+    tps = sorted(df["tp"].unique())
+    n = len(tps)
+    if n == 0:
+        return
+
+    fig, axes = plt.subplots(2, n, figsize=(7 * n, 10))
+    if n == 1:
+        axes = axes.reshape(2, 1)
+    fig.suptitle("Throughput & Cache Hit Rate vs Concurrent Sessions", fontsize=15)
+
+    for idx, tp in enumerate(tps):
+        tp_df = df[df["tp"] == tp].sort_values("bs")
+        off = tp_df[tp_df["offload"] == "off"].sort_values("bs")
+        on = tp_df[tp_df["offload"] == "on"].sort_values("bs")
+
+        # --- Top row: Throughput ---
+        ax = axes[0, idx]
+        if len(off) > 0:
+            ax.plot(off["bs"], off["total_tps_per_gpu"], "o-", color="#d62728",
+                    linewidth=2.5, markersize=7, label="Offload OFF")
+        if len(on) > 0:
+            ax.plot(on["bs"], on["total_tps_per_gpu"], "s-", color="#2ca02c",
+                    linewidth=2.5, markersize=7, label="Offload ON")
+
+        # Annotate max gain
+        if len(off) > 0 and len(on) > 0:
+            merged = pd.merge(off[["bs", "total_tps_per_gpu"]], on[["bs", "total_tps_per_gpu"]],
+                              on="bs", suffixes=("_off", "_on"))
+            if len(merged) > 0:
+                merged["gain_pct"] = ((merged["total_tps_per_gpu_on"] - merged["total_tps_per_gpu_off"])
+                                      / merged["total_tps_per_gpu_off"] * 100)
+                max_row = merged.loc[merged["gain_pct"].idxmax()]
+                if max_row["gain_pct"] > 20:
+                    ax.annotate(f"+{max_row['gain_pct']:.0f}%",
+                                xy=(max_row["bs"], max_row["total_tps_per_gpu_on"]),
+                                xytext=(0, 15), textcoords="offset points",
+                                fontsize=11, fontweight="bold", color="green", ha="center")
+
+        ax.set_xlabel("Concurrent Sessions", fontsize=10)
+        ax.set_ylabel("Throughput/GPU (tok/s)", fontsize=10)
+        ax.set_title(f"TP{tp} — Throughput", fontsize=13, fontweight="bold")
+        max_tput = df["total_tps_per_gpu"].max()
+        ax.set_ylim(0, max_tput * 1.15 if max_tput > 0 else 15000)
+        ax.legend(fontsize=9)
+        ax.grid(True, alpha=0.2)
+
+        # --- Bottom row: Cache hit rate ---
+        ax = axes[1, idx]
+        if len(off) > 0:
+            ax.plot(off["bs"], off["gpu_hit_rate"], "o-", color="#d62728",
+                    linewidth=2, markersize=6, label="GPU Hit — OFF")
+        if len(on) > 0:
+            ax.plot(on["bs"], on["gpu_hit_rate"], "s-", color="#2ca02c",
+                    linewidth=2, markersize=6, label="GPU Hit — ON")
+            cpu_hit = on["cpu_hit_rate"].fillna(0)
+            if cpu_hit.max() > 1:
+                ax.plot(on["bs"], cpu_hit, "v--", color="#9467bd",
+                        linewidth=2, markersize=6, label="CPU Hit — ON")
+
+        ax.set_xlabel("Concurrent Sessions", fontsize=10)
+        ax.set_ylabel("Cache Hit Rate (%)", fontsize=10)
+        ax.set_title(f"TP{tp} — Cache Hit Rate", fontsize=13, fontweight="bold")
+        ax.set_ylim(0, 105)
+        ax.legend(fontsize=9)
+        ax.grid(True, alpha=0.2)
+
+    plt.tight_layout()
+    out = output_dir / "throughput_vs_concurrency.png"
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"Saved {out}")
+
+
+def plot_workload_consistency(pareto_input_dir: Path, output_dir: Path) -> None:
+    """ISL distribution box plots per experiment to verify consistent workload."""
+    csv.field_size_limit(sys.maxsize)
+
+    tps = set()
+    data_by_tp: dict[int, list[tuple[int, str, list[float]]]] = defaultdict(list)
+
+    for exp_dir in sorted(pareto_input_dir.iterdir()):
+        if not exp_dir.is_dir() or not exp_dir.name.startswith("tp"):
+            continue
+        if "offloadon" in exp_dir.name:
+            continue  # Only use offload-off for consistency check
+
+        parts = exp_dir.name.split("_")
+        try:
+            tp = int(parts[0].replace("tp", ""))
+            bs = int(parts[1].replace("bs", ""))
+        except (IndexError, ValueError):
+            continue
+
+        tps.add(tp)
+
+        # Try trace replay CSV
+        csv_path = exp_dir / "trace_replay" / "detailed_results.csv"
+        if not csv_path.exists():
+            # Try aiperf JSONL
+            continue
+
+        isls = []
+        try:
+            with open(csv_path) as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    if row.get("success") == "True":
+                        isls.append(int(row["input_tokens"]) / 1000)  # k tokens
+        except Exception:
+            continue
+
+        if isls:
+            data_by_tp[tp].append((bs, exp_dir.name, isls))
+
+    if not data_by_tp:
+        print("No workload data found for consistency plot")
+        return
+
+    sorted_tps = sorted(data_by_tp.keys())
+    n = len(sorted_tps)
+
+    fig, axes = plt.subplots(1, n, figsize=(7 * n, 6))
+    if n == 1:
+        axes = [axes]
+    fig.suptitle("Workload Consistency — ISL Distribution Per Experiment (Offload OFF)", fontsize=14)
+
+    for idx, tp in enumerate(sorted_tps):
+        ax = axes[idx]
+        entries = sorted(data_by_tp[tp], key=lambda x: x[0])
+
+        box_data = [e[2] for e in entries]
+        labels = [str(e[0]) for e in entries]
+        means = [np.mean(e[2]) for e in entries]
+
+        bp = ax.boxplot(box_data, tick_labels=labels, patch_artist=True,
+                        showfliers=False, widths=0.6,
+                        medianprops=dict(color="red", linewidth=2))
+        for patch in bp["boxes"]:
+            patch.set_facecolor("steelblue")
+            patch.set_alpha(0.6)
+
+        ax.plot(range(1, len(means) + 1), means, "o--", color="orange", linewidth=2,
+                markersize=6, label=f"Mean ({np.mean(means):.0f}k ± {np.std(means):.0f}k)", zorder=5)
+
+        overall_mean = np.mean(means)
+        overall_std = np.std(means)
+        ax.axhspan(overall_mean - overall_std, overall_mean + overall_std,
+                   alpha=0.1, color="orange", label="±1σ band")
+        ax.axhline(overall_mean, color="orange", linestyle=":", alpha=0.5)
+
+        ax.set_xlabel("Concurrent Sessions", fontsize=11)
+        ax.set_ylabel("ISL (k tokens)", fontsize=11)
+        ax.set_title(f"TP{tp}", fontsize=13, fontweight="bold")
+        ax.legend(fontsize=9)
+        ax.grid(True, alpha=0.2, axis="y")
+        ax.set_ylim(0, 140)
+
+    plt.tight_layout()
+    out = output_dir / "workload_consistency.png"
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"Saved {out}")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} <pareto_input_dir> [<output_dir>]")
+        sys.exit(1)
+
+    pareto_input_dir = Path(sys.argv[1])
+    output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else pareto_input_dir.parent
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load experiment summary
+    summary_csv = pareto_input_dir / "experiment_summary.csv"
+    if not summary_csv.exists():
+        # Try parent
+        summary_csv = output_dir / "summary.csv"
+    if not summary_csv.exists():
+        print(f"No summary CSV found in {pareto_input_dir} or {output_dir}")
+        return
+
+    df = pd.read_csv(summary_csv)
+
+    # Ensure required columns exist
+    required = ["tp", "bs", "offload", "total_tps_per_gpu", "gpu_hit_rate"]
+    missing = [c for c in required if c not in df.columns]
+    if missing:
+        print(f"Missing columns in summary: {missing}")
+        return
+
+    plot_throughput_vs_concurrency(df, output_dir)
+    plot_workload_consistency(pareto_input_dir, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 52e28e9b8..ac91177ca 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -159,7 +159,7 @@ else
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     set -x
-    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME"
+    salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"
@@ -188,7 +188,7 @@ else
         --container-writable \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh
 
     scancel $JOB_ID
 

From 28991ebac6d1e51c63ffc136d42f40d9d59e2ae7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 15:27:35 -0500
Subject: [PATCH 02/33] remove deprecated GpuTransferCollector from metrics
 collector

Replaced by vLLM's native kv_offload metrics. Removes subprocess/threading
imports and ~100 lines of dead code.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../vllm_benchmark/bench/metrics_collector.py | 105 ------------------
 1 file changed, 105 deletions(-)

diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
index c129f38b8..064795f51 100644
--- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
+++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
@@ -6,8 +6,6 @@
 import asyncio
 import csv
 import re
-import subprocess
-import threading
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -16,109 +14,6 @@
 import matplotlib.pyplot as plt
 
 
-@dataclass
-class GpuTransferSnapshot:
-    timestamp: float
-    gpu_id: int = 0
-    tx_pci: float = 0.0  # PCIe TX (MB/s)
-    rx_pci: float = 0.0  # PCIe RX (MB/s)
-
-
-class GpuTransferCollector:
-    """DEPRECATED: Collects GPU transfer stats using nvidia-smi dmon.
-
-    Replaced by vLLM's native kv_offload metrics (vllm:kv_offload_total_bytes_total,
-    vllm:kv_offload_total_time_total) which are more precise and don't require
-    spawning a subprocess.
-    """
-
-    def __init__(self, gpu_id: int = 0, poll_interval: int = 1):
-        self.gpu_id = gpu_id
-        self.poll_interval = poll_interval
-        self.snapshots: list[GpuTransferSnapshot] = []
-        self._process: subprocess.Popen | None = None
-        self._thread: threading.Thread | None = None
-        self._running = False
-
-    def _parse_line(self, line: str) -> GpuTransferSnapshot | None:
-        """Parse a line of nvidia-smi dmon CSV output.
-
-        Format: gpu, rxpci, txpci (values in MB/s)
-        Example: 0, 406, 32013
-        """
-        line = line.strip()
-        if not line or line.startswith('#'):  # Skip header/comments
-            return None
-
-        parts = [p.strip() for p in line.split(',')]
-        if len(parts) < 3:
-            return None
-
-        try:
-            return GpuTransferSnapshot(
-                timestamp=time.time(),
-                gpu_id=int(parts[0]),
-                rx_pci=float(parts[1]) if parts[1] != '-' else 0.0,
-                tx_pci=float(parts[2]) if parts[2] != '-' else 0.0,
-            )
-        except (ValueError, IndexError):
-            return None
-
-    def _reader_thread(self) -> None:
-        """Background thread to read nvidia-smi output."""
-        if self._process is None:
-            return
-
-        for line in iter(self._process.stdout.readline, ''):
-            if not self._running:
-                break
-            snapshot = self._parse_line(line)
-            if snapshot and snapshot.gpu_id == self.gpu_id:
-                self.snapshots.append(snapshot)
-
-    def start(self) -> None:
-        """Start collecting GPU transfer stats."""
-        if self._running:
-            return
-
-        self._running = True
-        self.snapshots = []
-
-        try:
-            self._process = subprocess.Popen(
-                [
-                    'nvidia-smi', 'dmon',
-                    '-i', str(self.gpu_id),
-                    '-s', 't',
-                    '-d', str(self.poll_interval),
-                    '--format', 'csv',
-                ],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                text=True,
-            )
-            self._thread = threading.Thread(target=self._reader_thread, daemon=True)
-            self._thread.start()
-        except FileNotFoundError:
-            print("nvidia-smi not found, GPU transfer monitoring disabled")
-            self._running = False
-
-    def stop(self) -> None:
-        """Stop collecting GPU transfer stats."""
-        self._running = False
-        if self._process:
-            self._process.terminate()
-            try:
-                self._process.wait(timeout=2)
-            except subprocess.TimeoutExpired:
-                self._process.kill()
-            self._process = None
-
-        if self._thread:
-            self._thread.join(timeout=2)
-            self._thread = None
-
-
 @dataclass
 class MetricsSnapshot:
     timestamp: float

From 695ec2e03f62e9d0e523cb084f6c72297d3447a8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 16:50:32 -0500
Subject: [PATCH 03/33] modularize metrics collector with backend
 auto-detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add VLLMMetricsParser and SGLangMetricsParser with shared MetricsSnapshot.
Backend is auto-detected from metrics prefix (vllm: vs sglang:) on first poll.

sglang metrics mapped:
- token_usage / num_used_tokens → kv_cache_usage
- num_running_reqs → num_requests_running
- num_queue_reqs → num_requests_waiting
- cache_hit_rate × prompt_tokens → prefix_cache_hits/queries
- num_retracted_reqs → num_preemptions
- realtime_tokens_total mode=prefill_compute/prefill_cache → token source

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../vllm_benchmark/bench/metrics_collector.py | 235 ++++++++++--------
 1 file changed, 129 insertions(+), 106 deletions(-)

diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
index 064795f51..6091318c0 100644
--- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
+++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
@@ -1,6 +1,7 @@
 """
-Metrics collector for vLLM server during benchmarks.
+Metrics collector for inference servers during benchmarks.
 Polls /metrics endpoint and generates visualizations.
+Supports vLLM and sglang backends (auto-detected from metrics prefix).
 """
 
 import asyncio
@@ -9,6 +10,7 @@
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import Protocol
 
 import aiohttp
 import matplotlib.pyplot as plt
@@ -43,123 +45,144 @@ class MetricsSnapshot:
     prefill_kv_computed_tokens_count: int = 0
 
 
-@dataclass
-class MetricsCollector:
-    base_url: str
-    poll_interval: float = 1.0
-    snapshots: list[MetricsSnapshot] = field(default_factory=list)
-    _running: bool = False
-    _task: asyncio.Task | None = None
-    gpu_transfer_collector: GpuTransferCollector | None = None
-    gpu_id: int = 0
+# =============================================================================
+# Metrics Parsers — one per backend
+# =============================================================================
+
+def _get_value(text: str, pattern: str, default: float = 0.0) -> float:
+    """Extract a gauge/counter value from Prometheus text using a regex."""
+    match = re.search(pattern, text)
+    return float(match.group(1)) if match else default
 
-    def _parse_metrics(self, text: str) -> MetricsSnapshot:
-        """Parse Prometheus metrics text format."""
-        snapshot = MetricsSnapshot(timestamp=time.time())
 
-        # Helper to extract gauge/counter value
-        def get_value(pattern: str, default: float = 0.0) -> float:
-            match = re.search(pattern, text)
-            if match:
-                return float(match.group(1))
-            return default
+class VLLMMetricsParser:
+    """Parse vLLM Prometheus metrics (prefix: vllm:)."""
+
+    def parse(self, text: str) -> MetricsSnapshot:
+        snapshot = MetricsSnapshot(timestamp=time.time())
+        g = lambda p, d=0.0: _get_value(text, p, d)
 
         # KV cache usage (0-1 scale)
-        snapshot.kv_cache_usage = get_value(
-            r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)'
-        )
-        # Fallback to old metric name if new one not found
+        snapshot.kv_cache_usage = g(r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)')
         if snapshot.kv_cache_usage == 0.0:
-            snapshot.kv_cache_usage = get_value(
-                r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)'
-            )
-
-        # CPU/offloaded KV cache usage
-        snapshot.cpu_kv_cache_usage = get_value(
-            r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)'
-        )
-
-        # Running/waiting requests
-        snapshot.num_requests_running = int(get_value(
-            r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-        snapshot.num_requests_waiting = int(get_value(
-            r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-
-        # Prefix cache (cumulative counters) - GPU
-        snapshot.prefix_cache_hits = int(get_value(
-            r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-        snapshot.prefix_cache_queries = int(get_value(
-            r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-
-        # Prefix cache - external/offloaded (KV connector cross-instance cache)
-        snapshot.cpu_prefix_cache_hits = int(get_value(
-            r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-        snapshot.cpu_prefix_cache_queries = int(get_value(
-            r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-
-        # Token counters
-        snapshot.prompt_tokens = int(get_value(
-            r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-        snapshot.generation_tokens = int(get_value(
-            r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-
-        # Preemptions
-        snapshot.num_preemptions = int(get_value(
-            r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-
-        # Request success (sum all finish reasons)
+            snapshot.kv_cache_usage = g(r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)')
+
+        snapshot.cpu_kv_cache_usage = g(r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)')
+
+        snapshot.num_requests_running = int(g(r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.num_requests_waiting = int(g(r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)'))
+
+        snapshot.prefix_cache_hits = int(g(r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.prefix_cache_queries = int(g(r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)'))
+
+        snapshot.cpu_prefix_cache_hits = int(g(r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.cpu_prefix_cache_queries = int(g(r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)'))
+
+        snapshot.prompt_tokens = int(g(r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.generation_tokens = int(g(r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)'))
+
+        snapshot.num_preemptions = int(g(r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)'))
+
         for match in re.finditer(
-            r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)',
-            text
+            r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', text
         ):
             snapshot.request_success += int(float(match.group(1)))
 
-        # KV offload bytes transferred (cumulative counters by direction)
-        snapshot.kv_offload_bytes_gpu_to_cpu = get_value(
-            r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)'
-        )
-        snapshot.kv_offload_bytes_cpu_to_gpu = get_value(
-            r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)'
-        )
-
-        # KV offload time (cumulative, seconds)
-        snapshot.kv_offload_time_gpu_to_cpu = get_value(
-            r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)'
-        )
-        snapshot.kv_offload_time_cpu_to_gpu = get_value(
-            r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)'
-        )
-
-        # Prompt tokens by source (cumulative)
-        snapshot.prompt_tokens_local_compute = int(get_value(
-            r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)'
-        ))
-        snapshot.prompt_tokens_local_cache_hit = int(get_value(
-            r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)'
-        ))
-        snapshot.prompt_tokens_external_kv_transfer = int(get_value(
-            r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)'
-        ))
-
-        # Prefill KV computed tokens (histogram sum and count)
-        snapshot.prefill_kv_computed_tokens_sum = int(get_value(
-            r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)'
-        ))
-        snapshot.prefill_kv_computed_tokens_count = int(get_value(
-            r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)'
-        ))
+        snapshot.kv_offload_bytes_gpu_to_cpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)')
+        snapshot.kv_offload_bytes_cpu_to_gpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)')
+        snapshot.kv_offload_time_gpu_to_cpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)')
+        snapshot.kv_offload_time_cpu_to_gpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)')
+
+        snapshot.prompt_tokens_local_compute = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.prompt_tokens_local_cache_hit = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.prompt_tokens_external_kv_transfer = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)'))
+
+        snapshot.prefill_kv_computed_tokens_sum = int(g(r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.prefill_kv_computed_tokens_count = int(g(r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)'))
 
         return snapshot
 
+
+class SGLangMetricsParser:
+    """Parse sglang Prometheus metrics (prefix: sglang:)."""
+
+    def parse(self, text: str) -> MetricsSnapshot:
+        snapshot = MetricsSnapshot(timestamp=time.time())
+        g = lambda p, d=0.0: _get_value(text, p, d)
+
+        # KV cache usage — sglang reports token_usage as a ratio (0-1)
+        snapshot.kv_cache_usage = g(r'sglang:token_usage\{[^}]*\}\s+([\d.e+-]+)')
+        # Fallback: compute from num_used_tokens / max_total_num_tokens
+        if snapshot.kv_cache_usage == 0.0:
+            used = g(r'sglang:num_used_tokens\{[^}]*\}\s+([\d.e+-]+)')
+            total = g(r'sglang:max_total_num_tokens\{[^}]*\}\s+([\d.e+-]+)')
+            if total > 0:
+                snapshot.kv_cache_usage = used / total
+
+        snapshot.num_requests_running = int(g(r'sglang:num_running_reqs\{[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.num_requests_waiting = int(g(r'sglang:num_queue_reqs\{[^}]*\}\s+([\d.e+-]+)'))
+
+        # sglang exposes cache_hit_rate as a direct gauge (0-1)
+        # We convert to cumulative-style by tracking hits/queries from token sources
+        cache_hit_rate = g(r'sglang:cache_hit_rate\{[^}]*\}\s+([\d.e+-]+)')
+        prompt_tokens = int(g(r'sglang:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.prompt_tokens = prompt_tokens
+        # Approximate cumulative cache hits from rate × total prompts
+        if prompt_tokens > 0 and cache_hit_rate > 0:
+            snapshot.prefix_cache_queries = prompt_tokens
+            snapshot.prefix_cache_hits = int(prompt_tokens * cache_hit_rate)
+
+        snapshot.generation_tokens = int(g(r'sglang:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)'))
+
+        # Preemptions — sglang calls them "retractions"
+        snapshot.num_preemptions = int(g(r'sglang:num_retracted_reqs\{[^}]*\}\s+([\d.e+-]+)'))
+
+        snapshot.request_success = int(g(r'sglang:num_requests_total\{[^}]*\}\s+([\d.e+-]+)'))
+
+        # Token source breakdown from realtime_tokens_total
+        snapshot.prompt_tokens_local_compute = int(g(
+            r'sglang:realtime_tokens_total\{[^}]*mode="prefill_compute"[^}]*\}\s+([\d.e+-]+)'))
+        snapshot.prompt_tokens_local_cache_hit = int(g(
+            r'sglang:realtime_tokens_total\{[^}]*mode="prefill_cache"[^}]*\}\s+([\d.e+-]+)'))
+
+        return snapshot
+
+
+def detect_backend(text: str) -> str:
+    """Auto-detect backend from metrics text."""
+    if 'vllm:' in text:
+        return 'vllm'
+    elif 'sglang:' in text:
+        return 'sglang'
+    return 'unknown'
+
+
+def get_parser(backend: str):
+    """Get the appropriate parser for the backend."""
+    if backend == 'sglang':
+        return SGLangMetricsParser()
+    return VLLMMetricsParser()  # default
+
+
+@dataclass
+class MetricsCollector:
+    base_url: str
+    poll_interval: float = 1.0
+    snapshots: list[MetricsSnapshot] = field(default_factory=list)
+    _running: bool = False
+    _task: asyncio.Task | None = None
+    _parser: VLLMMetricsParser | SGLangMetricsParser | None = None
+    _backend: str = ""
+
+    def _parse_metrics(self, text: str) -> MetricsSnapshot:
+        """Parse Prometheus metrics text, auto-detecting backend on first call."""
+        if self._parser is None:
+            self._backend = detect_backend(text)
+            self._parser = get_parser(self._backend)
+            if self._backend != 'unknown':
+                print(f"Auto-detected metrics backend: {self._backend}")
+        return self._parser.parse(text)
+
     async def _poll_loop(self) -> None:
         """Background polling loop."""
         metrics_url = f"{self.base_url}/metrics"

From 6a41d49a2345207899e5f8c30e48078abccb25b2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 16:51:19 -0500
Subject: [PATCH 04/33] remove unused Protocol import

---
 experimental/multiturn/vllm_benchmark/bench/metrics_collector.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
index 6091318c0..7bcdf31a4 100644
--- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
+++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
@@ -10,7 +10,6 @@
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Protocol
 
 import aiohttp
 import matplotlib.pyplot as plt

From c137677e1f0d5b90617d3578ae99f404ceb2a55c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 17:09:47 -0500
Subject: [PATCH 05/33] add LMCache agentic trace benchmark for H100

Replays SWE-bench/GAIA/WildClaw traces from sammshen/lmcache-agentic-traces
via AIPerf with mooncake_trace format. Downloads and converts traces at
runtime. Supports concurrency sweep with offload on/off.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../multiturn_fp8_h100_lmcache_aiperf.sh      | 230 ++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100755 benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
new file mode 100755
index 000000000..fb02a79a1
--- /dev/null
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# LMCache agentic trace benchmark for FP8 models on H100 using AIPerf.
+# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format.
+# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces
+#
+# Required env vars:
+#   MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR
+# Optional:
+#   PORT (default 8888), REQUEST_TIMEOUT (default 3600)
+#   DURATION (if set, runs for this many seconds; otherwise runs to completion)
+#   REQUEST_RATE (default: 0 = no rate limit, concurrency-burst mode)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    USERS \
+    OFFLOAD_MODE \
+    TOTAL_CPU_DRAM_GB \
+    RESULT_DIR
+
+PORT=${PORT:-8888}
+REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600}
+REQUEST_RATE=${REQUEST_RATE:-0}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ---- Download model --------------------------------------------------------
+hf download "$MODEL"
+
+nvidia-smi
+
+# ---- Paths -----------------------------------------------------------------
+MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark
+AIPERF_DIR="$MULTITURN_DIR/aiperf"
+TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl"
+
+pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true
+
+# Patch vLLM bug: local_cache_hit counter can go negative under high load
+# (causes "Counters can only be incremented by non-negative amounts" crash)
+STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "")
+if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then
+    echo "Patching vLLM stats.py: $STATS_FILE"
+    python3 -c "
+import re, sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'self.local_cache_hit += (\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+    'self.local_cache_hit += max(0,\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+" "$STATS_FILE"
+fi
+
+# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859)
+# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading)
+SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "")
+if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then
+    echo "Patching vLLM scheduler.py: $SCHED_FILE"
+    python3 << 'PYEOF' "$SCHED_FILE"
+import sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'assert req_id in self.requests\n            req = self.requests[req_id]\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n                self.finished_recving_kv_req_ids.discard(req_id)\n                continue\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+)
+src = src.replace(
+    'assert req_id in self.requests\n            self._free_blocks(self.requests[req_id])',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n                continue\n            self._free_blocks(req)',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+PYEOF
+fi
+
+# ---- Convert LMCache traces to mooncake format -----------------------------
+echo "Downloading and converting LMCache traces..."
+python3 -c "
+import json, os
+try:
+    from datasets import load_dataset
+    ds = load_dataset('sammshen/lmcache-agentic-traces', split='train')
+    out_path = '$TRACE_FILE'
+    sessions = set()
+    with open(out_path, 'w') as f:
+        for row in ds:
+            entry = {
+                'session_id': row['session_id'],
+                'messages': row['input'],
+                'output_length': row['output_length'],
+            }
+            f.write(json.dumps(entry) + '\n')
+            sessions.add(row['session_id'])
+    print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}')
+except Exception as e:
+    print(f'ERROR converting traces: {e}')
+    exit(1)
+"
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Generate vLLM config --------------------------------------------------
+cat > "$RESULT_DIR/config.yaml" << 'EOF'
+kv-cache-dtype: fp8
+async-scheduling: true
+EOF
+
+# ---- Build vLLM command -----------------------------------------------------
+offload_size=$TOTAL_CPU_DRAM_GB
+
+VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT"
+VLLM_CMD+=" --config $RESULT_DIR/config.yaml"
+VLLM_CMD+=" --gpu-memory-utilization 0.9"
+VLLM_CMD+=" --tensor-parallel-size $TP"
+
+if [ "$OFFLOAD_MODE" = "on" ]; then
+    VLLM_CMD+=" --kv_offloading_backend native"
+    VLLM_CMD+=" --kv_offloading_size $offload_size"
+    VLLM_CMD+=" --disable-hybrid-kv-cache-manager"
+elif [ "$OFFLOAD_MODE" = "noprefix" ]; then
+    VLLM_CMD+=" --no-enable-prefix-caching"
+fi
+
+echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt"
+
+# ---- Start vLLM server ------------------------------------------------------
+echo "Starting vllm server..."
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+$VLLM_CMD > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready \
+    --port "$PORT" \
+    --server-log "$SERVER_LOG" \
+    --server-pid "$SERVER_PID"
+
+# ---- Install dependencies ---------------------------------------------------
+set -x
+pip install -q -r "$MULTITURN_DIR/requirements.txt"
+
+echo "Installing aiperf in isolated venv..."
+python3 -m venv /tmp/aiperf-venv --system-site-packages
+/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10
+AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf"
+
+/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')"
+set +x
+
+# ---- Start server metrics collector -----------------------------------------
+export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}"
+
+echo "Starting server metrics collector..."
+python3 -m bench.run_metrics_collector \
+    --url "http://localhost:$PORT" \
+    --output-prefix "$RESULT_DIR/metrics" \
+    --pid-file "$RESULT_DIR/metrics_collector.pid" &
+METRICS_PID=$!
+echo "Metrics collector PID: $METRICS_PID"
+
+sleep 2
+
+# ---- Run AIPerf benchmark ----------------------------------------------------
+export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl"
+
+AIPERF_CMD="$AIPERF_BIN profile"
+AIPERF_CMD+=" --model $MODEL"
+AIPERF_CMD+=" --url http://localhost:$PORT"
+AIPERF_CMD+=" --endpoint-type chat"
+AIPERF_CMD+=" --streaming"
+AIPERF_CMD+=" --input-file $TRACE_FILE"
+AIPERF_CMD+=" --custom-dataset-type mooncake_trace"
+AIPERF_CMD+=" --concurrency $USERS"
+if [ "$REQUEST_RATE" != "0" ]; then
+    AIPERF_CMD+=" --request-rate $REQUEST_RATE"
+fi
+if [ -n "${DURATION:-}" ]; then
+    AIPERF_CMD+=" --benchmark-duration $DURATION"
+    AIPERF_CMD+=" --benchmark-grace-period 0"
+fi
+AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT"
+AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts"
+AIPERF_CMD+=" --extra-inputs ignore_eos:true"
+AIPERF_CMD+=" --export-level records"
+AIPERF_CMD+=" --ui-type simple"
+AIPERF_CMD+=" --random-seed 42"
+
+echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then
+    echo "SUCCESS" > "$RESULT_DIR/status.txt"
+    echo "Benchmark completed successfully"
+else
+    echo "FAILED" > "$RESULT_DIR/status.txt"
+    echo "Benchmark failed"
+fi
+set +x
+
+# ---- Analyze workload distributions -----------------------------------------
+echo "Analyzing workload distributions..."
+python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true
+
+# ---- Stop metrics collector -------------------------------------------------
+echo "Stopping metrics collector..."
+if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then
+    kill -TERM "$METRICS_PID" 2>/dev/null || true
+    wait "$METRICS_PID" 2>/dev/null || true
+fi
+
+# ---- Cleanup -----------------------------------------------------------------
+echo "Stopping vllm server..."
+kill "$SERVER_PID" 2>/dev/null || true
+wait "$SERVER_PID" 2>/dev/null || true
+
+echo "Experiment finished at $(date)"

From ee767671f52da38c31d355ab359b9a0d8000d532 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 17:15:20 -0500
Subject: [PATCH 06/33] add H100 LMCache trace sweep config

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/multiturn-agentic-trace.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml
index 5ec98b902..e19780a21 100644
--- a/.github/configs/multiturn-agentic-trace.yaml
+++ b/.github/configs/multiturn-agentic-trace.yaml
@@ -20,6 +20,17 @@ mi355x-fp8-llama70b:
     users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512]
     offload: ["on", "off"]
 
+h100-fp8-llama70b-lmcache:
+  tp2:
+    users: [2, 4, 6, 8, 10, 12, 16, 20]
+    offload: ["on", "off"]
+  tp4:
+    users: [2, 4, 8, 12, 16, 20, 24, 32, 40]
+    offload: ["on", "off"]
+  tp8:
+    users: [2, 4, 8, 16, 24, 32, 48, 64]
+    offload: ["on", "off"]
+
 b200-fp4-dsr1:
   tp4:
     ep: 4

From 839ba0f8de99ef541ca8c652a6bfe087479e5a02 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 17:30:54 -0500
Subject: [PATCH 07/33] fix LMCache benchmark: use fixed-schedule replay,
 remove ignore_eos

- Add --fixed-schedule to replay at exact trace timestamps
- Remove --extra-inputs ignore_eos:true (let model stop naturally)
- Remove unused REQUEST_RATE logic

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh       | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index fb02a79a1..53d2c03b1 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -11,7 +11,6 @@ set -x
 # Optional:
 #   PORT (default 8888), REQUEST_TIMEOUT (default 3600)
 #   DURATION (if set, runs for this many seconds; otherwise runs to completion)
-#   REQUEST_RATE (default: 0 = no rate limit, concurrency-burst mode)
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -25,7 +24,6 @@ check_env_vars \
 
 PORT=${PORT:-8888}
 REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600}
-REQUEST_RATE=${REQUEST_RATE:-0}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -183,17 +181,14 @@ AIPERF_CMD+=" --endpoint-type chat"
 AIPERF_CMD+=" --streaming"
 AIPERF_CMD+=" --input-file $TRACE_FILE"
 AIPERF_CMD+=" --custom-dataset-type mooncake_trace"
+AIPERF_CMD+=" --fixed-schedule"
 AIPERF_CMD+=" --concurrency $USERS"
-if [ "$REQUEST_RATE" != "0" ]; then
-    AIPERF_CMD+=" --request-rate $REQUEST_RATE"
-fi
 if [ -n "${DURATION:-}" ]; then
     AIPERF_CMD+=" --benchmark-duration $DURATION"
     AIPERF_CMD+=" --benchmark-grace-period 0"
 fi
 AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT"
 AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts"
-AIPERF_CMD+=" --extra-inputs ignore_eos:true"
 AIPERF_CMD+=" --export-level records"
 AIPERF_CMD+=" --ui-type simple"
 AIPERF_CMD+=" --random-seed 42"

From fc8e3cf02d7975931233bcd43589030ab036d829 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 17:36:00 -0500
Subject: [PATCH 08/33] remove --fixed-schedule: use concurrency mode per
 Samuel's recommendation

---
 benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index 53d2c03b1..ff10f0252 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -181,7 +181,6 @@ AIPERF_CMD+=" --endpoint-type chat"
 AIPERF_CMD+=" --streaming"
 AIPERF_CMD+=" --input-file $TRACE_FILE"
 AIPERF_CMD+=" --custom-dataset-type mooncake_trace"
-AIPERF_CMD+=" --fixed-schedule"
 AIPERF_CMD+=" --concurrency $USERS"
 if [ -n "${DURATION:-}" ]; then
     AIPERF_CMD+=" --benchmark-duration $DURATION"

From 6bbbfa989d23789385897fb015b2271a89390293 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 21:23:45 -0500
Subject: [PATCH 09/33] update yaml

---
 .github/configs/multiturn-agentic-trace.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml
index e19780a21..500a6705e 100644
--- a/.github/configs/multiturn-agentic-trace.yaml
+++ b/.github/configs/multiturn-agentic-trace.yaml
@@ -22,13 +22,13 @@ mi355x-fp8-llama70b:
 
 h100-fp8-llama70b-lmcache:
   tp2:
-    users: [2, 4, 6, 8, 10, 12, 16, 20]
+    users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64]
     offload: ["on", "off"]
   tp4:
-    users: [2, 4, 8, 12, 16, 20, 24, 32, 40]
+    users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128]
     offload: ["on", "off"]
   tp8:
-    users: [2, 4, 8, 16, 24, 32, 48, 64]
+    users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256]
     offload: ["on", "off"]
 
 b200-fp4-dsr1:

From a2e4fe64351a31f378eb535e903555995b9f9341 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 21:51:24 -0500
Subject: [PATCH 10/33] fix H100 runner: add SCRIPT_SUFFIX support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 runners/launch_h100-cw.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh
index 49a42e981..28e89e0cb 100644
--- a/runners/launch_h100-cw.sh
+++ b/runners/launch_h100-cw.sh
@@ -31,7 +31,7 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT=8888 \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh
+bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh
 
 rmdir $SAGEMAKER_SHM_PATH
 scancel $JOB_ID

From fee02780917b7755aec804fcea39dc940160ddaf Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 22:45:33 -0500
Subject: [PATCH 11/33] fix: mkdir RESULT_DIR before trace conversion

---
 benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index ff10f0252..1bec35577 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -81,6 +81,8 @@ with open(sys.argv[1], 'w') as f:
 PYEOF
 fi
 
+mkdir -p "$RESULT_DIR"
+
 # ---- Convert LMCache traces to mooncake format -----------------------------
 echo "Downloading and converting LMCache traces..."
 python3 -c "

From 769532c3985bd24714d65dfdf3ad6e3651c9b60c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 23:15:28 -0500
Subject: [PATCH 12/33] add H200 LMCache trace benchmark and config

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/multiturn-agentic-trace.yaml  |  11 +
 .../multiturn_fp8_h200_lmcache_aiperf.sh      | 226 ++++++++++++++++++
 2 files changed, 237 insertions(+)
 create mode 100755 benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh

diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml
index 500a6705e..bb0e568d3 100644
--- a/.github/configs/multiturn-agentic-trace.yaml
+++ b/.github/configs/multiturn-agentic-trace.yaml
@@ -20,6 +20,17 @@ mi355x-fp8-llama70b:
     users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512]
     offload: ["on", "off"]
 
+h200-fp8-llama70b-lmcache:
+  tp2:
+    users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32]
+    offload: ["on", "off"]
+  tp4:
+    users: [4, 8, 16, 24, 32, 40, 48, 56]
+    offload: ["on", "off"]
+  tp8:
+    users: [2, 4, 8, 16, 32, 48, 64, 80]
+    offload: ["on", "off"]
+
 h100-fp8-llama70b-lmcache:
   tp2:
     users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64]
diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
new file mode 100755
index 000000000..9a0c89e5a
--- /dev/null
+++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
@@ -0,0 +1,226 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# LMCache agentic trace benchmark for FP8 models on H200 using AIPerf.
+# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format.
+# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces
+#
+# Required env vars:
+#   MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR
+# Optional:
+#   PORT (default 8888), REQUEST_TIMEOUT (default 3600)
+#   DURATION (if set, runs for this many seconds; otherwise runs to completion)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    USERS \
+    OFFLOAD_MODE \
+    TOTAL_CPU_DRAM_GB \
+    RESULT_DIR
+
+PORT=${PORT:-8888}
+REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ---- Download model --------------------------------------------------------
+hf download "$MODEL"
+
+nvidia-smi
+
+# ---- Paths -----------------------------------------------------------------
+MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark
+AIPERF_DIR="$MULTITURN_DIR/aiperf"
+TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl"
+
+pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true
+
+# Patch vLLM bug: local_cache_hit counter can go negative under high load
+# (causes "Counters can only be incremented by non-negative amounts" crash)
+STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "")
+if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then
+    echo "Patching vLLM stats.py: $STATS_FILE"
+    python3 -c "
+import re, sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'self.local_cache_hit += (\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+    'self.local_cache_hit += max(0,\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+" "$STATS_FILE"
+fi
+
+# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859)
+# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading)
+SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "")
+if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then
+    echo "Patching vLLM scheduler.py: $SCHED_FILE"
+    python3 << 'PYEOF' "$SCHED_FILE"
+import sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'assert req_id in self.requests\n            req = self.requests[req_id]\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n                self.finished_recving_kv_req_ids.discard(req_id)\n                continue\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+)
+src = src.replace(
+    'assert req_id in self.requests\n            self._free_blocks(self.requests[req_id])',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n                continue\n            self._free_blocks(req)',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+PYEOF
+fi
+
+mkdir -p "$RESULT_DIR"
+
+# ---- Convert LMCache traces to mooncake format -----------------------------
+echo "Downloading and converting LMCache traces..."
+python3 -c "
+import json, os
+try:
+    from datasets import load_dataset
+    ds = load_dataset('sammshen/lmcache-agentic-traces', split='train')
+    out_path = '$TRACE_FILE'
+    sessions = set()
+    with open(out_path, 'w') as f:
+        for row in ds:
+            entry = {
+                'session_id': row['session_id'],
+                'messages': row['input'],
+                'output_length': row['output_length'],
+            }
+            f.write(json.dumps(entry) + '\n')
+            sessions.add(row['session_id'])
+    print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}')
+except Exception as e:
+    print(f'ERROR converting traces: {e}')
+    exit(1)
+"
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Generate vLLM config --------------------------------------------------
+cat > "$RESULT_DIR/config.yaml" << 'EOF'
+kv-cache-dtype: fp8
+async-scheduling: true
+EOF
+
+# ---- Build vLLM command -----------------------------------------------------
+offload_size=$TOTAL_CPU_DRAM_GB
+
+VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT"
+VLLM_CMD+=" --config $RESULT_DIR/config.yaml"
+VLLM_CMD+=" --gpu-memory-utilization 0.9"
+VLLM_CMD+=" --tensor-parallel-size $TP"
+
+if [ "$OFFLOAD_MODE" = "on" ]; then
+    VLLM_CMD+=" --kv_offloading_backend native"
+    VLLM_CMD+=" --kv_offloading_size $offload_size"
+    VLLM_CMD+=" --disable-hybrid-kv-cache-manager"
+elif [ "$OFFLOAD_MODE" = "noprefix" ]; then
+    VLLM_CMD+=" --no-enable-prefix-caching"
+fi
+
+echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt"
+
+# ---- Start vLLM server ------------------------------------------------------
+echo "Starting vllm server..."
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+$VLLM_CMD > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready \
+    --port "$PORT" \
+    --server-log "$SERVER_LOG" \
+    --server-pid "$SERVER_PID"
+
+# ---- Install dependencies ---------------------------------------------------
+set -x
+pip install -q -r "$MULTITURN_DIR/requirements.txt"
+
+echo "Installing aiperf in isolated venv..."
+python3 -m venv /tmp/aiperf-venv --system-site-packages
+/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10
+AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf"
+
+/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')"
+set +x
+
+# ---- Start server metrics collector -----------------------------------------
+export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}"
+
+echo "Starting server metrics collector..."
+python3 -m bench.run_metrics_collector \
+    --url "http://localhost:$PORT" \
+    --output-prefix "$RESULT_DIR/metrics" \
+    --pid-file "$RESULT_DIR/metrics_collector.pid" &
+METRICS_PID=$!
+echo "Metrics collector PID: $METRICS_PID"
+
+sleep 2
+
+# ---- Run AIPerf benchmark ----------------------------------------------------
+export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl"
+
+AIPERF_CMD="$AIPERF_BIN profile"
+AIPERF_CMD+=" --model $MODEL"
+AIPERF_CMD+=" --url http://localhost:$PORT"
+AIPERF_CMD+=" --endpoint-type chat"
+AIPERF_CMD+=" --streaming"
+AIPERF_CMD+=" --input-file $TRACE_FILE"
+AIPERF_CMD+=" --custom-dataset-type mooncake_trace"
+AIPERF_CMD+=" --concurrency $USERS"
+if [ -n "${DURATION:-}" ]; then
+    AIPERF_CMD+=" --benchmark-duration $DURATION"
+    AIPERF_CMD+=" --benchmark-grace-period 0"
+fi
+AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT"
+AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts"
+AIPERF_CMD+=" --export-level records"
+AIPERF_CMD+=" --ui-type simple"
+AIPERF_CMD+=" --random-seed 42"
+
+echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then
+    echo "SUCCESS" > "$RESULT_DIR/status.txt"
+    echo "Benchmark completed successfully"
+else
+    echo "FAILED" > "$RESULT_DIR/status.txt"
+    echo "Benchmark failed"
+fi
+set +x
+
+# ---- Analyze workload distributions -----------------------------------------
+echo "Analyzing workload distributions..."
+python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true
+
+# ---- Stop metrics collector -------------------------------------------------
+echo "Stopping metrics collector..."
+if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then
+    kill -TERM "$METRICS_PID" 2>/dev/null || true
+    wait "$METRICS_PID" 2>/dev/null || true
+fi
+
+# ---- Cleanup -----------------------------------------------------------------
+echo "Stopping vllm server..."
+kill "$SERVER_PID" 2>/dev/null || true
+wait "$SERVER_PID" 2>/dev/null || true
+
+echo "Experiment finished at $(date)"

From 02876afda83786ab96df394708356f99076d9fe0 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 23:15:47 -0500
Subject: [PATCH 13/33] update yaml

---
 .github/configs/multiturn-agentic-trace.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml
index bb0e568d3..63892d202 100644
--- a/.github/configs/multiturn-agentic-trace.yaml
+++ b/.github/configs/multiturn-agentic-trace.yaml
@@ -22,13 +22,13 @@ mi355x-fp8-llama70b:
 
 h200-fp8-llama70b-lmcache:
   tp2:
-    users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32]
+    users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64]
     offload: ["on", "off"]
   tp4:
-    users: [4, 8, 16, 24, 32, 40, 48, 56]
+    users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128]
     offload: ["on", "off"]
   tp8:
-    users: [2, 4, 8, 16, 32, 48, 64, 80]
+    users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256]
     offload: ["on", "off"]
 
 h100-fp8-llama70b-lmcache:

From 2134fd8664effdb5066834c2e81a5c53a50ce3fd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 23:19:08 -0500
Subject: [PATCH 14/33] fix H200-nb runner: add SCRIPT_SUFFIX support

---
 runners/launch_h200-nb.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 9d157a858..8c75700df 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -19,4 +19,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME"
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh

From ab2812a8eaea1d52c1d08e37383d7649308ca613 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 1 Apr 2026 23:23:08 -0500
Subject: [PATCH 15/33] fix all H200 runners: add SCRIPT_SUFFIX support

---
 runners/launch_h200-cw.sh         | 2 +-
 runners/launch_h200-dgxc-slurm.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 657f84792..c4bdad736 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -44,7 +44,7 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh
 
 rmdir $SAGEMAKER_SHM_PATH
 scancel $JOB_ID
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 9b3b771a5..e09eaeeed 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -258,7 +258,7 @@ else
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh
+        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp')${SCRIPT_SUFFIX:-}.sh
 
     scancel $JOB_ID
 

From 5aa993f5eef7ecf3625bb861c04530e976d2a1a0 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 08:20:01 -0500
Subject: [PATCH 16/33] fix all runners: add SCRIPT_SUFFIX support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 runners/launch_b200-dgxc-slurm.sh | 2 +-
 runners/launch_b200-dgxc.sh       | 2 +-
 runners/launch_b200-nb.sh         | 2 +-
 runners/launch_gb200-nv.sh        | 2 +-
 runners/launch_h100-cr.sh         | 2 +-
 runners/launch_h100-dgxc-slurm.sh | 2 +-
 runners/launch_mi300x-amds.sh     | 2 +-
 runners/launch_mi325x-amd.sh      | 2 +-
 runners/launch_mi355x-amds.sh     | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 0d1bd40cc..d2ad4bc5d 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -234,5 +234,5 @@ else
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh
 fi
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index f8c614936..8243fd6d0 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -36,7 +36,7 @@ docker run --rm --init --network host --name $server_name \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
-benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
+benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh"
 
 # Try graceful first
 docker stop -t 90 "$server_name" || true
diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index c321ee0f9..eda4b17ba 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -17,4 +17,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME"
 --container-writable \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
\ No newline at end of file
+bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh
\ No newline at end of file
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index f8f0ef26e..8d20ea162 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -63,7 +63,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" && -z "$CONFIG_FILE" ]]; then
     else
         BENCHMARK_SUBDIR="single_node"
     fi
-    bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}"
+    bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME%.sh}${SCRIPT_SUFFIX:-}.sh"
     # Wait for all jobs to complete
     echo "Waiting for all jobs to complete..."
     while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 5100419b9..7539d99db 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -15,4 +15,4 @@ docker run --rm --network=host --name=$server_name \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
 $IMAGE \
-benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh"
+benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh"
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index bb0335955..98af3caf2 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -247,7 +247,7 @@ else
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh
+        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh
 
     scancel $JOB_ID
 
diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh
index b654c515a..8b9896e00 100644
--- a/runners/launch_mi300x-amds.sh
+++ b/runners/launch_mi300x-amds.sh
@@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \
 --container-remap-root \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi300x.sh
+bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi300x${SCRIPT_SUFFIX:-}.sh
 
 scancel $JOB_ID
\ No newline at end of file
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 67f93a309..e6c3ca4e4 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \
 --container-remap-root \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh
+bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x${SCRIPT_SUFFIX:-}.sh
 
 scancel $JOB_ID
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index ac91177ca..2069774ba 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -57,7 +57,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     else
         BENCHMARK_SUBDIR="single_node"
     fi
-    JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
+    JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME%.sh}${SCRIPT_SUFFIX:-}.sh")
 
     # Wait for job to complete
     LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out"

From d5dd15103276a358988792f7d8d41c37b5ff07d0 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 08:35:56 -0500
Subject: [PATCH 17/33] reduce multiturn artifact size: upload only files
 needed for post-processing

Drops ~18GB per artifact by excluding inputs.json, conversations.jsonl,
responses.json, GPU telemetry, raw records, and full aiperf_artifacts/.
Only uploads the specific files used by collect_sweep_results.py and
plot_pareto.py.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark-multiturn-tmpl.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml
index a72034b14..20777d0eb 100644
--- a/.github/workflows/benchmark-multiturn-tmpl.yml
+++ b/.github/workflows/benchmark-multiturn-tmpl.yml
@@ -156,18 +156,17 @@ jobs:
             results/metrics_server_metrics.csv
             results/metrics_plots.png
             results/benchmark.log
-            results/server.log
             results/config.yaml
             results/vllm_command.txt
             results/benchmark_command.txt
             results/benchmark_metadata.json
             results/metrics_workload.png
-            results/responses.json
-            results/aiperf_artifacts/
-            results/conversations.jsonl
+            results/aiperf_artifacts/profile_export_aiperf.csv
+            results/aiperf_artifacts/profile_export_aiperf.json
+            results/aiperf_artifacts/profile_export.jsonl
             results/workload_distribution_summary.txt
             results/workload_distribution_plots.png
-            results/trace_replay/
+            results/trace_replay/detailed_results.csv
             results/status.txt
           if-no-files-found: ignore
 

From bd4ec30ec4d83fefc403828b61db0fe599c00aab Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 10:04:42 -0500
Subject: [PATCH 18/33] add exclusive

---
 runners/launch_h100-dgxc-slurm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 98af3caf2..b3190577a 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -242,6 +242,7 @@ else
     fi
 
     srun --jobid=$JOB_ID \
+        --exclusive \
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \

From a12cc9d2498c2571b98e9bb4239a3c2c047901f4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 10:04:57 -0500
Subject: [PATCH 19/33] add exclusive

---
 runners/launch_b200-dgxc-slurm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index d2ad4bc5d..3ff289e61 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -229,6 +229,7 @@ else
     fi
 
     srun --jobid=$JOB_ID \
+        --exclusive \
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \

From af49d11635ee979f36b9550bfcf56199671a8ce3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 11:04:40 -0500
Subject: [PATCH 20/33] add exclusive

---
 runners/launch_h100-dgxc-slurm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index b3190577a..124c8de6e 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -3,7 +3,7 @@
 # System-specific configuration for H100 DGXC Slurm cluster
 SLURM_PARTITION="hpc-gpu-1"
 SLURM_ACCOUNT="customer"
-SLURM_EXCLUDED_NODELIST="hpc-gpu-1-7"
+SLURM_EXCLUDED_NODELIST="hpc-gpu-1-1,hpc-gpu-1-7,hpc-gpu-1-18"
 
 set -x
 

From 48ef44d54c63823ff127c214e0785dc4f70cafb2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 11:14:32 -0500
Subject: [PATCH 21/33] use aiperf summary CSV instead of per-record JSONL for
 post-processing

The profile_export.jsonl with 233K records was ~10GB per artifact.
Switch collect_sweep_results.py and plot_pareto.py to read from the
pre-computed profile_export_aiperf.csv (~4KB) instead. Remove the JSONL
from the artifact upload. Existing client CSV and trace_replay paths
are unchanged.

Also exclude low-FreeMem H100 nodes (1, 7, 18) to avoid
cudaMallocHost/mlock failures during vLLM CPU KV cache allocation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../workflows/benchmark-multiturn-tmpl.yml    |   1 -
 .../vllm_benchmark/analysis/plot_pareto.py    | 172 +++++++------
 .../scripts/collect_sweep_results.py          | 242 ++++++++++--------
 3 files changed, 223 insertions(+), 192 deletions(-)

diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml
index 20777d0eb..7c1d5ce0d 100644
--- a/.github/workflows/benchmark-multiturn-tmpl.yml
+++ b/.github/workflows/benchmark-multiturn-tmpl.yml
@@ -163,7 +163,6 @@ jobs:
             results/metrics_workload.png
             results/aiperf_artifacts/profile_export_aiperf.csv
             results/aiperf_artifacts/profile_export_aiperf.json
-            results/aiperf_artifacts/profile_export.jsonl
             results/workload_distribution_summary.txt
             results/workload_distribution_plots.png
             results/trace_replay/detailed_results.csv
diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
index 277bfca7f..7da67c8a4 100644
--- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
+++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
@@ -17,53 +17,69 @@
 from pathlib import Path
 
 
-def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None:
-    """Load per-request metrics from aiperf profile_export JSONL."""
-    records = []
-    with open(jsonl_path) as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            entry = json.loads(line)
-            meta = entry.get("metadata", {})
-            metrics = entry.get("metrics", {})
-
-            if meta.get("benchmark_phase") != "profiling":
-                continue
-            if meta.get("was_cancelled", False):
-                continue
-
-            def val(key, default=0):
-                m = metrics.get(key)
-                if m is None:
-                    return default
-                return m.get("value", default) if isinstance(m, dict) else m
-
-            itl = metrics.get("inter_token_latency")
-            if itl and isinstance(itl, dict):
-                tpot_ms = itl.get("value", 0)
-            else:
-                osl = val("output_sequence_length", 1)
-                ttft = val("time_to_first_token", 0)
-                latency = val("request_latency", 0)
-                tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0
-
-            start_ns = meta.get("request_start_ns", 0)
-            start_ms = start_ns / 1e6
-
-            records.append({
-                "start_time_ms": start_ms,
-                "ttft_ms": val("time_to_first_token"),
-                "tpot_ms": tpot_ms,
-                "latency_ms": val("request_latency"),
-                "input_num_tokens": val("input_sequence_length"),
-                "output_num_tokens": val("output_sequence_length"),
-            })
-
-    if not records:
+def _load_aiperf_summary_csv(csv_path: Path, exp_dir: Path, tp: int,
+                             gpu_hit_rate: float | None,
+                             cpu_hit_rate: float | None) -> dict | None:
+    """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv."""
+    df = pd.read_csv(csv_path)
+    if len(df) == 0:
         return None
-    return pd.DataFrame(records)
+
+    per_metric = df[df["avg"].notna()].set_index("Metric")
+    scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric")
+
+    def metric_stat(metric_name, stat):
+        if metric_name in per_metric.index:
+            return float(per_metric.loc[metric_name, stat])
+        return 0
+
+    def scalar_val(metric_name):
+        if metric_name in scalars.index:
+            return float(scalars.loc[metric_name, "min"])
+        return 0
+
+    exp_name = exp_dir.name
+    parts = exp_name.split("_")
+    tp_parsed = int(parts[0].replace("tp", ""))
+    bs = int(parts[1].replace("bs", ""))
+    offload = parts[2].replace("offload", "")
+
+    num_requests = int(scalar_val("Request Count"))
+    throughput_rps = scalar_val("Request Throughput (requests/sec)")
+    output_throughput_tps = scalar_val("Output Token Throughput (tokens/sec)")
+    total_throughput_tps = scalar_val("Total Token Throughput (tokens/sec)")
+    input_throughput_tps = total_throughput_tps - output_throughput_tps
+
+    return {
+        "exp_name": exp_name,
+        "tp": tp_parsed,
+        "bs": bs,
+        "offload": offload,
+        "num_requests": num_requests,
+        "throughput_rps": throughput_rps,
+        "input_throughput_tps": input_throughput_tps,
+        "total_throughput_tps": total_throughput_tps,
+        "input_tps_per_gpu": input_throughput_tps / tp_parsed,
+        "output_tps_per_gpu": output_throughput_tps / tp_parsed,
+        "total_tps_per_gpu": total_throughput_tps / tp_parsed,
+        "mean_ttft_ms": metric_stat("Time to First Token (ms)", "avg"),
+        "p50_ttft_ms": metric_stat("Time to First Token (ms)", "p50"),
+        "p90_ttft_ms": metric_stat("Time to First Token (ms)", "p90"),
+        "p99_ttft_ms": metric_stat("Time to First Token (ms)", "p99"),
+        "mean_tpot_ms": metric_stat("Inter Token Latency (ms)", "avg"),
+        "p50_tpot_ms": metric_stat("Inter Token Latency (ms)", "p50"),
+        "p90_tpot_ms": metric_stat("Inter Token Latency (ms)", "p90"),
+        "p99_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"),
+        "p999_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"),  # p999 not available, use p99
+        "mean_latency_ms": metric_stat("Request Latency (ms)", "avg"),
+        "p50_latency_ms": metric_stat("Request Latency (ms)", "p50"),
+        "p90_latency_ms": metric_stat("Request Latency (ms)", "p90"),
+        "p99_latency_ms": metric_stat("Request Latency (ms)", "p99"),
+        "p999_latency_ms": metric_stat("Request Latency (ms)", "p99"),  # p999 not available, use p99
+        "p999_ttft_ms": metric_stat("Time to First Token (ms)", "p99"),  # p999 not available, use p99
+        "gpu_hit_rate": gpu_hit_rate,
+        "cpu_hit_rate": cpu_hit_rate,
+    }
 
 
 def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None:
@@ -103,43 +119,46 @@ def load_experiment_data(exp_dir: Path) -> dict | None:
     if status != "SUCCESS":
         return None
 
-    # Also check for aiperf output
-    aiperf_jsonl = None
+    # Check for aiperf summary CSV (preferred)
+    aiperf_summary_csv = None
     aiperf_artifacts = exp_dir / "aiperf_artifacts"
     if aiperf_artifacts.exists():
-        candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl"))
-        if not candidates:
-            candidates = list(aiperf_artifacts.glob("profile_export*.jsonl"))
-        if candidates:
-            aiperf_jsonl = candidates[0]
+        candidate = aiperf_artifacts / "profile_export_aiperf.csv"
+        if candidate.exists():
+            aiperf_summary_csv = candidate
 
     # Check for trace replay output
     trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv"
 
-    if not client_metrics_file.exists() and aiperf_jsonl is None and not trace_replay_csv.exists():
+    if not client_metrics_file.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists():
         return None
 
     try:
-        if client_metrics_file.exists():
-            df = pd.read_csv(client_metrics_file)
-        elif aiperf_jsonl is not None:
-            df = _load_aiperf_jsonl(aiperf_jsonl)
-        elif trace_replay_csv.exists():
-            df = _load_trace_replay_csv(trace_replay_csv)
-        else:
-            return None
-
         # Load server metrics for cache hit rates
         gpu_hit_rate = None
         cpu_hit_rate = None
         if server_metrics_file.exists():
             server_df = pd.read_csv(server_metrics_file)
-            # Get final cumulative values
             final_row = server_df.iloc[-1]
             if final_row["prefix_cache_queries"] > 0:
                 gpu_hit_rate = 100 * final_row["prefix_cache_hits"] / final_row["prefix_cache_queries"]
             if final_row["cpu_prefix_cache_queries"] > 0:
                 cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"]
+
+        # Use aiperf summary CSV directly if available
+        if aiperf_summary_csv is not None and not client_metrics_file.exists():
+            exp_name = exp_dir.name
+            parts = exp_name.split("_")
+            tp = int(parts[0].replace("tp", ""))
+            return _load_aiperf_summary_csv(aiperf_summary_csv, exp_dir, tp, gpu_hit_rate, cpu_hit_rate)
+
+        if client_metrics_file.exists():
+            df = pd.read_csv(client_metrics_file)
+        elif trace_replay_csv.exists():
+            df = _load_trace_replay_csv(trace_replay_csv)
+        else:
+            return None
+
         if len(df) == 0:
             return None
 
@@ -151,7 +170,6 @@ def load_experiment_data(exp_dir: Path) -> dict | None:
         offload = parts[2].replace("offload", "")
 
         # Calculate metrics
-        # Prefer benchmark_metadata.json for precise wall-clock duration
         metadata_file = exp_dir / "benchmark_metadata.json"
         total_time_sec = None
         if metadata_file.exists():
@@ -162,33 +180,20 @@ def load_experiment_data(exp_dir: Path) -> dict | None:
             except Exception:
                 pass
 
-        # Fallback: derive from per-request data (first start to last finish)
         if not total_time_sec or total_time_sec <= 0:
             first_start_ms = df["start_time_ms"].min()
             last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max()
             total_time_sec = (last_finish_ms - first_start_ms) / 1000.0
         if total_time_sec <= 0:
-            total_time_sec = df["latency_ms"].sum() / 1000  # fallback
+            total_time_sec = df["latency_ms"].sum() / 1000
 
         num_requests = len(df)
         throughput_rps = num_requests / total_time_sec if total_time_sec > 0 else 0
-
-        # Input token throughput (prefill)
         total_input_tokens = df["input_num_tokens"].sum()
         input_throughput_tps = total_input_tokens / total_time_sec if total_time_sec > 0 else 0
-
-        # Output token throughput (decode only)
         total_output_tokens = df["output_num_tokens"].sum()
         output_throughput_tps = total_output_tokens / total_time_sec if total_time_sec > 0 else 0
-
-        # Total token throughput (input + output)
-        total_tokens = total_input_tokens + total_output_tokens
-        total_throughput_tps = total_tokens / total_time_sec if total_time_sec > 0 else 0
-
-        # Normalized throughput (per GPU)
-        input_tps_per_gpu = input_throughput_tps / tp
-        output_tps_per_gpu = output_throughput_tps / tp
-        total_tps_per_gpu = total_throughput_tps / tp
+        total_throughput_tps = (total_input_tokens + total_output_tokens) / total_time_sec if total_time_sec > 0 else 0
 
         return {
             "exp_name": exp_name,
@@ -199,9 +204,9 @@ def load_experiment_data(exp_dir: Path) -> dict | None:
             "throughput_rps": throughput_rps,
             "input_throughput_tps": input_throughput_tps,
             "total_throughput_tps": total_throughput_tps,
-            "input_tps_per_gpu": input_tps_per_gpu,
-            "output_tps_per_gpu": output_tps_per_gpu,
-            "total_tps_per_gpu": total_tps_per_gpu,
+            "input_tps_per_gpu": input_throughput_tps / tp,
+            "output_tps_per_gpu": output_throughput_tps / tp,
+            "total_tps_per_gpu": total_throughput_tps / tp,
             "mean_ttft_ms": df["ttft_ms"].mean(),
             "p50_ttft_ms": df["ttft_ms"].median(),
             "p90_ttft_ms": df["ttft_ms"].quantile(0.9),
@@ -217,7 +222,6 @@ def load_experiment_data(exp_dir: Path) -> dict | None:
             "p99_latency_ms": df["latency_ms"].quantile(0.99),
             "p999_latency_ms": df["latency_ms"].quantile(0.999),
             "p999_ttft_ms": df["ttft_ms"].quantile(0.999),
-            # Cache hit rates
             "gpu_hit_rate": gpu_hit_rate,
             "cpu_hit_rate": cpu_hit_rate,
         }
diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
index fc02b1865..9910fb8ff 100755
--- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
+++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
@@ -33,63 +33,52 @@ def _load_custom_client_csv(client_csv: Path, exp_dir: Path) -> pd.DataFrame | N
     return df
 
 
-def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None:
-    """Load per-request metrics from aiperf profile_export JSONL.
+def _load_aiperf_summary_csv(csv_path: Path) -> dict | None:
+    """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv.
 
-    Converts aiperf's per-record format into the same column schema
-    used by the custom benchmark client CSV.
+    Returns a dict with pre-computed metrics matching the result schema,
+    or None if the file can't be parsed.
     """
-    records = []
-    with open(jsonl_path) as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            entry = json.loads(line)
-            meta = entry.get("metadata", {})
-            metrics = entry.get("metrics", {})
-
-            # Skip non-profiling records or cancelled requests
-            if meta.get("benchmark_phase") != "profiling":
-                continue
-            if meta.get("was_cancelled", False):
-                continue
-
-            # Extract values (aiperf stores metrics as {value, unit} dicts)
-            def val(key, default=0):
-                m = metrics.get(key)
-                if m is None:
-                    return default
-                return m.get("value", default) if isinstance(m, dict) else m
-
-            # Compute TPOT from ITL if available
-            itl = metrics.get("inter_token_latency")
-            if itl and isinstance(itl, dict):
-                tpot_ms = itl.get("value", 0)
-            else:
-                # Fallback: (latency - ttft) / (output_tokens - 1)
-                osl = val("output_sequence_length", 1)
-                ttft = val("time_to_first_token", 0)
-                latency = val("request_latency", 0)
-                tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0
-
-            # Convert request_start_ns to ms (epoch)
-            start_ns = meta.get("request_start_ns", 0)
-            start_ms = start_ns / 1e6
-
-            records.append({
-                "start_time_ms": start_ms,
-                "ttft_ms": val("time_to_first_token"),
-                "tpot_ms": tpot_ms,
-                "latency_ms": val("request_latency"),
-                "input_num_tokens": val("input_sequence_length"),
-                "output_num_tokens": val("output_sequence_length"),
-            })
-
-    if not records:
+    df = pd.read_csv(csv_path)
+    if len(df) == 0:
         return None
 
-    return pd.DataFrame(records)
+    # The CSV has two sections:
+    # 1. Per-metric rows with columns: Metric, avg, min, max, sum, p1..p99, std
+    # 2. Scalar rows with columns: Metric, Value
+    # Split by finding rows where only Metric and Value are populated
+    per_metric = df[df["avg"].notna()].set_index("Metric")
+    scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric")
+
+    def metric_stat(metric_name, stat):
+        if metric_name in per_metric.index:
+            return float(per_metric.loc[metric_name, stat])
+        return 0
+
+    def scalar_val(metric_name):
+        if metric_name in scalars.index:
+            return float(scalars.loc[metric_name, "min"])  # "min" column holds Value
+        return 0
+
+    return {
+        "num_requests": int(scalar_val("Request Count")),
+        "throughput_rps": scalar_val("Request Throughput (requests/sec)"),
+        "output_throughput_tps": scalar_val("Output Token Throughput (tokens/sec)"),
+        "total_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)"),
+        "input_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)") - scalar_val("Output Token Throughput (tokens/sec)"),
+        "mean_ttft_ms": metric_stat("Time to First Token (ms)", "avg"),
+        "p50_ttft_ms": metric_stat("Time to First Token (ms)", "p50"),
+        "p90_ttft_ms": metric_stat("Time to First Token (ms)", "p90"),
+        "p99_ttft_ms": metric_stat("Time to First Token (ms)", "p99"),
+        "mean_tpot_ms": metric_stat("Inter Token Latency (ms)", "avg"),
+        "p50_tpot_ms": metric_stat("Inter Token Latency (ms)", "p50"),
+        "p90_tpot_ms": metric_stat("Inter Token Latency (ms)", "p90"),
+        "p99_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"),
+        "mean_latency_ms": metric_stat("Request Latency (ms)", "avg"),
+        "p50_latency_ms": metric_stat("Request Latency (ms)", "p50"),
+        "p90_latency_ms": metric_stat("Request Latency (ms)", "p90"),
+        "p99_latency_ms": metric_stat("Request Latency (ms)", "p99"),
+    }
 
 
 def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None:
@@ -125,20 +114,18 @@ def load_experiment(exp_dir: Path) -> dict | None:
         return None
     status = status_file.read_text().strip()
 
-    # Also check for aiperf output
-    aiperf_jsonl = None
+    # Check for aiperf summary CSV (preferred) or per-record JSONL (fallback)
+    aiperf_summary_csv = None
     aiperf_artifacts = exp_dir / "aiperf_artifacts"
     if aiperf_artifacts.exists():
-        candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl"))
-        if not candidates:
-            candidates = list(aiperf_artifacts.glob("profile_export*.jsonl"))
-        if candidates:
-            aiperf_jsonl = candidates[0]
+        candidate = aiperf_artifacts / "profile_export_aiperf.csv"
+        if candidate.exists():
+            aiperf_summary_csv = candidate
 
     # Check for trace replay output
     trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv"
 
-    if not client_csv.exists() and aiperf_jsonl is None and not trace_replay_csv.exists():
+    if not client_csv.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists():
         return None
 
     # Parse experiment name from directory: multiturn_tp{N}_users{M}_offload{mode}
@@ -168,59 +155,100 @@ def load_experiment(exp_dir: Path) -> dict | None:
         return result
 
     try:
-        # Determine data source: custom client CSV, aiperf JSONL, or trace replay CSV
+        # Determine data source: custom client CSV, aiperf summary CSV, or trace replay CSV
         if client_csv.exists():
             df = _load_custom_client_csv(client_csv, exp_dir)
-        elif aiperf_jsonl is not None:
-            df = _load_aiperf_jsonl(aiperf_jsonl)
+            if df is None or len(df) == 0:
+                return result
+
+            # Prefer benchmark_metadata.json for precise wall-clock duration
+            metadata_file = exp_dir / "benchmark_metadata.json"
+            total_time_sec = None
+            if metadata_file.exists():
+                try:
+                    with open(metadata_file) as f:
+                        metadata = json.load(f)
+                    total_time_sec = metadata.get("benchmark_runtime_sec")
+                except Exception:
+                    pass
+
+            if not total_time_sec or total_time_sec <= 0:
+                first_start_ms = df["start_time_ms"].min()
+                last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max()
+                total_time_sec = (last_finish_ms - first_start_ms) / 1000.0
+            if total_time_sec <= 0:
+                total_time_sec = df["latency_ms"].sum() / 1000
+
+            num_requests = len(df)
+            result.update({
+                "num_requests": num_requests,
+                "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0,
+                "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
+                "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
+                "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0,
+                "mean_ttft_ms": df["ttft_ms"].mean(),
+                "p50_ttft_ms": df["ttft_ms"].median(),
+                "p90_ttft_ms": df["ttft_ms"].quantile(0.9),
+                "p99_ttft_ms": df["ttft_ms"].quantile(0.99),
+                "mean_tpot_ms": df["tpot_ms"].mean(),
+                "p50_tpot_ms": df["tpot_ms"].median(),
+                "p90_tpot_ms": df["tpot_ms"].quantile(0.9),
+                "p99_tpot_ms": df["tpot_ms"].quantile(0.99),
+                "mean_latency_ms": df["latency_ms"].mean(),
+                "p50_latency_ms": df["latency_ms"].median(),
+                "p90_latency_ms": df["latency_ms"].quantile(0.9),
+                "p99_latency_ms": df["latency_ms"].quantile(0.99),
+            })
+        elif aiperf_summary_csv is not None:
+            aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv)
+            if aiperf_metrics is None:
+                return result
+            result.update(aiperf_metrics)
         elif trace_replay_csv.exists():
             df = _load_trace_replay_csv(trace_replay_csv)
+            if df is None or len(df) == 0:
+                return result
+
+            metadata_file = exp_dir / "benchmark_metadata.json"
+            total_time_sec = None
+            if metadata_file.exists():
+                try:
+                    with open(metadata_file) as f:
+                        metadata = json.load(f)
+                    total_time_sec = metadata.get("benchmark_runtime_sec")
+                except Exception:
+                    pass
+
+            if not total_time_sec or total_time_sec <= 0:
+                first_start_ms = df["start_time_ms"].min()
+                last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max()
+                total_time_sec = (last_finish_ms - first_start_ms) / 1000.0
+            if total_time_sec <= 0:
+                total_time_sec = df["latency_ms"].sum() / 1000
+
+            num_requests = len(df)
+            result.update({
+                "num_requests": num_requests,
+                "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0,
+                "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
+                "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
+                "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0,
+                "mean_ttft_ms": df["ttft_ms"].mean(),
+                "p50_ttft_ms": df["ttft_ms"].median(),
+                "p90_ttft_ms": df["ttft_ms"].quantile(0.9),
+                "p99_ttft_ms": df["ttft_ms"].quantile(0.99),
+                "mean_tpot_ms": df["tpot_ms"].mean(),
+                "p50_tpot_ms": df["tpot_ms"].median(),
+                "p90_tpot_ms": df["tpot_ms"].quantile(0.9),
+                "p99_tpot_ms": df["tpot_ms"].quantile(0.99),
+                "mean_latency_ms": df["latency_ms"].mean(),
+                "p50_latency_ms": df["latency_ms"].median(),
+                "p90_latency_ms": df["latency_ms"].quantile(0.9),
+                "p99_latency_ms": df["latency_ms"].quantile(0.99),
+            })
         else:
             return result
 
-        if df is None or len(df) == 0:
-            return result
-
-        # Prefer benchmark_metadata.json for precise wall-clock duration
-        metadata_file = exp_dir / "benchmark_metadata.json"
-        total_time_sec = None
-        if metadata_file.exists():
-            try:
-                with open(metadata_file) as f:
-                    metadata = json.load(f)
-                total_time_sec = metadata.get("benchmark_runtime_sec")
-            except Exception:
-                pass
-
-        # Fallback: derive from per-request data (first start to last finish)
-        if not total_time_sec or total_time_sec <= 0:
-            first_start_ms = df["start_time_ms"].min()
-            last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max()
-            total_time_sec = (last_finish_ms - first_start_ms) / 1000.0
-        if total_time_sec <= 0:
-            total_time_sec = df["latency_ms"].sum() / 1000
-
-        num_requests = len(df)
-        result.update({
-            "num_requests": num_requests,
-            "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0,
-            "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
-            "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
-            "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0,
-            "mean_ttft_ms": df["ttft_ms"].mean(),
-            "p50_ttft_ms": df["ttft_ms"].median(),
-            "p90_ttft_ms": df["ttft_ms"].quantile(0.9),
-            "p99_ttft_ms": df["ttft_ms"].quantile(0.99),
-            "mean_tpot_ms": df["tpot_ms"].mean(),
-            "p50_tpot_ms": df["tpot_ms"].median(),
-            "p90_tpot_ms": df["tpot_ms"].quantile(0.9),
-            "p99_tpot_ms": df["tpot_ms"].quantile(0.99),
-            "mean_latency_ms": df["latency_ms"].mean(),
-            "p50_latency_ms": df["latency_ms"].median(),
-            "p90_latency_ms": df["latency_ms"].quantile(0.9),
-            "p99_latency_ms": df["latency_ms"].quantile(0.99),
-        })
-
         # Cache hit rates from server metrics
         if server_csv.exists():
             try:

From 4f106b8fdc9e27b30ca843eaf699510204e28216 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 11:26:03 -0500
Subject: [PATCH 22/33] debug

---
 benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index 1bec35577..926bda021 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -14,6 +14,10 @@ set -x
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
+export CUDA_LAUNCH_BLOCKING=1
+
+ulimit -a
+
 check_env_vars \
     MODEL \
     TP \

From cfb25fb509e7a87b0d8f8dadb4b60821f06eb072 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 12:31:49 -0500
Subject: [PATCH 23/33] fix LMCache traces: convert system role to developer
 for vLLM v0.18+

vLLM v0.18.0 follows the newer OpenAI API spec where the 'system'
message role was renamed to 'developer'. The LMCache traces use
'system', causing 100% 400 Bad Request errors. Also drop the 15GB
profile_export_aiperf.json from artifact uploads.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark-multiturn-tmpl.yml           | 1 -
 .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh     | 9 ++++++++-
 .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh     | 9 ++++++++-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml
index 7c1d5ce0d..f366564d3 100644
--- a/.github/workflows/benchmark-multiturn-tmpl.yml
+++ b/.github/workflows/benchmark-multiturn-tmpl.yml
@@ -162,7 +162,6 @@ jobs:
             results/benchmark_metadata.json
             results/metrics_workload.png
             results/aiperf_artifacts/profile_export_aiperf.csv
-            results/aiperf_artifacts/profile_export_aiperf.json
             results/workload_distribution_summary.txt
             results/workload_distribution_plots.png
             results/trace_replay/detailed_results.csv
diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index 926bda021..1d1c3154d 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -98,9 +98,16 @@ try:
     sessions = set()
     with open(out_path, 'w') as f:
         for row in ds:
+            # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role
+            # was renamed to 'developer'. Convert to avoid 400 validation errors.
+            messages = []
+            for msg in row['input']:
+                if msg.get('role') == 'system':
+                    msg = {**msg, 'role': 'developer'}
+                messages.append(msg)
             entry = {
                 'session_id': row['session_id'],
-                'messages': row['input'],
+                'messages': messages,
                 'output_length': row['output_length'],
             }
             f.write(json.dumps(entry) + '\n')
diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
index 9a0c89e5a..03fd4402e 100755
--- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
@@ -94,9 +94,16 @@ try:
     sessions = set()
     with open(out_path, 'w') as f:
         for row in ds:
+            # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role
+            # was renamed to 'developer'. Convert to avoid 400 validation errors.
+            messages = []
+            for msg in row['input']:
+                if msg.get('role') == 'system':
+                    msg = {**msg, 'role': 'developer'}
+                messages.append(msg)
             entry = {
                 'session_id': row['session_id'],
-                'messages': row['input'],
+                'messages': messages,
                 'output_length': row['output_length'],
             }
             f.write(json.dumps(entry) + '\n')

From ede9bde6e081eff22aad6683a9472d8babb2be86 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 12:33:59 -0500
Subject: [PATCH 24/33] revert system->developer role conversion in LMCache
 traces

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh     | 9 +--------
 .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh     | 9 +--------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index 1d1c3154d..926bda021 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -98,16 +98,9 @@ try:
     sessions = set()
     with open(out_path, 'w') as f:
         for row in ds:
-            # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role
-            # was renamed to 'developer'. Convert to avoid 400 validation errors.
-            messages = []
-            for msg in row['input']:
-                if msg.get('role') == 'system':
-                    msg = {**msg, 'role': 'developer'}
-                messages.append(msg)
             entry = {
                 'session_id': row['session_id'],
-                'messages': messages,
+                'messages': row['input'],
                 'output_length': row['output_length'],
             }
             f.write(json.dumps(entry) + '\n')
diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
index 03fd4402e..9a0c89e5a 100755
--- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
@@ -94,16 +94,9 @@ try:
     sessions = set()
     with open(out_path, 'w') as f:
         for row in ds:
-            # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role
-            # was renamed to 'developer'. Convert to avoid 400 validation errors.
-            messages = []
-            for msg in row['input']:
-                if msg.get('role') == 'system':
-                    msg = {**msg, 'role': 'developer'}
-                messages.append(msg)
             entry = {
                 'session_id': row['session_id'],
-                'messages': messages,
+                'messages': row['input'],
                 'output_length': row['output_length'],
             }
             f.write(json.dumps(entry) + '\n')

From a7ac440570908ca5f64e71b06b83fec3ea2da444 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 12:34:47 -0500
Subject: [PATCH 25/33] fix MetricsCollector missing gpu_transfer_collector
 attribute

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 experimental/multiturn/vllm_benchmark/bench/metrics_collector.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
index 7bcdf31a4..b38653ea8 100644
--- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
+++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py
@@ -172,6 +172,7 @@ class MetricsCollector:
     _task: asyncio.Task | None = None
     _parser: VLLMMetricsParser | SGLangMetricsParser | None = None
     _backend: str = ""
+    gpu_transfer_collector: object = None
 
     def _parse_metrics(self, text: str) -> MetricsSnapshot:
         """Parse Prometheus metrics text, auto-detecting backend on first call."""

From db87b95fc7eb55e19ec318569e1771369fa6ac28 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 12:44:27 -0500
Subject: [PATCH 26/33] fix LMCache traces: strip null fields to pass vLLM
 Pydantic validation

The LMCache traces include explicit null values for optional fields
(tool_calls, tool_call_id, name) on every message. vLLM's strict
Pydantic validation rejects these, causing 100% HTTP 400 errors.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 5 ++++-
 benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index 926bda021..034df4d89 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -98,9 +98,12 @@ try:
     sessions = set()
     with open(out_path, 'w') as f:
         for row in ds:
+            # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
+            # for optional fields like tool_calls, tool_call_id, name
+            messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
             entry = {
                 'session_id': row['session_id'],
-                'messages': row['input'],
+                'messages': messages,
                 'output_length': row['output_length'],
             }
             f.write(json.dumps(entry) + '\n')
diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
index 9a0c89e5a..c4d26dd7e 100755
--- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
@@ -94,9 +94,12 @@ try:
     sessions = set()
     with open(out_path, 'w') as f:
         for row in ds:
+            # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
+            # for optional fields like tool_calls, tool_call_id, name
+            messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
             entry = {
                 'session_id': row['session_id'],
-                'messages': row['input'],
+                'messages': messages,
                 'output_length': row['output_length'],
             }
             f.write(json.dumps(entry) + '\n')

From 07ce85de133bf608d48bd635b3816458a2e5db53 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 13:13:29 -0500
Subject: [PATCH 27/33] use hf download for LMCache traces instead of
 datasets.load_dataset

Avoids flaky streaming downloads that fail mid-transfer. The dataset
is now cached via hf download (same as model weights) and read from
the local parquet files.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../multiturn_fp8_h100_lmcache_aiperf.sh      | 58 ++++++++++++-------
 .../multiturn_fp8_h200_lmcache_aiperf.sh      | 58 ++++++++++++-------
 2 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index 034df4d89..ae666c37b 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -87,31 +87,45 @@ fi
 
 mkdir -p "$RESULT_DIR"
 
-# ---- Convert LMCache traces to mooncake format -----------------------------
-echo "Downloading and converting LMCache traces..."
+# ---- Download and convert LMCache traces to mooncake format ----------------
+echo "Downloading LMCache traces..."
+hf download sammshen/lmcache-agentic-traces --repo-type dataset
+
+echo "Converting LMCache traces to mooncake format..."
 python3 -c "
-import json, os
-try:
+import json, glob, os
+hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub'))
+# Find the downloaded parquet/jsonl files in the HF cache
+candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True)
+if not candidates:
+    candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True)
+if not candidates:
+    # Fallback: use datasets library to load from cache
     from datasets import load_dataset
     ds = load_dataset('sammshen/lmcache-agentic-traces', split='train')
-    out_path = '$TRACE_FILE'
-    sessions = set()
-    with open(out_path, 'w') as f:
-        for row in ds:
-            # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
-            # for optional fields like tool_calls, tool_call_id, name
-            messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
-            entry = {
-                'session_id': row['session_id'],
-                'messages': messages,
-                'output_length': row['output_length'],
-            }
-            f.write(json.dumps(entry) + '\n')
-            sessions.add(row['session_id'])
-    print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}')
-except Exception as e:
-    print(f'ERROR converting traces: {e}')
-    exit(1)
+    rows = list(ds)
+else:
+    import pyarrow.parquet as pq
+    rows = []
+    for f in sorted(candidates):
+        table = pq.read_table(f)
+        rows.extend(table.to_pylist())
+    print(f'Loaded {len(rows)} rows from {len(candidates)} cached files')
+
+out_path = '$TRACE_FILE'
+sessions = set()
+with open(out_path, 'w') as f:
+    for row in rows:
+        # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
+        messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
+        entry = {
+            'session_id': row['session_id'],
+            'messages': messages,
+            'output_length': row['output_length'],
+        }
+        f.write(json.dumps(entry) + '\n')
+        sessions.add(row['session_id'])
+print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}')
 "
 
 SERVER_LOG="$RESULT_DIR/server.log"
diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
index c4d26dd7e..56232cf58 100755
--- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
@@ -83,31 +83,45 @@ fi
 
 mkdir -p "$RESULT_DIR"
 
-# ---- Convert LMCache traces to mooncake format -----------------------------
-echo "Downloading and converting LMCache traces..."
+# ---- Download and convert LMCache traces to mooncake format ----------------
+echo "Downloading LMCache traces..."
+hf download sammshen/lmcache-agentic-traces --repo-type dataset
+
+echo "Converting LMCache traces to mooncake format..."
 python3 -c "
-import json, os
-try:
+import json, glob, os
+hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub'))
+# Find the downloaded parquet/jsonl files in the HF cache
+candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True)
+if not candidates:
+    candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True)
+if not candidates:
+    # Fallback: use datasets library to load from cache
     from datasets import load_dataset
     ds = load_dataset('sammshen/lmcache-agentic-traces', split='train')
-    out_path = '$TRACE_FILE'
-    sessions = set()
-    with open(out_path, 'w') as f:
-        for row in ds:
-            # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
-            # for optional fields like tool_calls, tool_call_id, name
-            messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
-            entry = {
-                'session_id': row['session_id'],
-                'messages': messages,
-                'output_length': row['output_length'],
-            }
-            f.write(json.dumps(entry) + '\n')
-            sessions.add(row['session_id'])
-    print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}')
-except Exception as e:
-    print(f'ERROR converting traces: {e}')
-    exit(1)
+    rows = list(ds)
+else:
+    import pyarrow.parquet as pq
+    rows = []
+    for f in sorted(candidates):
+        table = pq.read_table(f)
+        rows.extend(table.to_pylist())
+    print(f'Loaded {len(rows)} rows from {len(candidates)} cached files')
+
+out_path = '$TRACE_FILE'
+sessions = set()
+with open(out_path, 'w') as f:
+    for row in rows:
+        # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
+        messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
+        entry = {
+            'session_id': row['session_id'],
+            'messages': messages,
+            'output_length': row['output_length'],
+        }
+        f.write(json.dumps(entry) + '\n')
+        sessions.add(row['session_id'])
+print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}')
 "
 
 SERVER_LOG="$RESULT_DIR/server.log"

From 195ca66d90e2dd14412bbeee38fa1ee612949832 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 15:00:04 -0500
Subject: [PATCH 28/33] add B200 FP4 multiturn benchmark script using aiperf

Based on H100 aiperf script with B200-specific changes:
- TORCH_CUDA_ARCH_LIST=10.0 (Blackwell)
- B200 compilation config (FULL_DECODE_ONLY cudagraphs, custom ops)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../multiturn_fp4_b200_lmcache_aiperf.sh      | 248 ++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100755 benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh

diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
new file mode 100755
index 000000000..2e8164f3f
--- /dev/null
+++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
@@ -0,0 +1,248 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# LMCache agentic trace benchmark for FP4 models on B200 using AIPerf.
+# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format.
+# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces
+#
+# Required env vars:
+#   MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR
+# Optional:
+#   PORT (default 8888), REQUEST_TIMEOUT (default 3600)
+#   DURATION (if set, runs for this many seconds; otherwise runs to completion)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+export CUDA_LAUNCH_BLOCKING=1
+
+ulimit -a
+
+check_env_vars \
+    MODEL \
+    TP \
+    USERS \
+    OFFLOAD_MODE \
+    TOTAL_CPU_DRAM_GB \
+    RESULT_DIR
+
+PORT=${PORT:-8888}
+REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ---- Download model --------------------------------------------------------
+hf download "$MODEL"
+
+nvidia-smi
+
+# ---- Paths -----------------------------------------------------------------
+MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark
+AIPERF_DIR="$MULTITURN_DIR/aiperf"
+TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl"
+
+pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true
+
+# Patch vLLM bug: local_cache_hit counter can go negative under high load
+# (causes "Counters can only be incremented by non-negative amounts" crash)
+STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "")
+if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then
+    echo "Patching vLLM stats.py: $STATS_FILE"
+    python3 -c "
+import re, sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'self.local_cache_hit += (\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+    'self.local_cache_hit += max(0,\n            num_cached_tokens + recomputed - num_external_computed_tokens\n        )',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+" "$STATS_FILE"
+fi
+
+# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859)
+# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading)
+SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "")
+if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then
+    echo "Patching vLLM scheduler.py: $SCHED_FILE"
+    python3 << 'PYEOF' "$SCHED_FILE"
+import sys
+with open(sys.argv[1]) as f:
+    src = f.read()
+src = src.replace(
+    'assert req_id in self.requests\n            req = self.requests[req_id]\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n                self.finished_recving_kv_req_ids.discard(req_id)\n                continue\n            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:',
+)
+src = src.replace(
+    'assert req_id in self.requests\n            self._free_blocks(self.requests[req_id])',
+    'req = self.requests.get(req_id)\n            if req is None:\n                logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n                continue\n            self._free_blocks(req)',
+)
+with open(sys.argv[1], 'w') as f:
+    f.write(src)
+PYEOF
+fi
+
+mkdir -p "$RESULT_DIR"
+
+# ---- Download and convert LMCache traces to mooncake format ----------------
+echo "Downloading LMCache traces..."
+hf download sammshen/lmcache-agentic-traces --repo-type dataset
+
+echo "Converting LMCache traces to mooncake format..."
+python3 -c "
+import json, glob, os
+hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub'))
+# Find the downloaded parquet/jsonl files in the HF cache
+candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True)
+if not candidates:
+    candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True)
+if not candidates:
+    # Fallback: use datasets library to load from cache
+    from datasets import load_dataset
+    ds = load_dataset('sammshen/lmcache-agentic-traces', split='train')
+    rows = list(ds)
+else:
+    import pyarrow.parquet as pq
+    rows = []
+    for f in sorted(candidates):
+        table = pq.read_table(f)
+        rows.extend(table.to_pylist())
+    print(f'Loaded {len(rows)} rows from {len(candidates)} cached files')
+
+out_path = '$TRACE_FILE'
+sessions = set()
+with open(out_path, 'w') as f:
+    for row in rows:
+        # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
+        messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
+        entry = {
+            'session_id': row['session_id'],
+            'messages': messages,
+            'output_length': row['output_length'],
+        }
+        f.write(json.dumps(entry) + '\n')
+        sessions.add(row['session_id'])
+print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}')
+"
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# ---- Generate vLLM config --------------------------------------------------
+cat > "$RESULT_DIR/config.yaml" << 'EOF'
+kv-cache-dtype: fp8
+compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
+async-scheduling: true
+EOF
+
+# ---- Build vLLM command -----------------------------------------------------
+offload_size=$TOTAL_CPU_DRAM_GB
+
+VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT"
+VLLM_CMD+=" --config $RESULT_DIR/config.yaml"
+VLLM_CMD+=" --gpu-memory-utilization 0.9"
+VLLM_CMD+=" --tensor-parallel-size $TP"
+
+if [ "$OFFLOAD_MODE" = "on" ]; then
+    VLLM_CMD+=" --kv_offloading_backend native"
+    VLLM_CMD+=" --kv_offloading_size $offload_size"
+    VLLM_CMD+=" --disable-hybrid-kv-cache-manager"
+elif [ "$OFFLOAD_MODE" = "noprefix" ]; then
+    VLLM_CMD+=" --no-enable-prefix-caching"
+fi
+
+echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt"
+
+# ---- Start vLLM server ------------------------------------------------------
+echo "Starting vllm server..."
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PYTHONNOUSERSITE=1
+
+$VLLM_CMD > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready \
+    --port "$PORT" \
+    --server-log "$SERVER_LOG" \
+    --server-pid "$SERVER_PID"
+
+# ---- Install dependencies ---------------------------------------------------
+set -x
+pip install -q -r "$MULTITURN_DIR/requirements.txt"
+
+echo "Installing aiperf in isolated venv..."
+python3 -m venv /tmp/aiperf-venv --system-site-packages
+/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10
+AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf"
+
+/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')"
+set +x
+
+# ---- Start server metrics collector -----------------------------------------
+export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}"
+
+echo "Starting server metrics collector..."
+python3 -m bench.run_metrics_collector \
+    --url "http://localhost:$PORT" \
+    --output-prefix "$RESULT_DIR/metrics" \
+    --pid-file "$RESULT_DIR/metrics_collector.pid" &
+METRICS_PID=$!
+echo "Metrics collector PID: $METRICS_PID"
+
+sleep 2
+
+# ---- Run AIPerf benchmark ----------------------------------------------------
+export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl"
+
+AIPERF_CMD="$AIPERF_BIN profile"
+AIPERF_CMD+=" --model $MODEL"
+AIPERF_CMD+=" --url http://localhost:$PORT"
+AIPERF_CMD+=" --endpoint-type chat"
+AIPERF_CMD+=" --streaming"
+AIPERF_CMD+=" --input-file $TRACE_FILE"
+AIPERF_CMD+=" --custom-dataset-type mooncake_trace"
+AIPERF_CMD+=" --concurrency $USERS"
+if [ -n "${DURATION:-}" ]; then
+    AIPERF_CMD+=" --benchmark-duration $DURATION"
+    AIPERF_CMD+=" --benchmark-grace-period 0"
+fi
+AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT"
+AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts"
+AIPERF_CMD+=" --export-level records"
+AIPERF_CMD+=" --ui-type simple"
+AIPERF_CMD+=" --random-seed 42"
+
+echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then
+    echo "SUCCESS" > "$RESULT_DIR/status.txt"
+    echo "Benchmark completed successfully"
+else
+    echo "FAILED" > "$RESULT_DIR/status.txt"
+    echo "Benchmark failed"
+fi
+set +x
+
+# ---- Analyze workload distributions -----------------------------------------
+echo "Analyzing workload distributions..."
+python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true
+
+# ---- Stop metrics collector -------------------------------------------------
+echo "Stopping metrics collector..."
+if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then
+    kill -TERM "$METRICS_PID" 2>/dev/null || true
+    wait "$METRICS_PID" 2>/dev/null || true
+fi
+
+# ---- Cleanup -----------------------------------------------------------------
+echo "Stopping vllm server..."
+kill "$SERVER_PID" 2>/dev/null || true
+wait "$SERVER_PID" 2>/dev/null || true
+
+echo "Experiment finished at $(date)"

From 09e6ec1c9746f86563e178aebde9681e04899cae Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 15:09:13 -0500
Subject: [PATCH 29/33] add entry for b200 ds

---
 .github/configs/multiturn-agentic-trace.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml
index 63892d202..f371c5625 100644
--- a/.github/configs/multiturn-agentic-trace.yaml
+++ b/.github/configs/multiturn-agentic-trace.yaml
@@ -42,12 +42,12 @@ h100-fp8-llama70b-lmcache:
     users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256]
     offload: ["on", "off"]
 
-b200-fp4-dsr1:
+b200-fp4-dsr1-weka-trace:
   tp4:
     ep: 4
-    users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128]
+    users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128, 256]
     offload: ["on", "off"]
   tp8:
     ep: 8
-    users: [1, 2, 4, 8, 12, 16, 32, 64, 128]
+    users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512]
     offload: ["on", "off"]

From 951326a2b5cf281c7b057b18b75558eb01a70b20 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 15:09:47 -0500
Subject: [PATCH 30/33] add expert parallel support to B200 FP4 aiperf script

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
index 2e8164f3f..5acba8a73 100755
--- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
@@ -146,6 +146,10 @@ VLLM_CMD+=" --config $RESULT_DIR/config.yaml"
 VLLM_CMD+=" --gpu-memory-utilization 0.9"
 VLLM_CMD+=" --tensor-parallel-size $TP"
 
+if [ "${EP_SIZE:-0}" -gt 1 ]; then
+    VLLM_CMD+=" --enable-expert-parallel"
+fi
+
 if [ "$OFFLOAD_MODE" = "on" ]; then
     VLLM_CMD+=" --kv_offloading_backend native"
     VLLM_CMD+=" --kv_offloading_size $offload_size"

From 0100fa1bc6ed4326c73de918088feddef542471c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 15:32:48 -0500
Subject: [PATCH 31/33] skip LMCache trace entries with empty messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dataset was updated (24K → 74K rows) and now includes entries
with empty message lists, causing aiperf MooncakeTrace validation
to fail.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../single_node/multiturn_fp4_b200_lmcache_aiperf.sh      | 8 +++++++-
 .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh      | 8 +++++++-
 .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh      | 8 +++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
index 5acba8a73..0df4efb0c 100755
--- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh
@@ -114,10 +114,14 @@ else:
 
 out_path = '$TRACE_FILE'
 sessions = set()
+skipped = 0
 with open(out_path, 'w') as f:
     for row in rows:
         # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
         messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
+        if not messages:
+            skipped += 1
+            continue
         entry = {
             'session_id': row['session_id'],
             'messages': messages,
@@ -125,7 +129,9 @@ with open(out_path, 'w') as f:
         }
         f.write(json.dumps(entry) + '\n')
         sessions.add(row['session_id'])
-print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}')
+if skipped:
+    print(f'Skipped {skipped} entries with empty messages')
+print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}')
 "
 
 SERVER_LOG="$RESULT_DIR/server.log"
diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
index ae666c37b..b81105d5b 100755
--- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh
@@ -114,10 +114,14 @@ else:
 
 out_path = '$TRACE_FILE'
 sessions = set()
+skipped = 0
 with open(out_path, 'w') as f:
     for row in rows:
         # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
         messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
+        if not messages:
+            skipped += 1
+            continue
         entry = {
             'session_id': row['session_id'],
             'messages': messages,
@@ -125,7 +129,9 @@ with open(out_path, 'w') as f:
         }
         f.write(json.dumps(entry) + '\n')
         sessions.add(row['session_id'])
-print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}')
+if skipped:
+    print(f'Skipped {skipped} entries with empty messages')
+print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}')
 "
 
 SERVER_LOG="$RESULT_DIR/server.log"
diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
index 56232cf58..e3acd1bb0 100755
--- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
+++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh
@@ -110,10 +110,14 @@ else:
 
 out_path = '$TRACE_FILE'
 sessions = set()
+skipped = 0
 with open(out_path, 'w') as f:
     for row in rows:
         # Strip None fields — vLLM's Pydantic validation rejects explicit nulls
         messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']]
+        if not messages:
+            skipped += 1
+            continue
         entry = {
             'session_id': row['session_id'],
             'messages': messages,
@@ -121,7 +125,9 @@ with open(out_path, 'w') as f:
         }
         f.write(json.dumps(entry) + '\n')
         sessions.add(row['session_id'])
-print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}')
+if skipped:
+    print(f'Skipped {skipped} entries with empty messages')
+print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}')
 "
 
 SERVER_LOG="$RESULT_DIR/server.log"

From 110dfa4803fcdf6d529baecfb5ca6598bdc8516b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 17:56:15 -0500
Subject: [PATCH 32/33] fix: prioritize aiperf summary CSV over malformed
 client CSV

Both collect_sweep_results.py and plot_pareto.py were trying to load
metrics_client_metrics.csv first, which fails with "Expected 15 fields,
saw 19" on aiperf runs. Now aiperf summary CSV is checked first.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../vllm_benchmark/analysis/plot_pareto.py         |  4 ++--
 .../scripts/collect_sweep_results.py               | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
index 7da67c8a4..081c98ebd 100644
--- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
+++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
@@ -145,8 +145,8 @@ def load_experiment_data(exp_dir: Path) -> dict | None:
             if final_row["cpu_prefix_cache_queries"] > 0:
                 cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"]
 
-        # Use aiperf summary CSV directly if available
-        if aiperf_summary_csv is not None and not client_metrics_file.exists():
+        # Use aiperf summary CSV directly if available (preferred over client CSV)
+        if aiperf_summary_csv is not None:
             exp_name = exp_dir.name
             parts = exp_name.split("_")
             tp = int(parts[0].replace("tp", ""))
diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
index 9910fb8ff..28f115f47 100755
--- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
+++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
@@ -155,8 +155,13 @@ def load_experiment(exp_dir: Path) -> dict | None:
         return result
 
     try:
-        # Determine data source: custom client CSV, aiperf summary CSV, or trace replay CSV
-        if client_csv.exists():
+        # Determine data source: aiperf summary CSV (preferred), custom client CSV, or trace replay CSV
+        if aiperf_summary_csv is not None:
+            aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv)
+            if aiperf_metrics is None:
+                return result
+            result.update(aiperf_metrics)
+        elif client_csv.exists():
             df = _load_custom_client_csv(client_csv, exp_dir)
             if df is None or len(df) == 0:
                 return result
@@ -199,11 +204,6 @@ def load_experiment(exp_dir: Path) -> dict | None:
                 "p90_latency_ms": df["latency_ms"].quantile(0.9),
                 "p99_latency_ms": df["latency_ms"].quantile(0.99),
             })
-        elif aiperf_summary_csv is not None:
-            aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv)
-            if aiperf_metrics is None:
-                return result
-            result.update(aiperf_metrics)
         elif trace_replay_csv.exists():
             df = _load_trace_replay_csv(trace_replay_csv)
             if df is None or len(df) == 0:

From c64e644b4b1af0a0bd6c7eb0a364d455ec02db71 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 2 Apr 2026 17:58:44 -0500
Subject: [PATCH 33/33] fix aiperf CSV parser: handle multi-section format with
 different column counts

The profile_export_aiperf.csv has 3 sections (per-metric stats, scalar
values, GPU metrics) with different column counts. pd.read_csv choked
on the GPU section (19 cols vs 14). Parse manually by splitting on
column count changes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../vllm_benchmark/analysis/plot_pareto.py    | 35 +++++++++++----
 .../scripts/collect_sweep_results.py          | 43 +++++++++++++------
 2 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
index 081c98ebd..90b7ed1f8 100644
--- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
+++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py
@@ -21,21 +21,40 @@ def _load_aiperf_summary_csv(csv_path: Path, exp_dir: Path, tp: int,
                              gpu_hit_rate: float | None,
                              cpu_hit_rate: float | None) -> dict | None:
     """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv."""
-    df = pd.read_csv(csv_path)
-    if len(df) == 0:
+    # The CSV has multiple sections with different column counts.
+    # Read raw lines and split into per-metric and scalar sections.
+    lines = csv_path.read_text().strip().split('\n')
+    if len(lines) < 2:
         return None
 
-    per_metric = df[df["avg"].notna()].set_index("Metric")
-    scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric")
+    header = lines[0].split(',')
+    per_metric = {}
+    scalars = {}
+    for line in lines[1:]:
+        if not line.strip():
+            continue
+        parts = line.split(',')
+        if len(parts) == len(header):
+            per_metric[parts[0]] = {h: parts[i] for i, h in enumerate(header)}
+        elif len(parts) == 2:
+            scalars[parts[0]] = parts[1]
+        else:
+            break
 
     def metric_stat(metric_name, stat):
-        if metric_name in per_metric.index:
-            return float(per_metric.loc[metric_name, stat])
+        if metric_name in per_metric:
+            try:
+                return float(per_metric[metric_name].get(stat, 0))
+            except (ValueError, TypeError):
+                return 0
         return 0
 
     def scalar_val(metric_name):
-        if metric_name in scalars.index:
-            return float(scalars.loc[metric_name, "min"])
+        if metric_name in scalars:
+            try:
+                return float(scalars[metric_name])
+            except (ValueError, TypeError):
+                return 0
         return 0
 
     exp_name = exp_dir.name
diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
index 28f115f47..89cf990f3 100755
--- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
+++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py
@@ -39,25 +39,44 @@ def _load_aiperf_summary_csv(csv_path: Path) -> dict | None:
     Returns a dict with pre-computed metrics matching the result schema,
     or None if the file can't be parsed.
     """
-    df = pd.read_csv(csv_path)
-    if len(df) == 0:
+    # The CSV has multiple sections with different column counts.
+    # Read raw lines and split into per-metric and scalar sections.
+    lines = csv_path.read_text().strip().split('\n')
+    if len(lines) < 2:
         return None
 
-    # The CSV has two sections:
-    # 1. Per-metric rows with columns: Metric, avg, min, max, sum, p1..p99, std
-    # 2. Scalar rows with columns: Metric, Value
-    # Split by finding rows where only Metric and Value are populated
-    per_metric = df[df["avg"].notna()].set_index("Metric")
-    scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric")
+    # Section 1: per-metric stats (header + data rows with 14 columns)
+    header = lines[0].split(',')
+    per_metric = {}
+    scalars = {}
+    for line in lines[1:]:
+        if not line.strip():
+            continue
+        parts = line.split(',')
+        if len(parts) == len(header):
+            # Per-metric row
+            per_metric[parts[0]] = {h: parts[i] for i, h in enumerate(header)}
+        elif len(parts) == 2:
+            # Scalar row (Metric, Value)
+            scalars[parts[0]] = parts[1]
+        else:
+            # Different section (GPU metrics) — stop
+            break
 
     def metric_stat(metric_name, stat):
-        if metric_name in per_metric.index:
-            return float(per_metric.loc[metric_name, stat])
+        if metric_name in per_metric:
+            try:
+                return float(per_metric[metric_name].get(stat, 0))
+            except (ValueError, TypeError):
+                return 0
         return 0
 
     def scalar_val(metric_name):
-        if metric_name in scalars.index:
-            return float(scalars.loc[metric_name, "min"])  # "min" column holds Value
+        if metric_name in scalars:
+            try:
+                return float(scalars[metric_name])
+            except (ValueError, TypeError):
+                return 0
         return 0
 
     return {