From 679dfdbab685e4fdeee4731d0bc15102275211d2 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 08:25:42 +0000 Subject: [PATCH 01/15] feat(agentv-bench): add executor subagent for non-CLI targets (#797) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add executor subagent that performs eval test cases directly when the target is a non-CLI provider. Mirrors the grader pattern — one executor per test case, all dispatched in parallel. - Create agents/executor.md with workspace isolation guardrails - Update SKILL.md: executor dispatch flow, subagent_mode_allowed docs, targets.yaml location and ${{ ENV_VAR }} security note - Update eval-yaml-spec.md: manifest-based opt-out, two invoke.json kinds - Update run_tests.py docstring Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agentv-dev/skills/agentv-bench/SKILL.md | 68 +++++++++++++-- .../skills/agentv-bench/agents/executor.md | 87 +++++++++++++++++++ .../agentv-bench/references/eval-yaml-spec.md | 4 +- .../skills/agentv-bench/scripts/run_tests.py | 5 +- 4 files changed, 151 insertions(+), 13 deletions(-) create mode 100644 plugins/agentv-dev/skills/agentv-bench/agents/executor.md diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index f325a61c5..02f5d3008 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -41,7 +41,7 @@ This skill ships with a Python scripts layer in `plugins/agentv-dev/skills/agent These scripts break the eval pipeline into discrete steps. The agent runs them in order, only handling LLM grading directly: -- `scripts/run_tests.py --out ` — Extract inputs and invoke CLI targets in parallel. Writes `response.md` per test. For agent-as-target, only extracts inputs (agent handles execution). +- `scripts/run_tests.py --out ` — Extract inputs and invoke CLI targets in parallel. Writes `response.md` per test. For agent-as-target (`kind: "agent"`), only extracts inputs — executor subagents handle execution. - `scripts/run_code_graders.py ` — Run code-grader assertions on existing responses. Writes per-grader results. - `scripts/bench.py < llm_scores.json` — Merge code-grader + LLM scores, compute weighted pass_rate, write `grading.json` + `index.jsonl` + `benchmark.json`. @@ -87,7 +87,7 @@ Before running or optimizing, understand what you're working with. 2. **Identify success criteria** — what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone. -3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what evaluator types are available and how to run tests. +3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what evaluator types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax — literal secrets are rejected as a security guardrail. 4. **Challenge assumptions** — if evals already exist, review their quality before running: - Are the test cases testing the right things? @@ -287,14 +287,28 @@ Write artifacts to `.agentv/artifacts/` or the iteration directory. ### Subagent mode: Running eval.yaml without CLI -When `SUBAGENT_EVAL_MODE=subagent` (default), use the pipeline CLI subcommands (`pipeline input`, `pipeline grade`, `pipeline bench`) and Python wrapper scripts. This mode spends LLM tokens only on grading, not on YAML parsing or target invocation. +When `SUBAGENT_EVAL_MODE=subagent` (default), use the pipeline CLI subcommands (`pipeline input`, `pipeline grade`, `pipeline bench`) and Python wrapper scripts. This mode dispatches `executor` subagents to perform each test case, then `grader` subagents to evaluate the outputs. + +**Executor subagent eligibility:** All providers except `cli` are eligible for executor subagents by default. To opt out a specific target, set `subagent_mode_allowed: false` in `.agentv/targets.yaml`: + +```yaml +# .agentv/targets.yaml +targets: + - name: my-target + provider: openai + model: ${{ OPENAI_MODEL }} + api_key: ${{ OPENAI_API_KEY }} + subagent_mode_allowed: false # forces CLI invocation instead of executor subagent +``` + +When `subagent_mode_allowed: false`, the target falls back to CLI invocation via `agentv eval` even in subagent mode. **Prerequisites:** - The eval.yaml file exists and contains valid test definitions - `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`) - Read `references/eval-yaml-spec.md` for the full schema -**Recommended: Single command for CLI targets** +**CLI targets: Single command** For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step. When `--out` is omitted, the output directory defaults to `.agentv/results/runs/` (same convention as `agentv eval`): @@ -315,20 +329,35 @@ agentv results validate That's the entire pipeline: **2 commands** + LLM grading + optional validation. -**Alternative: Step-by-step (subagent-as-target or fine-grained control)** +**Non-CLI targets: Executor subagents** + +When the target provider is not `cli`, check `manifest.json` → `target.subagent_mode_allowed`. If `true` (default for all non-CLI providers), the subagent IS the target. If `false` (user opted out via `subagent_mode_allowed: false` in `.agentv/targets.yaml`), fall back to `agentv eval` CLI mode instead. -Use individual commands when the subagent IS the target or you need control over each step: +For executor subagent targets, use `pipeline input` to extract inputs, then dispatch `executor` subagents to perform each test case: ```bash # Step 1: Extract inputs (defaults to .agentv/results/runs/) agentv pipeline input evals/repro.eval.yaml +``` + +This creates a run directory with per-test `input.json`, `invoke.json` (with `kind: "agent"`), `criteria.md`, and grader configs. -# Step 2: Subagent invokes each test (reads input.json, writes response.md) +**Step 2: Dispatch executor subagents** — read `agents/executor.md`. Launch one `executor` subagent **per test case**, all in parallel. Each subagent receives the test directory path, reads `input.json`, performs the task using its own tools, and writes `response.md`. For example, 5 tests = 5 executor subagents launched simultaneously. +``` +# Per executor subagent: +# - Reads //input.json +# - Performs the task +# - Writes //response.md +``` + +**Step 3 onward: Grade and merge** — same as CLI targets: + +```bash # Step 3: Run code graders agentv pipeline grade -# Step 4: Subagent does LLM grading, writes llm_scores.json +# Step 4: Subagent does LLM grading, writes llm_scores.json (see below) # Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) agentv pipeline bench --llm-scores llm_scores.json @@ -337,7 +366,27 @@ agentv pipeline bench --llm-scores llm_scores.json agentv results validate ``` -This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`). +**Step-by-step (fine-grained control for CLI targets)** + +Use individual commands when you need control over each step with CLI targets: + +```bash +# Step 1: Extract inputs (defaults to .agentv/results/runs/) +agentv pipeline input evals/repro.eval.yaml + +# Step 2: run_tests.py invokes CLI targets (or use pipeline run instead) + +# Step 3: Run code graders +agentv pipeline grade + +# Step 4: Subagent does LLM grading, writes llm_scores.json + +# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) +agentv pipeline bench --llm-scores llm_scores.json + +# Step 6: Validate +agentv results validate +``` **Step 3 (LLM grading): agent performs directly** @@ -641,6 +690,7 @@ The `agents/` directory contains instructions for specialized subagents. Read th | Agent | File | Purpose | When to dispatch | |-------|------|---------|-----------------| +| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | | grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading LLM-judged assertions) | | comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | | analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md new file mode 100644 index 000000000..45cf99520 --- /dev/null +++ b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md @@ -0,0 +1,87 @@ +--- +name: executor +description: >- + Execute an AgentV evaluation test case by performing the task described in the + input. Reads input.json from the test directory, carries out the task using + available tools, and writes response.md with the result. Dispatch one executor + subagent per test case, all in parallel. +model: inherit +color: cyan +tools: ["Read", "Bash", "Glob", "Grep", "Write", "Edit", "Agent", "WebFetch", "WebSearch"] +--- + +You are the executor for an AgentV evaluation test case. Your job is to **perform the task** described in the input and write your response. + +You are the target agent being evaluated. Do the task to the best of your ability — your output will be graded by a separate grader agent. + +**You will receive these parameters:** +- `test-dir`: Path to the test case directory (e.g., `.agentv/results/runs///`) +- `workspace-dir`: (optional) Path to the workspace directory where the task should be performed + +## Process + +### Step 1: Read Input + +1. **Read `input.json`** from `test-dir`. It contains: + - `input_text` — the task prompt as plain text + - `input_messages` — the task as a message array `[{role, content}]` + - `file_paths` — (optional) files referenced by the task + +2. **Read `invoke.json`** from `test-dir`. Confirm `kind` is `"agent"`. If it contains `instructions`, read them as additional context for how to approach the task. + +3. If `file_paths` are listed, read those files to understand the full context. + +### Step 2: Perform the Task + +Execute the task described in `input_text`. Use all available tools as needed: + +- **Read/Glob/Grep** to explore code and find information +- **Write/Edit** to create or modify files +- **Bash** to run commands, tests, builds +- **Agent** to delegate subtasks if needed +- **WebFetch/WebSearch** for external information if the task requires it + +Work in `workspace-dir` if provided. If no workspace directory is given, restrict file modifications to `test-dir` only — do not modify files in the repository root or other directories. + +**Guidelines:** +- Treat the input as a real user request — do what it asks +- Be thorough but focused — do what's needed, don't over-engineer +- If the task asks you to produce output (code, analysis, answer), capture it clearly +- If the task asks you to modify files, make the modifications and describe what you did +- If you encounter errors or uncertainties, note them but keep going + +### Step 3: Write Response + +Write your complete response to `{test-dir}/response.md`. + +The response should contain everything a grader needs to evaluate your work: +- Your answer, analysis, or explanation +- What actions you took (files created/modified, commands run) +- Any code you produced +- Errors or issues encountered + +If the task asked you to modify files, include a summary of the changes in `response.md` so the grader can evaluate without reading every file. + +### Step 4: Write Notes (if applicable) + +If you encountered uncertainties, made assumptions, or used workarounds, write them to `{test-dir}/user_notes.md`: + +```markdown +## Uncertainties +- [anything you weren't sure about] + +## Assumptions +- [decisions you made when the task was ambiguous] + +## Workarounds +- [issues you worked around] +``` + +Only create this file if you have something to note. Don't create an empty notes file. + +## Important + +- You are being evaluated. Do your best work. +- Do NOT read grading criteria, assertions, or expected outputs — those are for the grader, not for you. Only read `input.json` and `invoke.json`. +- Do NOT modify files outside `test-dir` and `workspace-dir`. +- Write `response.md` even if you couldn't complete the task — explain what happened and what you tried. diff --git a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md index 3723f3fda..b7e2338aa 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md @@ -285,14 +285,14 @@ Extracts inputs, target commands, and grader configs from an eval YAML file. { "eval_file": "path/to/eval.yaml", "timestamp": "2026-03-24T...", - "target": {"name": "target-name", "kind": "cli"}, + "target": {"name": "target-name", "kind": "cli", "subagent_mode_allowed": false}, "test_ids": ["test-01", "test-02"] } ``` **`invoke.json` kinds:** - `kind: "cli"` — has `command`, `cwd`, `timeout_ms`. Use the command to run the target. -- `kind: "agent"` — agent executes directly, no CLI invocation needed. +- `kind: "agent"` — non-CLI provider. Check `manifest.json` `target.subagent_mode_allowed` to decide whether to dispatch executor subagents or fall back to `agentv eval` CLI. ### `agentv pipeline grade ` diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py index 871cd86b1..baaac57ef 100644 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py +++ b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py @@ -20,8 +20,9 @@ │ ├── response.md ← target output (written by this script) │ └── timing.json ← execution timing (written by this script) -For agent-as-target mode (invoke.json has kind=agent), this script only runs -`agentv pipeline input`. The agent handles execution directly. +For agent-as-target mode (invoke.json has kind=agent — all non-CLI providers +unless subagent_mode_allowed=false in targets.yaml), this script only runs +`agentv pipeline input`. Executor subagents handle execution directly. """ import argparse import json From 8e928df0066dd4f4ad79d0c05899e627c4efe25e Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 08:30:46 +0000 Subject: [PATCH 02/15] fix(agentv-bench): remove tools restriction from executor subagent Executor should have access to all tools, not a hardcoded subset. Co-Authored-By: Claude Opus 4.6 (1M context) --- plugins/agentv-dev/skills/agentv-bench/agents/executor.md | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md index 45cf99520..bdccdc9b0 100644 --- a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md +++ b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md @@ -7,7 +7,6 @@ description: >- subagent per test case, all in parallel. model: inherit color: cyan -tools: ["Read", "Bash", "Glob", "Grep", "Write", "Edit", "Agent", "WebFetch", "WebSearch"] --- You are the executor for an AgentV evaluation test case. Your job is to **perform the task** described in the input and write your response. From a5e20850119b19cce132e4ff8690887fb1f3fa9e Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 08:33:07 +0000 Subject: [PATCH 03/15] docs(agentv-bench): add metadata field to executor input.json docs Co-Authored-By: Claude Opus 4.6 (1M context) --- plugins/agentv-dev/skills/agentv-bench/agents/executor.md | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md index bdccdc9b0..a9a69ddd0 100644 --- a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md +++ b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md @@ -25,6 +25,7 @@ You are the target agent being evaluated. Do the task to the best of your abilit - `input_text` — the task prompt as plain text - `input_messages` — the task as a message array `[{role, content}]` - `file_paths` — (optional) files referenced by the task + - `metadata` — (optional) additional context from the eval definition 2. **Read `invoke.json`** from `test-dir`. Confirm `kind` is `"agent"`. If it contains `instructions`, read them as additional context for how to approach the task. From 08d369fb6ccf66c22ad024607c41b26ded0ceb40 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 08:40:01 +0000 Subject: [PATCH 04/15] refactor(pipeline): align input.json fields with eval YAML and SDK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename input.json fields to match the eval YAML schema and code grader SDK naming conventions: - input_text + input_messages → input (Message[]) - file_paths → input_files (string[]) Drop redundant input_text — derive it from input[0].content where needed. Remove unnecessary user_notes step from executor subagent. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/grade.ts | 15 ++++++++--- apps/cli/src/commands/pipeline/input.ts | 8 +++--- apps/cli/src/commands/pipeline/run.ts | 26 ++++++++++++------- .../test/commands/eval/pipeline/grade.test.ts | 5 ++-- .../test/commands/eval/pipeline/input.test.ts | 6 ++--- .../skills/agentv-bench/agents/executor.md | 24 +++-------------- .../skills/agentv-bench/scripts/run_tests.py | 10 +++++-- 7 files changed, 46 insertions(+), 48 deletions(-) diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 44e0d42c1..bb8d4db95 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -15,6 +15,12 @@ import { join } from 'node:path'; import { executeScript } from '@agentv/core'; import { command, positional, string } from 'cmd-ts'; +/** Extract the first user message content as plain text from a Message[] array. */ +function extractInputText(input: Array<{ role: string; content: string }>): string { + const userMsg = input.find((m) => m.role === 'user'); + return typeof userMsg?.content === 'string' ? userMsg.content : ''; +} + export const evalGradeCommand = command({ name: 'grade', description: 'Run code-grader assertions on responses in an export directory', @@ -60,14 +66,15 @@ export const evalGradeCommand = command({ const graderName = graderConfig.name; // Build stdin payload matching CodeEvaluator format (snake_case) + const questionText = extractInputText(inputData.input); const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], - input: inputData.input_messages, - question: inputData.input_text, + input: inputData.input, + question: questionText, criteria: '', expected_output: [], reference_answer: '', - input_files: [], + input_files: inputData.input_files ?? [], trace: null, token_usage: null, cost_usd: null, @@ -78,7 +85,7 @@ export const evalGradeCommand = command({ workspace_path: null, config: graderConfig.config ?? null, metadata: {}, - input_text: inputData.input_text, + input_text: questionText, output_text: responseText, expected_output_text: '', }); diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 859a13a99..90d1fd712 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -117,16 +117,14 @@ export const evalInputCommand = command({ await mkdir(testDir, { recursive: true }); testIds.push(test.id); - // input.json - const inputText = test.question; + // input.json — aligned with eval YAML schema and code grader SDK field names const inputMessages = test.input.map((m) => ({ role: m.role, content: typeof m.content === 'string' ? m.content : m.content, })); await writeJson(join(testDir, 'input.json'), { - input_text: inputText, - input_messages: inputMessages, - file_paths: test.file_paths, + input: inputMessages, + input_files: test.file_paths, metadata: test.metadata ?? {}, }); diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index e5cbabfe2..9d530679c 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -25,6 +25,12 @@ import { buildDefaultRunDir } from '../eval/result-layout.js'; import { findRepoRoot } from '../eval/shared.js'; import { selectTarget } from '../eval/targets.js'; +/** Extract the first user message content as plain text from a Message[] array. */ +function extractInputText(input: Array<{ role: string; content: string }>): string { + const userMsg = input.find((m) => m.role === 'user'); + return typeof userMsg?.content === 'string' ? userMsg.content : ''; +} + /** Load key=value pairs from a .env file. Ignores comments and blank lines. */ function loadEnvFile(dir: string): Record { let current = resolve(dir); @@ -134,15 +140,13 @@ export const evalRunCommand = command({ await mkdir(testDir, { recursive: true }); testIds.push(test.id); - const inputText = test.question; const inputMessages = test.input.map((m) => ({ role: m.role, content: typeof m.content === 'string' ? m.content : m.content, })); await writeJson(join(testDir, 'input.json'), { - input_text: inputText, - input_messages: inputMessages, - file_paths: test.file_paths, + input: inputMessages, + input_files: test.file_paths, metadata: test.metadata ?? {}, }); @@ -216,12 +220,13 @@ export const evalRunCommand = command({ // Write temp prompt file const promptFile = join(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`); const outputFile = join(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`); - await writeFile(promptFile, inputData.input_text, 'utf8'); + const inputText = extractInputText(inputData.input); + await writeFile(promptFile, inputText, 'utf8'); let rendered = template; rendered = rendered.replace('{PROMPT_FILE}', promptFile); rendered = rendered.replace('{OUTPUT_FILE}', outputFile); - rendered = rendered.replace('{PROMPT}', inputData.input_text); + rendered = rendered.replace('{PROMPT}', inputText); const start = performance.now(); try { @@ -313,14 +318,15 @@ export const evalRunCommand = command({ const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8')); const graderName = graderConfig.name; + const questionText = extractInputText(inputData.input); const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], - input: inputData.input_messages, - question: inputData.input_text, + input: inputData.input, + question: questionText, criteria: '', expected_output: [], reference_answer: '', - input_files: [], + input_files: inputData.input_files ?? [], trace: null, token_usage: null, cost_usd: null, @@ -331,7 +337,7 @@ export const evalRunCommand = command({ workspace_path: null, config: graderConfig.config ?? null, metadata: {}, - input_text: inputData.input_text, + input_text: questionText, output_text: responseText, expected_output_text: '', }); diff --git a/apps/cli/test/commands/eval/pipeline/grade.test.ts b/apps/cli/test/commands/eval/pipeline/grade.test.ts index e129d0e7f..66aa45b12 100644 --- a/apps/cli/test/commands/eval/pipeline/grade.test.ts +++ b/apps/cli/test/commands/eval/pipeline/grade.test.ts @@ -15,9 +15,8 @@ describe('pipeline grade', () => { await writeFile( join(testDir, 'input.json'), JSON.stringify({ - input_text: 'say hello', - input_messages: [{ role: 'user', content: 'say hello' }], - file_paths: [], + input: [{ role: 'user', content: 'say hello' }], + input_files: [], }), ); await writeFile( diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts index 12194f4dc..445c05a0d 100644 --- a/apps/cli/test/commands/eval/pipeline/input.test.ts +++ b/apps/cli/test/commands/eval/pipeline/input.test.ts @@ -21,15 +21,15 @@ describe('pipeline input', () => { expect(manifest.eval_file).toContain('input-test.eval.yaml'); }); - it('writes per-test input.json with input_text', async () => { + it('writes per-test input.json with input and input_files', async () => { const { execa } = await import('execa'); await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); const input = JSON.parse( await readFile(join(OUT_DIR, 'input-test', 'test-01', 'input.json'), 'utf8'), ); - expect(input.input_text).toBe('hello world'); - expect(input.input_messages).toHaveLength(1); + expect(input.input).toHaveLength(1); + expect(input.input[0].content).toBe('hello world'); }); it('writes code_graders/.json with resolved command', async () => { diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md index a9a69ddd0..ec7feec28 100644 --- a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md +++ b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md @@ -22,14 +22,13 @@ You are the target agent being evaluated. Do the task to the best of your abilit ### Step 1: Read Input 1. **Read `input.json`** from `test-dir`. It contains: - - `input_text` — the task prompt as plain text - - `input_messages` — the task as a message array `[{role, content}]` - - `file_paths` — (optional) files referenced by the task + - `input` — the task as a message array `[{role, content}]` + - `input_files` — (optional) files referenced by the task - `metadata` — (optional) additional context from the eval definition 2. **Read `invoke.json`** from `test-dir`. Confirm `kind` is `"agent"`. If it contains `instructions`, read them as additional context for how to approach the task. -3. If `file_paths` are listed, read those files to understand the full context. +3. If `input_files` are listed, read those files to understand the full context. ### Step 2: Perform the Task @@ -62,23 +61,6 @@ The response should contain everything a grader needs to evaluate your work: If the task asked you to modify files, include a summary of the changes in `response.md` so the grader can evaluate without reading every file. -### Step 4: Write Notes (if applicable) - -If you encountered uncertainties, made assumptions, or used workarounds, write them to `{test-dir}/user_notes.md`: - -```markdown -## Uncertainties -- [anything you weren't sure about] - -## Assumptions -- [decisions you made when the task was ambiguous] - -## Workarounds -- [issues you worked around] -``` - -Only create this file if you have something to note. Don't create an empty notes file. - ## Important - You are being evaluated. Do your best work. diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py index baaac57ef..aff81612e 100644 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py +++ b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py @@ -81,9 +81,15 @@ def invoke_cli_target(test_dir: Path, extra_env: dict | None = None) -> None: timeout_s = invoke.get("timeout_ms", 30000) / 1000 merged_env = {**os.environ, **(extra_env or {})} + # Extract prompt text from input messages + prompt_text = next( + (m["content"] for m in input_data.get("input", []) if m.get("role") == "user"), + "", + ) + # Write prompt to temp file for {PROMPT_FILE} placeholder with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as pf: - pf.write(input_data["input_text"]) + pf.write(prompt_text) prompt_file = pf.name # Create output file path for {OUTPUT_FILE} placeholder @@ -91,7 +97,7 @@ def invoke_cli_target(test_dir: Path, extra_env: dict | None = None) -> None: # Render template rendered = command_template - rendered = rendered.replace("{PROMPT}", input_data["input_text"]) + rendered = rendered.replace("{PROMPT}", prompt_text) rendered = rendered.replace("{PROMPT_FILE}", prompt_file) rendered = rendered.replace("{OUTPUT_FILE}", output_file) From 2661ee17e699c48b516a5006928c7a23ba041571 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 09:36:19 +0000 Subject: [PATCH 05/15] refactor(pipeline): rename questionText to inputText for consistency Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/grade.ts | 6 +++--- apps/cli/src/commands/pipeline/run.ts | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index bb8d4db95..497cdb635 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -66,11 +66,11 @@ export const evalGradeCommand = command({ const graderName = graderConfig.name; // Build stdin payload matching CodeEvaluator format (snake_case) - const questionText = extractInputText(inputData.input); + const inputText = extractInputText(inputData.input); const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], input: inputData.input, - question: questionText, + question: inputText, criteria: '', expected_output: [], reference_answer: '', @@ -85,7 +85,7 @@ export const evalGradeCommand = command({ workspace_path: null, config: graderConfig.config ?? null, metadata: {}, - input_text: questionText, + input_text: inputText, output_text: responseText, expected_output_text: '', }); diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 9d530679c..bd170cf05 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -318,11 +318,11 @@ export const evalRunCommand = command({ const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8')); const graderName = graderConfig.name; - const questionText = extractInputText(inputData.input); + const inputText = extractInputText(inputData.input); const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], input: inputData.input, - question: questionText, + question: inputText, criteria: '', expected_output: [], reference_answer: '', @@ -337,7 +337,7 @@ export const evalRunCommand = command({ workspace_path: null, config: graderConfig.config ?? null, metadata: {}, - input_text: questionText, + input_text: inputText, output_text: responseText, expected_output_text: '', }); From 75bffd9be3470699c88c444ccb8a146fb15c881f Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 09:41:45 +0000 Subject: [PATCH 06/15] refactor(pipeline): remove legacy question/reference_answer from grader payload The code grader SDK uses input_text and expected_output_text (via toSnakeCaseDeep). The question and reference_answer fields were redundant legacy names not present in the real orchestrator payload. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/grade.ts | 4 +--- apps/cli/src/commands/pipeline/run.ts | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 497cdb635..3b42e0f8e 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -70,10 +70,8 @@ export const evalGradeCommand = command({ const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], input: inputData.input, - question: inputText, criteria: '', expected_output: [], - reference_answer: '', input_files: inputData.input_files ?? [], trace: null, token_usage: null, @@ -84,7 +82,7 @@ export const evalGradeCommand = command({ file_changes: null, workspace_path: null, config: graderConfig.config ?? null, - metadata: {}, + metadata: inputData.metadata ?? {}, input_text: inputText, output_text: responseText, expected_output_text: '', diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index bd170cf05..68492559e 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -322,10 +322,8 @@ export const evalRunCommand = command({ const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], input: inputData.input, - question: inputText, criteria: '', expected_output: [], - reference_answer: '', input_files: inputData.input_files ?? [], trace: null, token_usage: null, @@ -336,7 +334,7 @@ export const evalRunCommand = command({ file_changes: null, workspace_path: null, config: graderConfig.config ?? null, - metadata: {}, + metadata: inputData.metadata ?? {}, input_text: inputText, output_text: responseText, expected_output_text: '', From e3db11ac8795dd42995d6789fe65ac2766104dfa Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 09:47:07 +0000 Subject: [PATCH 07/15] fix(agentv-bench): update stale input_text/input_messages references Co-Authored-By: Claude Opus 4.6 (1M context) --- .../skills/agentv-bench/agents/executor.md | 45 ++----------------- .../agentv-bench/references/eval-yaml-spec.md | 2 +- 2 files changed, 5 insertions(+), 42 deletions(-) diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md index ec7feec28..fd32ff365 100644 --- a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md +++ b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md @@ -19,51 +19,14 @@ You are the target agent being evaluated. Do the task to the best of your abilit ## Process -### Step 1: Read Input +1. **Read `{test-dir}/input.json`**. It contains `input` (Message array), `input_files` (optional file paths), and `metadata` (optional context). If `input_files` are listed, read those files too. -1. **Read `input.json`** from `test-dir`. It contains: - - `input` — the task as a message array `[{role, content}]` - - `input_files` — (optional) files referenced by the task - - `metadata` — (optional) additional context from the eval definition +2. **Perform the task** described in the input. Work in `workspace-dir` if provided; otherwise restrict file modifications to `test-dir` only. -2. **Read `invoke.json`** from `test-dir`. Confirm `kind` is `"agent"`. If it contains `instructions`, read them as additional context for how to approach the task. - -3. If `input_files` are listed, read those files to understand the full context. - -### Step 2: Perform the Task - -Execute the task described in `input_text`. Use all available tools as needed: - -- **Read/Glob/Grep** to explore code and find information -- **Write/Edit** to create or modify files -- **Bash** to run commands, tests, builds -- **Agent** to delegate subtasks if needed -- **WebFetch/WebSearch** for external information if the task requires it - -Work in `workspace-dir` if provided. If no workspace directory is given, restrict file modifications to `test-dir` only — do not modify files in the repository root or other directories. - -**Guidelines:** -- Treat the input as a real user request — do what it asks -- Be thorough but focused — do what's needed, don't over-engineer -- If the task asks you to produce output (code, analysis, answer), capture it clearly -- If the task asks you to modify files, make the modifications and describe what you did -- If you encounter errors or uncertainties, note them but keep going - -### Step 3: Write Response - -Write your complete response to `{test-dir}/response.md`. - -The response should contain everything a grader needs to evaluate your work: -- Your answer, analysis, or explanation -- What actions you took (files created/modified, commands run) -- Any code you produced -- Errors or issues encountered - -If the task asked you to modify files, include a summary of the changes in `response.md` so the grader can evaluate without reading every file. +3. **Write `{test-dir}/response.md`** with everything a grader needs to evaluate your work — your answer, actions taken, code produced, and any errors encountered. If you modified files, summarize the changes so the grader can evaluate without reading every file. ## Important -- You are being evaluated. Do your best work. -- Do NOT read grading criteria, assertions, or expected outputs — those are for the grader, not for you. Only read `input.json` and `invoke.json`. +- Do NOT read grading criteria, assertions, or expected outputs — those are for the grader, not for you. - Do NOT modify files outside `test-dir` and `workspace-dir`. - Write `response.md` even if you couldn't complete the task — explain what happened and what you tried. diff --git a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md index b7e2338aa..87794493f 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md @@ -272,7 +272,7 @@ Extracts inputs, target commands, and grader configs from an eval YAML file. / ├── manifest.json ├── / -│ ├── input.json ← {input_text, input_messages, file_paths} +│ ├── input.json ← {input, input_files, metadata} │ ├── invoke.json ← {kind, command?, cwd?, timeout_ms?} │ ├── criteria.md ← human-readable success criteria │ ├── expected_output.json ← (if present) From 72e7cace66ea908ff6df06cbf838acaef81a1540 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 09:56:12 +0000 Subject: [PATCH 08/15] =?UTF-8?q?fix(agentv-bench):=20simplify=20executor?= =?UTF-8?q?=20=E2=80=94=20remove=20workspace-dir=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The executor works in the current workspace naturally. No need to restrict or parameterize the working directory. Co-Authored-By: Claude Opus 4.6 (1M context) --- plugins/agentv-dev/skills/agentv-bench/agents/executor.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md index fd32ff365..2543e0ec1 100644 --- a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md +++ b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md @@ -15,18 +15,16 @@ You are the target agent being evaluated. Do the task to the best of your abilit **You will receive these parameters:** - `test-dir`: Path to the test case directory (e.g., `.agentv/results/runs///`) -- `workspace-dir`: (optional) Path to the workspace directory where the task should be performed ## Process 1. **Read `{test-dir}/input.json`**. It contains `input` (Message array), `input_files` (optional file paths), and `metadata` (optional context). If `input_files` are listed, read those files too. -2. **Perform the task** described in the input. Work in `workspace-dir` if provided; otherwise restrict file modifications to `test-dir` only. +2. **Perform the task** described in the input. 3. **Write `{test-dir}/response.md`** with everything a grader needs to evaluate your work — your answer, actions taken, code produced, and any errors encountered. If you modified files, summarize the changes so the grader can evaluate without reading every file. ## Important - Do NOT read grading criteria, assertions, or expected outputs — those are for the grader, not for you. -- Do NOT modify files outside `test-dir` and `workspace-dir`. - Write `response.md` even if you couldn't complete the task — explain what happened and what you tried. From f795ac4171dfc401a25c796b122f3cd160fe8f0b Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 10:06:11 +0000 Subject: [PATCH 09/15] docs(agentv-bench): add workspace detection guidance for executor subagents Clarify that the current workspace is the target workspace (multi-repo), not the eval repo. Warn if the user opened the wrong workspace since executor subagents won't have access to the agent's skills and repos. Co-Authored-By: Claude Opus 4.6 (1M context) --- plugins/agentv-dev/skills/agentv-bench/SKILL.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 02f5d3008..6d6d9b015 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -308,6 +308,10 @@ When `subagent_mode_allowed: false`, the target falls back to CLI invocation via - `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`) - Read `references/eval-yaml-spec.md` for the full schema +**Workspace detection:** The current workspace is the **target workspace** — where executor subagents perform tasks. This is typically a multi-repo workspace (e.g., set up by allagents) containing the agent's skills, repos, and tools. It is NOT the eval repo itself. Eval files and `.agentv/` artifacts may live in a different repo within the workspace. Always use absolute paths when referencing eval files (e.g., `../agentv/evals/test.eval.yaml`). + +If the current directory is the eval repo itself (e.g., the agentv repo) rather than the target workspace, warn the user — executor subagents won't have access to the agent's skills and repos. + **CLI targets: Single command** For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step. When `--out` is omitted, the output directory defaults to `.agentv/results/runs/` (same convention as `agentv eval`): From 771b142a19acebf635edd7a6671111e48ea89b55 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 10:08:27 +0000 Subject: [PATCH 10/15] =?UTF-8?q?docs(agentv-bench):=20clarify=20workspace?= =?UTF-8?q?=20detection=20=E2=80=94=20only=20matters=20for=20workspace-dep?= =?UTF-8?q?endent=20evals?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- plugins/agentv-dev/skills/agentv-bench/SKILL.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 6d6d9b015..70abaf0cf 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -308,9 +308,7 @@ When `subagent_mode_allowed: false`, the target falls back to CLI invocation via - `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`) - Read `references/eval-yaml-spec.md` for the full schema -**Workspace detection:** The current workspace is the **target workspace** — where executor subagents perform tasks. This is typically a multi-repo workspace (e.g., set up by allagents) containing the agent's skills, repos, and tools. It is NOT the eval repo itself. Eval files and `.agentv/` artifacts may live in a different repo within the workspace. Always use absolute paths when referencing eval files (e.g., `../agentv/evals/test.eval.yaml`). - -If the current directory is the eval repo itself (e.g., the agentv repo) rather than the target workspace, warn the user — executor subagents won't have access to the agent's skills and repos. +**Workspace matters when evals need it:** Some evals pass prompt files directly and don't require a specific workspace — those run fine from anywhere. But evals that test agent behavior in a workspace (accessing skills, modifying repos, using tools across multiple repos) require the user to be in the **target workspace** (e.g., a multi-repo workspace set up by allagents). If the eval references workspace files or expects the agent to use skills, check that the current directory is the target workspace, not just the eval repo — and warn the user if it's wrong. **CLI targets: Single command** From e54efb6b4f6d61a49058464a8000e81e66f7a73e Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 10:14:02 +0000 Subject: [PATCH 11/15] fix(pipeline): extractInputText should use all messages, not just first Single message: returns content directly (no role prefix). Multiple messages: prefixes each with @role for least surprise. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/grade.ts | 11 ++++++++--- apps/cli/src/commands/pipeline/run.ts | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 3b42e0f8e..84af87cc2 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -15,10 +15,15 @@ import { join } from 'node:path'; import { executeScript } from '@agentv/core'; import { command, positional, string } from 'cmd-ts'; -/** Extract the first user message content as plain text from a Message[] array. */ +/** + * Convert a Message[] array to plain text. + * Single message: returns content directly (no role prefix). + * Multiple messages: prefixes each with @role for clarity. + */ function extractInputText(input: Array<{ role: string; content: string }>): string { - const userMsg = input.find((m) => m.role === 'user'); - return typeof userMsg?.content === 'string' ? userMsg.content : ''; + if (!input || input.length === 0) return ''; + if (input.length === 1) return input[0].content; + return input.map((m) => `@${m.role}\n${m.content}`).join('\n\n'); } export const evalGradeCommand = command({ diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 68492559e..7aeb4fe3b 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -25,10 +25,15 @@ import { buildDefaultRunDir } from '../eval/result-layout.js'; import { findRepoRoot } from '../eval/shared.js'; import { selectTarget } from '../eval/targets.js'; -/** Extract the first user message content as plain text from a Message[] array. */ +/** + * Convert a Message[] array to plain text. + * Single message: returns content directly (no role prefix). + * Multiple messages: prefixes each with @role for clarity. + */ function extractInputText(input: Array<{ role: string; content: string }>): string { - const userMsg = input.find((m) => m.role === 'user'); - return typeof userMsg?.content === 'string' ? userMsg.content : ''; + if (!input || input.length === 0) return ''; + if (input.length === 1) return input[0].content; + return input.map((m) => `@${m.role}\n${m.content}`).join('\n\n'); } /** Load key=value pairs from a .env file. Ignores comments and blank lines. */ From bcb87fd071e7feb0d19fbacb68ce2da8e50f9d38 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 10:16:11 +0000 Subject: [PATCH 12/15] =?UTF-8?q?docs(eval):=20fix=20inputText=20jsdoc=20?= =?UTF-8?q?=E2=80=94=20includes=20all=20messages,=20not=20just=20first?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/eval/src/schemas.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index fff5c2741..777772d6b 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -76,9 +76,9 @@ export const CodeGraderInputSchema = z.object({ fileChanges: z.string().nullable().optional(), workspacePath: z.string().nullable().optional(), config: z.record(z.unknown()).nullable().optional(), - /** First user message content as string. */ + /** All input messages as plain text. Single message: content only. Multiple: @role prefixed. */ inputText: z.string(), - /** Expected output content as string. */ + /** All expected output messages as plain text. */ expectedOutputText: z.string().optional(), }); From ae8cbecc01f19914ba017667be58c14a3e1faa76 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 10:25:42 +0000 Subject: [PATCH 13/15] =?UTF-8?q?docs(eval):=20fix=20expectedOutputText=20?= =?UTF-8?q?jsdoc=20=E2=80=94=20last=20message=20content,=20not=20all?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/eval/src/schemas.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index 777772d6b..43b541bbc 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -78,7 +78,7 @@ export const CodeGraderInputSchema = z.object({ config: z.record(z.unknown()).nullable().optional(), /** All input messages as plain text. Single message: content only. Multiple: @role prefixed. */ inputText: z.string(), - /** All expected output messages as plain text. */ + /** Last expected output message content as plain text. */ expectedOutputText: z.string().optional(), }); From cf4690b570aa2c613c7134bad223b0071e126e0e Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 10:27:22 +0000 Subject: [PATCH 14/15] fix(pipeline): use @[role] pattern for multi-message inputText Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/grade.ts | 2 +- apps/cli/src/commands/pipeline/run.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 84af87cc2..5e6ad789d 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -23,7 +23,7 @@ import { command, positional, string } from 'cmd-ts'; function extractInputText(input: Array<{ role: string; content: string }>): string { if (!input || input.length === 0) return ''; if (input.length === 1) return input[0].content; - return input.map((m) => `@${m.role}\n${m.content}`).join('\n\n'); + return input.map((m) => `@[${m.role}]\n${m.content}`).join('\n\n'); } export const evalGradeCommand = command({ diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 7aeb4fe3b..26ef54c17 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -33,7 +33,7 @@ import { selectTarget } from '../eval/targets.js'; function extractInputText(input: Array<{ role: string; content: string }>): string { if (!input || input.length === 0) return ''; if (input.length === 1) return input[0].content; - return input.map((m) => `@${m.role}\n${m.content}`).join('\n\n'); + return input.map((m) => `@[${m.role}]\n${m.content}`).join('\n\n'); } /** Load key=value pairs from a .env file. Ignores comments and blank lines. */ From 9c7726c1ffa625a6915ffda2152303caed7c9316 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 11:04:55 +0000 Subject: [PATCH 15/15] fix(pipeline): align @[role]: format with artifact-writer convention Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/grade.ts | 2 +- apps/cli/src/commands/pipeline/run.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 5e6ad789d..f9d015021 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -23,7 +23,7 @@ import { command, positional, string } from 'cmd-ts'; function extractInputText(input: Array<{ role: string; content: string }>): string { if (!input || input.length === 0) return ''; if (input.length === 1) return input[0].content; - return input.map((m) => `@[${m.role}]\n${m.content}`).join('\n\n'); + return input.map((m) => `@[${m.role}]:\n${m.content}`).join('\n\n'); } export const evalGradeCommand = command({ diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 26ef54c17..a4615aaf9 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -33,7 +33,7 @@ import { selectTarget } from '../eval/targets.js'; function extractInputText(input: Array<{ role: string; content: string }>): string { if (!input || input.length === 0) return ''; if (input.length === 1) return input[0].content; - return input.map((m) => `@[${m.role}]\n${m.content}`).join('\n\n'); + return input.map((m) => `@[${m.role}]:\n${m.content}`).join('\n\n'); } /** Load key=value pairs from a .env file. Ignores comments and blank lines. */