diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 44e0d42c1..f9d015021 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -15,6 +15,17 @@ import { join } from 'node:path'; import { executeScript } from '@agentv/core'; import { command, positional, string } from 'cmd-ts'; +/** + * Convert a Message[] array to plain text. + * Single message: returns content directly (no role prefix). + * Multiple messages: prefixes each with @role for clarity. + */ +function extractInputText(input: Array<{ role: string; content: string }>): string { + if (!input || input.length === 0) return ''; + if (input.length === 1) return input[0].content; + return input.map((m) => `@[${m.role}]:\n${m.content}`).join('\n\n'); +} + export const evalGradeCommand = command({ name: 'grade', description: 'Run code-grader assertions on responses in an export directory', @@ -60,14 +71,13 @@ export const evalGradeCommand = command({ const graderName = graderConfig.name; // Build stdin payload matching CodeEvaluator format (snake_case) + const inputText = extractInputText(inputData.input); const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], - input: inputData.input_messages, - question: inputData.input_text, + input: inputData.input, criteria: '', expected_output: [], - reference_answer: '', - input_files: [], + input_files: inputData.input_files ?? [], trace: null, token_usage: null, cost_usd: null, @@ -77,8 +87,8 @@ export const evalGradeCommand = command({ file_changes: null, workspace_path: null, config: graderConfig.config ?? null, - metadata: {}, - input_text: inputData.input_text, + metadata: inputData.metadata ?? {}, + input_text: inputText, output_text: responseText, expected_output_text: '', }); diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 859a13a99..90d1fd712 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -117,16 +117,14 @@ export const evalInputCommand = command({ await mkdir(testDir, { recursive: true }); testIds.push(test.id); - // input.json - const inputText = test.question; + // input.json — aligned with eval YAML schema and code grader SDK field names const inputMessages = test.input.map((m) => ({ role: m.role, content: typeof m.content === 'string' ? m.content : m.content, })); await writeJson(join(testDir, 'input.json'), { - input_text: inputText, - input_messages: inputMessages, - file_paths: test.file_paths, + input: inputMessages, + input_files: test.file_paths, metadata: test.metadata ?? {}, }); diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index e5cbabfe2..a4615aaf9 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -25,6 +25,17 @@ import { buildDefaultRunDir } from '../eval/result-layout.js'; import { findRepoRoot } from '../eval/shared.js'; import { selectTarget } from '../eval/targets.js'; +/** + * Convert a Message[] array to plain text. + * Single message: returns content directly (no role prefix). + * Multiple messages: prefixes each with @role for clarity. + */ +function extractInputText(input: Array<{ role: string; content: string }>): string { + if (!input || input.length === 0) return ''; + if (input.length === 1) return input[0].content; + return input.map((m) => `@[${m.role}]:\n${m.content}`).join('\n\n'); +} + /** Load key=value pairs from a .env file. Ignores comments and blank lines. */ function loadEnvFile(dir: string): Record { let current = resolve(dir); @@ -134,15 +145,13 @@ export const evalRunCommand = command({ await mkdir(testDir, { recursive: true }); testIds.push(test.id); - const inputText = test.question; const inputMessages = test.input.map((m) => ({ role: m.role, content: typeof m.content === 'string' ? m.content : m.content, })); await writeJson(join(testDir, 'input.json'), { - input_text: inputText, - input_messages: inputMessages, - file_paths: test.file_paths, + input: inputMessages, + input_files: test.file_paths, metadata: test.metadata ?? {}, }); @@ -216,12 +225,13 @@ export const evalRunCommand = command({ // Write temp prompt file const promptFile = join(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`); const outputFile = join(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`); - await writeFile(promptFile, inputData.input_text, 'utf8'); + const inputText = extractInputText(inputData.input); + await writeFile(promptFile, inputText, 'utf8'); let rendered = template; rendered = rendered.replace('{PROMPT_FILE}', promptFile); rendered = rendered.replace('{OUTPUT_FILE}', outputFile); - rendered = rendered.replace('{PROMPT}', inputData.input_text); + rendered = rendered.replace('{PROMPT}', inputText); const start = performance.now(); try { @@ -313,14 +323,13 @@ export const evalRunCommand = command({ const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8')); const graderName = graderConfig.name; + const inputText = extractInputText(inputData.input); const payload = JSON.stringify({ output: [{ role: 'assistant', content: responseText }], - input: inputData.input_messages, - question: inputData.input_text, + input: inputData.input, criteria: '', expected_output: [], - reference_answer: '', - input_files: [], + input_files: inputData.input_files ?? [], trace: null, token_usage: null, cost_usd: null, @@ -330,8 +339,8 @@ export const evalRunCommand = command({ file_changes: null, workspace_path: null, config: graderConfig.config ?? null, - metadata: {}, - input_text: inputData.input_text, + metadata: inputData.metadata ?? {}, + input_text: inputText, output_text: responseText, expected_output_text: '', }); diff --git a/apps/cli/test/commands/eval/pipeline/grade.test.ts b/apps/cli/test/commands/eval/pipeline/grade.test.ts index e129d0e7f..66aa45b12 100644 --- a/apps/cli/test/commands/eval/pipeline/grade.test.ts +++ b/apps/cli/test/commands/eval/pipeline/grade.test.ts @@ -15,9 +15,8 @@ describe('pipeline grade', () => { await writeFile( join(testDir, 'input.json'), JSON.stringify({ - input_text: 'say hello', - input_messages: [{ role: 'user', content: 'say hello' }], - file_paths: [], + input: [{ role: 'user', content: 'say hello' }], + input_files: [], }), ); await writeFile( diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts index 12194f4dc..445c05a0d 100644 --- a/apps/cli/test/commands/eval/pipeline/input.test.ts +++ b/apps/cli/test/commands/eval/pipeline/input.test.ts @@ -21,15 +21,15 @@ describe('pipeline input', () => { expect(manifest.eval_file).toContain('input-test.eval.yaml'); }); - it('writes per-test input.json with input_text', async () => { + it('writes per-test input.json with input and input_files', async () => { const { execa } = await import('execa'); await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); const input = JSON.parse( await readFile(join(OUT_DIR, 'input-test', 'test-01', 'input.json'), 'utf8'), ); - expect(input.input_text).toBe('hello world'); - expect(input.input_messages).toHaveLength(1); + expect(input.input).toHaveLength(1); + expect(input.input[0].content).toBe('hello world'); }); it('writes code_graders/.json with resolved command', async () => { diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index fff5c2741..43b541bbc 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -76,9 +76,9 @@ export const CodeGraderInputSchema = z.object({ fileChanges: z.string().nullable().optional(), workspacePath: z.string().nullable().optional(), config: z.record(z.unknown()).nullable().optional(), - /** First user message content as string. */ + /** All input messages as plain text. Single message: content only. Multiple: @role prefixed. */ inputText: z.string(), - /** Expected output content as string. */ + /** Last expected output message content as plain text. */ expectedOutputText: z.string().optional(), }); diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index f325a61c5..70abaf0cf 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -41,7 +41,7 @@ This skill ships with a Python scripts layer in `plugins/agentv-dev/skills/agent These scripts break the eval pipeline into discrete steps. The agent runs them in order, only handling LLM grading directly: -- `scripts/run_tests.py --out ` — Extract inputs and invoke CLI targets in parallel. Writes `response.md` per test. For agent-as-target, only extracts inputs (agent handles execution). +- `scripts/run_tests.py --out ` — Extract inputs and invoke CLI targets in parallel. Writes `response.md` per test. For agent-as-target (`kind: "agent"`), only extracts inputs — executor subagents handle execution. - `scripts/run_code_graders.py ` — Run code-grader assertions on existing responses. Writes per-grader results. - `scripts/bench.py < llm_scores.json` — Merge code-grader + LLM scores, compute weighted pass_rate, write `grading.json` + `index.jsonl` + `benchmark.json`. @@ -87,7 +87,7 @@ Before running or optimizing, understand what you're working with. 2. **Identify success criteria** — what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone. -3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what evaluator types are available and how to run tests. +3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what evaluator types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax — literal secrets are rejected as a security guardrail. 4. **Challenge assumptions** — if evals already exist, review their quality before running: - Are the test cases testing the right things? @@ -287,14 +287,30 @@ Write artifacts to `.agentv/artifacts/` or the iteration directory. ### Subagent mode: Running eval.yaml without CLI -When `SUBAGENT_EVAL_MODE=subagent` (default), use the pipeline CLI subcommands (`pipeline input`, `pipeline grade`, `pipeline bench`) and Python wrapper scripts. This mode spends LLM tokens only on grading, not on YAML parsing or target invocation. +When `SUBAGENT_EVAL_MODE=subagent` (default), use the pipeline CLI subcommands (`pipeline input`, `pipeline grade`, `pipeline bench`) and Python wrapper scripts. This mode dispatches `executor` subagents to perform each test case, then `grader` subagents to evaluate the outputs. + +**Executor subagent eligibility:** All providers except `cli` are eligible for executor subagents by default. To opt out a specific target, set `subagent_mode_allowed: false` in `.agentv/targets.yaml`: + +```yaml +# .agentv/targets.yaml +targets: + - name: my-target + provider: openai + model: ${{ OPENAI_MODEL }} + api_key: ${{ OPENAI_API_KEY }} + subagent_mode_allowed: false # forces CLI invocation instead of executor subagent +``` + +When `subagent_mode_allowed: false`, the target falls back to CLI invocation via `agentv eval` even in subagent mode. **Prerequisites:** - The eval.yaml file exists and contains valid test definitions - `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`) - Read `references/eval-yaml-spec.md` for the full schema -**Recommended: Single command for CLI targets** +**Workspace matters when evals need it:** Some evals pass prompt files directly and don't require a specific workspace — those run fine from anywhere. But evals that test agent behavior in a workspace (accessing skills, modifying repos, using tools across multiple repos) require the user to be in the **target workspace** (e.g., a multi-repo workspace set up by allagents). If the eval references workspace files or expects the agent to use skills, check that the current directory is the target workspace, not just the eval repo — and warn the user if it's wrong. + +**CLI targets: Single command** For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step. When `--out` is omitted, the output directory defaults to `.agentv/results/runs/` (same convention as `agentv eval`): @@ -315,20 +331,35 @@ agentv results validate That's the entire pipeline: **2 commands** + LLM grading + optional validation. -**Alternative: Step-by-step (subagent-as-target or fine-grained control)** +**Non-CLI targets: Executor subagents** + +When the target provider is not `cli`, check `manifest.json` → `target.subagent_mode_allowed`. If `true` (default for all non-CLI providers), the subagent IS the target. If `false` (user opted out via `subagent_mode_allowed: false` in `.agentv/targets.yaml`), fall back to `agentv eval` CLI mode instead. -Use individual commands when the subagent IS the target or you need control over each step: +For executor subagent targets, use `pipeline input` to extract inputs, then dispatch `executor` subagents to perform each test case: ```bash # Step 1: Extract inputs (defaults to .agentv/results/runs/) agentv pipeline input evals/repro.eval.yaml +``` + +This creates a run directory with per-test `input.json`, `invoke.json` (with `kind: "agent"`), `criteria.md`, and grader configs. -# Step 2: Subagent invokes each test (reads input.json, writes response.md) +**Step 2: Dispatch executor subagents** — read `agents/executor.md`. Launch one `executor` subagent **per test case**, all in parallel. Each subagent receives the test directory path, reads `input.json`, performs the task using its own tools, and writes `response.md`. For example, 5 tests = 5 executor subagents launched simultaneously. +``` +# Per executor subagent: +# - Reads //input.json +# - Performs the task +# - Writes //response.md +``` + +**Step 3 onward: Grade and merge** — same as CLI targets: + +```bash # Step 3: Run code graders agentv pipeline grade -# Step 4: Subagent does LLM grading, writes llm_scores.json +# Step 4: Subagent does LLM grading, writes llm_scores.json (see below) # Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) agentv pipeline bench --llm-scores llm_scores.json @@ -337,7 +368,27 @@ agentv pipeline bench --llm-scores llm_scores.json agentv results validate ``` -This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`). +**Step-by-step (fine-grained control for CLI targets)** + +Use individual commands when you need control over each step with CLI targets: + +```bash +# Step 1: Extract inputs (defaults to .agentv/results/runs/) +agentv pipeline input evals/repro.eval.yaml + +# Step 2: run_tests.py invokes CLI targets (or use pipeline run instead) + +# Step 3: Run code graders +agentv pipeline grade + +# Step 4: Subagent does LLM grading, writes llm_scores.json + +# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) +agentv pipeline bench --llm-scores llm_scores.json + +# Step 6: Validate +agentv results validate +``` **Step 3 (LLM grading): agent performs directly** @@ -641,6 +692,7 @@ The `agents/` directory contains instructions for specialized subagents. Read th | Agent | File | Purpose | When to dispatch | |-------|------|---------|-----------------| +| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | | grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading LLM-judged assertions) | | comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | | analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/executor.md b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md new file mode 100644 index 000000000..2543e0ec1 --- /dev/null +++ b/plugins/agentv-dev/skills/agentv-bench/agents/executor.md @@ -0,0 +1,30 @@ +--- +name: executor +description: >- + Execute an AgentV evaluation test case by performing the task described in the + input. Reads input.json from the test directory, carries out the task using + available tools, and writes response.md with the result. Dispatch one executor + subagent per test case, all in parallel. +model: inherit +color: cyan +--- + +You are the executor for an AgentV evaluation test case. Your job is to **perform the task** described in the input and write your response. + +You are the target agent being evaluated. Do the task to the best of your ability — your output will be graded by a separate grader agent. + +**You will receive these parameters:** +- `test-dir`: Path to the test case directory (e.g., `.agentv/results/runs///`) + +## Process + +1. **Read `{test-dir}/input.json`**. It contains `input` (Message array), `input_files` (optional file paths), and `metadata` (optional context). If `input_files` are listed, read those files too. + +2. **Perform the task** described in the input. + +3. **Write `{test-dir}/response.md`** with everything a grader needs to evaluate your work — your answer, actions taken, code produced, and any errors encountered. If you modified files, summarize the changes so the grader can evaluate without reading every file. + +## Important + +- Do NOT read grading criteria, assertions, or expected outputs — those are for the grader, not for you. +- Write `response.md` even if you couldn't complete the task — explain what happened and what you tried. diff --git a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md index 3723f3fda..87794493f 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md @@ -272,7 +272,7 @@ Extracts inputs, target commands, and grader configs from an eval YAML file. / ├── manifest.json ├── / -│ ├── input.json ← {input_text, input_messages, file_paths} +│ ├── input.json ← {input, input_files, metadata} │ ├── invoke.json ← {kind, command?, cwd?, timeout_ms?} │ ├── criteria.md ← human-readable success criteria │ ├── expected_output.json ← (if present) @@ -285,14 +285,14 @@ Extracts inputs, target commands, and grader configs from an eval YAML file. { "eval_file": "path/to/eval.yaml", "timestamp": "2026-03-24T...", - "target": {"name": "target-name", "kind": "cli"}, + "target": {"name": "target-name", "kind": "cli", "subagent_mode_allowed": false}, "test_ids": ["test-01", "test-02"] } ``` **`invoke.json` kinds:** - `kind: "cli"` — has `command`, `cwd`, `timeout_ms`. Use the command to run the target. -- `kind: "agent"` — agent executes directly, no CLI invocation needed. +- `kind: "agent"` — non-CLI provider. Check `manifest.json` `target.subagent_mode_allowed` to decide whether to dispatch executor subagents or fall back to `agentv eval` CLI. ### `agentv pipeline grade ` diff --git a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py index 871cd86b1..aff81612e 100644 --- a/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py +++ b/plugins/agentv-dev/skills/agentv-bench/scripts/run_tests.py @@ -20,8 +20,9 @@ │ ├── response.md ← target output (written by this script) │ └── timing.json ← execution timing (written by this script) -For agent-as-target mode (invoke.json has kind=agent), this script only runs -`agentv pipeline input`. The agent handles execution directly. +For agent-as-target mode (invoke.json has kind=agent — all non-CLI providers +unless subagent_mode_allowed=false in targets.yaml), this script only runs +`agentv pipeline input`. Executor subagents handle execution directly. """ import argparse import json @@ -80,9 +81,15 @@ def invoke_cli_target(test_dir: Path, extra_env: dict | None = None) -> None: timeout_s = invoke.get("timeout_ms", 30000) / 1000 merged_env = {**os.environ, **(extra_env or {})} + # Extract prompt text from input messages + prompt_text = next( + (m["content"] for m in input_data.get("input", []) if m.get("role") == "user"), + "", + ) + # Write prompt to temp file for {PROMPT_FILE} placeholder with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as pf: - pf.write(input_data["input_text"]) + pf.write(prompt_text) prompt_file = pf.name # Create output file path for {OUTPUT_FILE} placeholder @@ -90,7 +97,7 @@ def invoke_cli_target(test_dir: Path, extra_env: dict | None = None) -> None: # Render template rendered = command_template - rendered = rendered.replace("{PROMPT}", input_data["input_text"]) + rendered = rendered.replace("{PROMPT}", prompt_text) rendered = rendered.replace("{PROMPT_FILE}", prompt_file) rendered = rendered.replace("{OUTPUT_FILE}", output_file)