Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
679dfdb
feat(agentv-bench): add executor subagent for non-CLI targets (#797)
christso Mar 28, 2026
8e928df
fix(agentv-bench): remove tools restriction from executor subagent
christso Mar 28, 2026
a5e2085
docs(agentv-bench): add metadata field to executor input.json docs
christso Mar 28, 2026
08d369f
refactor(pipeline): align input.json fields with eval YAML and SDK
christso Mar 28, 2026
2661ee1
refactor(pipeline): rename questionText to inputText for consistency
christso Mar 28, 2026
75bffd9
refactor(pipeline): remove legacy question/reference_answer from grad…
christso Mar 28, 2026
e3db11a
fix(agentv-bench): update stale input_text/input_messages references
christso Mar 28, 2026
72e7cac
fix(agentv-bench): simplify executor β€” remove workspace-dir parameter
christso Mar 28, 2026
f795ac4
docs(agentv-bench): add workspace detection guidance for executor sub…
christso Mar 28, 2026
771b142
docs(agentv-bench): clarify workspace detection β€” only matters for wo…
christso Mar 28, 2026
e54efb6
fix(pipeline): extractInputText should use all messages, not just first
christso Mar 28, 2026
bcb87fd
docs(eval): fix inputText jsdoc β€” includes all messages, not just first
christso Mar 28, 2026
ae8cbec
docs(eval): fix expectedOutputText jsdoc β€” last message content, not all
christso Mar 28, 2026
cf4690b
fix(pipeline): use @[role] pattern for multi-message inputText
christso Mar 28, 2026
9c7726c
fix(pipeline): align @[role]: format with artifact-writer convention
christso Mar 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ import { join } from 'node:path';
import { executeScript } from '@agentv/core';
import { command, positional, string } from 'cmd-ts';

/**
* Convert a Message[] array to plain text.
* Single message: returns content directly (no role prefix).
* Multiple messages: prefixes each with @role for clarity.
*/
function extractInputText(input: Array<{ role: string; content: string }>): string {
if (!input || input.length === 0) return '';
if (input.length === 1) return input[0].content;
return input.map((m) => `@[${m.role}]:\n${m.content}`).join('\n\n');
}

export const evalGradeCommand = command({
name: 'grade',
description: 'Run code-grader assertions on responses in an export directory',
Expand Down Expand Up @@ -60,14 +71,13 @@ export const evalGradeCommand = command({
const graderName = graderConfig.name;

// Build stdin payload matching CodeEvaluator format (snake_case)
const inputText = extractInputText(inputData.input);
const payload = JSON.stringify({
output: [{ role: 'assistant', content: responseText }],
input: inputData.input_messages,
question: inputData.input_text,
input: inputData.input,
criteria: '',
expected_output: [],
reference_answer: '',
input_files: [],
input_files: inputData.input_files ?? [],
trace: null,
token_usage: null,
cost_usd: null,
Expand All @@ -77,8 +87,8 @@ export const evalGradeCommand = command({
file_changes: null,
workspace_path: null,
config: graderConfig.config ?? null,
metadata: {},
input_text: inputData.input_text,
metadata: inputData.metadata ?? {},
input_text: inputText,
output_text: responseText,
expected_output_text: '',
});
Expand Down
8 changes: 3 additions & 5 deletions apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,14 @@ export const evalInputCommand = command({
await mkdir(testDir, { recursive: true });
testIds.push(test.id);

// input.json
const inputText = test.question;
// input.json β€” aligned with eval YAML schema and code grader SDK field names
const inputMessages = test.input.map((m) => ({
role: m.role,
content: typeof m.content === 'string' ? m.content : m.content,
}));
await writeJson(join(testDir, 'input.json'), {
input_text: inputText,
input_messages: inputMessages,
file_paths: test.file_paths,
input: inputMessages,
input_files: test.file_paths,
metadata: test.metadata ?? {},
});

Expand Down
33 changes: 21 additions & 12 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ import { buildDefaultRunDir } from '../eval/result-layout.js';
import { findRepoRoot } from '../eval/shared.js';
import { selectTarget } from '../eval/targets.js';

/**
* Convert a Message[] array to plain text.
* Single message: returns content directly (no role prefix).
* Multiple messages: prefixes each with @role for clarity.
*/
function extractInputText(input: Array<{ role: string; content: string }>): string {
if (!input || input.length === 0) return '';
if (input.length === 1) return input[0].content;
return input.map((m) => `@[${m.role}]:\n${m.content}`).join('\n\n');
}

/** Load key=value pairs from a .env file. Ignores comments and blank lines. */
function loadEnvFile(dir: string): Record<string, string> {
let current = resolve(dir);
Expand Down Expand Up @@ -134,15 +145,13 @@ export const evalRunCommand = command({
await mkdir(testDir, { recursive: true });
testIds.push(test.id);

const inputText = test.question;
const inputMessages = test.input.map((m) => ({
role: m.role,
content: typeof m.content === 'string' ? m.content : m.content,
}));
await writeJson(join(testDir, 'input.json'), {
input_text: inputText,
input_messages: inputMessages,
file_paths: test.file_paths,
input: inputMessages,
input_files: test.file_paths,
metadata: test.metadata ?? {},
});

Expand Down Expand Up @@ -216,12 +225,13 @@ export const evalRunCommand = command({
// Write temp prompt file
const promptFile = join(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
const outputFile = join(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
await writeFile(promptFile, inputData.input_text, 'utf8');
const inputText = extractInputText(inputData.input);
await writeFile(promptFile, inputText, 'utf8');

let rendered = template;
rendered = rendered.replace('{PROMPT_FILE}', promptFile);
rendered = rendered.replace('{OUTPUT_FILE}', outputFile);
rendered = rendered.replace('{PROMPT}', inputData.input_text);
rendered = rendered.replace('{PROMPT}', inputText);

const start = performance.now();
try {
Expand Down Expand Up @@ -313,14 +323,13 @@ export const evalRunCommand = command({
const graderConfig = JSON.parse(await readFile(join(codeGradersDir, graderFile), 'utf8'));
const graderName = graderConfig.name;

const inputText = extractInputText(inputData.input);
const payload = JSON.stringify({
output: [{ role: 'assistant', content: responseText }],
input: inputData.input_messages,
question: inputData.input_text,
input: inputData.input,
criteria: '',
expected_output: [],
reference_answer: '',
input_files: [],
input_files: inputData.input_files ?? [],
trace: null,
token_usage: null,
cost_usd: null,
Expand All @@ -330,8 +339,8 @@ export const evalRunCommand = command({
file_changes: null,
workspace_path: null,
config: graderConfig.config ?? null,
metadata: {},
input_text: inputData.input_text,
metadata: inputData.metadata ?? {},
input_text: inputText,
output_text: responseText,
expected_output_text: '',
});
Expand Down
5 changes: 2 additions & 3 deletions apps/cli/test/commands/eval/pipeline/grade.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ describe('pipeline grade', () => {
await writeFile(
join(testDir, 'input.json'),
JSON.stringify({
input_text: 'say hello',
input_messages: [{ role: 'user', content: 'say hello' }],
file_paths: [],
input: [{ role: 'user', content: 'say hello' }],
input_files: [],
}),
);
await writeFile(
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/eval/pipeline/input.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ describe('pipeline input', () => {
expect(manifest.eval_file).toContain('input-test.eval.yaml');
});

it('writes per-test input.json with input_text', async () => {
it('writes per-test input.json with input and input_files', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);

const input = JSON.parse(
await readFile(join(OUT_DIR, 'input-test', 'test-01', 'input.json'), 'utf8'),
);
expect(input.input_text).toBe('hello world');
expect(input.input_messages).toHaveLength(1);
expect(input.input).toHaveLength(1);
expect(input.input[0].content).toBe('hello world');
});

it('writes code_graders/<name>.json with resolved command', async () => {
Expand Down
4 changes: 2 additions & 2 deletions packages/eval/src/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ export const CodeGraderInputSchema = z.object({
fileChanges: z.string().nullable().optional(),
workspacePath: z.string().nullable().optional(),
config: z.record(z.unknown()).nullable().optional(),
/** First user message content as string. */
/** All input messages as plain text. Single message: content only. Multiple: @role prefixed. */
inputText: z.string(),
/** Expected output content as string. */
/** Last expected output message content as plain text. */
expectedOutputText: z.string().optional(),
});

Expand Down
70 changes: 61 additions & 9 deletions plugins/agentv-dev/skills/agentv-bench/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ This skill ships with a Python scripts layer in `plugins/agentv-dev/skills/agent

These scripts break the eval pipeline into discrete steps. The agent runs them in order, only handling LLM grading directly:

- `scripts/run_tests.py <eval-path> --out <dir>` β€” Extract inputs and invoke CLI targets in parallel. Writes `response.md` per test. For agent-as-target, only extracts inputs (agent handles execution).
- `scripts/run_tests.py <eval-path> --out <dir>` β€” Extract inputs and invoke CLI targets in parallel. Writes `response.md` per test. For agent-as-target (`kind: "agent"`), only extracts inputs β€” executor subagents handle execution.
- `scripts/run_code_graders.py <dir>` β€” Run code-grader assertions on existing responses. Writes per-grader results.
- `scripts/bench.py <dir> < llm_scores.json` β€” Merge code-grader + LLM scores, compute weighted pass_rate, write `grading.json` + `index.jsonl` + `benchmark.json`.

Expand Down Expand Up @@ -87,7 +87,7 @@ Before running or optimizing, understand what you're working with.

2. **Identify success criteria** β€” what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone.

3. **Understand the target harness** β€” which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what evaluator types are available and how to run tests.
3. **Understand the target harness** β€” which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what evaluator types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax β€” literal secrets are rejected as a security guardrail.

4. **Challenge assumptions** β€” if evals already exist, review their quality before running:
- Are the test cases testing the right things?
Expand Down Expand Up @@ -287,14 +287,30 @@ Write artifacts to `.agentv/artifacts/` or the iteration directory.

### Subagent mode: Running eval.yaml without CLI

When `SUBAGENT_EVAL_MODE=subagent` (default), use the pipeline CLI subcommands (`pipeline input`, `pipeline grade`, `pipeline bench`) and Python wrapper scripts. This mode spends LLM tokens only on grading, not on YAML parsing or target invocation.
When `SUBAGENT_EVAL_MODE=subagent` (default), use the pipeline CLI subcommands (`pipeline input`, `pipeline grade`, `pipeline bench`) and Python wrapper scripts. This mode dispatches `executor` subagents to perform each test case, then `grader` subagents to evaluate the outputs.

**Executor subagent eligibility:** All providers except `cli` are eligible for executor subagents by default. To opt out a specific target, set `subagent_mode_allowed: false` in `.agentv/targets.yaml`:

```yaml
# .agentv/targets.yaml
targets:
- name: my-target
provider: openai
model: ${{ OPENAI_MODEL }}
api_key: ${{ OPENAI_API_KEY }}
subagent_mode_allowed: false # forces CLI invocation instead of executor subagent
```

When `subagent_mode_allowed: false`, the target falls back to CLI invocation via `agentv eval` even in subagent mode.

**Prerequisites:**
- The eval.yaml file exists and contains valid test definitions
- `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`)
- Read `references/eval-yaml-spec.md` for the full schema

**Recommended: Single command for CLI targets**
**Workspace matters when evals need it:** Some evals pass prompt files directly and don't require a specific workspace β€” those run fine from anywhere. But evals that test agent behavior in a workspace (accessing skills, modifying repos, using tools across multiple repos) require the user to be in the **target workspace** (e.g., a multi-repo workspace set up by allagents). If the eval references workspace files or expects the agent to use skills, check that the current directory is the target workspace, not just the eval repo β€” and warn the user if it's wrong.

**CLI targets: Single command**

For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step. When `--out` is omitted, the output directory defaults to `.agentv/results/runs/<timestamp>` (same convention as `agentv eval`):

Expand All @@ -315,20 +331,35 @@ agentv results validate <run-dir>

That's the entire pipeline: **2 commands** + LLM grading + optional validation.

**Alternative: Step-by-step (subagent-as-target or fine-grained control)**
**Non-CLI targets: Executor subagents**

When the target provider is not `cli`, check `manifest.json` β†’ `target.subagent_mode_allowed`. If `true` (default for all non-CLI providers), the subagent IS the target. If `false` (user opted out via `subagent_mode_allowed: false` in `.agentv/targets.yaml`), fall back to `agentv eval` CLI mode instead.

Use individual commands when the subagent IS the target or you need control over each step:
For executor subagent targets, use `pipeline input` to extract inputs, then dispatch `executor` subagents to perform each test case:

```bash
# Step 1: Extract inputs (defaults to .agentv/results/runs/<timestamp>)
agentv pipeline input evals/repro.eval.yaml
```

This creates a run directory with per-test `input.json`, `invoke.json` (with `kind: "agent"`), `criteria.md`, and grader configs.

# Step 2: Subagent invokes each test (reads input.json, writes response.md)
**Step 2: Dispatch executor subagents** β€” read `agents/executor.md`. Launch one `executor` subagent **per test case**, all in parallel. Each subagent receives the test directory path, reads `input.json`, performs the task using its own tools, and writes `response.md`. For example, 5 tests = 5 executor subagents launched simultaneously.

```
# Per executor subagent:
# - Reads <run-dir>/<test-id>/input.json
# - Performs the task
# - Writes <run-dir>/<test-id>/response.md
```

**Step 3 onward: Grade and merge** β€” same as CLI targets:

```bash
# Step 3: Run code graders
agentv pipeline grade <run-dir>

# Step 4: Subagent does LLM grading, writes llm_scores.json
# Step 4: Subagent does LLM grading, writes llm_scores.json (see below)

# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard)
agentv pipeline bench <run-dir> --llm-scores llm_scores.json
Expand All @@ -337,7 +368,27 @@ agentv pipeline bench <run-dir> --llm-scores llm_scores.json
agentv results validate <run-dir>
```

This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`).
**Step-by-step (fine-grained control for CLI targets)**

Use individual commands when you need control over each step with CLI targets:

```bash
# Step 1: Extract inputs (defaults to .agentv/results/runs/<timestamp>)
agentv pipeline input evals/repro.eval.yaml

# Step 2: run_tests.py invokes CLI targets (or use pipeline run instead)

# Step 3: Run code graders
agentv pipeline grade <run-dir>

# Step 4: Subagent does LLM grading, writes llm_scores.json

# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard)
agentv pipeline bench <run-dir> --llm-scores llm_scores.json

# Step 6: Validate
agentv results validate <run-dir>
```

**Step 3 (LLM grading): agent performs directly**

Expand Down Expand Up @@ -641,6 +692,7 @@ The `agents/` directory contains instructions for specialized subagents. Read th

| Agent | File | Purpose | When to dispatch |
|-------|------|---------|-----------------|
| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets β€” one per test case) |
| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading LLM-judged assertions) |
| comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) |
| analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) |
Expand Down
30 changes: 30 additions & 0 deletions plugins/agentv-dev/skills/agentv-bench/agents/executor.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
name: executor
description: >-
Execute an AgentV evaluation test case by performing the task described in the
input. Reads input.json from the test directory, carries out the task using
available tools, and writes response.md with the result. Dispatch one executor
subagent per test case, all in parallel.
model: inherit
color: cyan
---

You are the executor for an AgentV evaluation test case. Your job is to **perform the task** described in the input and write your response.

You are the target agent being evaluated. Do the task to the best of your ability β€” your output will be graded by a separate grader agent.

**You will receive these parameters:**
- `test-dir`: Path to the test case directory (e.g., `.agentv/results/runs/<timestamp>/<test-id>/`)

## Process

1. **Read `{test-dir}/input.json`**. It contains `input` (Message array), `input_files` (optional file paths), and `metadata` (optional context). If `input_files` are listed, read those files too.

2. **Perform the task** described in the input.

3. **Write `{test-dir}/response.md`** with everything a grader needs to evaluate your work β€” your answer, actions taken, code produced, and any errors encountered. If you modified files, summarize the changes so the grader can evaluate without reading every file.

## Important

- Do NOT read grading criteria, assertions, or expected outputs β€” those are for the grader, not for you.
- Write `response.md` even if you couldn't complete the task β€” explain what happened and what you tried.
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ Extracts inputs, target commands, and grader configs from an eval YAML file.
<out-dir>/
β”œβ”€β”€ manifest.json
β”œβ”€β”€ <test-id>/
β”‚ β”œβ”€β”€ input.json ← {input_text, input_messages, file_paths}
β”‚ β”œβ”€β”€ input.json ← {input, input_files, metadata}
β”‚ β”œβ”€β”€ invoke.json ← {kind, command?, cwd?, timeout_ms?}
β”‚ β”œβ”€β”€ criteria.md ← human-readable success criteria
β”‚ β”œβ”€β”€ expected_output.json ← (if present)
Expand All @@ -285,14 +285,14 @@ Extracts inputs, target commands, and grader configs from an eval YAML file.
{
"eval_file": "path/to/eval.yaml",
"timestamp": "2026-03-24T...",
"target": {"name": "target-name", "kind": "cli"},
"target": {"name": "target-name", "kind": "cli", "subagent_mode_allowed": false},
"test_ids": ["test-01", "test-02"]
}
```

**`invoke.json` kinds:**
- `kind: "cli"` β€” has `command`, `cwd`, `timeout_ms`. Use the command to run the target.
- `kind: "agent"` β€” agent executes directly, no CLI invocation needed.
- `kind: "agent"` β€” non-CLI provider. Check `manifest.json` `target.subagent_mode_allowed` to decide whether to dispatch executor subagents or fall back to `agentv eval` CLI.

### `agentv pipeline grade <export-dir>`

Expand Down
Loading
Loading