From 0135be2a93eceb9fb90d41becbbd295d4748dd29 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:40:39 +0000 Subject: [PATCH 01/16] feat(eval): add path-derived category field for hierarchical grouping Placeholder commit to open draft PR. See PR description for implementation plan. Closes #813 From c1fa922cc3a1b6dc5a4718ae3462e93f09ed689c Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:46:50 +0000 Subject: [PATCH 02/16] feat(core): add optional category field to EvalTest and EvaluationResult Add `readonly category?: string` to both interfaces to support path-derived categorization of eval tests throughout the pipeline. --- packages/core/src/evaluation/types.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 8f5d718a..586cf5c2 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -772,6 +772,7 @@ export type EvaluatorConfig = export interface EvalTest { readonly id: string; readonly dataset?: string; + readonly category?: string; readonly conversation_id?: string; readonly question: string; readonly input: readonly TestMessage[]; @@ -894,6 +895,7 @@ export interface EvaluationResult { readonly timestamp: string; readonly testId: string; readonly dataset?: string; + readonly category?: string; readonly conversationId?: string; readonly score: number; readonly assertions: readonly AssertionEntry[]; From 2b465f86c5bbf50ff3f74833c3db6fd4783bc397 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:46:56 +0000 Subject: [PATCH 03/16] refactor(core): extract deriveCategory to @agentv/core Move category derivation logic from CLI discover.ts into a shared core module so it can be reused by the YAML parser and run-eval. --- apps/cli/src/commands/eval/discover.ts | 16 +--------------- packages/core/src/evaluation/category.ts | 17 +++++++++++++++++ packages/core/src/index.ts | 1 + 3 files changed, 19 insertions(+), 15 deletions(-) create mode 100644 packages/core/src/evaluation/category.ts diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts index 7ac77231..70578b12 100644 --- a/apps/cli/src/commands/eval/discover.ts +++ b/apps/cli/src/commands/eval/discover.ts @@ -1,5 +1,5 @@ import path from 'node:path'; -import { DEFAULT_EVAL_PATTERNS, loadConfig } from '@agentv/core'; +import { DEFAULT_EVAL_PATTERNS, deriveCategory, loadConfig } from '@agentv/core'; import fg from 'fast-glob'; import { findRepoRoot } from './shared.js'; @@ -52,20 +52,6 @@ export async function discoverEvalFiles(cwd: string): Promise d !== 'evals'); - return dirs.length > 0 ? dirs.join('/') : 'root'; -} /** Get unique categories from discovered eval files. */ export function getCategories(files: readonly DiscoveredEvalFile[]): readonly string[] { diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts new file mode 100644 index 00000000..2cfd2854 --- /dev/null +++ b/packages/core/src/evaluation/category.ts @@ -0,0 +1,17 @@ +import path from 'node:path'; + +/** + * Derive a human-readable category from an eval file's relative path. + * + * Strips the filename and any `evals` directory segments, then joins + * remaining directories with `/`. Returns `'Uncategorized'` for files + * at the root level. + */ +export function deriveCategory(relativePath: string): string { + const parts = relativePath.split(path.sep); + if (parts.length <= 1) { + return 'Uncategorized'; + } + const dirs = parts.slice(0, -1).filter((d) => d !== 'evals'); + return dirs.length > 0 ? dirs.join('/') : 'Uncategorized'; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 79454572..f5e71c02 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -54,6 +54,7 @@ export { getWorkspacePoolRoot, } from './paths.js'; export { trimBaselineResult } from './evaluation/baseline.js'; +export { deriveCategory } from './evaluation/category.js'; export * from './observability/index.js'; // Registry exports From 308b1764ab544019660ecd5e012b9562a58d12ab Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:47:03 +0000 Subject: [PATCH 04/16] feat(core): propagate category through YAML parser Add category to LoadOptions and pass it through to constructed EvalTest objects so tests carry their file-derived category. --- packages/core/src/evaluation/yaml-parser.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index ed4d4b83..132baed8 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -77,6 +77,8 @@ type LoadOptions = { readonly verbose?: boolean; /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */ readonly filter?: string; + /** Category derived from the eval file's directory path */ + readonly category?: string; }; type RawTestSuite = JsonObject & { @@ -472,6 +474,7 @@ async function loadTestsFromYaml( const testCase: EvalTest = { id, dataset: evalSetName, + category: options?.category, conversation_id: conversationId, question: question, input: inputMessages, From 02b7146e9ab55f036a80bba1580e842160c812c7 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:47:07 +0000 Subject: [PATCH 05/16] feat(cli): derive and pass category from run-eval to parser Compute category from the eval file's relative path and forward it to loadTestSuite so each test gets its category assigned. --- apps/cli/src/commands/eval/run-eval.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index a0eaad40..bd107786 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -16,6 +16,7 @@ import { runEvaluation as defaultRunEvaluation, ensureVSCodeSubagents, loadConfig, + deriveCategory, loadTestSuite, loadTsConfig, shouldEnableCache, @@ -444,9 +445,13 @@ async function prepareFileMetadata(params: { verbose: options.verbose, }); + const relativePath = path.relative(cwd, testFilePath); + const category = deriveCategory(relativePath); + const suite = await loadTestSuite(testFilePath, repoRoot, { verbose: options.verbose, filter: options.filter, + category, }); const filteredIds = suite.tests.map((value) => value.id); From e1d388a5940fd48a325b0adfc41792e22b4f0f21 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:47:12 +0000 Subject: [PATCH 06/16] feat(core): pass category through orchestrator to EvaluationResult Include evalCase.category in all result-building paths (success, budget-exceeded, fail-on-error, and error results). --- packages/core/src/evaluation/orchestrator.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index cdd40209..cd3303ee 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -822,6 +822,7 @@ export async function runEvaluation( timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, score: 0, assertions: [], output: [], @@ -861,6 +862,7 @@ export async function runEvaluation( timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, score: 0, assertions: [], output: [], @@ -2108,6 +2110,7 @@ async function evaluateCandidate(options: { timestamp: completedAt.toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, conversationId: evalCase.conversation_id, score: score.score, assertions: score.assertions, @@ -2590,6 +2593,7 @@ function buildErrorResult( timestamp: timestamp.toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, conversationId: evalCase.conversation_id, score: 0, assertions: [{ text: `Error: ${message}`, passed: false }], From c1d58d3d157c6192132e84fe712e2b4abb856aee Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:47:16 +0000 Subject: [PATCH 07/16] feat(cli): include category in artifact writer and manifest Add category field to IndexArtifactEntry and ResultManifestRecord so it flows through JSONL output and manifest hydration. --- apps/cli/src/commands/eval/artifact-writer.ts | 3 +++ apps/cli/src/commands/results/manifest.ts | 2 ++ 2 files changed, 5 insertions(+) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 3089a5c6..b60b1c2d 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -95,6 +95,7 @@ export interface IndexArtifactEntry { readonly timestamp: string; readonly test_id: string; readonly dataset?: string; + readonly category?: string; readonly conversation_id?: string; readonly score: number; readonly target: string; @@ -508,6 +509,7 @@ export function buildIndexArtifactEntry( timestamp: result.timestamp, test_id: result.testId ?? 'unknown', dataset: getDataset(result), + category: result.category, conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', @@ -539,6 +541,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA timestamp: result.timestamp, test_id: result.testId ?? 'unknown', dataset: getDataset(result), + category: result.category, conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index fe642d36..7a4e3d72 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -12,6 +12,7 @@ export interface ResultManifestRecord { readonly test_id?: string; readonly eval_id?: string; readonly dataset?: string; + readonly category?: string; readonly experiment?: string; readonly target?: string; readonly score: number; @@ -125,6 +126,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E timestamp: record.timestamp, testId, dataset: record.dataset, + category: record.category, target: record.target, score: record.score, executionStatus: record.execution_status, From 9d749b1195da7fa8b85f7be5b9e0af06a84d10fe Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:55:19 +0000 Subject: [PATCH 08/16] feat(studio-api): add categories and category-datasets endpoints --- apps/cli/src/commands/results/serve.ts | 76 ++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index a178359d..c894b73c 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -304,6 +304,82 @@ export function createApp( } }); + // Category summaries for a run + app.get('/api/runs/:filename/categories', (c) => { + const filename = c.req.param('filename'); + const metas = listResultFiles(searchDir); + const meta = metas.find((m) => m.filename === filename); + if (!meta) { + return c.json({ error: 'Run not found' }, 404); + } + try { + const loaded = patchTestIds(loadManifestResults(meta.path)); + const categoryMap = new Map< + string, + { total: number; passed: number; scoreSum: number; datasets: Set } + >(); + for (const r of loaded) { + const cat = r.category ?? 'Uncategorized'; + const entry = categoryMap.get(cat) ?? { + total: 0, + passed: 0, + scoreSum: 0, + datasets: new Set(), + }; + entry.total++; + if (r.score >= 1) entry.passed++; + entry.scoreSum += r.score; + entry.datasets.add(r.dataset ?? r.target ?? 'default'); + categoryMap.set(cat, entry); + } + const categories = [...categoryMap.entries()].map(([name, entry]) => ({ + name, + total: entry.total, + passed: entry.passed, + failed: entry.total - entry.passed, + avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, + dataset_count: entry.datasets.size, + })); + return c.json({ categories }); + } catch { + return c.json({ error: 'Failed to load categories' }, 500); + } + }); + + // Datasets within a category for a run + app.get('/api/runs/:filename/categories/:category/datasets', (c) => { + const filename = c.req.param('filename'); + const category = decodeURIComponent(c.req.param('category')); + const metas = listResultFiles(searchDir); + const meta = metas.find((m) => m.filename === filename); + if (!meta) { + return c.json({ error: 'Run not found' }, 404); + } + try { + const loaded = patchTestIds(loadManifestResults(meta.path)); + const filtered = loaded.filter((r) => (r.category ?? 'Uncategorized') === category); + const datasetMap = new Map(); + for (const r of filtered) { + const ds = r.dataset ?? r.target ?? 'default'; + const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; + entry.total++; + if (r.score >= 1) entry.passed++; + entry.scoreSum += r.score; + datasetMap.set(ds, entry); + } + const datasets = [...datasetMap.entries()].map(([name, entry]) => ({ + name, + total: entry.total, + passed: entry.passed, + failed: entry.total - entry.passed, + avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, + })); + return c.json({ datasets }); + } catch { + return c.json({ error: 'Failed to load datasets' }, 500); + } + }); + // Full eval detail with hydrated artifacts app.get('/api/runs/:filename/evals/:evalId', (c) => { const filename = c.req.param('filename'); From 32cf5d506150db45f94854d2546ddc35d4833293 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:55:47 +0000 Subject: [PATCH 09/16] feat(studio): add category types and API hooks --- apps/studio/src/lib/api.ts | 29 +++++++++++++++++++++++++++++ apps/studio/src/lib/types.ts | 14 ++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index cc3aa022..a1d6c7f0 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -8,6 +8,7 @@ import { queryOptions, useQuery } from '@tanstack/react-query'; import type { + CategoriesResponse, DatasetsResponse, EvalDetailResponse, ExperimentsResponse, @@ -105,6 +106,26 @@ export function evalFileContentOptions(runId: string, evalId: string, filePath: }); } +export function runCategoriesOptions(runId: string) { + return queryOptions({ + queryKey: ['runs', runId, 'categories'], + queryFn: () => + fetchJson(`/api/runs/${encodeURIComponent(runId)}/categories`), + enabled: !!runId, + }); +} + +export function categoryDatasetsOptions(runId: string, category: string) { + return queryOptions({ + queryKey: ['runs', runId, 'categories', category, 'datasets'], + queryFn: () => + fetchJson( + `/api/runs/${encodeURIComponent(runId)}/categories/${encodeURIComponent(category)}/datasets`, + ), + enabled: !!runId && !!category, + }); +} + // ── Hooks ─────────────────────────────────────────────────────────────── export function useRunList() { @@ -146,3 +167,11 @@ export function useEvalFiles(runId: string, evalId: string) { export function useEvalFileContent(runId: string, evalId: string, filePath: string) { return useQuery(evalFileContentOptions(runId, evalId, filePath)); } + +export function useRunCategories(runId: string) { + return useQuery(runCategoriesOptions(runId)); +} + +export function useCategoryDatasets(runId: string, category: string) { + return useQuery(categoryDatasetsOptions(runId, category)); +} diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index e6454008..4c1ef6f0 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -49,6 +49,7 @@ export interface EvalResult { testId: string; timestamp?: string; dataset?: string; + category?: string; target?: string; experiment?: string; score: number; @@ -150,3 +151,16 @@ export interface FileContentResponse { content: string; language: string; } + +export interface CategorySummary { + name: string; + total: number; + passed: number; + failed: number; + avg_score: number; + dataset_count: number; +} + +export interface CategoriesResponse { + categories: CategorySummary[]; +} From 3f5457b9517f44c678c1d3e3d79469b296fa2744 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:56:28 +0000 Subject: [PATCH 10/16] feat(studio): group run detail by category with collapsible sections --- apps/studio/src/components/RunDetail.tsx | 178 +++++++++++++++++------ 1 file changed, 136 insertions(+), 42 deletions(-) diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 6c96c381..42a27848 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -1,10 +1,11 @@ /** * Run detail component showing per-eval breakdown with score bars. * - * Displays each eval result as a row with test ID, target, score bar, - * status, duration, and cost. Clicking a row navigates to eval detail. + * Groups results by category (from file path), then by dataset within each category. + * Categories are shown as collapsible sections with dataset cards inside. */ +import { useState } from 'react'; import { Link } from '@tanstack/react-router'; import type { EvalResult } from '~/lib/types'; @@ -17,34 +18,78 @@ interface RunDetailProps { runId: string; } -export function RunDetail({ results, runId }: RunDetailProps) { - const total = results.length; - const passed = results.filter((r) => r.score >= 1).length; - const failed = total - passed; - const passRate = total > 0 ? passed / total : 0; - const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); +interface DatasetStats { + name: string; + passed: number; + failed: number; + total: number; + avgScore: number; +} + +interface CategoryGroup { + name: string; + datasets: DatasetStats[]; + total: number; + passed: number; + failed: number; + avgScore: number; +} - // Dataset breakdown: group by dataset - const datasetMap = new Map< +function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { + const categoryMap = new Map< string, - { passed: number; failed: number; total: number; scoreSum: number } + Map >(); + for (const r of results) { + const cat = r.category ?? 'Uncategorized'; const ds = r.dataset ?? 'Uncategorized'; - const entry = datasetMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; + if (!categoryMap.has(cat)) categoryMap.set(cat, new Map()); + const dsMap = categoryMap.get(cat)!; + const entry = dsMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; entry.total += 1; entry.scoreSum += r.score; if (r.score >= 1) entry.passed += 1; else entry.failed += 1; - datasetMap.set(ds, entry); + dsMap.set(ds, entry); } - const datasets = Array.from(datasetMap.entries()) - .map(([name, stats]) => ({ - name, - ...stats, - avgScore: stats.total > 0 ? stats.scoreSum / stats.total : 0, - })) + + return Array.from(categoryMap.entries()) + .map(([catName, dsMap]) => { + const datasets = Array.from(dsMap.entries()) + .map(([dsName, stats]) => ({ + name: dsName, + ...stats, + avgScore: stats.total > 0 ? stats.scoreSum / stats.total : 0, + })) + .sort((a, b) => a.name.localeCompare(b.name)); + + const total = datasets.reduce((s, d) => s + d.total, 0); + const passed = datasets.reduce((s, d) => s + d.passed, 0); + const failed = datasets.reduce((s, d) => s + d.failed, 0); + const scoreSum = datasets.reduce((s, d) => s + d.avgScore * d.total, 0); + + return { + name: catName, + datasets, + total, + passed, + failed, + avgScore: total > 0 ? scoreSum / total : 0, + }; + }) .sort((a, b) => a.name.localeCompare(b.name)); +} + +export function RunDetail({ results, runId }: RunDetailProps) { + const total = results.length; + const passed = results.filter((r) => r.score >= 1).length; + const failed = total - passed; + const passRate = total > 0 ? passed / total : 0; + const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); + + const categories = buildCategoryGroups(results); + const hasMultipleCategories = categories.length > 1; if (total === 0) { return ( @@ -68,32 +113,19 @@ export function RunDetail({ results, runId }: RunDetailProps) { totalCost={totalCost > 0 ? totalCost : undefined} /> - {/* Dataset breakdown */} - {datasets.length >= 1 && ( + {hasMultipleCategories ? ( +
+

Categories

+ {categories.map((cat) => ( + + ))} +
+ ) : (

Datasets

- {datasets.map((cat) => ( - -
- {cat.name} - - {cat.passed}/{cat.total} - -
-
- -
-
- {cat.passed} passed - {cat.failed > 0 && {cat.failed} failed} -
- + {categories[0]?.datasets.map((ds) => ( + ))}
@@ -148,6 +180,68 @@ export function RunDetail({ results, runId }: RunDetailProps) { ); } +function CategorySection({ category, runId }: { category: CategoryGroup; runId: string }) { + const [expanded, setExpanded] = useState(true); + + return ( +
+ + {expanded && ( +
+
+ {category.datasets.map((ds) => ( + + ))} +
+
+ )} +
+ ); +} + +function DatasetCard({ dataset, runId }: { dataset: DatasetStats; runId: string }) { + return ( + +
+ {dataset.name} + + {dataset.passed}/{dataset.total} + +
+
+ +
+
+ {dataset.passed} passed + {dataset.failed > 0 && {dataset.failed} failed} +
+ + ); +} + function StatusBadge({ status }: { status?: string }) { if (!status) return -; From a010e7966cf92332272fcdf782a3379e009f97db Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:57:41 +0000 Subject: [PATCH 11/16] feat(studio): add category drill-down route --- apps/studio/src/routeTree.gen.ts | 21 +++++ .../runs/$runId_.category.$category.tsx | 94 +++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 apps/studio/src/routes/runs/$runId_.category.$category.tsx diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index 971d6546..8fa17fb4 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -14,6 +14,7 @@ import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$evalId' import { Route as RunsRunIdDatasetDatasetRouteImport } from './routes/runs/$runId_.dataset.$dataset' +import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' const IndexRoute = IndexRouteImport.update({ id: '/', @@ -41,6 +42,11 @@ const RunsRunIdDatasetDatasetRoute = RunsRunIdDatasetDatasetRouteImport.update({ path: '/runs/$runId/dataset/$dataset', getParentRoute: () => rootRouteImport, } as any) +const RunsRunIdCategoryCategoryRoute = RunsRunIdCategoryCategoryRouteImport.update({ + id: '/runs/$runId_/category/$category', + path: '/runs/$runId/category/$category', + getParentRoute: () => rootRouteImport, +} as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute @@ -48,6 +54,7 @@ export interface FileRoutesByFullPath { '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute + '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute } export interface FileRoutesByTo { '/': typeof IndexRoute @@ -55,6 +62,7 @@ export interface FileRoutesByTo { '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute + '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute } export interface FileRoutesById { __root__: typeof rootRouteImport @@ -63,6 +71,7 @@ export interface FileRoutesById { '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute '/runs/$runId_/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute + '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute } export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath @@ -72,6 +81,7 @@ export interface FileRouteTypes { | '/runs/$runId' | '/evals/$runId/$evalId' | '/runs/$runId/dataset/$dataset' + | '/runs/$runId/category/$category' fileRoutesByTo: FileRoutesByTo to: | '/' @@ -79,6 +89,7 @@ export interface FileRouteTypes { | '/runs/$runId' | '/evals/$runId/$evalId' | '/runs/$runId/dataset/$dataset' + | '/runs/$runId/category/$category' id: | '__root__' | '/' @@ -86,6 +97,7 @@ export interface FileRouteTypes { | '/runs/$runId' | '/evals/$runId/$evalId' | '/runs/$runId_/dataset/$dataset' + | '/runs/$runId_/category/$category' fileRoutesById: FileRoutesById } export interface RootRouteChildren { @@ -94,6 +106,7 @@ export interface RootRouteChildren { RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute RunsRunIdDatasetDatasetRoute: typeof RunsRunIdDatasetDatasetRoute + RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute } declare module '@tanstack/react-router' { @@ -133,6 +146,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof RunsRunIdDatasetDatasetRouteImport parentRoute: typeof rootRouteImport } + '/runs/$runId_/category/$category': { + id: '/runs/$runId_/category/$category' + path: '/runs/$runId/category/$category' + fullPath: '/runs/$runId/category/$category' + preLoaderRoute: typeof RunsRunIdCategoryCategoryRouteImport + parentRoute: typeof rootRouteImport + } } } @@ -142,6 +162,7 @@ const rootRouteChildren: RootRouteChildren = { RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, RunsRunIdDatasetDatasetRoute: RunsRunIdDatasetDatasetRoute, + RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute, } export const routeTree = rootRouteImport ._addFileChildren(rootRouteChildren) diff --git a/apps/studio/src/routes/runs/$runId_.category.$category.tsx b/apps/studio/src/routes/runs/$runId_.category.$category.tsx new file mode 100644 index 00000000..b4e4619a --- /dev/null +++ b/apps/studio/src/routes/runs/$runId_.category.$category.tsx @@ -0,0 +1,94 @@ +/** + * Category drill-down route: shows datasets filtered to a single category. + * + * Uses the `$runId_` trailing-underscore convention so that + * `/runs/:runId/category/:category` is a sibling of `/runs/:runId`, + * not a child route. + */ + +import { Link, createFileRoute } from '@tanstack/react-router'; + +import { ScoreBar } from '~/components/ScoreBar'; +import { StatsCards } from '~/components/StatsCards'; +import { useCategoryDatasets } from '~/lib/api'; + +export const Route = createFileRoute('/runs/$runId_/category/$category')({ + component: CategoryPage, +}); + +function CategoryPage() { + const { runId_, category } = Route.useParams(); + const { data, isLoading, error } = useCategoryDatasets(runId_, category); + + if (isLoading) { + return ( +
+
+
+ {['s1', 's2', 's3', 's4', 's5'].map((id) => ( +
+ ))} +
+
+ ); + } + + if (error) { + return ( +
+ Failed to load category: {error.message} +
+ ); + } + + const datasets = data?.datasets ?? []; + const total = datasets.reduce((s, d) => s + d.total, 0); + const passed = datasets.reduce((s, d) => s + d.passed, 0); + const failed = total - passed; + const passRate = total > 0 ? passed / total : 0; + + return ( +
+
+

{category}

+

Category in run: {runId_}

+
+ + + + {datasets.length === 0 ? ( +
+

No datasets in this category

+
+ ) : ( +
+

Datasets

+
+ {datasets.map((ds) => ( + +
+ {ds.name} + + {ds.passed}/{ds.total} + +
+
+ +
+
+ {ds.passed} passed + {ds.failed > 0 && {ds.failed} failed} +
+ + ))} +
+
+ )} +
+ ); +} From 689a467f78a3cee5839d3cb52ee49b67984049b2 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:58:18 +0000 Subject: [PATCH 12/16] feat(studio): update breadcrumbs and sidebar for category navigation --- apps/studio/src/components/Breadcrumbs.tsx | 13 ++++- apps/studio/src/components/Sidebar.tsx | 61 +++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx index 48d5f7c3..602abc37 100644 --- a/apps/studio/src/components/Breadcrumbs.tsx +++ b/apps/studio/src/components/Breadcrumbs.tsx @@ -23,7 +23,18 @@ function deriveSegments(matches: ReturnType): BreadcrumbSegme if (routeId === '/' || routeId === '/_layout') continue; - if (routeId.includes('/runs/$runId/dataset/$dataset')) { + if (routeId.includes('/runs/$runId/category/$category')) { + if (!segments.some((s) => s.label === params.runId)) { + segments.push({ + label: params.runId ?? 'Run', + to: `/runs/${encodeURIComponent(params.runId)}`, + }); + } + segments.push({ + label: params.category ?? 'Category', + to: match.pathname, + }); + } else if (routeId.includes('/runs/$runId/dataset/$dataset')) { segments.push({ label: params.dataset ?? 'Dataset', to: match.pathname, diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 68da8651..3ca1d43e 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -10,11 +10,15 @@ import { Link, useMatchRoute } from '@tanstack/react-router'; -import { useExperiments, useRunDetail, useRunList } from '~/lib/api'; +import { useCategoryDatasets, useExperiments, useRunDetail, useRunList } from '~/lib/api'; export function Sidebar() { const matchRoute = useMatchRoute(); const evalMatch = matchRoute({ to: '/evals/$runId/$evalId', fuzzy: true }); + const categoryMatch = matchRoute({ + to: '/runs/$runId/category/$category', + fuzzy: true, + }); const datasetMatch = matchRoute({ to: '/runs/$runId/dataset/$dataset', fuzzy: true, @@ -24,6 +28,12 @@ export function Sidebar() { fuzzy: true, }); + // If on a category detail page, show the category sidebar + if (categoryMatch && typeof categoryMatch === 'object' && 'runId' in categoryMatch) { + const { runId, category } = categoryMatch as { runId: string; category: string }; + return ; + } + // If on a dataset detail page, show evals filtered to that dataset if (datasetMatch && typeof datasetMatch === 'object' && 'runId' in datasetMatch) { const { runId, dataset } = datasetMatch as { runId: string; dataset: string }; @@ -206,6 +216,55 @@ function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string }) ); } +function CategorySidebar({ runId, category }: { runId: string; category: string }) { + const { data } = useCategoryDatasets(runId, category); + const datasets = data?.datasets ?? []; + + return ( + + ); +} + function ExperimentSidebar({ currentExperiment }: { currentExperiment: string }) { const { data } = useExperiments(); const experiments = data?.experiments ?? []; From ce3671d0750fe5a84e8bfced3fd43b2ac95cf151 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 13:59:52 +0000 Subject: [PATCH 13/16] fix(studio): use correct runId param name in category route --- apps/studio/src/routeTree.gen.ts | 27 ++++++++++--------- .../runs/$runId_.category.$category.tsx | 8 +++--- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index 8fa17fb4..118360b3 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -42,27 +42,28 @@ const RunsRunIdDatasetDatasetRoute = RunsRunIdDatasetDatasetRouteImport.update({ path: '/runs/$runId/dataset/$dataset', getParentRoute: () => rootRouteImport, } as any) -const RunsRunIdCategoryCategoryRoute = RunsRunIdCategoryCategoryRouteImport.update({ - id: '/runs/$runId_/category/$category', - path: '/runs/$runId/category/$category', - getParentRoute: () => rootRouteImport, -} as any) +const RunsRunIdCategoryCategoryRoute = + RunsRunIdCategoryCategoryRouteImport.update({ + id: '/runs/$runId_/category/$category', + path: '/runs/$runId/category/$category', + getParentRoute: () => rootRouteImport, + } as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute + '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRoutesByTo { '/': typeof IndexRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute + '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRoutesById { __root__: typeof rootRouteImport @@ -70,8 +71,8 @@ export interface FileRoutesById { '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/runs/$runId_/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute + '/runs/$runId_/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath @@ -80,24 +81,24 @@ export interface FileRouteTypes { | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/runs/$runId/dataset/$dataset' | '/runs/$runId/category/$category' + | '/runs/$runId/dataset/$dataset' fileRoutesByTo: FileRoutesByTo to: | '/' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/runs/$runId/dataset/$dataset' | '/runs/$runId/category/$category' + | '/runs/$runId/dataset/$dataset' id: | '__root__' | '/' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/runs/$runId_/dataset/$dataset' | '/runs/$runId_/category/$category' + | '/runs/$runId_/dataset/$dataset' fileRoutesById: FileRoutesById } export interface RootRouteChildren { @@ -105,8 +106,8 @@ export interface RootRouteChildren { ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute - RunsRunIdDatasetDatasetRoute: typeof RunsRunIdDatasetDatasetRoute RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute + RunsRunIdDatasetDatasetRoute: typeof RunsRunIdDatasetDatasetRoute } declare module '@tanstack/react-router' { @@ -161,8 +162,8 @@ const rootRouteChildren: RootRouteChildren = { ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, - RunsRunIdDatasetDatasetRoute: RunsRunIdDatasetDatasetRoute, RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute, + RunsRunIdDatasetDatasetRoute: RunsRunIdDatasetDatasetRoute, } export const routeTree = rootRouteImport ._addFileChildren(rootRouteChildren) diff --git a/apps/studio/src/routes/runs/$runId_.category.$category.tsx b/apps/studio/src/routes/runs/$runId_.category.$category.tsx index b4e4619a..813b5b6c 100644 --- a/apps/studio/src/routes/runs/$runId_.category.$category.tsx +++ b/apps/studio/src/routes/runs/$runId_.category.$category.tsx @@ -17,8 +17,8 @@ export const Route = createFileRoute('/runs/$runId_/category/$category')({ }); function CategoryPage() { - const { runId_, category } = Route.useParams(); - const { data, isLoading, error } = useCategoryDatasets(runId_, category); + const { runId, category } = Route.useParams(); + const { data, isLoading, error } = useCategoryDatasets(runId, category); if (isLoading) { return ( @@ -51,7 +51,7 @@ function CategoryPage() {

{category}

-

Category in run: {runId_}

+

Category in run: {runId}

@@ -68,7 +68,7 @@ function CategoryPage() {
From 90eff03c50e9852984912641ce46c93c9b3a82aa Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 14:07:31 +0000 Subject: [PATCH 14/16] fix: resolve lint issues (import sorting, formatting, non-null assertion) --- apps/cli/src/commands/eval/discover.ts | 1 - apps/cli/src/commands/eval/run-eval.ts | 2 +- apps/studio/src/components/RunDetail.tsx | 3 ++- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts index 70578b12..f8ea59e1 100644 --- a/apps/cli/src/commands/eval/discover.ts +++ b/apps/cli/src/commands/eval/discover.ts @@ -52,7 +52,6 @@ export async function discoverEvalFiles(cwd: string): Promise(); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index bd107786..32d3318f 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -14,9 +14,9 @@ import { ResponseCache, type TrialsConfig, runEvaluation as defaultRunEvaluation, + deriveCategory, ensureVSCodeSubagents, loadConfig, - deriveCategory, loadTestSuite, loadTsConfig, shouldEnableCache, diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 42a27848..01961138 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -5,8 +5,8 @@ * Categories are shown as collapsible sections with dataset cards inside. */ -import { useState } from 'react'; import { Link } from '@tanstack/react-router'; +import { useState } from 'react'; import type { EvalResult } from '~/lib/types'; @@ -45,6 +45,7 @@ function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { const cat = r.category ?? 'Uncategorized'; const ds = r.dataset ?? 'Uncategorized'; if (!categoryMap.has(cat)) categoryMap.set(cat, new Map()); + // biome-ignore lint/style/noNonNullAssertion: map entry guaranteed by line above const dsMap = categoryMap.get(cat)!; const entry = dsMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; entry.total += 1; From bf3d3e08b04b9642cc50dd5d7a290e3c87e40456 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 21:18:51 +0000 Subject: [PATCH 15/16] refactor: extract DEFAULT_CATEGORY constant and add unit tests for deriveCategory --- apps/cli/src/commands/results/serve.ts | 6 +-- packages/core/src/evaluation/category.ts | 9 ++-- packages/core/src/index.ts | 2 +- .../core/test/evaluation/category.test.ts | 41 +++++++++++++++++++ 4 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 packages/core/test/evaluation/category.test.ts diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index c894b73c..6253206e 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -31,7 +31,7 @@ import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { command, number, option, optional, positional, string } from 'cmd-ts'; -import type { EvaluationResult } from '@agentv/core'; +import { DEFAULT_CATEGORY, type EvaluationResult } from '@agentv/core'; import { Hono } from 'hono'; import { parseJsonlResults } from '../eval/artifact-writer.js'; @@ -319,7 +319,7 @@ export function createApp( { total: number; passed: number; scoreSum: number; datasets: Set } >(); for (const r of loaded) { - const cat = r.category ?? 'Uncategorized'; + const cat = r.category ?? DEFAULT_CATEGORY; const entry = categoryMap.get(cat) ?? { total: 0, passed: 0, @@ -357,7 +357,7 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); - const filtered = loaded.filter((r) => (r.category ?? 'Uncategorized') === category); + const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); const datasetMap = new Map(); for (const r of filtered) { const ds = r.dataset ?? r.target ?? 'default'; diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts index 2cfd2854..926e34f8 100644 --- a/packages/core/src/evaluation/category.ts +++ b/packages/core/src/evaluation/category.ts @@ -1,17 +1,20 @@ import path from 'node:path'; +/** Default category for eval files without subdirectory structure. */ +export const DEFAULT_CATEGORY = 'Uncategorized'; + /** * Derive a human-readable category from an eval file's relative path. * * Strips the filename and any `evals` directory segments, then joins - * remaining directories with `/`. Returns `'Uncategorized'` for files + * remaining directories with `/`. Returns {@link DEFAULT_CATEGORY} for files * at the root level. */ export function deriveCategory(relativePath: string): string { const parts = relativePath.split(path.sep); if (parts.length <= 1) { - return 'Uncategorized'; + return DEFAULT_CATEGORY; } const dirs = parts.slice(0, -1).filter((d) => d !== 'evals'); - return dirs.length > 0 ? dirs.join('/') : 'Uncategorized'; + return dirs.length > 0 ? dirs.join('/') : DEFAULT_CATEGORY; } diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index f5e71c02..0e457b4d 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -54,7 +54,7 @@ export { getWorkspacePoolRoot, } from './paths.js'; export { trimBaselineResult } from './evaluation/baseline.js'; -export { deriveCategory } from './evaluation/category.js'; +export { DEFAULT_CATEGORY, deriveCategory } from './evaluation/category.js'; export * from './observability/index.js'; // Registry exports diff --git a/packages/core/test/evaluation/category.test.ts b/packages/core/test/evaluation/category.test.ts new file mode 100644 index 00000000..9b8c62d0 --- /dev/null +++ b/packages/core/test/evaluation/category.test.ts @@ -0,0 +1,41 @@ +import { describe, expect, test } from 'bun:test'; + +import { DEFAULT_CATEGORY, deriveCategory } from '../../src/evaluation/category.js'; + +describe('deriveCategory', () => { + test('returns Uncategorized for single-segment path (root-level file)', () => { + expect(deriveCategory('dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); + }); + + test('returns Uncategorized when only directory is evals', () => { + expect(deriveCategory('evals/dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); + }); + + test('strips evals segment and returns remaining directory', () => { + expect(deriveCategory('evals/fundamentals/greetings.eval.yaml')).toBe('fundamentals'); + }); + + test('preserves nested directory paths', () => { + expect(deriveCategory('evals/cargowise-customs/layout-engine/eval.yaml')).toBe( + 'cargowise-customs/layout-engine', + ); + }); + + test('handles paths without evals segment', () => { + expect(deriveCategory('examples/showcase/eval.yaml')).toBe('examples/showcase'); + }); + + test('strips evals from middle of multi-level path', () => { + expect(deriveCategory('examples/showcase/export-screening/evals/dataset.eval.yaml')).toBe( + 'examples/showcase/export-screening', + ); + }); + + test('returns Uncategorized for empty string', () => { + expect(deriveCategory('')).toBe(DEFAULT_CATEGORY); + }); + + test('returns Uncategorized for just a filename with no directory', () => { + expect(deriveCategory('eval.yaml')).toBe(DEFAULT_CATEGORY); + }); +}); From f245b5612b87c0a3e1d63b67defb7bc8dec26428 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 21:32:31 +0000 Subject: [PATCH 16/16] fix: pass category through pipeline run and input commands --- apps/cli/src/commands/pipeline/input.ts | 7 ++++--- apps/cli/src/commands/pipeline/run.ts | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 26da5826..ef53fe5e 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -20,10 +20,10 @@ */ import { readFile } from 'node:fs/promises'; import { mkdir, writeFile } from 'node:fs/promises'; -import { dirname, join, resolve } from 'node:path'; +import { dirname, join, relative, resolve } from 'node:path'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; -import { loadTestSuite } from '@agentv/core'; +import { deriveCategory, loadTestSuite } from '@agentv/core'; import { command, option, optional, positional, string } from 'cmd-ts'; import { buildDefaultRunDir } from '../eval/result-layout.js'; @@ -57,7 +57,8 @@ export const evalInputCommand = command({ const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); const evalDir = dirname(resolvedEvalPath); - const suite = await loadTestSuite(resolvedEvalPath, repoRoot); + const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); + const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); const tests = suite.tests; if (tests.length === 0) { diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 09033635..d2c18811 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -15,9 +15,9 @@ import { execSync } from 'node:child_process'; import { existsSync, readFileSync, unlinkSync } from 'node:fs'; import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; -import { dirname, join, resolve } from 'node:path'; +import { dirname, join, relative, resolve } from 'node:path'; -import { executeScript, loadTestSuite } from '@agentv/core'; +import { deriveCategory, executeScript, loadTestSuite } from '@agentv/core'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; import { command, number, option, optional, positional, string } from 'cmd-ts'; @@ -91,7 +91,8 @@ export const evalRunCommand = command({ const evalDir = dirname(resolvedEvalPath); // ── Step 1: Extract inputs (same as pipeline input) ────────────── - const suite = await loadTestSuite(resolvedEvalPath, repoRoot); + const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); + const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); const tests = suite.tests; if (tests.length === 0) {