diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index bcbdbd02..feeb7c38 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -26,7 +26,7 @@ * - createApp(results, cwd) — Hono app factory */ -import { existsSync, readFileSync, writeFileSync } from 'node:fs'; +import { existsSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { command, number, option, optional, positional, string } from 'cmd-ts'; @@ -37,7 +37,12 @@ import { Hono } from 'hono'; import { parseJsonlResults } from '../eval/artifact-writer.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; import { listResultFiles } from '../trace/utils.js'; -import { loadManifestResults, resolveResultSourcePath } from './manifest.js'; +import { + loadLightweightResults, + loadManifestResults, + parseResultManifest, + resolveResultSourcePath, +} from './manifest.js'; import { patchTestIds } from './shared.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -165,15 +170,31 @@ export function createApp( app.get('/api/runs', (c) => { const metas = listResultFiles(searchDir); return c.json({ - runs: metas.map((m) => ({ - filename: m.filename, - path: m.path, - timestamp: m.timestamp, - test_count: m.testCount, - pass_rate: m.passRate, - avg_score: m.avgScore, - size_bytes: m.sizeBytes, - })), + runs: metas.map((m) => { + // Enrich with target/experiment from lightweight records + let target: string | undefined; + let experiment: string | undefined; + try { + const records = loadLightweightResults(m.path); + if (records.length > 0) { + target = records[0].target; + experiment = records[0].experiment; + } + } catch { + // ignore enrichment errors + } + return { + filename: m.filename, + path: m.path, + timestamp: m.timestamp, + test_count: m.testCount, + pass_rate: m.passRate, + avg_score: m.avgScore, + size_bytes: m.sizeBytes, + ...(target && { target }), + ...(experiment && { experiment }), + }; + }), }); }); @@ -327,6 +348,263 @@ export function createApp( return c.json({ entries }); }); + // ── File tree for eval artifacts ──────────────────────────────────────── + + interface FileNode { + name: string; + path: string; + type: 'file' | 'dir'; + children?: FileNode[]; + } + + function buildFileTree(dirPath: string, relativeTo: string): FileNode[] { + if (!existsSync(dirPath) || !statSync(dirPath).isDirectory()) { + return []; + } + const entries = readdirSync(dirPath, { withFileTypes: true }); + return entries + .sort((a, b) => { + // Directories first, then alphabetical + if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1; + return a.name.localeCompare(b.name); + }) + .map((entry) => { + const fullPath = path.join(dirPath, entry.name); + const relPath = path.relative(relativeTo, fullPath); + if (entry.isDirectory()) { + return { + name: entry.name, + path: relPath, + type: 'dir' as const, + children: buildFileTree(fullPath, relativeTo), + }; + } + return { name: entry.name, path: relPath, type: 'file' as const }; + }); + } + + function inferLanguage(filePath: string): string { + const ext = path.extname(filePath).toLowerCase(); + const langMap: Record = { + '.json': 'json', + '.jsonl': 'json', + '.ts': 'typescript', + '.tsx': 'typescript', + '.js': 'javascript', + '.jsx': 'javascript', + '.md': 'markdown', + '.yaml': 'yaml', + '.yml': 'yaml', + '.log': 'plaintext', + '.txt': 'plaintext', + '.py': 'python', + '.sh': 'shell', + '.bash': 'shell', + '.css': 'css', + '.html': 'html', + '.xml': 'xml', + '.svg': 'xml', + '.toml': 'toml', + '.diff': 'diff', + '.patch': 'diff', + }; + return langMap[ext] ?? 'plaintext'; + } + + // File tree for a specific eval's artifact directory + app.get('/api/runs/:filename/evals/:evalId/files', (c) => { + const filename = c.req.param('filename'); + const evalId = c.req.param('evalId'); + const metas = listResultFiles(searchDir); + const meta = metas.find((m) => m.filename === filename); + if (!meta) { + return c.json({ error: 'Run not found' }, 404); + } + try { + const content = readFileSync(meta.path, 'utf8'); + const records = parseResultManifest(content); + const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId); + if (!record) { + return c.json({ error: 'Eval not found' }, 404); + } + + const baseDir = path.dirname(meta.path); + + // Derive the eval's artifact subdirectory from known paths + const knownPaths = [ + record.grading_path, + record.timing_path, + record.input_path, + record.output_path, + record.response_path, + ].filter((p): p is string => !!p); + + if (knownPaths.length === 0) { + return c.json({ files: [] }); + } + + // Find the common parent directory of all artifact paths + const artifactDirs = knownPaths.map((p) => path.dirname(p)); + let commonDir = artifactDirs[0]; + for (const dir of artifactDirs) { + while (!dir.startsWith(commonDir)) { + commonDir = path.dirname(commonDir); + } + } + + const artifactAbsDir = path.join(baseDir, commonDir); + const files = buildFileTree(artifactAbsDir, baseDir); + return c.json({ files }); + } catch { + return c.json({ error: 'Failed to load file tree' }, 500); + } + }); + + // File content for a specific artifact file + app.get('/api/runs/:filename/evals/:evalId/files/*', (c) => { + const filename = c.req.param('filename'); + const evalId = c.req.param('evalId'); + const metas = listResultFiles(searchDir); + const meta = metas.find((m) => m.filename === filename); + if (!meta) { + return c.json({ error: 'Run not found' }, 404); + } + + // Extract the file path from the wildcard portion + const requestPath = c.req.path; + const prefix = `/api/runs/${filename}/evals/${evalId}/files/`; + const filePath = requestPath.slice(prefix.length); + + if (!filePath) { + return c.json({ error: 'No file path specified' }, 400); + } + + const baseDir = path.dirname(meta.path); + const absolutePath = path.resolve(baseDir, filePath); + + // Security: prevent path traversal — resolved path must be inside baseDir + if ( + !absolutePath.startsWith(path.resolve(baseDir) + path.sep) && + absolutePath !== path.resolve(baseDir) + ) { + return c.json({ error: 'Path traversal not allowed' }, 403); + } + + if (!existsSync(absolutePath) || !statSync(absolutePath).isFile()) { + return c.json({ error: 'File not found' }, 404); + } + + try { + const fileContent = readFileSync(absolutePath, 'utf8'); + const language = inferLanguage(absolutePath); + return c.json({ content: fileContent, language }); + } catch { + return c.json({ error: 'Failed to read file' }, 500); + } + }); + + // ── Aggregate endpoints ───────────────────────────────────────────────── + + // Experiments aggregate (group all runs by experiment) + app.get('/api/experiments', (c) => { + const metas = listResultFiles(searchDir); + const experimentMap = new Map< + string, + { + targets: Set; + runFilenames: Set; + evalCount: number; + passedCount: number; + lastTimestamp: string; + } + >(); + + for (const m of metas) { + try { + const records = loadLightweightResults(m.path); + for (const r of records) { + const experiment = r.experiment ?? 'default'; + const entry = experimentMap.get(experiment) ?? { + targets: new Set(), + runFilenames: new Set(), + evalCount: 0, + passedCount: 0, + lastTimestamp: '', + }; + entry.runFilenames.add(m.filename); + if (r.target) entry.targets.add(r.target); + entry.evalCount++; + if (r.score >= 1) entry.passedCount++; + if (r.timestamp && r.timestamp > entry.lastTimestamp) { + entry.lastTimestamp = r.timestamp; + } + experimentMap.set(experiment, entry); + } + } catch { + // skip runs that fail to load + } + } + + const experiments = [...experimentMap.entries()].map(([name, entry]) => ({ + name, + run_count: entry.runFilenames.size, + target_count: entry.targets.size, + eval_count: entry.evalCount, + passed_count: entry.passedCount, + pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0, + last_run: entry.lastTimestamp || null, + })); + + return c.json({ experiments }); + }); + + // Targets aggregate (group all runs by target) + app.get('/api/targets', (c) => { + const metas = listResultFiles(searchDir); + const targetMap = new Map< + string, + { + experiments: Set; + runFilenames: Set; + evalCount: number; + passedCount: number; + } + >(); + + for (const m of metas) { + try { + const records = loadLightweightResults(m.path); + for (const r of records) { + const target = r.target ?? 'default'; + const entry = targetMap.get(target) ?? { + experiments: new Set(), + runFilenames: new Set(), + evalCount: 0, + passedCount: 0, + }; + entry.runFilenames.add(m.filename); + if (r.experiment) entry.experiments.add(r.experiment); + entry.evalCount++; + if (r.score >= 1) entry.passedCount++; + targetMap.set(target, entry); + } + } catch { + // skip runs that fail to load + } + } + + const targets = [...targetMap.entries()].map(([name, entry]) => ({ + name, + run_count: entry.runFilenames.size, + experiment_count: entry.experiments.size, + eval_count: entry.evalCount, + passed_count: entry.passedCount, + pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0, + })); + + return c.json({ targets }); + }); + // ── Static file serving for Studio SPA ──────────────────────────────── if (studioDistPath) { diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx new file mode 100644 index 00000000..378e81ee --- /dev/null +++ b/apps/studio/src/components/Breadcrumbs.tsx @@ -0,0 +1,94 @@ +/** + * Breadcrumb navigation derived from TanStack Router matches. + * + * Maps route segments to human-readable labels and renders them as + * clickable links (except the last segment, which is the current page). + */ + +import { Link, useMatches } from '@tanstack/react-router'; + +interface BreadcrumbSegment { + label: string; + to?: string; +} + +function deriveSegments(matches: ReturnType): BreadcrumbSegment[] { + const segments: BreadcrumbSegment[] = []; + + // Skip the root match (index 0) + for (let i = 1; i < matches.length; i++) { + const match = matches[i]; + const routeId = match.routeId ?? match.id; + const params = match.params as Record; + + if (routeId === '/' || routeId === '/_layout') continue; + + if (routeId.includes('/runs/$runId/category/$category')) { + segments.push({ + label: params.category ?? 'Category', + to: match.pathname, + }); + } else if (routeId.includes('/runs/$runId')) { + segments.push({ + label: params.runId ?? 'Run', + to: match.pathname, + }); + } else if (routeId.includes('/evals/$runId/$evalId')) { + // For eval pages, show the run as a parent segment too + if (!segments.some((s) => s.label === params.runId)) { + segments.push({ + label: params.runId ?? 'Run', + to: `/runs/${encodeURIComponent(params.runId)}`, + }); + } + segments.push({ + label: params.evalId ?? 'Eval', + to: match.pathname, + }); + } else if (routeId.includes('/experiments/$experimentName')) { + segments.push({ + label: params.experimentName ?? 'Experiment', + to: match.pathname, + }); + } else if (routeId === '/index' || routeId === '/') { + segments.push({ label: 'Home', to: '/' }); + } + } + + return segments; +} + +export function Breadcrumbs() { + const matches = useMatches(); + const segments = deriveSegments(matches); + + if (segments.length === 0) return null; + + return ( +
+ + Home + + + {segments.map((segment, idx) => { + const isLast = idx === segments.length - 1; + + return ( + + > + {isLast ? ( + {segment.label} + ) : ( + + {segment.label} + + )} + + ); + })} +
+ ); +} diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 68259078..4163a6bd 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -2,23 +2,40 @@ * Three-tab eval detail view: Steps (assertions), Output, and Task (input). * * Shows the full evaluation result with score breakdown, assertions list, - * and Monaco viewers for output/input content. + * and Monaco viewers for output/input content. Output and Task tabs include + * a file tree sidebar when artifact files are available. */ import { useState } from 'react'; +import { useEvalFileContent, useEvalFiles } from '~/lib/api'; import type { EvalResult } from '~/lib/types'; +import type { FileNode } from './FileTree'; +import { FileTree } from './FileTree'; import { MonacoViewer } from './MonacoViewer'; import { ScoreBar } from './ScoreBar'; interface EvalDetailProps { eval: EvalResult; + runId: string; } type Tab = 'steps' | 'output' | 'task'; -export function EvalDetail({ eval: result }: EvalDetailProps) { +/** Recursively find the first file node in the tree. */ +function findFirstFile(nodes: FileNode[]): string | null { + for (const node of nodes) { + if (node.type === 'file') return node.path; + if (node.children) { + const found = findFirstFile(node.children); + if (found) return found; + } + } + return null; +} + +export function EvalDetail({ eval: result, runId }: EvalDetailProps) { const [activeTab, setActiveTab] = useState('steps'); const tabs: { id: Tab; label: string }[] = [ @@ -90,8 +107,8 @@ export function EvalDetail({ eval: result }: EvalDetailProps) { {/* Tab content */}
{activeTab === 'steps' && } - {activeTab === 'output' && } - {activeTab === 'task' && } + {activeTab === 'output' && } + {activeTab === 'task' && }
); @@ -154,7 +171,14 @@ function StepsTab({ result }: { result: EvalResult }) { {a.passed ? '\u2713' : '\u2717'}
-

{a.text}

+

+ {a.text} + {a.durationMs != null && ( + + ({(a.durationMs / 1000).toFixed(1)}s) + + )} +

{a.evidence &&

{a.evidence}

}
@@ -179,26 +203,93 @@ function StepsTab({ result }: { result: EvalResult }) { ); } -function OutputTab({ result }: { result: EvalResult }) { +function OutputTab({ result, runId }: { result: EvalResult; runId: string }) { + const evalId = result.testId; + const { data: filesData } = useEvalFiles(runId, evalId); + const files = filesData?.files ?? []; + const hasFiles = files.length > 0; + + const [selectedPath, setSelectedPath] = useState(null); + + // Resolve effective path: selected, or first file, or null + const effectivePath = selectedPath ?? (hasFiles ? findFirstFile(files) : null); + + const { data: fileContentData, isLoading: isLoadingContent } = useEvalFileContent( + runId, + evalId, + effectivePath ?? '', + ); + const output = result.output; + const fallbackText = + output && output.length > 0 ? output.map((m) => `[${m.role}]\n${m.content}`).join('\n\n') : ''; - if (!output || output.length === 0) { - return

No output available.

; + if (!hasFiles) { + if (!output || output.length === 0) { + return

No output available.

; + } + return ; } - const text = output.map((m) => `[${m.role}]\n${m.content}`).join('\n\n'); + const displayValue = effectivePath + ? isLoadingContent + ? 'Loading...' + : (fileContentData?.content ?? fallbackText) + : fallbackText; - return ; + const displayLanguage = effectivePath ? (fileContentData?.language ?? 'plaintext') : 'markdown'; + + return ( +
+ +
+ +
+
+ ); } -function TaskTab({ result }: { result: EvalResult }) { +function TaskTab({ result, runId }: { result: EvalResult; runId: string }) { + const evalId = result.testId; + const { data: filesData } = useEvalFiles(runId, evalId); + const files = filesData?.files ?? []; + const hasFiles = files.length > 0; + + const [selectedPath, setSelectedPath] = useState(null); + + const effectivePath = selectedPath ?? (hasFiles ? findFirstFile(files) : null); + + const { data: fileContentData, isLoading: isLoadingContent } = useEvalFileContent( + runId, + evalId, + effectivePath ?? '', + ); + const input = result.input; + const fallbackText = + input && input.length > 0 ? input.map((m) => `[${m.role}]\n${m.content}`).join('\n\n') : ''; - if (!input || input.length === 0) { - return

No task input available.

; + if (!hasFiles) { + if (!input || input.length === 0) { + return

No task input available.

; + } + return ; } - const text = input.map((m) => `[${m.role}]\n${m.content}`).join('\n\n'); + const displayValue = effectivePath + ? isLoadingContent + ? 'Loading...' + : (fileContentData?.content ?? fallbackText) + : fallbackText; + + const displayLanguage = effectivePath ? (fileContentData?.language ?? 'plaintext') : 'markdown'; - return ; + return ( +
+ +
+ +
+
+ ); } diff --git a/apps/studio/src/components/ExperimentsTab.tsx b/apps/studio/src/components/ExperimentsTab.tsx new file mode 100644 index 00000000..59335e2f --- /dev/null +++ b/apps/studio/src/components/ExperimentsTab.tsx @@ -0,0 +1,105 @@ +/** + * Experiments table showing experiments grouped across all runs. + * + * Displays experiment name, number of runs, targets, pass rate, and + * last run timestamp. Each row links to the experiment detail page. + */ + +import { Link } from '@tanstack/react-router'; + +import { useExperiments } from '~/lib/api'; +import type { ExperimentSummary } from '~/lib/types'; + +import { ScoreBar } from './ScoreBar'; + +export function ExperimentsTab() { + const { data, isLoading } = useExperiments(); + + if (isLoading) { + return ; + } + + const experiments = data?.experiments ?? []; + + if (experiments.length === 0) { + return ( +
+

No experiments found

+

+ Experiments will appear here once evaluations are run with experiment labels. +

+
+ ); + } + + return ( +
+ + + + + + + + + + + + {experiments.map((exp: ExperimentSummary) => ( + + + + + + + + ))} + +
ExperimentRunsTargetsPass RateLast Run
+ + {exp.name} + + {exp.run_count} + {exp.target_count} + + + {formatTimestamp(exp.last_run)}
+
+ ); +} + +function formatTimestamp(ts: string | undefined | null): string { + if (!ts) return 'N/A'; + try { + const d = new Date(ts); + if (Number.isNaN(d.getTime())) return 'N/A'; + return d.toLocaleString(); + } catch { + return 'N/A'; + } +} + +function LoadingSkeleton() { + return ( +
+
+
+
+
+ {['sk-1', 'sk-2', 'sk-3', 'sk-4', 'sk-5'].map((id) => ( +
+
+
+
+
+
+
+ ))} +
+
+ ); +} diff --git a/apps/studio/src/components/FileTree.tsx b/apps/studio/src/components/FileTree.tsx new file mode 100644 index 00000000..c9c29953 --- /dev/null +++ b/apps/studio/src/components/FileTree.tsx @@ -0,0 +1,148 @@ +/** + * Collapsible file tree component for showing eval artifacts. + * + * Displays a hierarchical file/folder structure with extension-based icons, + * collapsible directories, and selection highlighting. + */ + +import { useState } from 'react'; + +export interface FileNode { + name: string; + path: string; + type: 'file' | 'dir'; + children?: FileNode[]; +} + +interface FileTreeProps { + files: FileNode[]; + selectedPath: string | null; + onSelect: (path: string) => void; +} + +function getFileIcon(name: string): string { + const ext = name.split('.').pop()?.toLowerCase(); + switch (ext) { + case 'ts': + case 'tsx': + return '\u{1F4D8}'; + case 'json': + return '\u{1F4CB}'; + case 'log': + case 'txt': + return '\u{1F4DC}'; + case 'md': + return '\u{1F4DD}'; + default: + return '\u{1F4C4}'; + } +} + +function collectAllDirs(nodes: FileNode[]): string[] { + const dirs: string[] = []; + for (const node of nodes) { + if (node.type === 'dir') { + dirs.push(node.path); + if (node.children) { + dirs.push(...collectAllDirs(node.children)); + } + } + } + return dirs; +} + +function TreeNode({ + node, + selectedPath, + onSelect, + openFolders, + toggleFolder, + depth = 0, +}: { + node: FileNode; + selectedPath: string | null; + onSelect: (path: string) => void; + openFolders: Set; + toggleFolder: (path: string) => void; + depth?: number; +}) { + const isDir = node.type === 'dir'; + const isOpen = openFolders.has(node.path); + const isSelected = selectedPath === node.path; + + return ( +
+ + + {isDir && isOpen && node.children && ( +
+ {node.children.map((child) => ( + + ))} +
+ )} +
+ ); +} + +export function FileTree({ files, selectedPath, onSelect }: FileTreeProps) { + const [openFolders, setOpenFolders] = useState>( + () => new Set(collectAllDirs(files)), + ); + + const toggleFolder = (path: string) => { + setOpenFolders((prev: Set) => { + const next = new Set(prev); + if (next.has(path)) { + next.delete(path); + } else { + next.add(path); + } + return next; + }); + }; + + return ( +
+ {files.length === 0 &&

No files.

} + {files.map((node) => ( + + ))} +
+ ); +} diff --git a/apps/studio/src/components/Layout.tsx b/apps/studio/src/components/Layout.tsx index ea036859..9a983e99 100644 --- a/apps/studio/src/components/Layout.tsx +++ b/apps/studio/src/components/Layout.tsx @@ -1,21 +1,25 @@ /** - * App shell: sidebar + main content area. + * App shell: sidebar + breadcrumbs + main content area. * - * The sidebar provides navigation, and the main area renders the - * active route via the Outlet. + * The sidebar provides navigation, breadcrumbs show the current + * location, and the main area renders the active route via Outlet. */ import { Outlet } from '@tanstack/react-router'; +import { Breadcrumbs } from './Breadcrumbs'; import { Sidebar } from './Sidebar'; export function Layout() { return (
-
- -
+
+ +
+ +
+
); } diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 533ad7a8..d77f042e 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -5,8 +5,6 @@ * status, duration, and cost. Clicking a row navigates to eval detail. */ -import { useState } from 'react'; - import { Link } from '@tanstack/react-router'; import type { EvalResult } from '~/lib/types'; @@ -48,11 +46,6 @@ export function RunDetail({ results, runId }: RunDetailProps) { })) .sort((a, b) => a.name.localeCompare(b.name)); - const [categoryFilter, setCategoryFilter] = useState(null); - const filteredResults = categoryFilter - ? results.filter((r) => (r.eval_set ?? 'Uncategorized') === categoryFilter) - : results; - if (total === 0) { return (
@@ -80,45 +73,29 @@ export function RunDetail({ results, runId }: RunDetailProps) {

Categories

- {categories.map((cat) => { - const isActive = categoryFilter === cat.name; - return ( - - ); - })} + {categories.map((cat) => ( + +
+ {cat.name} + + {cat.passed}/{cat.total} + +
+
+ +
+
+ {cat.passed} passed + {cat.failed > 0 && {cat.failed} failed} +
+ + ))}
- {categoryFilter && ( - - )}
)} @@ -135,7 +112,7 @@ export function RunDetail({ results, runId }: RunDetailProps) { - {filteredResults.map((result, idx) => ( + {results.map((result, idx) => ( Run + Target + Experiment Timestamp Tests + {run.target ?? '-'} + {run.experiment ?? '-'} {formatTimestamp(run.timestamp)} {run.test_count} diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 46d99328..7cde39f7 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -4,15 +4,31 @@ * Adapts its content based on the current route: * - At root or run detail: shows list of runs * - At eval detail: shows list of evals in the current run with pass/fail indicators + * - At category detail: shows evals filtered to that category + * - At experiment detail: shows list of experiments */ import { Link, useMatchRoute } from '@tanstack/react-router'; -import { useRunDetail, useRunList } from '~/lib/api'; +import { useExperiments, useRunDetail, useRunList } from '~/lib/api'; export function Sidebar() { const matchRoute = useMatchRoute(); const evalMatch = matchRoute({ to: '/evals/$runId/$evalId', fuzzy: true }); + const categoryMatch = matchRoute({ + to: '/runs/$runId/category/$category', + fuzzy: true, + }); + const experimentMatch = matchRoute({ + to: '/experiments/$experimentName', + fuzzy: true, + }); + + // If on a category detail page, show evals filtered to that category + if (categoryMatch && typeof categoryMatch === 'object' && 'runId' in categoryMatch) { + const { runId, category } = categoryMatch as { runId: string; category: string }; + return ; + } // If on an eval detail page, show the eval sidebar if (evalMatch && typeof evalMatch === 'object' && 'runId' in evalMatch) { @@ -20,6 +36,16 @@ export function Sidebar() { return ; } + // If on an experiment detail page, show the experiment list + if ( + experimentMatch && + typeof experimentMatch === 'object' && + 'experimentName' in experimentMatch + ) { + const { experimentName } = experimentMatch as { experimentName: string }; + return ; + } + return ; } @@ -125,3 +151,108 @@ function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: s ); } + +function CategorySidebar({ runId, category }: { runId: string; category: string }) { + const { data } = useRunDetail(runId); + const categoryResults = (data?.results ?? []).filter( + (r) => (r.eval_set ?? 'Uncategorized') === category, + ); + + return ( + + ); +} + +function ExperimentSidebar({ currentExperiment }: { currentExperiment: string }) { + const { data } = useExperiments(); + const experiments = data?.experiments ?? []; + + return ( + + ); +} diff --git a/apps/studio/src/components/TargetsTab.tsx b/apps/studio/src/components/TargetsTab.tsx new file mode 100644 index 00000000..4a171d5e --- /dev/null +++ b/apps/studio/src/components/TargetsTab.tsx @@ -0,0 +1,91 @@ +/** + * Targets table showing targets grouped across all runs. + * + * Displays target name, number of runs, experiments, pass rate, and + * eval counts (passed/total). Links are not needed since targets are + * informational groupings. + */ + +import { useTargets } from '~/lib/api'; +import type { TargetSummary } from '~/lib/types'; + +import { ScoreBar } from './ScoreBar'; + +export function TargetsTab() { + const { data, isLoading } = useTargets(); + + if (isLoading) { + return ; + } + + const targets = data?.targets ?? []; + + if (targets.length === 0) { + return ( +
+

No targets found

+

+ Targets will appear here once evaluations are run with target labels. +

+
+ ); + } + + return ( +
+ + + + + + + + + + + + {targets.map((target: TargetSummary) => ( + + + + + + + + ))} + +
TargetRunsExperimentsPass RateEvals
{target.name} + {target.run_count} + + {target.experiment_count} + + + + {target.passed_count} + / + {target.eval_count} +
+
+ ); +} + +function LoadingSkeleton() { + return ( +
+
+
+
+
+ {['sk-1', 'sk-2', 'sk-3', 'sk-4', 'sk-5'].map((id) => ( +
+
+
+
+
+
+
+ ))} +
+
+ ); +} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 69aa7ef8..8018ed6e 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -10,10 +10,14 @@ import { queryOptions, useQuery } from '@tanstack/react-query'; import type { CategoriesResponse, EvalDetailResponse, + ExperimentsResponse, FeedbackData, + FileContentResponse, + FileTreeResponse, IndexResponse, RunDetailResponse, RunListResponse, + TargetsResponse, } from './types'; async function fetchJson(url: string): Promise { @@ -70,6 +74,38 @@ export const feedbackOptions = queryOptions({ queryFn: () => fetchJson('/api/feedback'), }); +export const experimentsOptions = queryOptions({ + queryKey: ['experiments'], + queryFn: () => fetchJson('/api/experiments'), +}); + +export const targetsOptions = queryOptions({ + queryKey: ['targets'], + queryFn: () => fetchJson('/api/targets'), +}); + +export function evalFilesOptions(runId: string, evalId: string) { + return queryOptions({ + queryKey: ['runs', runId, 'evals', evalId, 'files'], + queryFn: () => + fetchJson( + `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/files`, + ), + enabled: !!runId && !!evalId, + }); +} + +export function evalFileContentOptions(runId: string, evalId: string, filePath: string) { + return queryOptions({ + queryKey: ['runs', runId, 'evals', evalId, 'files', filePath], + queryFn: () => + fetchJson( + `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/files/${filePath}`, + ), + enabled: !!runId && !!evalId && !!filePath, + }); +} + // ── Hooks ─────────────────────────────────────────────────────────────── export function useRunList() { @@ -95,3 +131,19 @@ export function useIndex() { export function useFeedback() { return useQuery(feedbackOptions); } + +export function useExperiments() { + return useQuery(experimentsOptions); +} + +export function useTargets() { + return useQuery(targetsOptions); +} + +export function useEvalFiles(runId: string, evalId: string) { + return useQuery(evalFilesOptions(runId, evalId)); +} + +export function useEvalFileContent(runId: string, evalId: string, filePath: string) { + return useQuery(evalFileContentOptions(runId, evalId, filePath)); +} diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 64c96128..38395093 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -13,6 +13,8 @@ export interface RunMeta { pass_rate: number; avg_score: number; size_bytes: number; + target?: string; + experiment?: string; } export interface RunListResponse { @@ -40,6 +42,7 @@ export interface AssertionEntry { text: string; passed: boolean; evidence?: string; + durationMs?: number; } export interface EvalResult { @@ -106,3 +109,44 @@ export interface FeedbackReview { export interface FeedbackData { reviews: FeedbackReview[]; } + +export interface ExperimentSummary { + name: string; + run_count: number; + target_count: number; + pass_rate: number; + last_run: string; +} + +export interface ExperimentsResponse { + experiments: ExperimentSummary[]; +} + +export interface TargetSummary { + name: string; + run_count: number; + experiment_count: number; + pass_rate: number; + passed_count: number; + eval_count: number; +} + +export interface TargetsResponse { + targets: TargetSummary[]; +} + +export interface FileNode { + name: string; + path: string; + type: 'file' | 'dir'; + children?: FileNode[]; +} + +export interface FileTreeResponse { + files: FileNode[]; +} + +export interface FileContentResponse { + content: string; + language: string; +} diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index 4ef4956d..64eda3a5 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -11,7 +11,9 @@ import { Route as rootRouteImport } from './routes/__root' import { Route as IndexRouteImport } from './routes/index' import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' +import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$evalId' +import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' const IndexRoute = IndexRouteImport.update({ id: '/', @@ -23,40 +25,76 @@ const RunsRunIdRoute = RunsRunIdRouteImport.update({ path: '/runs/$runId', getParentRoute: () => rootRouteImport, } as any) +const ExperimentsExperimentNameRoute = + ExperimentsExperimentNameRouteImport.update({ + id: '/experiments/$experimentName', + path: '/experiments/$experimentName', + getParentRoute: () => rootRouteImport, + } as any) const EvalsRunIdEvalIdRoute = EvalsRunIdEvalIdRouteImport.update({ id: '/evals/$runId/$evalId', path: '/evals/$runId/$evalId', getParentRoute: () => rootRouteImport, } as any) +const RunsRunIdCategoryCategoryRoute = + RunsRunIdCategoryCategoryRouteImport.update({ + id: '/runs/$runId_/category/$category', + path: '/runs/$runId/category/$category', + getParentRoute: () => rootRouteImport, + } as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute + '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute + '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute } export interface FileRoutesByTo { '/': typeof IndexRoute + '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute + '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute } export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute + '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute + '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute } export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath - fullPaths: '/' | '/runs/$runId' | '/evals/$runId/$evalId' + fullPaths: + | '/' + | '/experiments/$experimentName' + | '/runs/$runId' + | '/evals/$runId/$evalId' + | '/runs/$runId/category/$category' fileRoutesByTo: FileRoutesByTo - to: '/' | '/runs/$runId' | '/evals/$runId/$evalId' - id: '__root__' | '/' | '/runs/$runId' | '/evals/$runId/$evalId' + to: + | '/' + | '/experiments/$experimentName' + | '/runs/$runId' + | '/evals/$runId/$evalId' + | '/runs/$runId/category/$category' + id: + | '__root__' + | '/' + | '/experiments/$experimentName' + | '/runs/$runId' + | '/evals/$runId/$evalId' + | '/runs/$runId_/category/$category' fileRoutesById: FileRoutesById } export interface RootRouteChildren { IndexRoute: typeof IndexRoute + ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute + RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute } declare module '@tanstack/react-router' { @@ -75,6 +113,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof RunsRunIdRouteImport parentRoute: typeof rootRouteImport } + '/experiments/$experimentName': { + id: '/experiments/$experimentName' + path: '/experiments/$experimentName' + fullPath: '/experiments/$experimentName' + preLoaderRoute: typeof ExperimentsExperimentNameRouteImport + parentRoute: typeof rootRouteImport + } '/evals/$runId/$evalId': { id: '/evals/$runId/$evalId' path: '/evals/$runId/$evalId' @@ -82,13 +127,22 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof EvalsRunIdEvalIdRouteImport parentRoute: typeof rootRouteImport } + '/runs/$runId_/category/$category': { + id: '/runs/$runId_/category/$category' + path: '/runs/$runId/category/$category' + fullPath: '/runs/$runId/category/$category' + preLoaderRoute: typeof RunsRunIdCategoryCategoryRouteImport + parentRoute: typeof rootRouteImport + } } } const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, + ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, + RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute, } export const routeTree = rootRouteImport ._addFileChildren(rootRouteChildren) diff --git a/apps/studio/src/routes/evals/$runId.$evalId.tsx b/apps/studio/src/routes/evals/$runId.$evalId.tsx index 5ad3b5a1..35c2ce6a 100644 --- a/apps/studio/src/routes/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/evals/$runId.$evalId.tsx @@ -57,7 +57,7 @@ function EvalDetailPage() {

{evalId}

- +
); } diff --git a/apps/studio/src/routes/experiments/$experimentName.tsx b/apps/studio/src/routes/experiments/$experimentName.tsx new file mode 100644 index 00000000..29b6868b --- /dev/null +++ b/apps/studio/src/routes/experiments/$experimentName.tsx @@ -0,0 +1,107 @@ +/** + * Experiment detail route: shows aggregate stats and the run list. + * + * Fetches experiment data from the experiments API for stats, + * and the full run list for the table below. + */ + +import { createFileRoute } from '@tanstack/react-router'; + +import { RunList } from '~/components/RunList'; +import { useExperiments, useRunList } from '~/lib/api'; + +export const Route = createFileRoute('/experiments/$experimentName')({ + component: ExperimentDetailPage, +}); + +function ExperimentDetailPage() { + const { experimentName } = Route.useParams(); + const { data: experimentsData, isLoading: expLoading } = useExperiments(); + const { data: runListData, isLoading: runsLoading } = useRunList(); + + const isLoading = expLoading || runsLoading; + + if (isLoading) { + return ( +
+
+
+ {['s1', 's2', 's3', 's4'].map((id) => ( +
+ ))} +
+
+ ); + } + + const experiment = experimentsData?.experiments?.find((e) => e.name === experimentName); + const runs = runListData?.runs ?? []; + + // Derive stats from the experiment summary if available + const passRate = experiment?.pass_rate ?? 0; + const runCount = experiment?.run_count ?? 0; + const targetCount = experiment?.target_count ?? 0; + + return ( +
+
+

{experimentName}

+

+ {runCount} run{runCount !== 1 ? 's' : ''} · {targetCount} target + {targetCount !== 1 ? 's' : ''} + {experiment?.last_run && ( + · Last run: {formatTimestamp(experiment.last_run)} + )} +

+
+ + {experiment && ( +
+ + + + +
+ )} + +
+

All Runs

+ +
+
+ ); +} + +function StatCard({ + label, + value, + accent, +}: { + label: string; + value: string; + accent?: string; +}) { + return ( +
+

{label}

+

+ {value} +

+
+ ); +} + +function formatTimestamp(ts: string | undefined | null): string { + if (!ts) return 'N/A'; + try { + const d = new Date(ts); + if (Number.isNaN(d.getTime())) return 'N/A'; + return d.toLocaleString(); + } catch { + return 'N/A'; + } +} diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index 095fb285..1a9d5a30 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -1,19 +1,78 @@ /** - * Home route: displays the run list. + * Home route: tabbed landing page with Recent Runs, Experiments, and Targets. + * + * Uses URL search param `?tab=` for tab persistence. */ -import { createFileRoute } from '@tanstack/react-router'; +import { createFileRoute, useNavigate, useRouterState } from '@tanstack/react-router'; +import { ExperimentsTab } from '~/components/ExperimentsTab'; import { RunList } from '~/components/RunList'; +import { TargetsTab } from '~/components/TargetsTab'; import { useRunList } from '~/lib/api'; +type TabId = 'runs' | 'experiments' | 'targets'; + +const tabs: { id: TabId; label: string }[] = [ + { id: 'runs', label: 'Recent Runs' }, + { id: 'experiments', label: 'Experiments' }, + { id: 'targets', label: 'Targets' }, +]; + export const Route = createFileRoute('/')({ component: HomePage, }); function HomePage() { + const routerState = useRouterState(); + const searchParams = routerState.location.search as Record; + const tab = searchParams.tab as TabId | undefined; + const navigate = useNavigate(); const { data, isLoading, error } = useRunList(); + const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'runs'; + + return ( +
+

Evaluation Runs

+ + {/* Tab navigation */} +
+
+ {tabs.map((t) => ( + + ))} +
+
+ + {/* Tab content */} + {activeTab === 'runs' && } + {activeTab === 'experiments' && } + {activeTab === 'targets' && } +
+ ); +} + +function RunsTabContent({ + data, + isLoading, + error, +}: { + data: ReturnType['data']; + isLoading: boolean; + error: Error | null; +}) { if (isLoading) { return ; } @@ -26,23 +85,15 @@ function HomePage() { ); } - return ( -
-

Evaluation Runs

- -
- ); + return ; } function LoadingSkeleton() { return ( -
-
-
- {['s1', 's2', 's3', 's4', 's5'].map((id) => ( -
- ))} -
+
+ {['s1', 's2', 's3', 's4', 's5'].map((id) => ( +
+ ))}
); } diff --git a/apps/studio/src/routes/runs/$runId_.category.$category.tsx b/apps/studio/src/routes/runs/$runId_.category.$category.tsx new file mode 100644 index 00000000..73a20f12 --- /dev/null +++ b/apps/studio/src/routes/runs/$runId_.category.$category.tsx @@ -0,0 +1,140 @@ +/** + * Category drill-down route: shows evals filtered to a single category. + * + * Uses the `$runId_` trailing-underscore convention so that + * `/runs/:runId/category/:category` is a sibling of `/runs/:runId`, + * not a child route. + */ + +import { Link, createFileRoute } from '@tanstack/react-router'; + +import { ScoreBar } from '~/components/ScoreBar'; +import { StatsCards } from '~/components/StatsCards'; +import { useRunDetail } from '~/lib/api'; + +export const Route = createFileRoute('/runs/$runId_/category/$category')({ + component: CategoryPage, +}); + +function CategoryPage() { + const { runId, category } = Route.useParams(); + const { data, isLoading, error } = useRunDetail(runId); + + if (isLoading) { + return ( +
+
+
+ {['s1', 's2', 's3', 's4', 's5'].map((id) => ( +
+ ))} +
+
+ ); + } + + if (error) { + return ( +
+ Failed to load run: {error.message} +
+ ); + } + + const results = (data?.results ?? []).filter((r) => (r.eval_set ?? 'Uncategorized') === category); + const total = results.length; + const passed = results.filter((r) => r.score >= 1).length; + const failed = total - passed; + const passRate = total > 0 ? passed / total : 0; + const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); + + return ( +
+
+

{category}

+

Category in run: {runId}

+
+ + 0 ? totalCost : undefined} + /> + + {total === 0 ? ( +
+

No evaluations in this category

+
+ ) : ( +
+ + + + + + + + + + + + + {results.map((result, idx) => ( + + + + + + + + + ))} + +
Test IDTargetScoreStatusDurationCost
+ + {result.testId} + + {result.target ?? '-'} + + + + + {result.durationMs != null ? `${(result.durationMs / 1000).toFixed(1)}s` : '-'} + + {result.costUsd != null ? `$${result.costUsd.toFixed(4)}` : '-'} +
+
+ )} +
+ ); +} + +function StatusBadge({ status }: { status?: string }) { + if (!status) return -; + + const isSuccess = status === 'success' || status === 'completed'; + const isError = status === 'error' || status === 'failed'; + + return ( + + {status} + + ); +}