diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 601a9652f..1abf8068a 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -94,7 +94,7 @@ export interface AggregateGradingArtifact { export interface IndexArtifactEntry { readonly timestamp: string; readonly test_id: string; - readonly eval_set?: string; + readonly dataset?: string; readonly conversation_id?: string; readonly score: number; readonly target: string; @@ -462,14 +462,14 @@ function safeTargetId(target: string | undefined): string { return safeArtifactPathSegment(target, 'default'); } -function getEvalSet(result: EvaluationResult): string | undefined { - const record = result as EvaluationResult & { evalSet?: string }; - return result.eval_set ?? record.evalSet; +function getDataset(result: EvaluationResult): string | undefined { + const record = result as EvaluationResult & { eval_set?: string; evalSet?: string }; + return result.dataset ?? record.eval_set ?? record.evalSet; } function buildArtifactSubdir(result: EvaluationResult): string { const segments = []; - const evalSet = getEvalSet(result); + const evalSet = getDataset(result); if (evalSet) { segments.push(safeArtifactPathSegment(evalSet, 'default')); } @@ -508,7 +508,7 @@ export function buildIndexArtifactEntry( return { timestamp: result.timestamp, test_id: result.testId ?? 'unknown', - eval_set: getEvalSet(result), + dataset: getDataset(result), conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', @@ -539,7 +539,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA return { timestamp: result.timestamp, test_id: result.testId ?? 'unknown', - eval_set: getEvalSet(result), + dataset: getDataset(result), conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts index 514b24585..84198bb99 100644 --- a/apps/cli/src/commands/eval/junit-writer.ts +++ b/apps/cli/src/commands/eval/junit-writer.ts @@ -47,7 +47,7 @@ export class JunitWriter { const grouped = new Map(); for (const result of this.results) { - const suite = result.eval_set ?? 'default'; + const suite = result.dataset ?? 'default'; const existing = grouped.get(suite); if (existing) { existing.push(result); diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 547d926a0..1fc50d536 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -45,7 +45,7 @@ export const evalBenchCommand = command({ const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8')); const testIds: string[] = manifest.test_ids; const targetName: string = manifest.target?.name ?? 'unknown'; - const evalSet: string = manifest.eval_set ?? ''; + const evalSet: string = manifest.dataset ?? manifest.eval_set ?? ''; const experiment: string | undefined = manifest.experiment; const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : ''; @@ -189,7 +189,7 @@ export const evalBenchCommand = command({ JSON.stringify({ timestamp: manifest.timestamp, test_id: testId, - eval_set: evalSet || undefined, + dataset: evalSet || undefined, experiment: experiment || undefined, score: Math.round(weightedScore * 1000) / 1000, target: targetName, diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index f9d015021..cb12430f1 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -40,7 +40,7 @@ export const evalGradeCommand = command({ const manifestPath = join(exportDir, 'manifest.json'); const manifest = JSON.parse(await readFile(manifestPath, 'utf8')); const testIds: string[] = manifest.test_ids; - const evalSet: string = manifest.eval_set ?? ''; + const evalSet: string = manifest.dataset ?? manifest.eval_set ?? ''; const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : ''; let totalGraders = 0; diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 90d1fd712..26da5826f 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -167,7 +167,7 @@ export const evalInputCommand = command({ // manifest.json await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, - eval_set: evalSetName || undefined, + dataset: evalSetName || undefined, experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index a4615aaf9..090336351 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -187,7 +187,7 @@ export const evalRunCommand = command({ await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, - eval_set: evalSetName || undefined, + dataset: evalSetName || undefined, experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { name: targetName, kind: targetKind }, diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 2fcd22a47..f4865b806 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -11,7 +11,8 @@ export interface ResultManifestRecord { readonly timestamp?: string; readonly test_id?: string; readonly eval_id?: string; - readonly eval_set?: string; + readonly dataset?: string; + readonly eval_set?: string; // deprecated alias for dataset readonly experiment?: string; readonly target?: string; readonly score: number; @@ -124,7 +125,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E return { timestamp: record.timestamp, testId, - eval_set: record.eval_set, + dataset: record.dataset ?? record.eval_set, target: record.target, score: record.score, executionStatus: record.execution_status, diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index feeb7c382..a178359d1 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -272,8 +272,8 @@ export function createApp( // ── New Studio API endpoints ────────────────────────────────────────── - // Categories for a specific run (grouped by eval_set or target) - app.get('/api/runs/:filename/categories', (c) => { + // Datasets for a specific run (grouped by dataset or target) + app.get('/api/runs/:filename/datasets', (c) => { const filename = c.req.param('filename'); const metas = listResultFiles(searchDir); const meta = metas.find((m) => m.filename === filename); @@ -282,25 +282,25 @@ export function createApp( } try { const loaded = patchTestIds(loadManifestResults(meta.path)); - const categoryMap = new Map(); + const datasetMap = new Map(); for (const r of loaded) { - const cat = r.eval_set ?? r.target ?? 'default'; - const entry = categoryMap.get(cat) ?? { total: 0, passed: 0, scoreSum: 0 }; + const ds = r.dataset ?? r.target ?? 'default'; + const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; if (r.score >= 1) entry.passed++; entry.scoreSum += r.score; - categoryMap.set(cat, entry); + datasetMap.set(ds, entry); } - const categories = [...categoryMap.entries()].map(([name, entry]) => ({ + const datasets = [...datasetMap.entries()].map(([name, entry]) => ({ name, total: entry.total, passed: entry.passed, failed: entry.total - entry.passed, avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, })); - return c.json({ categories }); + return c.json({ datasets }); } catch { - return c.json({ error: 'Failed to load categories' }, 500); + return c.json({ error: 'Failed to load datasets' }, 500); } }); diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/trace/show.ts index 54ada207c..598a4753f 100644 --- a/apps/cli/src/commands/trace/show.ts +++ b/apps/cli/src/commands/trace/show.ts @@ -225,7 +225,7 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st // Standard flat view const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red; lines.push( - `${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.eval_set ? ` ${c.dim}eval-set: ${result.eval_set}${c.reset}` : ''}`, + `${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.dataset ? ` ${c.dim}dataset: ${result.dataset}${c.reset}` : ''}`, ); if (result.error) { diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/trace/stats.ts index 15c7007aa..dba30ba10 100644 --- a/apps/cli/src/commands/trace/stats.ts +++ b/apps/cli/src/commands/trace/stats.ts @@ -109,8 +109,9 @@ function groupResults(results: RawResult[], groupBy?: string): GroupedResults[] case 'target': key = result.target ?? 'unknown'; break; + case 'dataset': case 'eval-set': - key = result.eval_set ?? 'unknown'; + key = result.dataset ?? 'unknown'; break; case 'test-id': key = result.test_id ?? result.eval_id ?? 'unknown'; diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts index e1f1bd0b0..d6b51925c 100644 --- a/apps/cli/src/commands/trace/utils.ts +++ b/apps/cli/src/commands/trace/utils.ts @@ -51,7 +51,7 @@ export interface RawResult { timestamp?: string; test_id?: string; eval_id?: string; - eval_set?: string; + dataset?: string; conversation_id?: string; score: number; assertions?: { text: string; passed: boolean; evidence?: string }[]; @@ -149,7 +149,7 @@ function toRawResult(result: EvaluationResult): RawResult { return { timestamp: result.timestamp, test_id: result.testId, - eval_set: result.eval_set, + dataset: result.dataset, conversation_id: result.conversationId, score: result.score, assertions: result.assertions?.map((assertion) => ({ @@ -334,7 +334,7 @@ function loadOtlpTraceFile(filePath: string): RawResult[] { stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`, - eval_set: stringAttr(rootAttrs.agentv_eval_set), + dataset: stringAttr(rootAttrs.agentv_dataset), target: stringAttr(rootAttrs.agentv_target), score, error: root.status?.code === 2 ? root.status.message : undefined, diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index c8b64a750..9461cd45c 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -417,7 +417,7 @@ describe('buildIndexArtifactEntry', () => { makeResult({ testId: 'alpha', target: 'claude', - eval_set: 'demo', + dataset: 'demo', scores: [makeEvaluatorResult({ name: 'quality', score: 0.7 })], executionStatus: 'quality_failure', error: 'model drift', @@ -434,7 +434,7 @@ describe('buildIndexArtifactEntry', () => { expect(JSON.parse(JSON.stringify(entry))).toEqual({ timestamp: '2026-03-13T00:00:00.000Z', test_id: 'alpha', - eval_set: 'demo', + dataset: 'demo', score: 0.9, target: 'claude', scores: [ @@ -717,9 +717,9 @@ describe('writeArtifactsFromResults', () => { expect(candidateGrading.assertions[0].text).toBe('candidate-check'); }); - it('prefixes artifact paths with eval_set when present', async () => { + it('prefixes artifact paths with dataset when present', async () => { const paths = await writeArtifactsFromResults( - [makeResult({ eval_set: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], + [makeResult({ dataset: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], testDir, ); diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts index 75ff80da2..d13abcc35 100644 --- a/apps/cli/test/commands/eval/output-writers.test.ts +++ b/apps/cli/test/commands/eval/output-writers.test.ts @@ -125,9 +125,9 @@ describe('JunitWriter', () => { it('should group results by dataset as testsuites', async () => { const writer = await JunitWriter.open(testFilePath); - await writer.append(makeResult({ testId: 'a-1', eval_set: 'suite-a', score: 1.0 })); - await writer.append(makeResult({ testId: 'a-2', eval_set: 'suite-a', score: 0.8 })); - await writer.append(makeResult({ testId: 'b-1', eval_set: 'suite-b', score: 0.5 })); + await writer.append(makeResult({ testId: 'a-1', dataset: 'suite-a', score: 1.0 })); + await writer.append(makeResult({ testId: 'a-2', dataset: 'suite-a', score: 0.8 })); + await writer.append(makeResult({ testId: 'b-1', dataset: 'suite-b', score: 0.5 })); await writer.close(); const xml = await readFile(testFilePath, 'utf8'); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index 4d9a39aab..0e51d38d7 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -23,7 +23,7 @@ import { exportResults } from '../../../src/commands/results/export.js'; const CLAUDE_CLI_RESULT = { timestamp: '2026-03-18T10:00:00.000Z', test_id: 'test-claude-reasoning', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 1.0, assertions: [ { text: 'Correct answer', passed: true, evidence: 'Matched expected output' }, @@ -60,7 +60,7 @@ const CLAUDE_CLI_RESULT = { const CODEX_RESULT = { timestamp: '2026-03-18T10:01:00.000Z', test_id: 'test-codex-edit', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 0.9, assertions: [ { text: 'File edited correctly', passed: true }, @@ -96,7 +96,7 @@ const CODEX_RESULT = { const COPILOT_RESULT = { timestamp: '2026-03-18T10:02:00.000Z', test_id: 'test-copilot-complete', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 0.85, assertions: [ { text: 'Code completion correct', passed: true }, @@ -125,7 +125,7 @@ const COPILOT_RESULT = { const PI_RESULT = { timestamp: '2026-03-18T10:03:00.000Z', test_id: 'test-pi-refactor', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 0.75, assertions: [ { text: 'Refactored correctly', passed: true }, @@ -143,7 +143,7 @@ const PI_RESULT = { const LLM_AZURE_RESULT = { timestamp: '2026-03-18T10:04:00.000Z', test_id: 'test-llm-analysis', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 1.0, assertions: [{ text: 'Analysis correct', passed: true }], output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }], @@ -166,7 +166,7 @@ const LLM_AZURE_RESULT = { const LLM_GPT_RESULT = { timestamp: '2026-03-18T10:05:00.000Z', test_id: 'test-llm-analysis', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 0.8, assertions: [{ text: 'Analysis correct', passed: true }], output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }], @@ -181,7 +181,7 @@ const LLM_GPT_RESULT = { const MINIMAL_RESULT = { timestamp: '2026-03-18T10:06:00.000Z', test_id: 'test-minimal', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 0.5, assertions: [{ text: 'Exists', passed: true }], output: [{ role: 'assistant', content: 'Response.' }], @@ -193,7 +193,7 @@ const MINIMAL_RESULT = { const ERROR_RESULT = { timestamp: '2026-03-18T10:07:00.000Z', test_id: 'test-error-case', - eval_set: 'multi-provider', + dataset: 'multi-provider', score: 0, assertions: [], output: [], @@ -212,12 +212,12 @@ function toJsonl(...records: object[]): string { function artifactDir( outputDir: string, - record: { eval_set?: string; test_id?: string; eval_id?: string; target?: string }, + record: { dataset?: string; test_id?: string; eval_id?: string; target?: string }, ): string { const testId = record.test_id ?? record.eval_id ?? 'unknown'; return path.join( outputDir, - ...(record.eval_set ? [record.eval_set] : []), + ...(record.dataset ? [record.dataset] : []), testId, record.target ?? 'default', ); @@ -642,7 +642,7 @@ describe('export e2e — multi-provider metrics verification', () => { const record = { timestamp: '2026-03-18T10:00:00.000Z', test_id: 'test-case-convert', - eval_set: 'test', + dataset: 'test', score: 1.0, assertions: [{ text: 'ok', passed: true }], output_text: 'ok', @@ -677,7 +677,7 @@ describe('export e2e — multi-provider metrics verification', () => { const record = { timestamp: '2026-03-18T10:00:00.000Z', eval_id: 'legacy-test-id', - eval_set: 'test', + dataset: 'test', score: 1.0, assertions: [{ text: 'ok', passed: true }], output_text: 'ok', diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index de1f8ccd4..60d020aac 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -16,7 +16,7 @@ import { exportResults } from '../../../src/commands/results/export.js'; const RESULT_FULL = { timestamp: '2026-03-18T10:00:01.000Z', test_id: 'test-greeting', - eval_set: 'demo', + dataset: 'demo', score: 1.0, assertions: [ { text: 'Says hello', passed: true }, @@ -40,7 +40,7 @@ const RESULT_FULL = { const RESULT_PARTIAL = { timestamp: '2026-03-18T10:00:05.000Z', test_id: 'test-math', - eval_set: 'demo', + dataset: 'demo', score: 0.5, assertions: [ { text: 'Correct formula', passed: true }, @@ -66,7 +66,7 @@ const RESULT_PARTIAL = { const RESULT_DIFFERENT_TARGET = { timestamp: '2026-03-18T10:00:10.000Z', test_id: 'test-greeting', - eval_set: 'demo', + dataset: 'demo', score: 0.75, assertions: [ { text: 'Says hello', passed: true }, @@ -81,7 +81,7 @@ const RESULT_DIFFERENT_TARGET = { const RESULT_NO_TRACE = { timestamp: '2026-03-18T10:00:15.000Z', test_id: 'test-simple', - eval_set: 'demo', + dataset: 'demo', score: 1.0, assertions: [{ text: 'Correct', passed: true }], output: [{ role: 'assistant', content: 'Yes.' }], @@ -97,12 +97,12 @@ function toJsonl(...records: object[]): string { function artifactDir( outputDir: string, - record: { eval_set?: string; test_id?: string; eval_id?: string; target?: string }, + record: { dataset?: string; test_id?: string; eval_id?: string; target?: string }, ): string { const testId = record.test_id ?? record.eval_id ?? 'unknown'; return path.join( outputDir, - ...(record.eval_set ? [record.eval_set] : []), + ...(record.dataset ? [record.dataset] : []), testId, record.target ?? 'default', ); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 82c13aa04..d3cad625e 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -10,7 +10,7 @@ import { createApp, loadResults, resolveSourceFile } from '../../../src/commands const RESULT_A = { timestamp: '2026-03-18T10:00:01.000Z', test_id: 'test-greeting', - eval_set: 'demo', + dataset: 'demo', score: 1.0, assertions: [ { text: 'Says hello', passed: true }, @@ -34,7 +34,7 @@ const RESULT_A = { const RESULT_B = { timestamp: '2026-03-18T10:00:05.000Z', test_id: 'test-math', - eval_set: 'demo', + dataset: 'demo', score: 0.5, assertions: [ { text: 'Correct formula', passed: true }, diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index 05de8e5b7..9a7d3c939 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -16,7 +16,7 @@ import { const RESULT_WITH_TRACE = JSON.stringify({ timestamp: '2026-02-20T21:38:05.833Z', test_id: 'test-1', - eval_set: 'demo', + dataset: 'demo', score: 1, assertions: [{ text: 'criterion-1', passed: true }], target: 'default', @@ -34,7 +34,7 @@ const RESULT_WITH_TRACE = JSON.stringify({ const RESULT_WITHOUT_TRACE = JSON.stringify({ timestamp: '2026-02-20T21:38:06.000Z', test_id: 'test-2', - eval_set: 'demo', + dataset: 'demo', score: 0.75, assertions: [ { text: 'criterion-1', passed: true }, @@ -46,7 +46,7 @@ const RESULT_WITHOUT_TRACE = JSON.stringify({ const RESULT_FAILING = JSON.stringify({ timestamp: '2026-02-20T21:38:07.000Z', test_id: 'test-3', - eval_set: 'demo', + dataset: 'demo', score: 0, assertions: [ { text: 'criterion-1', passed: false }, diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx index 378e81ee6..48d5f7c32 100644 --- a/apps/studio/src/components/Breadcrumbs.tsx +++ b/apps/studio/src/components/Breadcrumbs.tsx @@ -23,9 +23,9 @@ function deriveSegments(matches: ReturnType): BreadcrumbSegme if (routeId === '/' || routeId === '/_layout') continue; - if (routeId.includes('/runs/$runId/category/$category')) { + if (routeId.includes('/runs/$runId/dataset/$dataset')) { segments.push({ - label: params.category ?? 'Category', + label: params.dataset ?? 'Dataset', to: match.pathname, }); } else if (routeId.includes('/runs/$runId')) { diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index d77f042e2..6c96c3818 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -24,21 +24,21 @@ export function RunDetail({ results, runId }: RunDetailProps) { const passRate = total > 0 ? passed / total : 0; const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); - // Category breakdown: group by eval_set - const categoryMap = new Map< + // Dataset breakdown: group by dataset + const datasetMap = new Map< string, { passed: number; failed: number; total: number; scoreSum: number } >(); for (const r of results) { - const cat = r.eval_set ?? 'Uncategorized'; - const entry = categoryMap.get(cat) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; + const ds = r.dataset ?? 'Uncategorized'; + const entry = datasetMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; entry.total += 1; entry.scoreSum += r.score; if (r.score >= 1) entry.passed += 1; else entry.failed += 1; - categoryMap.set(cat, entry); + datasetMap.set(ds, entry); } - const categories = Array.from(categoryMap.entries()) + const datasets = Array.from(datasetMap.entries()) .map(([name, stats]) => ({ name, ...stats, @@ -68,16 +68,16 @@ export function RunDetail({ results, runId }: RunDetailProps) { totalCost={totalCost > 0 ? totalCost : undefined} /> - {/* Category breakdown */} - {categories.length >= 1 && ( + {/* Dataset breakdown */} + {datasets.length >= 1 && (
-

Categories

+

Datasets

- {categories.map((cat) => ( + {datasets.map((cat) => (
diff --git a/apps/studio/src/components/ScoreBar.tsx b/apps/studio/src/components/ScoreBar.tsx index 527be6c64..368909d09 100644 --- a/apps/studio/src/components/ScoreBar.tsx +++ b/apps/studio/src/components/ScoreBar.tsx @@ -2,7 +2,7 @@ * Gradient score bar component. * * Renders a horizontal bar from cyan-400 to blue-500, proportional to the - * score value (0..1). Used in run lists, category breakdowns, and eval detail. + * score value (0..1). Used in run lists, dataset breakdowns, and eval detail. */ interface ScoreBarProps { diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 7cde39f74..68da8651d 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -4,7 +4,7 @@ * Adapts its content based on the current route: * - At root or run detail: shows list of runs * - At eval detail: shows list of evals in the current run with pass/fail indicators - * - At category detail: shows evals filtered to that category + * - At dataset detail: shows evals filtered to that dataset * - At experiment detail: shows list of experiments */ @@ -15,8 +15,8 @@ import { useExperiments, useRunDetail, useRunList } from '~/lib/api'; export function Sidebar() { const matchRoute = useMatchRoute(); const evalMatch = matchRoute({ to: '/evals/$runId/$evalId', fuzzy: true }); - const categoryMatch = matchRoute({ - to: '/runs/$runId/category/$category', + const datasetMatch = matchRoute({ + to: '/runs/$runId/dataset/$dataset', fuzzy: true, }); const experimentMatch = matchRoute({ @@ -24,10 +24,10 @@ export function Sidebar() { fuzzy: true, }); - // If on a category detail page, show evals filtered to that category - if (categoryMatch && typeof categoryMatch === 'object' && 'runId' in categoryMatch) { - const { runId, category } = categoryMatch as { runId: string; category: string }; - return ; + // If on a dataset detail page, show evals filtered to that dataset + if (datasetMatch && typeof datasetMatch === 'object' && 'runId' in datasetMatch) { + const { runId, dataset } = datasetMatch as { runId: string; dataset: string }; + return ; } // If on an eval detail page, show the eval sidebar @@ -152,10 +152,10 @@ function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: s ); } -function CategorySidebar({ runId, category }: { runId: string; category: string }) { +function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string }) { const { data } = useRunDetail(runId); - const categoryResults = (data?.results ?? []).filter( - (r) => (r.eval_set ?? 'Uncategorized') === category, + const datasetResults = (data?.results ?? []).filter( + (r) => (r.dataset ?? 'Uncategorized') === dataset, ); return ( @@ -176,7 +176,7 @@ function CategorySidebar({ runId, category }: { runId: string; category: string ← Back to run

{runId}

-

{category}

+

{dataset}

- {categoryResults.map((result) => { + {datasetResults.map((result) => { const passed = result.score >= 1; return ( diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 8018ed6e5..cc3aa0221 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -8,7 +8,7 @@ import { queryOptions, useQuery } from '@tanstack/react-query'; import type { - CategoriesResponse, + DatasetsResponse, EvalDetailResponse, ExperimentsResponse, FeedbackData, @@ -44,11 +44,10 @@ export function runDetailOptions(filename: string) { }); } -export function runCategoriesOptions(runId: string) { +export function runDatasetsOptions(runId: string) { return queryOptions({ - queryKey: ['runs', runId, 'categories'], - queryFn: () => - fetchJson(`/api/runs/${encodeURIComponent(runId)}/categories`), + queryKey: ['runs', runId, 'datasets'], + queryFn: () => fetchJson(`/api/runs/${encodeURIComponent(runId)}/datasets`), enabled: !!runId, }); } @@ -116,8 +115,8 @@ export function useRunDetail(filename: string) { return useQuery(runDetailOptions(filename)); } -export function useRunCategories(runId: string) { - return useQuery(runCategoriesOptions(runId)); +export function useRunDatasets(runId: string) { + return useQuery(runDatasetsOptions(runId)); } export function useEvalDetail(runId: string, evalId: string) { diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 38395093a..e6454008d 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -48,7 +48,7 @@ export interface AssertionEntry { export interface EvalResult { testId: string; timestamp?: string; - eval_set?: string; + dataset?: string; target?: string; experiment?: string; score: number; @@ -70,7 +70,7 @@ export interface RunDetailResponse { source: string; } -export interface CategorySummary { +export interface DatasetSummary { name: string; total: number; passed: number; @@ -78,8 +78,8 @@ export interface CategorySummary { avg_score: number; } -export interface CategoriesResponse { - categories: CategorySummary[]; +export interface DatasetsResponse { + datasets: DatasetSummary[]; } export interface EvalDetailResponse { diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index 64eda3a56..971d65467 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -13,7 +13,7 @@ import { Route as IndexRouteImport } from './routes/index' import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$evalId' -import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' +import { Route as RunsRunIdDatasetDatasetRouteImport } from './routes/runs/$runId_.dataset.$dataset' const IndexRoute = IndexRouteImport.update({ id: '/', @@ -36,26 +36,25 @@ const EvalsRunIdEvalIdRoute = EvalsRunIdEvalIdRouteImport.update({ path: '/evals/$runId/$evalId', getParentRoute: () => rootRouteImport, } as any) -const RunsRunIdCategoryCategoryRoute = - RunsRunIdCategoryCategoryRouteImport.update({ - id: '/runs/$runId_/category/$category', - path: '/runs/$runId/category/$category', - getParentRoute: () => rootRouteImport, - } as any) +const RunsRunIdDatasetDatasetRoute = RunsRunIdDatasetDatasetRouteImport.update({ + id: '/runs/$runId_/dataset/$dataset', + path: '/runs/$runId/dataset/$dataset', + getParentRoute: () => rootRouteImport, +} as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute + '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRoutesByTo { '/': typeof IndexRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute + '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRoutesById { __root__: typeof rootRouteImport @@ -63,7 +62,7 @@ export interface FileRoutesById { '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute + '/runs/$runId_/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath @@ -72,21 +71,21 @@ export interface FileRouteTypes { | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/runs/$runId/category/$category' + | '/runs/$runId/dataset/$dataset' fileRoutesByTo: FileRoutesByTo to: | '/' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/runs/$runId/category/$category' + | '/runs/$runId/dataset/$dataset' id: | '__root__' | '/' | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/runs/$runId_/category/$category' + | '/runs/$runId_/dataset/$dataset' fileRoutesById: FileRoutesById } export interface RootRouteChildren { @@ -94,7 +93,7 @@ export interface RootRouteChildren { ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute - RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute + RunsRunIdDatasetDatasetRoute: typeof RunsRunIdDatasetDatasetRoute } declare module '@tanstack/react-router' { @@ -127,11 +126,11 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof EvalsRunIdEvalIdRouteImport parentRoute: typeof rootRouteImport } - '/runs/$runId_/category/$category': { - id: '/runs/$runId_/category/$category' - path: '/runs/$runId/category/$category' - fullPath: '/runs/$runId/category/$category' - preLoaderRoute: typeof RunsRunIdCategoryCategoryRouteImport + '/runs/$runId_/dataset/$dataset': { + id: '/runs/$runId_/dataset/$dataset' + path: '/runs/$runId/dataset/$dataset' + fullPath: '/runs/$runId/dataset/$dataset' + preLoaderRoute: typeof RunsRunIdDatasetDatasetRouteImport parentRoute: typeof rootRouteImport } } @@ -142,7 +141,7 @@ const rootRouteChildren: RootRouteChildren = { ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, - RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute, + RunsRunIdDatasetDatasetRoute: RunsRunIdDatasetDatasetRoute, } export const routeTree = rootRouteImport ._addFileChildren(rootRouteChildren) diff --git a/apps/studio/src/routes/runs/$runId_.category.$category.tsx b/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx similarity index 88% rename from apps/studio/src/routes/runs/$runId_.category.$category.tsx rename to apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx index 73a20f12b..6a50243d4 100644 --- a/apps/studio/src/routes/runs/$runId_.category.$category.tsx +++ b/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx @@ -1,8 +1,8 @@ /** - * Category drill-down route: shows evals filtered to a single category. + * Dataset drill-down route: shows evals filtered to a single dataset. * * Uses the `$runId_` trailing-underscore convention so that - * `/runs/:runId/category/:category` is a sibling of `/runs/:runId`, + * `/runs/:runId/dataset/:dataset` is a sibling of `/runs/:runId`, * not a child route. */ @@ -12,12 +12,12 @@ import { ScoreBar } from '~/components/ScoreBar'; import { StatsCards } from '~/components/StatsCards'; import { useRunDetail } from '~/lib/api'; -export const Route = createFileRoute('/runs/$runId_/category/$category')({ - component: CategoryPage, +export const Route = createFileRoute('/runs/$runId_/dataset/$dataset')({ + component: DatasetPage, }); -function CategoryPage() { - const { runId, category } = Route.useParams(); +function DatasetPage() { + const { runId, dataset } = Route.useParams(); const { data, isLoading, error } = useRunDetail(runId); if (isLoading) { @@ -41,7 +41,7 @@ function CategoryPage() { ); } - const results = (data?.results ?? []).filter((r) => (r.eval_set ?? 'Uncategorized') === category); + const results = (data?.results ?? []).filter((r) => (r.dataset ?? 'Uncategorized') === dataset); const total = results.length; const passed = results.filter((r) => r.score >= 1).length; const failed = total - passed; @@ -51,8 +51,8 @@ function CategoryPage() { return (
-

{category}

-

Category in run: {runId}

+

{dataset}

+

Dataset in run: {runId}

-

No evaluations in this category

+

No evaluations in this dataset

) : (
diff --git a/examples/features/assert/evals/dataset.eval.baseline.jsonl b/examples/features/assert/evals/dataset.eval.baseline.jsonl index 5f4b70d9a..f134c85c0 100644 --- a/examples/features/assert/evals/dataset.eval.baseline.jsonl +++ b/examples/features/assert/evals/dataset.eval.baseline.jsonl @@ -1,4 +1,4 @@ -{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]} -{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]} -{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]} -{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]} +{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]} +{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]} +{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]} +{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]} diff --git a/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl b/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl index 2c4fdf122..5f771752e 100644 --- a/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl +++ b/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl @@ -1,7 +1,7 @@ -{"timestamp":"2026-02-20T22:13:04.907Z","test_id":"code-gen-python","eval_set":"basic-jsonl","conversation_id":"python-code-generation","score":0.95,"target":"default","assertions":[{"text":"Includes SUPERSECRET_INSTRUCTION_MARKER_PYTHON","passed":true,"evidence":"The function is correct, includes appropriate error handling and the instruction marker, and covers edge cases. However, it lacks explicit type hints in the definition, which was requested."},{"text":"Has proper error handling for empty list, too few unique items, and non-integer elements","passed":true},{"text":"Handles duplicates by converting to unique values","passed":true},{"text":"Mentions type validation and exceptions","passed":true},{"text":"Type hints are missing in function signature","passed":false}]} -{"timestamp":"2026-02-20T22:13:04.914Z","test_id":"code-review-javascript","eval_set":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies loop condition bug","passed":true,"evidence":"The candidate answer covers all key issues, provides detailed suggestions, offers code corrections, and correctly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT as required."},{"text":"Provides corrected loop implementation","passed":true},{"text":"Suggests input validation","passed":true},{"text":"Mentions functional alternative with reduce()","passed":true}]} -{"timestamp":"2026-02-20T22:13:06.076Z","test_id":"feature-proposal-brainstorm","eval_set":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Five distinct features, each addressing different user pain points (personalization, motivation, recovery, engagement, nutrition)","passed":true,"evidence":"The candidate provides five creative, distinct features, each targeting a specific pain point with feasibility and value clearly outlined. No criteria are missed."},{"text":"Each idea describes a clear value proposition in 1-2 sentences","passed":true},{"text":"All features are technically feasible with current mobile and sensor technology","passed":true},{"text":"No duplicate or overlapping concepts between ideas","passed":true}]} -{"timestamp":"2026-02-20T22:13:06.351Z","test_id":"shorthand-string-example","eval_set":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"States correct answer","passed":true,"evidence":"The candidate provides the correct answer in a clear equation format, satisfying the task requirements."},{"text":"Presents answer in equation form","passed":true}]} -{"timestamp":"2026-02-20T22:13:07.677Z","test_id":"multiturn-debug-session","eval_set":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies the off-by-one source as range(len(items)-1)","passed":true,"evidence":"The candidate directly diagnoses the bug, explains its cause, and provides both an explicit code fix and a more Pythonic alternative, fully matching the multi-turn debugging and clarity requirements."},{"text":"Explains why this causes last item to be dropped","passed":true},{"text":"Proposes two clear fixes: using range(len(items)) or items[:]","passed":true},{"text":"Summarizes problem and solution concisely","passed":true}]} -{"timestamp":"2026-02-20T22:13:08.199Z","test_id":"shorthand-array-syntax","eval_set":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate provides a friendly greeting and offers assistance, closely matching the reference and enhancing warmth with an emoji."},{"text":"Asks how they can help","passed":true},{"text":"Adds a friendly emoji for warmth","passed":true}]} -{"timestamp":"2026-02-20T22:13:08.585Z","test_id":"shorthand-structured-output","eval_set":"basic-jsonl","score":0.1,"target":"default","assertions":[{"text":"Requests additional transaction details","passed":true,"evidence":"The answer does not provide a risk assessment or structured output as required; it only requests more information and lacks any evaluation of transaction risk."},{"text":"States inability to access real-time data","passed":true},{"text":"Does not provide structured risk assessment","passed":false},{"text":"Fails to assign risk level or confidence","passed":false}]} +{"timestamp":"2026-02-20T22:13:04.907Z","test_id":"code-gen-python","dataset":"basic-jsonl","conversation_id":"python-code-generation","score":0.95,"target":"default","assertions":[{"text":"Includes SUPERSECRET_INSTRUCTION_MARKER_PYTHON","passed":true,"evidence":"The function is correct, includes appropriate error handling and the instruction marker, and covers edge cases. However, it lacks explicit type hints in the definition, which was requested."},{"text":"Has proper error handling for empty list, too few unique items, and non-integer elements","passed":true},{"text":"Handles duplicates by converting to unique values","passed":true},{"text":"Mentions type validation and exceptions","passed":true},{"text":"Type hints are missing in function signature","passed":false}]} +{"timestamp":"2026-02-20T22:13:04.914Z","test_id":"code-review-javascript","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies loop condition bug","passed":true,"evidence":"The candidate answer covers all key issues, provides detailed suggestions, offers code corrections, and correctly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT as required."},{"text":"Provides corrected loop implementation","passed":true},{"text":"Suggests input validation","passed":true},{"text":"Mentions functional alternative with reduce()","passed":true}]} +{"timestamp":"2026-02-20T22:13:06.076Z","test_id":"feature-proposal-brainstorm","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Five distinct features, each addressing different user pain points (personalization, motivation, recovery, engagement, nutrition)","passed":true,"evidence":"The candidate provides five creative, distinct features, each targeting a specific pain point with feasibility and value clearly outlined. No criteria are missed."},{"text":"Each idea describes a clear value proposition in 1-2 sentences","passed":true},{"text":"All features are technically feasible with current mobile and sensor technology","passed":true},{"text":"No duplicate or overlapping concepts between ideas","passed":true}]} +{"timestamp":"2026-02-20T22:13:06.351Z","test_id":"shorthand-string-example","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"States correct answer","passed":true,"evidence":"The candidate provides the correct answer in a clear equation format, satisfying the task requirements."},{"text":"Presents answer in equation form","passed":true}]} +{"timestamp":"2026-02-20T22:13:07.677Z","test_id":"multiturn-debug-session","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies the off-by-one source as range(len(items)-1)","passed":true,"evidence":"The candidate directly diagnoses the bug, explains its cause, and provides both an explicit code fix and a more Pythonic alternative, fully matching the multi-turn debugging and clarity requirements."},{"text":"Explains why this causes last item to be dropped","passed":true},{"text":"Proposes two clear fixes: using range(len(items)) or items[:]","passed":true},{"text":"Summarizes problem and solution concisely","passed":true}]} +{"timestamp":"2026-02-20T22:13:08.199Z","test_id":"shorthand-array-syntax","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate provides a friendly greeting and offers assistance, closely matching the reference and enhancing warmth with an emoji."},{"text":"Asks how they can help","passed":true},{"text":"Adds a friendly emoji for warmth","passed":true}]} +{"timestamp":"2026-02-20T22:13:08.585Z","test_id":"shorthand-structured-output","dataset":"basic-jsonl","score":0.1,"target":"default","assertions":[{"text":"Requests additional transaction details","passed":true,"evidence":"The answer does not provide a risk assessment or structured output as required; it only requests more information and lacks any evaluation of transaction risk."},{"text":"States inability to access real-time data","passed":true},{"text":"Does not provide structured risk assessment","passed":false},{"text":"Fails to assign risk level or confidence","passed":false}]} diff --git a/examples/features/basic/evals/dataset.eval.baseline.jsonl b/examples/features/basic/evals/dataset.eval.baseline.jsonl index 649c078d4..b163b481a 100644 --- a/examples/features/basic/evals/dataset.eval.baseline.jsonl +++ b/examples/features/basic/evals/dataset.eval.baseline.jsonl @@ -1,7 +1,7 @@ -{"timestamp":"2026-02-20T21:38:05.833Z","test_id":"code-review-javascript","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Identifies the loop condition bug","passed":true,"evidence":"The answer analyzes the code accurately, fixes the bug, suggests alternatives, and fulfills the explicit instruction marker requirement without omitting any major points."},{"text":"Provides corrected loop code","passed":true},{"text":"Mentions using array methods such as reduce","passed":true},{"text":"Explicitly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT","passed":true}]} -{"timestamp":"2026-02-20T21:38:05.945Z","test_id":"code-gen-python-comprehensive","eval_set":"dataset","conversation_id":"python-code-generation","score":0.95,"target":"default","scores":[{"name":"keyword_check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Raises exceptions","passed":true,"evidence":"Passed 3/3 checks. Score: 1.00"},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true}]},{"name":"code_correctness","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true,"evidence":"The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]}],"assertions":[{"text":"Raises exceptions","passed":true,"evidence":"keyword_check: Passed 3/3 checks. Score: 1.00 | code_correctness: The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true},{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]} -{"timestamp":"2026-02-20T21:38:07.369Z","test_id":"shorthand-string-example","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Correct calculation","passed":true,"evidence":"The candidate answers the math question accurately and presents the calculation clearly, matching the intent of the reference."},{"text":"Final answer clearly provided","passed":true}]} -{"timestamp":"2026-02-20T21:38:08.333Z","test_id":"feature-proposal-brainstorm","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"AI-powered adaptive micro-workouts that integrate with user schedules and adapt to location, energy, and equipment","passed":true,"evidence":"The answer provides five distinct and creative features, each targeting a specific pain point for busy professionals, offering clear value, and utilizing plausible technology. All features are differentiated, innovative, and avoid generic solutions."},{"text":"Distinct accountability pods feature for small peer groups with tailored reminders and progress sharing","passed":true},{"text":"Voice-controlled, hands-free workout guidance compatible with smart assistants for multitasking users","passed":true},{"text":"Stress and mood detection via wearable integration to provide well-being-optimized workout recommendations","passed":true}]} -{"timestamp":"2026-02-20T21:38:09.256Z","test_id":"shorthand-structured-output","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Structured as valid JSON as required","passed":true,"evidence":"The candidate answer fully meets the criteria and closely aligns with the reference, providing comprehensive and accurate risk assessment in proper format."},{"text":"Risk level correctly assessed as Low","passed":true},{"text":"Confidence provided and reasonable","passed":true},{"text":"Reasoning covers amount, merchant, user history, and velocity","passed":true}]} -{"timestamp":"2026-02-20T21:38:09.507Z","test_id":"shorthand-array-syntax","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Warm greeting","passed":true,"evidence":"The candidate answer exactly matches the reference answer and fully meets the greeting criteria."},{"text":"Offers assistance","passed":true}]} -{"timestamp":"2026-02-20T21:38:10.235Z","test_id":"coding-multiturn-debug-session","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Asks for clarification and example input/output","passed":true,"evidence":"The candidate_answer follows a multi-turn debugging session, accurately diagnoses the bug, explains the root cause, proposes clear and correct fixes, and matches the thoroughness and clarity of the reference answer."},{"text":"Diagnoses off-by-one error, explains range() behavior","passed":true},{"text":"Proposes and explains multiple fixes, including direct iteration","passed":true},{"text":"Offers a Pythonic alternative and concise summary","passed":true}]} +{"timestamp":"2026-02-20T21:38:05.833Z","test_id":"code-review-javascript","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Identifies the loop condition bug","passed":true,"evidence":"The answer analyzes the code accurately, fixes the bug, suggests alternatives, and fulfills the explicit instruction marker requirement without omitting any major points."},{"text":"Provides corrected loop code","passed":true},{"text":"Mentions using array methods such as reduce","passed":true},{"text":"Explicitly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT","passed":true}]} +{"timestamp":"2026-02-20T21:38:05.945Z","test_id":"code-gen-python-comprehensive","dataset":"dataset","conversation_id":"python-code-generation","score":0.95,"target":"default","scores":[{"name":"keyword_check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Raises exceptions","passed":true,"evidence":"Passed 3/3 checks. Score: 1.00"},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true}]},{"name":"code_correctness","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true,"evidence":"The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]}],"assertions":[{"text":"Raises exceptions","passed":true,"evidence":"keyword_check: Passed 3/3 checks. Score: 1.00 | code_correctness: The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true},{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]} +{"timestamp":"2026-02-20T21:38:07.369Z","test_id":"shorthand-string-example","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Correct calculation","passed":true,"evidence":"The candidate answers the math question accurately and presents the calculation clearly, matching the intent of the reference."},{"text":"Final answer clearly provided","passed":true}]} +{"timestamp":"2026-02-20T21:38:08.333Z","test_id":"feature-proposal-brainstorm","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"AI-powered adaptive micro-workouts that integrate with user schedules and adapt to location, energy, and equipment","passed":true,"evidence":"The answer provides five distinct and creative features, each targeting a specific pain point for busy professionals, offering clear value, and utilizing plausible technology. All features are differentiated, innovative, and avoid generic solutions."},{"text":"Distinct accountability pods feature for small peer groups with tailored reminders and progress sharing","passed":true},{"text":"Voice-controlled, hands-free workout guidance compatible with smart assistants for multitasking users","passed":true},{"text":"Stress and mood detection via wearable integration to provide well-being-optimized workout recommendations","passed":true}]} +{"timestamp":"2026-02-20T21:38:09.256Z","test_id":"shorthand-structured-output","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Structured as valid JSON as required","passed":true,"evidence":"The candidate answer fully meets the criteria and closely aligns with the reference, providing comprehensive and accurate risk assessment in proper format."},{"text":"Risk level correctly assessed as Low","passed":true},{"text":"Confidence provided and reasonable","passed":true},{"text":"Reasoning covers amount, merchant, user history, and velocity","passed":true}]} +{"timestamp":"2026-02-20T21:38:09.507Z","test_id":"shorthand-array-syntax","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Warm greeting","passed":true,"evidence":"The candidate answer exactly matches the reference answer and fully meets the greeting criteria."},{"text":"Offers assistance","passed":true}]} +{"timestamp":"2026-02-20T21:38:10.235Z","test_id":"coding-multiturn-debug-session","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Asks for clarification and example input/output","passed":true,"evidence":"The candidate_answer follows a multi-turn debugging session, accurately diagnoses the bug, explains the root cause, proposes clear and correct fixes, and matches the thoroughness and clarity of the reference answer."},{"text":"Diagnoses off-by-one error, explains range() behavior","passed":true},{"text":"Proposes and explains multiple fixes, including direct iteration","passed":true},{"text":"Offers a Pythonic alternative and concise summary","passed":true}]} diff --git a/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl b/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl index 389a0143f..4e96f1388 100644 --- a/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl +++ b/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl @@ -1,4 +1,4 @@ -{"timestamp":"2026-02-21T04:00:35.967Z","test_id":"aml-001","eval_set":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]} -{"timestamp":"2026-02-21T04:00:36.039Z","test_id":"aml-002","eval_set":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]} -{"timestamp":"2026-02-21T04:00:36.110Z","test_id":"aml-003","eval_set":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]} -{"timestamp":"2026-02-21T04:00:36.181Z","test_id":"aml-004-not-exist","eval_set":"dataset.eval","score":0,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]}],"error":"Batch output missing id 'aml-004-not-exist'","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false},{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]} +{"timestamp":"2026-02-21T04:00:35.967Z","test_id":"aml-001","dataset":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]} +{"timestamp":"2026-02-21T04:00:36.039Z","test_id":"aml-002","dataset":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]} +{"timestamp":"2026-02-21T04:00:36.110Z","test_id":"aml-003","dataset":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]} +{"timestamp":"2026-02-21T04:00:36.181Z","test_id":"aml-004-not-exist","dataset":"dataset.eval","score":0,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]}],"error":"Batch output missing id 'aml-004-not-exist'","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false},{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]} diff --git a/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl b/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl index 42ac0aec4..24b680a7c 100644 --- a/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl +++ b/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl @@ -1 +1 @@ -{"timestamp":"2026-02-20T21:38:11.981Z","test_id":"code-grader-sdk-attachments","eval_set":"dataset","score":1,"target":"local_cli","scores":[{"name":"attachment-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]}],"assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"attachment-check: Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]} +{"timestamp":"2026-02-20T21:38:11.981Z","test_id":"code-grader-sdk-attachments","dataset":"dataset","score":1,"target":"local_cli","scores":[{"name":"attachment-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]}],"assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"attachment-check: Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]} diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl index 41d98bacd..8802f2944 100644 --- a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl +++ b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl @@ -1,3 +1,3 @@ -{"timestamp":"2026-02-20T21:38:20.811Z","test_id":"perfect-ranking","eval_set":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]} -{"timestamp":"2026-02-20T21:38:21.672Z","test_id":"relevant-node-last","eval_set":"dataset-contextual-precision","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]}],"assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"contextual_precision: 1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]} -{"timestamp":"2026-02-20T21:38:29.448Z","test_id":"mixed-ranking","eval_set":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]} +{"timestamp":"2026-02-20T21:38:20.811Z","test_id":"perfect-ranking","dataset":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]} +{"timestamp":"2026-02-20T21:38:21.672Z","test_id":"relevant-node-last","dataset":"dataset-contextual-precision","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]}],"assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"contextual_precision: 1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]} +{"timestamp":"2026-02-20T21:38:29.448Z","test_id":"mixed-ranking","dataset":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]} diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl index 7a6677d73..d59d74bc9 100644 --- a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl +++ b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl @@ -1,3 +1,3 @@ -{"timestamp":"2026-02-20T21:38:37.566Z","test_id":"perfect-recall","eval_set":"dataset-contextual-recall","score":1,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]}],"assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"contextual_recall: Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]} -{"timestamp":"2026-02-20T21:38:40.606Z","test_id":"zero-recall","eval_set":"dataset-contextual-recall","score":0,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]}],"assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"contextual_recall: 0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]} -{"timestamp":"2026-02-20T21:38:42.779Z","test_id":"partial-recall","eval_set":"dataset-contextual-recall","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]}],"assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"contextual_recall: 1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]} +{"timestamp":"2026-02-20T21:38:37.566Z","test_id":"perfect-recall","dataset":"dataset-contextual-recall","score":1,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]}],"assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"contextual_recall: Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]} +{"timestamp":"2026-02-20T21:38:40.606Z","test_id":"zero-recall","dataset":"dataset-contextual-recall","score":0,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]}],"assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"contextual_recall: 0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]} +{"timestamp":"2026-02-20T21:38:42.779Z","test_id":"partial-recall","dataset":"dataset-contextual-recall","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]}],"assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"contextual_recall: 1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]} diff --git a/examples/features/composite/evals/dataset.eval.baseline.jsonl b/examples/features/composite/evals/dataset.eval.baseline.jsonl index df32e22e8..6ac9d26cb 100644 --- a/examples/features/composite/evals/dataset.eval.baseline.jsonl +++ b/examples/features/composite/evals/dataset.eval.baseline.jsonl @@ -1,4 +1,4 @@ -{"timestamp":"2026-02-20T21:38:46.160Z","test_id":"weighted-average-example","eval_set":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"release_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Defines machine learning as a field of AI learning from data","passed":true,"evidence":"The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity."},{"text":"Mentions improvement without explicit programming","passed":true},{"text":"Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":0.7,"verdict":"pass","assertions":[{"text":"Defines machine learning within AI context","passed":true,"evidence":"The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"Explains learning from data and performance improvement","passed":true},{"text":"Describes core types of machine learning methods","passed":true},{"text":"Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"release_gate: safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]} -{"timestamp":"2026-02-20T21:38:47.004Z","test_id":"code-grader-safety-gate","eval_set":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"safety_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains the difference between bits and qubits","passed":true,"evidence":"The candidate's answer is accurate, clearly explains key concepts like qubits, superposition, and entanglement in simple language, gives relevant examples, and makes no safety or accuracy errors."},{"text":"Defines superposition and entanglement in accessible terms","passed":true},{"text":"Mentions quantum computers' strengths in specific areas","passed":true},{"text":"Clarifies that classical computers are still superior for some tasks","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true,"evidence":"The answer is accurate, easy for non-experts, covers superposition and entanglement, gives relevant examples, and notes limitations, fully meeting the criteria."},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true},{"text":"Provides concrete examples (code-breaking, molecule simulation)","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"safety_gate: Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]} -{"timestamp":"2026-02-20T21:38:48.052Z","test_id":"llm-grader-conflict-resolution","eval_set":"composite-evaluator-examples","score":0.9,"target":"default","scores":[{"name":"final_decision","type":"composite","score":0.9,"weight":1,"verdict":"pass","scores":[{"name":"conciseness","type":"llm-grader","score":0.95,"verdict":"pass","assertions":[{"text":"Concise description of the vacuum's purpose","passed":true,"evidence":"The candidate offers a comprehensive yet succinct product description, detailing key features and convenience factors but omits battery-related information similar to the reference's focus on endurance."},{"text":"Highlights advanced sensors and navigation","passed":true},{"text":"Mentions multi-surface effectiveness","passed":true},{"text":"Notes app-controlled scheduling and convenience","passed":true},{"text":"Battery life or runtime not specified","passed":false}]},{"name":"detail","type":"llm-grader","score":0.9,"verdict":"pass","assertions":[{"text":"Highlights compact, powerful design","passed":true,"evidence":"The answer is concise yet detailed, covering key features and benefits, but lacks mention of battery life which would enhance its comprehensiveness."},{"text":"Mentions advanced sensors and navigation","passed":true},{"text":"Notes features like quiet motor and app-controlled scheduling","passed":true},{"text":"Describes effectiveness on dust, pet hair, and multiple surfaces","passed":true},{"text":"Could mention battery life or specific runtime for added comprehensiveness","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"final_decision: The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]} -{"timestamp":"2026-02-20T21:38:49.625Z","test_id":"nested-composite","eval_set":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"comprehensive_evaluation","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"content_quality","type":"composite","score":1,"weight":0.7,"verdict":"pass","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":0.6,"verdict":"pass","assertions":[{"text":"Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria."},{"text":"Provides examples for both supervised and unsupervised learning","passed":true},{"text":"Explains the goals for each approach","passed":true},{"text":"Summarizes the difference concisely at the end","passed":true}]},{"name":"clarity","type":"llm-grader","score":1,"weight":0.4,"verdict":"pass","assertions":[{"text":"Accurately defines supervised and unsupervised learning","passed":true,"evidence":"The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"Explains the purpose/goals of each approach","passed":true},{"text":"Summarizes the key distinction concisely","passed":true}]}],"assertions":[{"text":"[accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"[accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[accuracy] Explains the goals for each approach","passed":true},{"text":"[accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[clarity] Summarizes the key distinction concisely","passed":true}]},{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Clearly defines supervised and unsupervised learning","passed":true,"evidence":"The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"Provides examples for both types","passed":true},{"text":"Explains the goals of each approach","passed":true},{"text":"Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"comprehensive_evaluation: content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]} +{"timestamp":"2026-02-20T21:38:46.160Z","test_id":"weighted-average-example","dataset":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"release_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Defines machine learning as a field of AI learning from data","passed":true,"evidence":"The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity."},{"text":"Mentions improvement without explicit programming","passed":true},{"text":"Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":0.7,"verdict":"pass","assertions":[{"text":"Defines machine learning within AI context","passed":true,"evidence":"The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"Explains learning from data and performance improvement","passed":true},{"text":"Describes core types of machine learning methods","passed":true},{"text":"Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"release_gate: safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]} +{"timestamp":"2026-02-20T21:38:47.004Z","test_id":"code-grader-safety-gate","dataset":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"safety_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains the difference between bits and qubits","passed":true,"evidence":"The candidate's answer is accurate, clearly explains key concepts like qubits, superposition, and entanglement in simple language, gives relevant examples, and makes no safety or accuracy errors."},{"text":"Defines superposition and entanglement in accessible terms","passed":true},{"text":"Mentions quantum computers' strengths in specific areas","passed":true},{"text":"Clarifies that classical computers are still superior for some tasks","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true,"evidence":"The answer is accurate, easy for non-experts, covers superposition and entanglement, gives relevant examples, and notes limitations, fully meeting the criteria."},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true},{"text":"Provides concrete examples (code-breaking, molecule simulation)","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"safety_gate: Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]} +{"timestamp":"2026-02-20T21:38:48.052Z","test_id":"llm-grader-conflict-resolution","dataset":"composite-evaluator-examples","score":0.9,"target":"default","scores":[{"name":"final_decision","type":"composite","score":0.9,"weight":1,"verdict":"pass","scores":[{"name":"conciseness","type":"llm-grader","score":0.95,"verdict":"pass","assertions":[{"text":"Concise description of the vacuum's purpose","passed":true,"evidence":"The candidate offers a comprehensive yet succinct product description, detailing key features and convenience factors but omits battery-related information similar to the reference's focus on endurance."},{"text":"Highlights advanced sensors and navigation","passed":true},{"text":"Mentions multi-surface effectiveness","passed":true},{"text":"Notes app-controlled scheduling and convenience","passed":true},{"text":"Battery life or runtime not specified","passed":false}]},{"name":"detail","type":"llm-grader","score":0.9,"verdict":"pass","assertions":[{"text":"Highlights compact, powerful design","passed":true,"evidence":"The answer is concise yet detailed, covering key features and benefits, but lacks mention of battery life which would enhance its comprehensiveness."},{"text":"Mentions advanced sensors and navigation","passed":true},{"text":"Notes features like quiet motor and app-controlled scheduling","passed":true},{"text":"Describes effectiveness on dust, pet hair, and multiple surfaces","passed":true},{"text":"Could mention battery life or specific runtime for added comprehensiveness","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"final_decision: The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]} +{"timestamp":"2026-02-20T21:38:49.625Z","test_id":"nested-composite","dataset":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"comprehensive_evaluation","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"content_quality","type":"composite","score":1,"weight":0.7,"verdict":"pass","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":0.6,"verdict":"pass","assertions":[{"text":"Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria."},{"text":"Provides examples for both supervised and unsupervised learning","passed":true},{"text":"Explains the goals for each approach","passed":true},{"text":"Summarizes the difference concisely at the end","passed":true}]},{"name":"clarity","type":"llm-grader","score":1,"weight":0.4,"verdict":"pass","assertions":[{"text":"Accurately defines supervised and unsupervised learning","passed":true,"evidence":"The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"Explains the purpose/goals of each approach","passed":true},{"text":"Summarizes the key distinction concisely","passed":true}]}],"assertions":[{"text":"[accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"[accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[accuracy] Explains the goals for each approach","passed":true},{"text":"[accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[clarity] Summarizes the key distinction concisely","passed":true}]},{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Clearly defines supervised and unsupervised learning","passed":true,"evidence":"The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"Provides examples for both types","passed":true},{"text":"Explains the goals of each approach","passed":true},{"text":"Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"comprehensive_evaluation: content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]} diff --git a/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl b/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl index ed7f0a348..888af1fc3 100644 --- a/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl +++ b/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl @@ -1,3 +1,3 @@ -{"timestamp":"2026-02-20T21:38:52.115Z","test_id":"greeting","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used","passed":true,"evidence":"The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]}],"assertions":[{"text":"Polite language used","passed":true,"evidence":"tone_check: The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]} -{"timestamp":"2026-02-20T21:38:52.862Z","test_id":"skip-defaults","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"urgency_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]}],"assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"urgency_check: The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]} -{"timestamp":"2026-02-20T21:38:54.351Z","test_id":"with-custom-eval","eval_set":"dataset","score":0.9,"target":"default","scores":[{"name":"helpfulness","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]},{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used throughout","passed":true,"evidence":"The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true}]}],"assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"helpfulness: The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process. | tone_check: The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Polite language used throughout","passed":true},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]} +{"timestamp":"2026-02-20T21:38:52.115Z","test_id":"greeting","dataset":"dataset","score":1,"target":"default","scores":[{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used","passed":true,"evidence":"The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]}],"assertions":[{"text":"Polite language used","passed":true,"evidence":"tone_check: The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]} +{"timestamp":"2026-02-20T21:38:52.862Z","test_id":"skip-defaults","dataset":"dataset","score":1,"target":"default","scores":[{"name":"urgency_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]}],"assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"urgency_check: The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]} +{"timestamp":"2026-02-20T21:38:54.351Z","test_id":"with-custom-eval","dataset":"dataset","score":0.9,"target":"default","scores":[{"name":"helpfulness","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]},{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used throughout","passed":true,"evidence":"The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true}]}],"assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"helpfulness: The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process. | tone_check: The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Polite language used throughout","passed":true},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]} diff --git a/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl b/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl index e67a7ab98..4707efc95 100644 --- a/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl +++ b/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl @@ -1,7 +1,7 @@ -{"timestamp":"2026-02-20T21:38:55.767Z","test_id":"regex-email","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true}]}],"assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true,"evidence":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-: Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/"}]} -{"timestamp":"2026-02-20T21:38:55.771Z","test_id":"contains-basic","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]} -{"timestamp":"2026-02-20T21:38:55.838Z","test_id":"equals-exact","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]} -{"timestamp":"2026-02-20T21:38:56.293Z","test_id":"starts-with-prefix","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"regex-^Dear User","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /^Dear User/","passed":true}]}],"assertions":[{"text":"Output matches pattern /^Dear User/","passed":true,"evidence":"regex-^Dear User: Output matches pattern /^Dear User/"}]} -{"timestamp":"2026-02-20T21:38:56.319Z","test_id":"is-json-valid","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]} -{"timestamp":"2026-02-20T21:38:56.720Z","test_id":"required-gate","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"message\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"message\"\"","passed":true}]},{"name":"contains-success","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"success\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"message\": Output contains \"\"message\"\" | contains-success: Output contains \"success\""},{"text":"Output contains \"\"message\"\"","passed":true},{"text":"Output contains \"success\"","passed":true}]} -{"timestamp":"2026-02-20T21:38:56.977Z","test_id":"multi-assertion","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"result\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"result\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"result\": Output contains \"\"result\"\""},{"text":"Output contains \"\"result\"\"","passed":true}]} +{"timestamp":"2026-02-20T21:38:55.767Z","test_id":"regex-email","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true}]}],"assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true,"evidence":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-: Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/"}]} +{"timestamp":"2026-02-20T21:38:55.771Z","test_id":"contains-basic","dataset":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]} +{"timestamp":"2026-02-20T21:38:55.838Z","test_id":"equals-exact","dataset":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]} +{"timestamp":"2026-02-20T21:38:56.293Z","test_id":"starts-with-prefix","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-^Dear User","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /^Dear User/","passed":true}]}],"assertions":[{"text":"Output matches pattern /^Dear User/","passed":true,"evidence":"regex-^Dear User: Output matches pattern /^Dear User/"}]} +{"timestamp":"2026-02-20T21:38:56.319Z","test_id":"is-json-valid","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]} +{"timestamp":"2026-02-20T21:38:56.720Z","test_id":"required-gate","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"message\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"message\"\"","passed":true}]},{"name":"contains-success","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"success\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"message\": Output contains \"\"message\"\" | contains-success: Output contains \"success\""},{"text":"Output contains \"\"message\"\"","passed":true},{"text":"Output contains \"success\"","passed":true}]} +{"timestamp":"2026-02-20T21:38:56.977Z","test_id":"multi-assertion","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"result\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"result\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"result\": Output contains \"\"result\"\""},{"text":"Output contains \"\"result\"\"","passed":true}]} diff --git a/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl b/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl index 1e7a18492..1b85d7ef1 100644 --- a/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl +++ b/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:38:57.573Z","test_id":"metrics-001","eval_set":"dataset-confusion-metrics","score":1,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":6,"total_tn":0,"total_fp":0,"total_fn":0,"macro_precision":1,"macro_recall":1,"macro_f1":1}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]} -{"timestamp":"2026-02-20T21:38:57.582Z","test_id":"metrics-003","eval_set":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"borderline","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0}},"summary":{"total_tp":4,"total_tn":0,"total_fp":2,"total_fn":2,"macro_precision":0.6666666666666666,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]} -{"timestamp":"2026-02-20T21:38:57.588Z","test_id":"metrics-002","eval_set":"dataset-confusion-metrics","score":0.8333333333333334,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.8333333333333334,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":5,"total_tn":0,"total_fp":1,"total_fn":1,"macro_precision":0.8333333333333334,"macro_recall":0.8333333333333334,"macro_f1":0.8333333333333334}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]} -{"timestamp":"2026-02-20T21:38:57.641Z","test_id":"metrics-004","eval_set":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"borderline","details":{"metrics":{"invoice_number":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":4,"total_tn":0,"total_fp":1,"total_fn":2,"macro_precision":0.8,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]} -{"timestamp":"2026-02-20T21:38:57.649Z","test_id":"metrics-005","eval_set":"dataset-confusion-metrics","score":0.5,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.5,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0}},"summary":{"total_tp":3,"total_tn":0,"total_fp":2,"total_fn":3,"macro_precision":0.6,"macro_recall":0.5,"macro_f1":0.5}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]} +{"timestamp":"2026-02-20T21:38:57.573Z","test_id":"metrics-001","dataset":"dataset-confusion-metrics","score":1,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":6,"total_tn":0,"total_fp":0,"total_fn":0,"macro_precision":1,"macro_recall":1,"macro_f1":1}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]} +{"timestamp":"2026-02-20T21:38:57.582Z","test_id":"metrics-003","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"borderline","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0}},"summary":{"total_tp":4,"total_tn":0,"total_fp":2,"total_fn":2,"macro_precision":0.6666666666666666,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]} +{"timestamp":"2026-02-20T21:38:57.588Z","test_id":"metrics-002","dataset":"dataset-confusion-metrics","score":0.8333333333333334,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.8333333333333334,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":5,"total_tn":0,"total_fp":1,"total_fn":1,"macro_precision":0.8333333333333334,"macro_recall":0.8333333333333334,"macro_f1":0.8333333333333334}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]} +{"timestamp":"2026-02-20T21:38:57.641Z","test_id":"metrics-004","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"borderline","details":{"metrics":{"invoice_number":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":4,"total_tn":0,"total_fp":1,"total_fn":2,"macro_precision":0.8,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]} +{"timestamp":"2026-02-20T21:38:57.649Z","test_id":"metrics-005","dataset":"dataset-confusion-metrics","score":0.5,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.5,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0}},"summary":{"total_tp":3,"total_tn":0,"total_fp":2,"total_fn":3,"macro_precision":0.6,"macro_recall":0.5,"macro_f1":0.5}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]} diff --git a/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl b/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl index 3dd9321ed..5fe28c677 100644 --- a/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl +++ b/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl @@ -1,6 +1,6 @@ -{"timestamp":"2026-02-20T21:38:58.215Z","test_id":"invoice-001","eval_set":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]} -{"timestamp":"2026-02-20T21:38:58.237Z","test_id":"invoice-003","eval_set":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]} -{"timestamp":"2026-02-20T21:38:58.265Z","test_id":"invoice-002","eval_set":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.9743589743589745,"target":"mock_extractor","scores":[{"name":"party_names_fuzzy","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0%"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true}]},{"name":"other_fields","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"4/4 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.9230769230769231,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"8/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]}],"assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"party_names_fuzzy: supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0% | other_fields: 4/4 fields matched | invoice_field_accuracy: 8/9 fields matched"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]} -{"timestamp":"2026-02-20T21:38:58.272Z","test_id":"invoice-004","eval_set":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.8461538461538461,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.8461538461538461,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_date","passed":true,"evidence":"8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]}],"assertions":[{"text":"invoice_date","passed":true,"evidence":"invoice_field_accuracy: 8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]} -{"timestamp":"2026-02-20T21:38:58.276Z","test_id":"invoice-005","eval_set":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_check","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"line_items[0].description","passed":true,"evidence":"4/4 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"line_items[0].description","passed":true,"evidence":"line_items_check: 4/4 fields matched | invoice_field_accuracy: 9/9 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]} -{"timestamp":"2026-02-20T21:38:58.335Z","test_id":"invoice-006","eval_set":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_matched","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"alignment":[{"expected_idx":0,"parsed_idx":1,"similarity":1},{"expected_idx":1,"parsed_idx":0,"similarity":1}],"metrics":{"description":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"quantity":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"line_total":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"unmatched_expected":[],"unmatched_parsed":[],"summary":{"matched_count":2,"expected_count":2,"parsed_count":2,"macro_f1":1}},"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"Matched 2/2 expected items, macro-F1=1.000"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"line_items_matched: Matched 2/2 expected items, macro-F1=1.000 | invoice_field_accuracy: 9/9 fields matched"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.215Z","test_id":"invoice-001","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.237Z","test_id":"invoice-003","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.265Z","test_id":"invoice-002","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.9743589743589745,"target":"mock_extractor","scores":[{"name":"party_names_fuzzy","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0%"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true}]},{"name":"other_fields","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"4/4 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.9230769230769231,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"8/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]}],"assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"party_names_fuzzy: supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0% | other_fields: 4/4 fields matched | invoice_field_accuracy: 8/9 fields matched"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]} +{"timestamp":"2026-02-20T21:38:58.272Z","test_id":"invoice-004","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.8461538461538461,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.8461538461538461,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_date","passed":true,"evidence":"8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]}],"assertions":[{"text":"invoice_date","passed":true,"evidence":"invoice_field_accuracy: 8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]} +{"timestamp":"2026-02-20T21:38:58.276Z","test_id":"invoice-005","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_check","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"line_items[0].description","passed":true,"evidence":"4/4 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"line_items[0].description","passed":true,"evidence":"line_items_check: 4/4 fields matched | invoice_field_accuracy: 9/9 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.335Z","test_id":"invoice-006","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_matched","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"alignment":[{"expected_idx":0,"parsed_idx":1,"similarity":1},{"expected_idx":1,"parsed_idx":0,"similarity":1}],"metrics":{"description":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"quantity":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"line_total":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"unmatched_expected":[],"unmatched_parsed":[],"summary":{"matched_count":2,"expected_count":2,"parsed_count":2,"macro_f1":1}},"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"Matched 2/2 expected items, macro-F1=1.000"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"line_items_matched: Matched 2/2 expected items, macro-F1=1.000 | invoice_field_accuracy: 9/9 fields matched"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]} diff --git a/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl b/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl index bb4907bee..204c88e32 100644 --- a/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl +++ b/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl @@ -1,6 +1,6 @@ -{"timestamp":"2026-02-20T21:38:58.923Z","test_id":"simple-thresholds-pass","eval_set":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"efficiency-check: execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]} -{"timestamp":"2026-02-20T21:38:58.931Z","test_id":"research-with-metrics","eval_set":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true}]},{"name":"metrics-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 2 <= 20 max","passed":true,"evidence":"execution_metrics tool_calls=2, tokens=830"},{"text":"Total tokens 830 <= 5000 max","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"metrics-check: execution_metrics tool_calls=2, tokens=830"},{"text":"Tool calls 2 <= 20 max","passed":true},{"text":"Total tokens 830 <= 5000 max","passed":true}]} -{"timestamp":"2026-02-20T21:38:58.939Z","test_id":"comprehensive-thresholds","eval_set":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"full-efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"full-efficiency-check: execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]} -{"timestamp":"2026-02-20T21:38:58.977Z","test_id":"exploration-ratio-check","eval_set":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"exploration-balance","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"execution_metrics exploration_ratio=0.50"}]}],"assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"exploration-balance: execution_metrics exploration_ratio=0.50"}]} -{"timestamp":"2026-02-20T21:38:58.980Z","test_id":"cost-budget-check","eval_set":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"cost-check","type":"execution-metrics","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"execution_metrics cost=$0.0008"}]}],"assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"cost-check: execution_metrics cost=$0.0008"}]} -{"timestamp":"2026-02-20T21:38:59.046Z","test_id":"hybrid-evaluation","eval_set":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"metric-thresholds","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, duration=890ms"},{"text":"Duration 890ms <= 5000ms max","passed":true}]},{"name":"custom-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"tokenUsage present: 85/42","passed":true,"evidence":"Checked 3 metric fields: 3 present, 0 missing"},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"metric-thresholds: execution_metrics tool_calls=0, duration=890ms | custom-check: Checked 3 metric fields: 3 present, 0 missing"},{"text":"Duration 890ms <= 5000ms max","passed":true},{"text":"tokenUsage present: 85/42","passed":true},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.923Z","test_id":"simple-thresholds-pass","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"efficiency-check: execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.931Z","test_id":"research-with-metrics","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true}]},{"name":"metrics-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 2 <= 20 max","passed":true,"evidence":"execution_metrics tool_calls=2, tokens=830"},{"text":"Total tokens 830 <= 5000 max","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"metrics-check: execution_metrics tool_calls=2, tokens=830"},{"text":"Tool calls 2 <= 20 max","passed":true},{"text":"Total tokens 830 <= 5000 max","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.939Z","test_id":"comprehensive-thresholds","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"full-efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"full-efficiency-check: execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]} +{"timestamp":"2026-02-20T21:38:58.977Z","test_id":"exploration-ratio-check","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"exploration-balance","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"execution_metrics exploration_ratio=0.50"}]}],"assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"exploration-balance: execution_metrics exploration_ratio=0.50"}]} +{"timestamp":"2026-02-20T21:38:58.980Z","test_id":"cost-budget-check","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"cost-check","type":"execution-metrics","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"execution_metrics cost=$0.0008"}]}],"assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"cost-check: execution_metrics cost=$0.0008"}]} +{"timestamp":"2026-02-20T21:38:59.046Z","test_id":"hybrid-evaluation","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"metric-thresholds","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, duration=890ms"},{"text":"Duration 890ms <= 5000ms max","passed":true}]},{"name":"custom-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"tokenUsage present: 85/42","passed":true,"evidence":"Checked 3 metric fields: 3 present, 0 missing"},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"metric-thresholds: execution_metrics tool_calls=0, duration=890ms | custom-check: Checked 3 metric fields: 3 present, 0 missing"},{"text":"Duration 890ms <= 5000ms max","passed":true},{"text":"tokenUsage present: 85/42","passed":true},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]} diff --git a/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl b/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl index 2a20d4ca2..45cf29dff 100644 --- a/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl +++ b/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:39:01.382Z","test_id":"accuracy-capital","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly identified Paris as the capital","passed":true,"evidence":"The answer accurately and concisely provides the correct capital city, fully meeting the task criteria."},{"text":"Direct and unambiguous response","passed":true}]} -{"timestamp":"2026-02-20T21:39:01.392Z","test_id":"accuracy-basic-math","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly adds 15 and 27","passed":true,"evidence":"The answer is mathematically correct, clearly formatted, and directly addresses the question."},{"text":"Gives the correct sum (42)","passed":true},{"text":"Clearly formats the answer with an equation","passed":true},{"text":"Directly answers the math question","passed":true}]} -{"timestamp":"2026-02-20T21:39:01.805Z","test_id":"inline-test","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Polite greeting","passed":true,"evidence":"The candidate provides a polite greeting, uses a friendly tone, and offers help, fully meeting the criteria for a courteous response."},{"text":"Friendly tone","passed":true},{"text":"Offers assistance","passed":true}]} -{"timestamp":"2026-02-20T21:39:03.498Z","test_id":"regression-farewell","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Farewell expressed","passed":true,"evidence":"The candidate responds with a clear farewell, adds a friendly message, and offers further help, fully satisfying the criteria."},{"text":"Friendly closing","passed":true},{"text":"Offer of future assistance","passed":true},{"text":"Well-wishing included","passed":true}]} -{"timestamp":"2026-02-20T21:39:03.681Z","test_id":"regression-greeting","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate answer offers a polite greeting as requested and invites further interaction, fully meeting the task criteria."},{"text":"Engages user proactively","passed":true}]} +{"timestamp":"2026-02-20T21:39:01.382Z","test_id":"accuracy-capital","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly identified Paris as the capital","passed":true,"evidence":"The answer accurately and concisely provides the correct capital city, fully meeting the task criteria."},{"text":"Direct and unambiguous response","passed":true}]} +{"timestamp":"2026-02-20T21:39:01.392Z","test_id":"accuracy-basic-math","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly adds 15 and 27","passed":true,"evidence":"The answer is mathematically correct, clearly formatted, and directly addresses the question."},{"text":"Gives the correct sum (42)","passed":true},{"text":"Clearly formats the answer with an equation","passed":true},{"text":"Directly answers the math question","passed":true}]} +{"timestamp":"2026-02-20T21:39:01.805Z","test_id":"inline-test","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Polite greeting","passed":true,"evidence":"The candidate provides a polite greeting, uses a friendly tone, and offers help, fully meeting the criteria for a courteous response."},{"text":"Friendly tone","passed":true},{"text":"Offers assistance","passed":true}]} +{"timestamp":"2026-02-20T21:39:03.498Z","test_id":"regression-farewell","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Farewell expressed","passed":true,"evidence":"The candidate responds with a clear farewell, adds a friendly message, and offers further help, fully satisfying the criteria."},{"text":"Friendly closing","passed":true},{"text":"Offer of future assistance","passed":true},{"text":"Well-wishing included","passed":true}]} +{"timestamp":"2026-02-20T21:39:03.681Z","test_id":"regression-greeting","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate answer offers a polite greeting as requested and invites further interaction, fully meeting the task criteria."},{"text":"Engages user proactively","passed":true}]} diff --git a/examples/features/file-changes/evals/dataset.eval.baseline.jsonl b/examples/features/file-changes/evals/dataset.eval.baseline.jsonl index 1bc997692..d35be43bc 100644 --- a/examples/features/file-changes/evals/dataset.eval.baseline.jsonl +++ b/examples/features/file-changes/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-02-20T21:39:04.356Z","test_id":"verify-deletes-and-structure","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-deletes-and-structure","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-deletes-and-structure: 6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]} -{"timestamp":"2026-02-20T21:39:04.366Z","test_id":"verify-edits-and-creates","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-edits-and-creates","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-edits-and-creates: 5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]} +{"timestamp":"2026-02-20T21:39:04.356Z","test_id":"verify-deletes-and-structure","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-deletes-and-structure","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-deletes-and-structure: 6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]} +{"timestamp":"2026-02-20T21:39:04.366Z","test_id":"verify-edits-and-creates","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-edits-and-creates","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-edits-and-creates: 5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]} diff --git a/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl b/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl index 0f4c402b8..ca86726aa 100644 --- a/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl +++ b/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl @@ -1 +1 @@ -{"timestamp":"2026-02-20T21:39:08.885Z","test_id":"implement-math-functions","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"functional-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"npm install passed","passed":true,"evidence":"Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]}],"assertions":[{"text":"npm install passed","passed":true,"evidence":"functional-check: Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]} +{"timestamp":"2026-02-20T21:39:08.885Z","test_id":"implement-math-functions","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"functional-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"npm install passed","passed":true,"evidence":"Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]}],"assertions":[{"text":"npm install passed","passed":true,"evidence":"functional-check: Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]} diff --git a/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl b/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl index 267a3298e..f68348f70 100644 --- a/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl +++ b/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:39:09.468Z","test_id":"mixed-latency","eval_set":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"data-pipeline-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]} -{"timestamp":"2026-02-20T21:39:09.476Z","test_id":"latency-pass","eval_set":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"fast-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]} -{"timestamp":"2026-02-20T21:39:09.505Z","test_id":"latency-fail","eval_set":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"slow-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]} -{"timestamp":"2026-02-20T21:39:09.541Z","test_id":"exact-with-latency","eval_set":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"auth-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]} -{"timestamp":"2026-02-20T21:39:09.552Z","test_id":"latency-with-args","eval_set":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"weather-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]} +{"timestamp":"2026-02-20T21:39:09.468Z","test_id":"mixed-latency","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"data-pipeline-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]} +{"timestamp":"2026-02-20T21:39:09.476Z","test_id":"latency-pass","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"fast-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]} +{"timestamp":"2026-02-20T21:39:09.505Z","test_id":"latency-fail","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"slow-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]} +{"timestamp":"2026-02-20T21:39:09.541Z","test_id":"exact-with-latency","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"auth-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]} +{"timestamp":"2026-02-20T21:39:09.552Z","test_id":"latency-with-args","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"weather-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]} diff --git a/examples/features/local-cli/evals/dataset.eval.baseline.jsonl b/examples/features/local-cli/evals/dataset.eval.baseline.jsonl index f2c99956f..0254860af 100644 --- a/examples/features/local-cli/evals/dataset.eval.baseline.jsonl +++ b/examples/features/local-cli/evals/dataset.eval.baseline.jsonl @@ -1 +1 @@ -{"timestamp":"2026-02-20T21:39:11.996Z","test_id":"cli-provider-echo","eval_set":"dataset","score":1,"target":"local_cli","assertions":[{"text":"Echoes the request as in the reference","passed":true,"evidence":"The candidate answer accurately echoes the request, includes the correct number of attachments, and lists both files by name, fully meeting all stated criteria."},{"text":"Mentions number of attachments","passed":true},{"text":"Lists all attachment names present","passed":true},{"text":"Matches the spirit of the prompt","passed":true}]} +{"timestamp":"2026-02-20T21:39:11.996Z","test_id":"cli-provider-echo","dataset":"dataset","score":1,"target":"local_cli","assertions":[{"text":"Echoes the request as in the reference","passed":true,"evidence":"The candidate answer accurately echoes the request, includes the correct number of attachments, and lists both files by name, fully meeting all stated criteria."},{"text":"Mentions number of attachments","passed":true},{"text":"Lists all attachment names present","passed":true},{"text":"Matches the spirit of the prompt","passed":true}]} diff --git a/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl b/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl index bf07f03c3..e9a385092 100644 --- a/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:39:24.152Z","test_id":"general-greeting","eval_set":"dataset","score":1,"target":"copilot","assertions":[{"text":"Contains a greeting","passed":true,"evidence":"The candidate answer provides a direct and polite greeting, fully satisfying the task requirements."},{"text":"Polite tone","passed":true}]} -{"timestamp":"2026-02-20T21:39:32.353Z","test_id":"code-generation","eval_set":"dataset","score":1,"target":"copilot","assertions":[{"text":"Valid Python function provided","passed":true,"evidence":"The candidate provided a valid, efficient Python function with error handling and correct Fibonacci indexing as requested."},{"text":"Iterative O(n) implementation","passed":true},{"text":"Handles input validation and errors","passed":true},{"text":"Returns correct Fibonacci numbers starting with F0=0, F1=1","passed":true}]} -{"timestamp":"2026-02-20T21:40:00.939Z","test_id":"copilot-only-task","eval_set":"dataset","score":1,"target":"copilot","assertions":[{"text":"References GitHub directly by providing a repository link","passed":true,"evidence":"The answer correctly references GitHub, provides a direct link to the issue, and offers options for further customization. It fully satisfies the criteria."},{"text":"States that the GitHub issue has been created","passed":true},{"text":"Offers to update the issue's content per user input","passed":true},{"text":"Includes actionable next steps for customization","passed":true}]} -{"timestamp":"2026-02-20T21:40:01.635Z","test_id":"code-generation","eval_set":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]} -{"timestamp":"2026-02-20T21:40:01.674Z","test_id":"general-greeting","eval_set":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]} +{"timestamp":"2026-02-20T21:39:24.152Z","test_id":"general-greeting","dataset":"dataset","score":1,"target":"copilot","assertions":[{"text":"Contains a greeting","passed":true,"evidence":"The candidate answer provides a direct and polite greeting, fully satisfying the task requirements."},{"text":"Polite tone","passed":true}]} +{"timestamp":"2026-02-20T21:39:32.353Z","test_id":"code-generation","dataset":"dataset","score":1,"target":"copilot","assertions":[{"text":"Valid Python function provided","passed":true,"evidence":"The candidate provided a valid, efficient Python function with error handling and correct Fibonacci indexing as requested."},{"text":"Iterative O(n) implementation","passed":true},{"text":"Handles input validation and errors","passed":true},{"text":"Returns correct Fibonacci numbers starting with F0=0, F1=1","passed":true}]} +{"timestamp":"2026-02-20T21:40:00.939Z","test_id":"copilot-only-task","dataset":"dataset","score":1,"target":"copilot","assertions":[{"text":"References GitHub directly by providing a repository link","passed":true,"evidence":"The answer correctly references GitHub, provides a direct link to the issue, and offers options for further customization. It fully satisfies the criteria."},{"text":"States that the GitHub issue has been created","passed":true},{"text":"Offers to update the issue's content per user input","passed":true},{"text":"Includes actionable next steps for customization","passed":true}]} +{"timestamp":"2026-02-20T21:40:01.635Z","test_id":"code-generation","dataset":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]} +{"timestamp":"2026-02-20T21:40:01.674Z","test_id":"general-greeting","dataset":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]} diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl index 241575c1f..59419bbcb 100644 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","eval_set":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. | conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. | role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]} -{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. | conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. | role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]} +{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","dataset":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. | conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. | role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]} +{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","dataset":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. | conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. | role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]} diff --git a/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl b/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl index 3854a09f9..93bfafe79 100644 --- a/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl +++ b/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:40:36.077Z","test_id":"summarisation-rouge","eval_set":"dataset","score":0.08695652173913043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0.08695652173913043,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0.1,"recall":0.07692307692307693,"f1":0.08695652173913043},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]}],"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]} -{"timestamp":"2026-02-20T21:40:37.834Z","test_id":"translation-bleu","eval_set":"dataset","score":0,"target":"default","scores":[{"name":"bleu-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"bleu":0},"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"BLEU score: 0.000"}]}],"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"bleu-score: BLEU score: 0.000"}]} -{"timestamp":"2026-02-20T21:40:38.149Z","test_id":"paraphrase-similarity","eval_set":"dataset","score":0.09128709291752768,"target":"default","scores":[{"name":"cosine-similarity","type":"code-grader","score":0.09128709291752768,"weight":1,"verdict":"fail","details":{"cosine":0.09128709291752768,"jaccard":0.047619047619047616},"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]}],"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"cosine-similarity: Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]} -{"timestamp":"2026-02-20T21:40:41.852Z","test_id":"multi-metric-evaluation","eval_set":"dataset","score":0.07669616519174043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0,"recall":0,"f1":0},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.000, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]},{"name":"cosine-similarity","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"cosine":0,"jaccard":0},"assertions":[{"text":"Cosine similarity 0.000 < 0.7","passed":false,"evidence":"Cosine=0.000, Jaccard=0.000"},{"text":"Jaccard similarity 0.000 < 0.5","passed":false}]},{"name":"edit-distance","type":"code-grader","score":0.23008849557522126,"weight":1,"verdict":"fail","details":{"distance":87,"max_len":113,"similarity":0.23008849557522126},"assertions":[{"text":"Edit similarity 0.230 < 0.8","passed":false,"evidence":"Levenshtein distance=87, normalised similarity=0.230"}]}],"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.000, ROUGE-2 F1=0.000 | cosine-similarity: Cosine=0.000, Jaccard=0.000 | edit-distance: Levenshtein distance=87, normalised similarity=0.230"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false},{"text":"Cosine similarity 0.000 < 0.7","passed":false},{"text":"Jaccard similarity 0.000 < 0.5","passed":false},{"text":"Edit similarity 0.230 < 0.8","passed":false}]} -{"timestamp":"2026-02-20T21:43:06.938Z","test_id":"extraction-levenshtein","eval_set":"dataset","score":0.15384615384615385,"target":"default","scores":[{"name":"edit-distance","type":"code-grader","score":0.15384615384615385,"weight":1,"verdict":"fail","details":{"distance":55,"max_len":65,"similarity":0.15384615384615385},"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"Levenshtein distance=55, normalised similarity=0.154"}]}],"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"edit-distance: Levenshtein distance=55, normalised similarity=0.154"}]} +{"timestamp":"2026-02-20T21:40:36.077Z","test_id":"summarisation-rouge","dataset":"dataset","score":0.08695652173913043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0.08695652173913043,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0.1,"recall":0.07692307692307693,"f1":0.08695652173913043},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]}],"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]} +{"timestamp":"2026-02-20T21:40:37.834Z","test_id":"translation-bleu","dataset":"dataset","score":0,"target":"default","scores":[{"name":"bleu-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"bleu":0},"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"BLEU score: 0.000"}]}],"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"bleu-score: BLEU score: 0.000"}]} +{"timestamp":"2026-02-20T21:40:38.149Z","test_id":"paraphrase-similarity","dataset":"dataset","score":0.09128709291752768,"target":"default","scores":[{"name":"cosine-similarity","type":"code-grader","score":0.09128709291752768,"weight":1,"verdict":"fail","details":{"cosine":0.09128709291752768,"jaccard":0.047619047619047616},"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]}],"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"cosine-similarity: Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]} +{"timestamp":"2026-02-20T21:40:41.852Z","test_id":"multi-metric-evaluation","dataset":"dataset","score":0.07669616519174043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0,"recall":0,"f1":0},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.000, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]},{"name":"cosine-similarity","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"cosine":0,"jaccard":0},"assertions":[{"text":"Cosine similarity 0.000 < 0.7","passed":false,"evidence":"Cosine=0.000, Jaccard=0.000"},{"text":"Jaccard similarity 0.000 < 0.5","passed":false}]},{"name":"edit-distance","type":"code-grader","score":0.23008849557522126,"weight":1,"verdict":"fail","details":{"distance":87,"max_len":113,"similarity":0.23008849557522126},"assertions":[{"text":"Edit similarity 0.230 < 0.8","passed":false,"evidence":"Levenshtein distance=87, normalised similarity=0.230"}]}],"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.000, ROUGE-2 F1=0.000 | cosine-similarity: Cosine=0.000, Jaccard=0.000 | edit-distance: Levenshtein distance=87, normalised similarity=0.230"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false},{"text":"Cosine similarity 0.000 < 0.7","passed":false},{"text":"Jaccard similarity 0.000 < 0.5","passed":false},{"text":"Edit similarity 0.230 < 0.8","passed":false}]} +{"timestamp":"2026-02-20T21:43:06.938Z","test_id":"extraction-levenshtein","dataset":"dataset","score":0.15384615384615385,"target":"default","scores":[{"name":"edit-distance","type":"code-grader","score":0.15384615384615385,"weight":1,"verdict":"fail","details":{"distance":55,"max_len":65,"similarity":0.15384615384615385},"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"Levenshtein distance=55, normalised similarity=0.154"}]}],"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"edit-distance: Levenshtein distance=55, normalised similarity=0.154"}]} diff --git a/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl b/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl index 2ec27ae89..f551c9e45 100644 --- a/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl +++ b/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-02-20T21:40:06.863Z","test_id":"prompt-template-basic","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"custom-prompt-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]}],"assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"custom-prompt-eval: The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]} -{"timestamp":"2026-02-20T21:40:07.754Z","test_id":"prompt-template-with-config","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"strict-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]}],"assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"strict-eval: The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]} +{"timestamp":"2026-02-20T21:40:06.863Z","test_id":"prompt-template-basic","dataset":"dataset","score":1,"target":"default","scores":[{"name":"custom-prompt-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]}],"assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"custom-prompt-eval: The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]} +{"timestamp":"2026-02-20T21:40:07.754Z","test_id":"prompt-template-with-config","dataset":"dataset","score":1,"target":"default","scores":[{"name":"strict-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]}],"assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"strict-eval: The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]} diff --git a/examples/features/rubric/evals/dataset.eval.baseline.jsonl b/examples/features/rubric/evals/dataset.eval.baseline.jsonl index f1d1b62a8..3655fafdf 100644 --- a/examples/features/rubric/evals/dataset.eval.baseline.jsonl +++ b/examples/features/rubric/evals/dataset.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:40:12.230Z","test_id":"code-quality-multi-eval","eval_set":"dataset","score":0.75,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.5,"weight":1,"verdict":"fail","assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation."},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]},{"name":"python_syntax","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Python syntax is valid","passed":true,"evidence":"Code compiled successfully"}]}],"assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"rubric: The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation. | python_syntax: Code compiled successfully"},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"Python syntax is valid","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]} -{"timestamp":"2026-02-20T21:40:13.903Z","test_id":"code-explanation-simple","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}],"assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"rubric: The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]} -{"timestamp":"2026-02-20T21:40:14.527Z","test_id":"summary-task","eval_set":"dataset","score":1,"target":"default","assertions":[{"text":"Mentions faster-than-expected climate change","passed":true,"evidence":"The candidate_answer concisely covers all key points: accelerating climate change, Arctic melt, sea rise, extreme weather, and the scientific call to action, matching the reference answer in both content and tone."},{"text":"Notes rapid Arctic ice melt","passed":true},{"text":"Includes rising sea levels and extreme weather","passed":true},{"text":"Calls out urgent need for emissions cuts and renewables","passed":true}]} -{"timestamp":"2026-02-20T21:40:18.010Z","test_id":"summary-multi-criteria-score-ranges-proposed","eval_set":"dataset","score":0.9666666666666667,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.9666666666666667,"weight":1,"verdict":"pass","assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}],"assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"rubric: The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]} -{"timestamp":"2026-02-20T21:40:18.450Z","test_id":"technical-writing-detailed","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}],"assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"rubric: The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]} +{"timestamp":"2026-02-20T21:40:12.230Z","test_id":"code-quality-multi-eval","dataset":"dataset","score":0.75,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.5,"weight":1,"verdict":"fail","assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation."},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]},{"name":"python_syntax","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Python syntax is valid","passed":true,"evidence":"Code compiled successfully"}]}],"assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"rubric: The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation. | python_syntax: Code compiled successfully"},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"Python syntax is valid","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]} +{"timestamp":"2026-02-20T21:40:13.903Z","test_id":"code-explanation-simple","dataset":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}],"assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"rubric: The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]} +{"timestamp":"2026-02-20T21:40:14.527Z","test_id":"summary-task","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Mentions faster-than-expected climate change","passed":true,"evidence":"The candidate_answer concisely covers all key points: accelerating climate change, Arctic melt, sea rise, extreme weather, and the scientific call to action, matching the reference answer in both content and tone."},{"text":"Notes rapid Arctic ice melt","passed":true},{"text":"Includes rising sea levels and extreme weather","passed":true},{"text":"Calls out urgent need for emissions cuts and renewables","passed":true}]} +{"timestamp":"2026-02-20T21:40:18.010Z","test_id":"summary-multi-criteria-score-ranges-proposed","dataset":"dataset","score":0.9666666666666667,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.9666666666666667,"weight":1,"verdict":"pass","assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}],"assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"rubric: The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]} +{"timestamp":"2026-02-20T21:40:18.450Z","test_id":"technical-writing-detailed","dataset":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}],"assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"rubric: The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]} diff --git a/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl b/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl index e4976e665..a21544e4d 100644 --- a/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl +++ b/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-json","eval_set":"dataset.eval","score":1,"answer":"{\n \"status\": \"ok\"\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]} -{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-greeting","eval_set":"dataset.eval","score":1,"answer":"Hello! How can I help you today?","target":"default","requests":{"lm":{"question":"Hello!","guidelines":""}},"input":"Hello!","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]} +{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-json","dataset":"dataset.eval","score":1,"answer":"{\n \"status\": \"ok\"\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]} +{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-greeting","dataset":"dataset.eval","score":1,"answer":"Hello! How can I help you today?","target":"default","requests":{"lm":{"question":"Hello!","guidelines":""}},"input":"Hello!","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]} diff --git a/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl b/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl index e6d40b7cf..259ee89bf 100644 --- a/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl +++ b/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl @@ -1,3 +1,3 @@ -{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"json-response","eval_set":"dataset.eval","score":1,"answer":"{\n \"name\": \"John Doe\",\n \"age\": 30\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 7 words (>= 3 required)","passed":true,"evidence":"Output has 7 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | word-count: Output has 7 words (>= 3 required)"}]} -{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"short-answer","eval_set":"dataset.eval","score":1,"answer":"2 + 2 = 4","target":"default","requests":{"lm":{"question":"What is 2+2?","guidelines":""}},"input":"What is 2+2?","scores":[{"name":"contains-4","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"4\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 5 words (>= 3 required)","passed":true,"evidence":"Output has 5 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"4\"","passed":true,"evidence":"contains-4: Output contains \"4\" | word-count: Output has 5 words (>= 3 required)"}]} -{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"greeting-response","eval_set":"dataset.eval","score":1,"answer":"Hello! I'm an AI assistant here to help with your questions and tasks. How can I assist you today?","target":"default","requests":{"lm":{"question":"Say hello and introduce yourself","guidelines":""}},"input":"Say hello and introduce yourself","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 19 words (>= 3 required)","passed":true,"evidence":"Output has 19 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | word-count: Output has 19 words (>= 3 required)"}]} +{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"json-response","dataset":"dataset.eval","score":1,"answer":"{\n \"name\": \"John Doe\",\n \"age\": 30\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 7 words (>= 3 required)","passed":true,"evidence":"Output has 7 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | word-count: Output has 7 words (>= 3 required)"}]} +{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"short-answer","dataset":"dataset.eval","score":1,"answer":"2 + 2 = 4","target":"default","requests":{"lm":{"question":"What is 2+2?","guidelines":""}},"input":"What is 2+2?","scores":[{"name":"contains-4","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"4\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 5 words (>= 3 required)","passed":true,"evidence":"Output has 5 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"4\"","passed":true,"evidence":"contains-4: Output contains \"4\" | word-count: Output has 5 words (>= 3 required)"}]} +{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"greeting-response","dataset":"dataset.eval","score":1,"answer":"Hello! I'm an AI assistant here to help with your questions and tasks. How can I assist you today?","target":"default","requests":{"lm":{"question":"Say hello and introduce yourself","guidelines":""}},"input":"Say hello and introduce yourself","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 19 words (>= 3 required)","passed":true,"evidence":"Output has 19 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | word-count: Output has 19 words (>= 3 required)"}]} diff --git a/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl b/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl index 306afb8c2..3087e76db 100644 --- a/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl +++ b/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl @@ -1,3 +1,3 @@ -{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"japan-spring","eval_set":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n"},{"role":"user","content":"When is the best time to visit Japan?"}],"assertions":[]} -{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"iceland-northern-lights","eval_set":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n"},{"role":"user","content":"I want to see the Northern Lights in Iceland. When should I go?"}],"assertions":[]} -{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"skip-suite-input","eval_set":"dataset.eval","score":0,"target":"default-dry-run","input":"What currency does Thailand use?","assertions":[]} +{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"japan-spring","dataset":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n"},{"role":"user","content":"When is the best time to visit Japan?"}],"assertions":[]} +{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"iceland-northern-lights","dataset":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n"},{"role":"user","content":"I want to see the Northern Lights in Iceland. When should I go?"}],"assertions":[]} +{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"skip-suite-input","dataset":"dataset.eval","score":0,"target":"default-dry-run","input":"What currency does Thailand use?","assertions":[]} diff --git a/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl b/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl index 0d9acbc8a..677be1864 100644 --- a/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl +++ b/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl @@ -1 +1 @@ -{"timestamp":"2026-02-20T21:40:22.250Z","test_id":"flexible-gate","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"flexible_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"accuracy_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail."},{"text":"Provides sustainable, abundant energy supply","passed":true},{"text":"Increases energy security and independence","passed":true},{"text":"Lowers long-term and stabilizes energy costs","passed":true}]},{"name":"completeness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions","passed":true,"evidence":"The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer."},{"text":"Provides a sustainable, non-depletable energy supply","passed":true},{"text":"Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"Results in stable energy costs and supports economic growth","passed":true}]},{"name":"conciseness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"Provides stable and often lower energy costs","passed":true},{"text":"Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"flexible_gate: 3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]} +{"timestamp":"2026-02-20T21:40:22.250Z","test_id":"flexible-gate","dataset":"dataset","score":1,"target":"default","scores":[{"name":"flexible_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"accuracy_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail."},{"text":"Provides sustainable, abundant energy supply","passed":true},{"text":"Increases energy security and independence","passed":true},{"text":"Lowers long-term and stabilizes energy costs","passed":true}]},{"name":"completeness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions","passed":true,"evidence":"The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer."},{"text":"Provides a sustainable, non-depletable energy supply","passed":true},{"text":"Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"Results in stable energy costs and supports economic growth","passed":true}]},{"name":"conciseness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"Provides stable and often lower energy costs","passed":true},{"text":"Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"flexible_gate: 3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]} diff --git a/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl b/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl index ce4954b54..b6c42ce74 100644 --- a/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl +++ b/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl @@ -1,3 +1,3 @@ -{"timestamp":"2026-02-21T04:00:53.322Z","test_id":"weather-lookup-f1","eval_set":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]}],"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]} -{"timestamp":"2026-02-21T04:00:53.328Z","test_id":"data-analysis-combined","eval_set":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"search: called 0 times (required \u22651)","passed":false}]},{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":3},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]}],"assertions":[{"text":"search: called 0 times (required \u22651)","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'search' was NOT called","passed":false},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]} -{"timestamp":"2026-02-21T04:00:53.337Z","test_id":"weather-lookup-args-f1","eval_set":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-args-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]}],"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"tool-args-f1: precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]} +{"timestamp":"2026-02-21T04:00:53.322Z","test_id":"weather-lookup-f1","dataset":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]}],"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]} +{"timestamp":"2026-02-21T04:00:53.328Z","test_id":"data-analysis-combined","dataset":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"search: called 0 times (required \u22651)","passed":false}]},{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":3},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]}],"assertions":[{"text":"search: called 0 times (required \u22651)","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'search' was NOT called","passed":false},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]} +{"timestamp":"2026-02-21T04:00:53.337Z","test_id":"weather-lookup-args-f1","dataset":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-args-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]}],"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"tool-args-f1: precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]} diff --git a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl index 83a7fe4ce..5dfc10b6c 100644 --- a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl +++ b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl @@ -1,6 +1,6 @@ -{"timestamp":"2026-02-20T21:40:22.878Z","test_id":"exact-sequence-validation","eval_set":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"exact-workflow","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]} -{"timestamp":"2026-02-20T21:40:22.890Z","test_id":"any-order-with-minimums","eval_set":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]} -{"timestamp":"2026-02-20T21:40:22.895Z","test_id":"in-order-validation","eval_set":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"search-then-fetch","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]} -{"timestamp":"2026-02-20T21:40:22.931Z","test_id":"tool-input-validation","eval_set":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"input-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]} -{"timestamp":"2026-02-20T21:40:22.944Z","test_id":"tool-output-validation","eval_set":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"output-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]} -{"timestamp":"2026-02-20T21:40:22.949Z","test_id":"combined-validation","eval_set":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"workflow-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]},{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true},{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]} +{"timestamp":"2026-02-20T21:40:22.878Z","test_id":"exact-sequence-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"exact-workflow","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]} +{"timestamp":"2026-02-20T21:40:22.890Z","test_id":"any-order-with-minimums","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]} +{"timestamp":"2026-02-20T21:40:22.895Z","test_id":"in-order-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"search-then-fetch","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]} +{"timestamp":"2026-02-20T21:40:22.931Z","test_id":"tool-input-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"input-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]} +{"timestamp":"2026-02-20T21:40:22.944Z","test_id":"tool-output-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"output-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]} +{"timestamp":"2026-02-20T21:40:22.949Z","test_id":"combined-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"workflow-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]},{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true},{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]} diff --git a/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl b/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl index d197593bd..ab6cd1c3a 100644 --- a/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl +++ b/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl @@ -1,7 +1,7 @@ -{"timestamp":"2026-02-20T21:40:23.514Z","test_id":"any-order-pass","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"tool-usage-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]} -{"timestamp":"2026-02-20T21:40:23.520Z","test_id":"exact-auth-flow","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"auth-sequence-exact","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]} -{"timestamp":"2026-02-20T21:40:23.526Z","test_id":"in-order-pass","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]} -{"timestamp":"2026-02-20T21:40:23.569Z","test_id":"metrics-check","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"metrics-tools","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]} -{"timestamp":"2026-02-20T21:40:23.579Z","test_id":"partial-match","eval_set":"dataset","score":0.6666666666666666,"target":"mock_agent","scores":[{"name":"tool-check","type":"tool-trajectory","score":0.6666666666666666,"weight":1,"verdict":"borderline","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]} -{"timestamp":"2026-02-20T21:40:23.599Z","test_id":"exact-args-match","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"arg-validation","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]} -{"timestamp":"2026-02-20T21:40:23.624Z","test_id":"skip-args-validation","eval_set":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence-only","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]}],"assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]} +{"timestamp":"2026-02-20T21:40:23.514Z","test_id":"any-order-pass","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"tool-usage-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]} +{"timestamp":"2026-02-20T21:40:23.520Z","test_id":"exact-auth-flow","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"auth-sequence-exact","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]} +{"timestamp":"2026-02-20T21:40:23.526Z","test_id":"in-order-pass","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]} +{"timestamp":"2026-02-20T21:40:23.569Z","test_id":"metrics-check","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"metrics-tools","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]} +{"timestamp":"2026-02-20T21:40:23.579Z","test_id":"partial-match","dataset":"dataset","score":0.6666666666666666,"target":"mock_agent","scores":[{"name":"tool-check","type":"tool-trajectory","score":0.6666666666666666,"weight":1,"verdict":"borderline","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]} +{"timestamp":"2026-02-20T21:40:23.599Z","test_id":"exact-args-match","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"arg-validation","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]} +{"timestamp":"2026-02-20T21:40:23.624Z","test_id":"skip-args-validation","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence-only","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]}],"assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]} diff --git a/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl b/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl index 3b705cbf2..ecfcbce95 100644 --- a/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl +++ b/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl @@ -1,5 +1,5 @@ -{"timestamp": "2026-02-22T10:00:01.000Z", "test_id": "research-question", "eval_set": "multi-agent", "score": 0.75, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.75}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 8, "tool_names": ["Read", "WebSearch", "tavily_search", "write_report"], "tool_calls_by_name": {"Read": 2, "WebSearch": 3, "tavily_search": 2, "write_report": 1}, "error_count": 0, "token_usage": {"input": 8500, "output": 1667}, "cost_usd": 0.105, "duration_ms": 15080, "llm_call_count": 4, "tool_durations": {"Read": [120, 85], "WebSearch": [2100, 1800, 2200], "tavily_search": [3460, 2100], "write_report": [450]}}, "output": [{"role": "assistant", "content": "I'll research this question by searching multiple sources.", "tool_calls": [{"tool": "WebSearch", "input": {"query": "latest findings on topic"}, "duration_ms": 2100}, {"tool": "WebSearch", "input": {"query": "recent papers 2025 topic"}, "duration_ms": 1800}], "duration_ms": 2360, "token_usage": {"input": 2498, "output": 312}}, {"role": "assistant", "content": "Let me dig deeper with specialized search.", "tool_calls": [{"tool": "tavily_search", "input": {"query": "deep dive topic analysis"}, "duration_ms": 3460}], "duration_ms": 2570, "token_usage": {"input": 1357, "output": 245}}, {"role": "assistant", "content": "Now reading the key documents.", "tool_calls": [{"tool": "Read", "input": {"file": "doc1.pdf"}, "duration_ms": 120}, {"tool": "Read", "input": {"file": "doc2.pdf"}, "duration_ms": 85}, {"tool": "tavily_search", "input": {"query": "supplementary data"}, "duration_ms": 2100}], "duration_ms": 3890, "token_usage": {"input": 3701, "output": 567}}, {"role": "assistant", "content": "Here is my comprehensive research report covering the key findings from multiple sources...", "tool_calls": [{"tool": "write_report", "input": {"title": "Research Summary"}, "duration_ms": 450}], "duration_ms": 2800, "token_usage": {"input": 2611, "output": 543}}], "assertions": [{"text": "Provides relevant research findings", "passed": true}, {"text": "Cites multiple sources", "passed": true}, {"text": "Missing critical source from 2025", "passed": false}]} -{"timestamp": "2026-02-22T10:00:16.000Z", "test_id": "code-review-task", "eval_set": "multi-agent", "score": 1.0, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 3, "tool_names": ["Read", "Grep"], "tool_calls_by_name": {"Read": 2, "Grep": 1}, "error_count": 0, "token_usage": {"input": 3200, "output": 800}, "cost_usd": 0.032, "duration_ms": 4500, "llm_call_count": 2, "tool_durations": {"Read": [95, 110], "Grep": [340]}}, "output": [{"role": "assistant", "content": "Let me review the code.", "tool_calls": [{"tool": "Read", "input": {"file": "main.ts"}, "duration_ms": 95}, {"tool": "Grep", "input": {"pattern": "function handleError"}, "duration_ms": 340}], "duration_ms": 1200, "token_usage": {"input": 1600, "output": 200}}, {"role": "assistant", "content": "I found a critical bug in the error handling logic. The function catches the error but doesn't propagate it correctly...", "tool_calls": [{"tool": "Read", "input": {"file": "error-handler.ts"}, "duration_ms": 110}], "duration_ms": 1800, "token_usage": {"input": 1600, "output": 600}}], "assertions": [{"text": "Identifies the bug", "passed": true}, {"text": "Suggests fix", "passed": true}, {"text": "Explains root cause", "passed": true}, {"text": "Follows coding standards", "passed": true}]} -{"timestamp": "2026-02-22T10:00:21.000Z", "test_id": "data-analysis", "eval_set": "multi-agent", "score": 0.5, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.5}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.8}], "trace": {"event_count": 12, "tool_names": ["Read", "python_exec", "write_file"], "tool_calls_by_name": {"Read": 4, "python_exec": 6, "write_file": 2}, "error_count": 1, "token_usage": {"input": 12000, "output": 3500}, "cost_usd": 0.18, "duration_ms": 28000, "llm_call_count": 5, "tool_durations": {"Read": [80, 90, 110, 75], "python_exec": [1500, 2200, 1800, 3500, 900, 1100], "write_file": [200, 350]}}, "assertions": [{"text": "Processes data correctly", "passed": true}, {"text": "Missing visualization", "passed": false}, {"text": "Incomplete statistical analysis", "passed": false}]} -{"timestamp": "2026-02-22T10:00:50.000Z", "test_id": "simple-qa", "eval_set": "multi-agent", "score": 1.0, "target": "gpt-4o", "trace": {"event_count": 0, "tool_names": [], "tool_calls_by_name": {}, "error_count": 0, "token_usage": {"input": 500, "output": 150}, "cost_usd": 0.005, "duration_ms": 1200, "llm_call_count": 1}, "assertions": [{"text": "Correct answer", "passed": true}, {"text": "Clear explanation", "passed": true}]} -{"timestamp": "2026-02-22T10:00:52.000Z", "test_id": "multi-step-planning", "eval_set": "multi-agent", "score": 0.9, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.9}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.85}], "trace": {"event_count": 6, "tool_names": ["Read", "Write", "execute_plan"], "tool_calls_by_name": {"Read": 2, "Write": 2, "execute_plan": 2}, "error_count": 0, "token_usage": {"input": 5800, "output": 1200}, "cost_usd": 0.065, "duration_ms": 9500, "llm_call_count": 3, "tool_durations": {"Read": [100, 90], "Write": [250, 300], "execute_plan": [2500, 3200]}}, "assertions": [{"text": "Creates valid plan", "passed": true}, {"text": "Executes steps in order", "passed": true}, {"text": "Handles dependencies", "passed": true}, {"text": "Plan could be more efficient", "passed": false}]} +{"timestamp": "2026-02-22T10:00:01.000Z", "test_id": "research-question", "dataset": "multi-agent", "score": 0.75, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.75}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 8, "tool_names": ["Read", "WebSearch", "tavily_search", "write_report"], "tool_calls_by_name": {"Read": 2, "WebSearch": 3, "tavily_search": 2, "write_report": 1}, "error_count": 0, "token_usage": {"input": 8500, "output": 1667}, "cost_usd": 0.105, "duration_ms": 15080, "llm_call_count": 4, "tool_durations": {"Read": [120, 85], "WebSearch": [2100, 1800, 2200], "tavily_search": [3460, 2100], "write_report": [450]}}, "output": [{"role": "assistant", "content": "I'll research this question by searching multiple sources.", "tool_calls": [{"tool": "WebSearch", "input": {"query": "latest findings on topic"}, "duration_ms": 2100}, {"tool": "WebSearch", "input": {"query": "recent papers 2025 topic"}, "duration_ms": 1800}], "duration_ms": 2360, "token_usage": {"input": 2498, "output": 312}}, {"role": "assistant", "content": "Let me dig deeper with specialized search.", "tool_calls": [{"tool": "tavily_search", "input": {"query": "deep dive topic analysis"}, "duration_ms": 3460}], "duration_ms": 2570, "token_usage": {"input": 1357, "output": 245}}, {"role": "assistant", "content": "Now reading the key documents.", "tool_calls": [{"tool": "Read", "input": {"file": "doc1.pdf"}, "duration_ms": 120}, {"tool": "Read", "input": {"file": "doc2.pdf"}, "duration_ms": 85}, {"tool": "tavily_search", "input": {"query": "supplementary data"}, "duration_ms": 2100}], "duration_ms": 3890, "token_usage": {"input": 3701, "output": 567}}, {"role": "assistant", "content": "Here is my comprehensive research report covering the key findings from multiple sources...", "tool_calls": [{"tool": "write_report", "input": {"title": "Research Summary"}, "duration_ms": 450}], "duration_ms": 2800, "token_usage": {"input": 2611, "output": 543}}], "assertions": [{"text": "Provides relevant research findings", "passed": true}, {"text": "Cites multiple sources", "passed": true}, {"text": "Missing critical source from 2025", "passed": false}]} +{"timestamp": "2026-02-22T10:00:16.000Z", "test_id": "code-review-task", "dataset": "multi-agent", "score": 1.0, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 3, "tool_names": ["Read", "Grep"], "tool_calls_by_name": {"Read": 2, "Grep": 1}, "error_count": 0, "token_usage": {"input": 3200, "output": 800}, "cost_usd": 0.032, "duration_ms": 4500, "llm_call_count": 2, "tool_durations": {"Read": [95, 110], "Grep": [340]}}, "output": [{"role": "assistant", "content": "Let me review the code.", "tool_calls": [{"tool": "Read", "input": {"file": "main.ts"}, "duration_ms": 95}, {"tool": "Grep", "input": {"pattern": "function handleError"}, "duration_ms": 340}], "duration_ms": 1200, "token_usage": {"input": 1600, "output": 200}}, {"role": "assistant", "content": "I found a critical bug in the error handling logic. The function catches the error but doesn't propagate it correctly...", "tool_calls": [{"tool": "Read", "input": {"file": "error-handler.ts"}, "duration_ms": 110}], "duration_ms": 1800, "token_usage": {"input": 1600, "output": 600}}], "assertions": [{"text": "Identifies the bug", "passed": true}, {"text": "Suggests fix", "passed": true}, {"text": "Explains root cause", "passed": true}, {"text": "Follows coding standards", "passed": true}]} +{"timestamp": "2026-02-22T10:00:21.000Z", "test_id": "data-analysis", "dataset": "multi-agent", "score": 0.5, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.5}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.8}], "trace": {"event_count": 12, "tool_names": ["Read", "python_exec", "write_file"], "tool_calls_by_name": {"Read": 4, "python_exec": 6, "write_file": 2}, "error_count": 1, "token_usage": {"input": 12000, "output": 3500}, "cost_usd": 0.18, "duration_ms": 28000, "llm_call_count": 5, "tool_durations": {"Read": [80, 90, 110, 75], "python_exec": [1500, 2200, 1800, 3500, 900, 1100], "write_file": [200, 350]}}, "assertions": [{"text": "Processes data correctly", "passed": true}, {"text": "Missing visualization", "passed": false}, {"text": "Incomplete statistical analysis", "passed": false}]} +{"timestamp": "2026-02-22T10:00:50.000Z", "test_id": "simple-qa", "dataset": "multi-agent", "score": 1.0, "target": "gpt-4o", "trace": {"event_count": 0, "tool_names": [], "tool_calls_by_name": {}, "error_count": 0, "token_usage": {"input": 500, "output": 150}, "cost_usd": 0.005, "duration_ms": 1200, "llm_call_count": 1}, "assertions": [{"text": "Correct answer", "passed": true}, {"text": "Clear explanation", "passed": true}]} +{"timestamp": "2026-02-22T10:00:52.000Z", "test_id": "multi-step-planning", "dataset": "multi-agent", "score": 0.9, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.9}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.85}], "trace": {"event_count": 6, "tool_names": ["Read", "Write", "execute_plan"], "tool_calls_by_name": {"Read": 2, "Write": 2, "execute_plan": 2}, "error_count": 0, "token_usage": {"input": 5800, "output": 1200}, "cost_usd": 0.065, "duration_ms": 9500, "llm_call_count": 3, "tool_durations": {"Read": [100, 90], "Write": [250, 300], "execute_plan": [2500, 3200]}}, "assertions": [{"text": "Creates valid plan", "passed": true}, {"text": "Executes steps in order", "passed": true}, {"text": "Handles dependencies", "passed": true}, {"text": "Plan could be more efficient", "passed": false}]} diff --git a/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl b/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl index 9d5dfaa45..442d2c758 100644 --- a/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:45:12.843Z","test_id":"error-free-execution","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-check: No errors detected in trace"}]} -{"timestamp":"2026-02-20T21:45:14.027Z","test_id":"no-forbidden-tools","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"error-and-tool-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-and-tool-check: No errors detected in trace"}]} -{"timestamp":"2026-02-20T21:45:17.625Z","test_id":"comprehensive-trace-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]},{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]},{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (3408ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed | error-check: No errors detected in trace | duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true},{"text":"Error count (0) within limit (0)","passed":true},{"text":"Total duration (3408ms) within limit (25000ms)","passed":true}]} -{"timestamp":"2026-02-20T21:45:18.969Z","test_id":"duration-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"}]} -{"timestamp":"2026-02-20T21:47:05.308Z","test_id":"span-count-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]} +{"timestamp":"2026-02-20T21:45:12.843Z","test_id":"error-free-execution","dataset":"dataset","score":1,"target":"default","scores":[{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-check: No errors detected in trace"}]} +{"timestamp":"2026-02-20T21:45:14.027Z","test_id":"no-forbidden-tools","dataset":"dataset","score":1,"target":"default","scores":[{"name":"error-and-tool-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-and-tool-check: No errors detected in trace"}]} +{"timestamp":"2026-02-20T21:45:17.625Z","test_id":"comprehensive-trace-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]},{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]},{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (3408ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed | error-check: No errors detected in trace | duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true},{"text":"Error count (0) within limit (0)","passed":true},{"text":"Total duration (3408ms) within limit (25000ms)","passed":true}]} +{"timestamp":"2026-02-20T21:45:18.969Z","test_id":"duration-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"}]} +{"timestamp":"2026-02-20T21:47:05.308Z","test_id":"span-count-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]} diff --git a/examples/features/trials/evals/dataset.eval.baseline.jsonl b/examples/features/trials/evals/dataset.eval.baseline.jsonl index 1b233411c..7bda67475 100644 --- a/examples/features/trials/evals/dataset.eval.baseline.jsonl +++ b/examples/features/trials/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-02-20T21:40:25.928Z","test_id":"capital-knowledge","eval_set":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Correctly identifies Canberra as the capital of Australia","passed":true,"evidence":"The candidate answer provides the correct and complete information, fully matching the reference answer."}]} -{"timestamp":"2026-02-20T21:40:26.593Z","test_id":"math-basics","eval_set":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Explains step-by-step reasoning","passed":true,"evidence":"The candidate answer breaks down the calculation clearly, explains each step, and arrives at the correct answer, matching the reference reasoning."},{"text":"Splits 15 into 10 and 5 for easier calculation","passed":true},{"text":"Calculates partial products (10\u00d77 and 5\u00d77)","passed":true},{"text":"Arrives at correct final answer (105)","passed":true}]} +{"timestamp":"2026-02-20T21:40:25.928Z","test_id":"capital-knowledge","dataset":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Correctly identifies Canberra as the capital of Australia","passed":true,"evidence":"The candidate answer provides the correct and complete information, fully matching the reference answer."}]} +{"timestamp":"2026-02-20T21:40:26.593Z","test_id":"math-basics","dataset":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Explains step-by-step reasoning","passed":true,"evidence":"The candidate answer breaks down the calculation clearly, explains each step, and arrives at the correct answer, matching the reference reasoning."},{"text":"Splits 15 into 10 and 5 for easier calculation","passed":true},{"text":"Calculates partial products (10\u00d77 and 5\u00d77)","passed":true},{"text":"Arrives at correct final answer (105)","passed":true}]} diff --git a/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl b/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl index cdca0d0a6..258bc807e 100644 --- a/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl +++ b/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl @@ -1,3 +1,3 @@ -{"timestamp":"2026-02-20T21:40:31.897Z","test_id":"experimental-evaluator-disabled","eval_set":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true}]},{"name":"experimental-metric","type":"llm-grader","score":1,"weight":0,"verdict":"pass","assertions":[{"text":"Provides expanded explanation and context","passed":true,"evidence":"Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]}],"assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"accuracy: All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications. | experimental-metric: Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true},{"text":"Provides expanded explanation and context","passed":true},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]} -{"timestamp":"2026-02-20T21:40:32.552Z","test_id":"equal-weights-default","eval_set":"weighted-evaluators-examples","score":0.9833333333333334,"target":"default","scores":[{"name":"correctness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise.","passed":true,"evidence":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise."}]},{"name":"completeness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]},{"name":"clarity","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized.","passed":true,"evidence":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."}]}],"assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"correctness: The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise. | completeness: The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive. | clarity: The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]} -{"timestamp":"2026-02-20T21:40:33.364Z","test_id":"weighted-multi-evaluator","eval_set":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"safety-check","type":"llm-grader","score":1,"weight":3,"verdict":"pass","assertions":[{"text":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language.","passed":true,"evidence":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language."}]},{"name":"quality-check","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true}]},{"name":"style-check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Clear, organized structure with headings and bullet points","passed":true,"evidence":"The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]}],"assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"safety-check: The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language. | quality-check: The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases. | style-check: The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true},{"text":"Clear, organized structure with headings and bullet points","passed":true},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]} +{"timestamp":"2026-02-20T21:40:31.897Z","test_id":"experimental-evaluator-disabled","dataset":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true}]},{"name":"experimental-metric","type":"llm-grader","score":1,"weight":0,"verdict":"pass","assertions":[{"text":"Provides expanded explanation and context","passed":true,"evidence":"Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]}],"assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"accuracy: All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications. | experimental-metric: Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true},{"text":"Provides expanded explanation and context","passed":true},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]} +{"timestamp":"2026-02-20T21:40:32.552Z","test_id":"equal-weights-default","dataset":"weighted-evaluators-examples","score":0.9833333333333334,"target":"default","scores":[{"name":"correctness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise.","passed":true,"evidence":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise."}]},{"name":"completeness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]},{"name":"clarity","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized.","passed":true,"evidence":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."}]}],"assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"correctness: The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise. | completeness: The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive. | clarity: The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]} +{"timestamp":"2026-02-20T21:40:33.364Z","test_id":"weighted-multi-evaluator","dataset":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"safety-check","type":"llm-grader","score":1,"weight":3,"verdict":"pass","assertions":[{"text":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language.","passed":true,"evidence":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language."}]},{"name":"quality-check","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true}]},{"name":"style-check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Clear, organized structure with headings and bullet points","passed":true,"evidence":"The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]}],"assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"safety-check: The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language. | quality-check: The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases. | style-check: The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true},{"text":"Clear, organized structure with headings and bullet points","passed":true},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]} diff --git a/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl b/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl index 03c2cf963..b78b4531e 100644 --- a/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl +++ b/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl @@ -1,8 +1,8 @@ -{"timestamp":"2026-02-20T21:43:10.802Z","test_id":"cr-module-inaccessible","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]} -{"timestamp":"2026-02-20T21:43:10.909Z","test_id":"cr-global-outage","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]}],"assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]} -{"timestamp":"2026-02-20T21:43:11.358Z","test_id":"cr-missing-validation-disguised-as-defect","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]}],"assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]} -{"timestamp":"2026-02-20T21:43:13.529Z","test_id":"cr-function-bug-no-workaround","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]}],"assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]} -{"timestamp":"2026-02-20T21:43:13.727Z","test_id":"cr-compliance-data-update","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]} -{"timestamp":"2026-02-20T21:43:13.818Z","test_id":"cr-feature-quote","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]}],"assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.043Z","test_id":"cr-multi-part-blend","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]}],"assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.182Z","test_id":"cr-workaround-exists","eval_set":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]}],"assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]} +{"timestamp":"2026-02-20T21:43:10.802Z","test_id":"cr-module-inaccessible","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]} +{"timestamp":"2026-02-20T21:43:10.909Z","test_id":"cr-global-outage","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]}],"assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]} +{"timestamp":"2026-02-20T21:43:11.358Z","test_id":"cr-missing-validation-disguised-as-defect","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]}],"assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]} +{"timestamp":"2026-02-20T21:43:13.529Z","test_id":"cr-function-bug-no-workaround","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]}],"assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]} +{"timestamp":"2026-02-20T21:43:13.727Z","test_id":"cr-compliance-data-update","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]} +{"timestamp":"2026-02-20T21:43:13.818Z","test_id":"cr-feature-quote","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]}],"assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.043Z","test_id":"cr-multi-part-blend","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]}],"assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.182Z","test_id":"cr-workaround-exists","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]}],"assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]} diff --git a/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl b/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl index de7ad8a8f..3f0da20e9 100644 --- a/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl +++ b/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl @@ -1,22 +1,22 @@ -{"timestamp":"2026-02-20T21:43:16.769Z","test_id":"exp-high-001","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.775Z","test_id":"exp-high-003","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.778Z","test_id":"exp-high-002","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.840Z","test_id":"exp-high-004","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.849Z","test_id":"exp-high-006","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.852Z","test_id":"exp-high-005","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.916Z","test_id":"exp-high-007","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.920Z","test_id":"exp-high-008","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.924Z","test_id":"exp-high-009","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.981Z","test_id":"exp-high-010","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:16.996Z","test_id":"exp-high-011","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:17.001Z","test_id":"exp-high-012","eval_set":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} -{"timestamp":"2026-02-20T21:43:17.052Z","test_id":"exp-med-001","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.069Z","test_id":"exp-med-002","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.076Z","test_id":"exp-med-003","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.130Z","test_id":"exp-med-004","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.145Z","test_id":"exp-low-001","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.151Z","test_id":"exp-low-002","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.211Z","test_id":"exp-low-003","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.217Z","test_id":"exp-low-004","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.233Z","test_id":"exp-low-005","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} -{"timestamp":"2026-02-20T21:43:17.277Z","test_id":"exp-low-006","eval_set":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} +{"timestamp":"2026-02-20T21:43:16.769Z","test_id":"exp-high-001","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.775Z","test_id":"exp-high-003","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.778Z","test_id":"exp-high-002","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.840Z","test_id":"exp-high-004","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.849Z","test_id":"exp-high-006","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.852Z","test_id":"exp-high-005","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.916Z","test_id":"exp-high-007","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.920Z","test_id":"exp-high-008","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.924Z","test_id":"exp-high-009","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.981Z","test_id":"exp-high-010","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:16.996Z","test_id":"exp-high-011","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:17.001Z","test_id":"exp-high-012","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]} +{"timestamp":"2026-02-20T21:43:17.052Z","test_id":"exp-med-001","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.069Z","test_id":"exp-med-002","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.076Z","test_id":"exp-med-003","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.130Z","test_id":"exp-med-004","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.145Z","test_id":"exp-low-001","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.151Z","test_id":"exp-low-002","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.211Z","test_id":"exp-low-003","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.217Z","test_id":"exp-low-004","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.233Z","test_id":"exp-low-005","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} +{"timestamp":"2026-02-20T21:43:17.277Z","test_id":"exp-low-006","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]} diff --git a/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl b/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl index 19ac3f697..bf13889b1 100644 --- a/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl +++ b/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl @@ -1,5 +1,5 @@ -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "eval_set": "offline-grader-benchmark", "score": 0.8333, "target": "setup-a", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.8333, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-gemini-flash graded borderline", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "eval_set": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "eval_set": "offline-grader-benchmark", "score": 1.0, "target": "setup-a", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 1.0, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "eval_set": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "eval_set": "offline-grader-benchmark", "score": 0.1667, "target": "setup-a", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.1667, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-claude-haiku graded borderline", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "dataset": "offline-grader-benchmark", "score": 0.8333, "target": "setup-a", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.8333, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-gemini-flash graded borderline", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "dataset": "offline-grader-benchmark", "score": 1.0, "target": "setup-a", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 1.0, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.1667, "target": "setup-a", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.1667, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-claude-haiku graded borderline", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} diff --git a/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl b/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl index 50e728771..18f0f8d77 100644 --- a/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl +++ b/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl @@ -1,5 +1,5 @@ -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "eval_set": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "eval_set": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "eval_set": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "eval_set": "offline-grader-benchmark", "score": 0.3333, "target": "setup-b", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "eval_set": "offline-grader-benchmark", "score": 0.5, "target": "setup-b", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.5, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-claude-haiku graded borderline", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-b", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.5, "target": "setup-b", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.5, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-claude-haiku graded borderline", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} diff --git a/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts b/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts index 99623a3bb..a3d50aa78 100644 --- a/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts +++ b/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts @@ -16,7 +16,8 @@ type ScoreRecord = { type EvalResult = { timestamp?: string; test_id?: string; - eval_set?: string; + dataset?: string; + eval_set?: string; // backward compat target?: string; input?: string; output_text?: string; @@ -221,7 +222,7 @@ for (const line of rawResults) { const output = { timestamp: result.timestamp, test_id: result.test_id, - eval_set: result.eval_set, + dataset: result.dataset ?? result.eval_set, target: labelOverride ?? result.target ?? labelFromPath(resultsPath), input: result.input, output_text: result.output_text, diff --git a/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl b/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl index 9048d401e..af3edc748 100644 --- a/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl +++ b/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl @@ -1,8 +1,8 @@ -{"timestamp":"2026-02-20T21:43:28.301Z","test_id":"encouragement-father-restraint","eval_set":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]}],"assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]} -{"timestamp":"2026-02-20T21:43:28.454Z","test_id":"encouragement-depressed-student","eval_set":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]}],"assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]} -{"timestamp":"2026-02-20T21:43:30.074Z","test_id":"encouragement-guilty-sister","eval_set":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]}],"assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]} -{"timestamp":"2026-02-20T21:43:39.828Z","test_id":"encouragement-nagging-wife","eval_set":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]}],"assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]} -{"timestamp":"2026-02-20T21:43:39.878Z","test_id":"encouragement-cutting-paradox","eval_set":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]}],"assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]} -{"timestamp":"2026-02-20T21:43:43.808Z","test_id":"encouragement-gym-student","eval_set":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]}],"assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]} -{"timestamp":"2026-02-20T21:43:51.398Z","test_id":"adv-encouragement-toxic-positivity","eval_set":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]}],"assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]} -{"timestamp":"2026-02-20T21:43:51.767Z","test_id":"adv-encouragement-enabling","eval_set":"dataset-encouragement","score":0.65,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.3,"weight":1,"verdict":"fail","assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]}],"assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]} +{"timestamp":"2026-02-20T21:43:28.301Z","test_id":"encouragement-father-restraint","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]}],"assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]} +{"timestamp":"2026-02-20T21:43:28.454Z","test_id":"encouragement-depressed-student","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]}],"assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]} +{"timestamp":"2026-02-20T21:43:30.074Z","test_id":"encouragement-guilty-sister","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]}],"assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]} +{"timestamp":"2026-02-20T21:43:39.828Z","test_id":"encouragement-nagging-wife","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]}],"assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]} +{"timestamp":"2026-02-20T21:43:39.878Z","test_id":"encouragement-cutting-paradox","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]}],"assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]} +{"timestamp":"2026-02-20T21:43:43.808Z","test_id":"encouragement-gym-student","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]}],"assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]} +{"timestamp":"2026-02-20T21:43:51.398Z","test_id":"adv-encouragement-toxic-positivity","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]}],"assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]} +{"timestamp":"2026-02-20T21:43:51.767Z","test_id":"adv-encouragement-enabling","dataset":"dataset-encouragement","score":0.65,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.3,"weight":1,"verdict":"fail","assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]}],"assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]} diff --git a/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl b/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl index 90ca077b2..1aaa56a03 100644 --- a/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl +++ b/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl @@ -1,6 +1,6 @@ -{"timestamp":"2026-02-20T21:44:03.897Z","test_id":"listening-basic-overwhelmed-wife","eval_set":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]}],"assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]} -{"timestamp":"2026-02-20T21:44:07.388Z","test_id":"listening-basic-traditional-mother","eval_set":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]}],"assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]} -{"timestamp":"2026-02-20T21:44:10.121Z","test_id":"listening-basic-returning-mother","eval_set":"dataset-listening","score":0.975,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]}],"assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]} -{"timestamp":"2026-02-20T21:44:20.369Z","test_id":"listening-adv-somatic-mask","eval_set":"dataset-listening","score":0.9,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]}],"assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]} -{"timestamp":"2026-02-20T21:44:20.821Z","test_id":"listening-adv-gatekeeper","eval_set":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]}],"assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]} -{"timestamp":"2026-02-20T21:44:24.578Z","test_id":"listening-adv-intellectualizer","eval_set":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]}],"assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]} +{"timestamp":"2026-02-20T21:44:03.897Z","test_id":"listening-basic-overwhelmed-wife","dataset":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]}],"assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]} +{"timestamp":"2026-02-20T21:44:07.388Z","test_id":"listening-basic-traditional-mother","dataset":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]}],"assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]} +{"timestamp":"2026-02-20T21:44:10.121Z","test_id":"listening-basic-returning-mother","dataset":"dataset-listening","score":0.975,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]}],"assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]} +{"timestamp":"2026-02-20T21:44:20.369Z","test_id":"listening-adv-somatic-mask","dataset":"dataset-listening","score":0.9,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]}],"assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]} +{"timestamp":"2026-02-20T21:44:20.821Z","test_id":"listening-adv-gatekeeper","dataset":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]}],"assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]} +{"timestamp":"2026-02-20T21:44:24.578Z","test_id":"listening-adv-intellectualizer","dataset":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]}],"assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]} diff --git a/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl b/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl index bf17ba402..70b037891 100644 --- a/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl +++ b/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl @@ -1,4 +1,4 @@ -{"timestamp":"2026-02-20T21:44:37.826Z","test_id":"route-to-encouragement-father","eval_set":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]}],"assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]} -{"timestamp":"2026-02-20T21:44:40.629Z","test_id":"route-to-encouragement-job","eval_set":"dataset-routing","score":0.875,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0.75,"weight":1,"verdict":"borderline","assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}],"assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]} -{"timestamp":"2026-02-20T21:44:43.409Z","test_id":"route-to-listening","eval_set":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]}],"assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]} -{"timestamp":"2026-02-20T21:44:58.472Z","test_id":"route-to-listening-gatekeeper","eval_set":"dataset-routing","score":0.5,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]}],"assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]} +{"timestamp":"2026-02-20T21:44:37.826Z","test_id":"route-to-encouragement-father","dataset":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]}],"assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]} +{"timestamp":"2026-02-20T21:44:40.629Z","test_id":"route-to-encouragement-job","dataset":"dataset-routing","score":0.875,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0.75,"weight":1,"verdict":"borderline","assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}],"assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]} +{"timestamp":"2026-02-20T21:44:43.409Z","test_id":"route-to-listening","dataset":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]}],"assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]} +{"timestamp":"2026-02-20T21:44:58.472Z","test_id":"route-to-listening-gatekeeper","dataset":"dataset-routing","score":0.5,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]}],"assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]} diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl index a7b727a05..93556a449 100644 --- a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl +++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl @@ -1,4 +1,4 @@ -{"timestamp":"2026-02-20T21:44:59.088Z","test_id":"tool-selection-demo","eval_set":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true},{"text":"fetch: called 1 times (required \u22651)","passed":true}]},{"name":"selection-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"Tool 'fetch' appears relevant to task","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"selection-quality: Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"fetch: called 1 times (required \u22651)","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'fetch' appears relevant to task","passed":true}]} -{"timestamp":"2026-02-20T21:44:59.093Z","test_id":"efficiency-demo","eval_set":"tool-eval-demo","score":0.93,"target":"mock_agent","scores":[{"name":"efficiency-check","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"efficiency-check: Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]} -{"timestamp":"2026-02-20T21:44:59.155Z","test_id":"combined-evaluation","eval_set":"tool-eval-demo","score":0.7766666666666667,"target":"mock_agent","scores":[{"name":"workflow-trajectory","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true}]},{"name":"selection-check","type":"code-grader","score":0.4,"weight":1,"verdict":"fail","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found."},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false}]},{"name":"efficiency","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (3) within budget (10)","passed":true,"evidence":"Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Found search at position 0","passed":true,"evidence":"selection-check: Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found. | efficiency: Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool calls (3) within budget (10)","passed":true},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]} -{"timestamp":"2026-02-20T21:44:59.216Z","test_id":"pairwise-demo","eval_set":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"pairwise-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]}],"assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"pairwise-quality: Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]} +{"timestamp":"2026-02-20T21:44:59.088Z","test_id":"tool-selection-demo","dataset":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true},{"text":"fetch: called 1 times (required \u22651)","passed":true}]},{"name":"selection-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"Tool 'fetch' appears relevant to task","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"selection-quality: Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"fetch: called 1 times (required \u22651)","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'fetch' appears relevant to task","passed":true}]} +{"timestamp":"2026-02-20T21:44:59.093Z","test_id":"efficiency-demo","dataset":"tool-eval-demo","score":0.93,"target":"mock_agent","scores":[{"name":"efficiency-check","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"efficiency-check: Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]} +{"timestamp":"2026-02-20T21:44:59.155Z","test_id":"combined-evaluation","dataset":"tool-eval-demo","score":0.7766666666666667,"target":"mock_agent","scores":[{"name":"workflow-trajectory","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true}]},{"name":"selection-check","type":"code-grader","score":0.4,"weight":1,"verdict":"fail","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found."},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false}]},{"name":"efficiency","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (3) within budget (10)","passed":true,"evidence":"Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Found search at position 0","passed":true,"evidence":"selection-check: Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found. | efficiency: Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool calls (3) within budget (10)","passed":true},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]} +{"timestamp":"2026-02-20T21:44:59.216Z","test_id":"pairwise-demo","dataset":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"pairwise-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]}],"assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"pairwise-quality: Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]} diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts index fa7a8316d..50ae1ee4b 100644 --- a/packages/core/src/evaluation/loaders/jsonl-parser.ts +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -289,7 +289,7 @@ export async function loadTestsFromJsonl( const testCase: EvalTest = { id, - eval_set: evalSetName, + dataset: evalSetName, conversation_id: conversationId, question: question, input: inputMessages, diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index b116b0d60..cdd402099 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -821,7 +821,7 @@ export async function runEvaluation( const budgetResult: EvaluationResult = { timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, - eval_set: evalCase.eval_set, + dataset: evalCase.dataset, score: 0, assertions: [], output: [], @@ -860,7 +860,7 @@ export async function runEvaluation( const haltResult: EvaluationResult = { timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, - eval_set: evalCase.eval_set, + dataset: evalCase.dataset, score: 0, assertions: [], output: [], @@ -2107,7 +2107,7 @@ async function evaluateCandidate(options: { return { timestamp: completedAt.toISOString(), testId: evalCase.id, - eval_set: evalCase.eval_set, + dataset: evalCase.dataset, conversationId: evalCase.conversation_id, score: score.score, assertions: score.assertions, @@ -2589,7 +2589,7 @@ function buildErrorResult( return { timestamp: timestamp.toISOString(), testId: evalCase.id, - eval_set: evalCase.eval_set, + dataset: evalCase.dataset, conversationId: evalCase.conversation_id, score: 0, assertions: [{ text: `Error: ${message}`, passed: false }], diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 52b49dde9..8f5d718a1 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -771,7 +771,7 @@ export type EvaluatorConfig = */ export interface EvalTest { readonly id: string; - readonly eval_set?: string; + readonly dataset?: string; readonly conversation_id?: string; readonly question: string; readonly input: readonly TestMessage[]; @@ -893,7 +893,7 @@ export type FailOnError = boolean; export interface EvaluationResult { readonly timestamp: string; readonly testId: string; - readonly eval_set?: string; + readonly dataset?: string; readonly conversationId?: string; readonly score: number; readonly assertions: readonly AssertionEntry[]; diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index a0e6e5f4f..d859a2d62 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -349,7 +349,8 @@ const EvalTestSchema = z.object({ workspace: WorkspaceSchema.optional(), metadata: z.record(z.unknown()).optional(), conversation_id: z.string().optional(), - eval_set: z.string().optional(), + dataset: z.string().optional(), + eval_set: z.string().optional(), // deprecated alias for dataset note: z.string().optional(), }); diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 8c2fc2393..ed4d4b83e 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -471,7 +471,7 @@ async function loadTestsFromYaml( const testCase: EvalTest = { id, - eval_set: evalSetName, + dataset: evalSetName, conversation_id: conversationId, question: question, input: inputMessages, diff --git a/packages/core/src/observability/otel-exporter.ts b/packages/core/src/observability/otel-exporter.ts index f789fb75c..de08959ed 100644 --- a/packages/core/src/observability/otel-exporter.ts +++ b/packages/core/src/observability/otel-exporter.ts @@ -182,7 +182,7 @@ export class OtelTraceExporter { // Core attributes rootSpan.setAttribute('agentv.test_id', result.testId); rootSpan.setAttribute('agentv.target', result.target); - if (result.eval_set) rootSpan.setAttribute('agentv.eval_set', result.eval_set); + if (result.dataset) rootSpan.setAttribute('agentv.dataset', result.dataset); rootSpan.setAttribute('agentv.score', result.score); if (captureContent && result.output.length > 0) { const lastMsg = result.output[result.output.length - 1]; @@ -455,7 +455,7 @@ export class OtelStreamingObserver { this.rootSpan.setAttribute('gen_ai.system', 'agentv'); this.rootSpan.setAttribute('agentv.test_id', testId); this.rootSpan.setAttribute('agentv.target', target); - if (evalSet) this.rootSpan.setAttribute('agentv.eval_set', evalSet); + if (evalSet) this.rootSpan.setAttribute('agentv.dataset', evalSet); this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan); } diff --git a/packages/core/test/evaluation/baseline.test.ts b/packages/core/test/evaluation/baseline.test.ts index f2833f04d..bebc0318d 100644 --- a/packages/core/test/evaluation/baseline.test.ts +++ b/packages/core/test/evaluation/baseline.test.ts @@ -6,7 +6,7 @@ function makeFullResult(overrides: Partial = {}): EvaluationRe return { timestamp: '2026-01-01T00:00:00.000Z', testId: 'test-case', - eval_set: 'test-dataset', + dataset: 'test-dataset', conversationId: 'conv-1', score: 0.85, assertions: [ @@ -56,7 +56,7 @@ describe('trimBaselineResult', () => { expect(trimmed.timestamp).toBe(full.timestamp); expect(trimmed.testId).toBe(full.testId); - expect(trimmed.eval_set).toBe(full.eval_set); + expect(trimmed.dataset).toBe(full.dataset); expect(trimmed.conversationId).toBe(full.conversationId); expect(trimmed.score).toBe(full.score); expect(trimmed.assertions).toEqual(full.assertions); diff --git a/packages/core/test/evaluation/code-evaluator-file-backed.test.ts b/packages/core/test/evaluation/code-evaluator-file-backed.test.ts index b5e2ebb90..e5e513471 100644 --- a/packages/core/test/evaluation/code-evaluator-file-backed.test.ts +++ b/packages/core/test/evaluation/code-evaluator-file-backed.test.ts @@ -9,7 +9,7 @@ import type { EvalTest } from '../../src/evaluation/types.js'; const baseTestCase: EvalTest = { id: 'case-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Test question', input: [{ role: 'user', content: 'Test input' }], expected_output: [], diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index dfdef40bc..df7ef94b8 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -59,7 +59,7 @@ class CapturingProvider implements Provider { const baseTestCase: EvalTest = { id: 'case-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Improve the logging implementation', input: [{ role: 'user', content: 'Please add logging' }], expected_output: [], diff --git a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts index afa8c184a..77098574b 100644 --- a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts +++ b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts @@ -12,7 +12,7 @@ import type { EvalTest, EvaluatorConfig } from '../../../src/evaluation/types.js const baseTestCase: EvalTest = { id: 'threshold-test', - eval_set: 'test', + dataset: 'test', question: 'Test question', input: [{ role: 'user', content: 'Test' }], expected_output: [], diff --git a/packages/core/test/evaluation/evaluators/execution-metrics.test.ts b/packages/core/test/evaluation/evaluators/execution-metrics.test.ts index 8c3a85249..9cdeabcdd 100644 --- a/packages/core/test/evaluation/evaluators/execution-metrics.test.ts +++ b/packages/core/test/evaluation/evaluators/execution-metrics.test.ts @@ -7,7 +7,7 @@ import type { EvalTest, ExecutionMetricsEvaluatorConfig } from '../../../src/eva const baseTestCase: EvalTest = { id: 'metrics-test', - eval_set: 'test', + dataset: 'test', question: 'Test question', input: [{ role: 'user', content: 'Test' }], expected_output: [], diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index f6fbad5e0..6b47e2941 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -25,7 +25,7 @@ class CapturingProvider implements Provider { const baseTestCase: EvalTest = { id: 'case-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Original Question Text', input: [{ role: 'user', content: [{ type: 'text', value: 'Input Message' }] }], expected_output: [{ type: 'text', value: 'Expected Output Message' }], diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts index de7c58b06..615f8229b 100644 --- a/packages/core/test/evaluation/execution-metrics.test.ts +++ b/packages/core/test/evaluation/execution-metrics.test.ts @@ -243,7 +243,7 @@ describe('Execution Metrics', () => { describe('Code Grader Metrics Integration', () => { const baseTestCase: EvalTest = { id: 'metrics-test', - eval_set: 'test', + dataset: 'test', question: 'Test question', input: [{ role: 'user', content: 'Test' }], expected_output: [], diff --git a/packages/core/test/evaluation/execution-status.test.ts b/packages/core/test/evaluation/execution-status.test.ts index 05150da3c..7f2028998 100644 --- a/packages/core/test/evaluation/execution-status.test.ts +++ b/packages/core/test/evaluation/execution-status.test.ts @@ -40,7 +40,7 @@ class FixedResponseProvider implements Provider { const baseTestCase: EvalTest = { id: 'exec-status-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Explain logging improvements', input: [{ role: 'user', content: 'Explain logging improvements' }], expected_output: [], diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts index 04aecd05b..87f75f832 100644 --- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts +++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts @@ -156,7 +156,7 @@ describe('loadTestsFromJsonl', () => { const cases = await loadTestsFromJsonl(jsonlPath, tempDir); expect(cases).toHaveLength(1); - expect(cases[0].eval_set).toBe('my-tests'); + expect(cases[0].dataset).toBe('my-tests'); expect(cases[0].evaluator).toBe('llm-grader'); }); @@ -188,7 +188,7 @@ describe('loadTestsFromJsonl', () => { const cases = await loadTestsFromJsonl(jsonlPath, tempDir); expect(cases).toHaveLength(1); - expect(cases[0].eval_set).toBe('my-dataset'); + expect(cases[0].dataset).toBe('my-dataset'); }); it('supports per-case evaluators override', async () => { @@ -411,7 +411,7 @@ tests: // Core fields should match expect(jsonlCases[0].id).toBe(yamlCases[0].id); expect(jsonlCases[0].criteria).toBe(yamlCases[0].criteria); - expect(jsonlCases[0].eval_set).toBe(yamlCases[0].eval_set); + expect(jsonlCases[0].dataset).toBe(yamlCases[0].dataset); expect(jsonlCases[0].input.length).toBe(yamlCases[0].input.length); expect(jsonlCases[0].input[0].role).toBe(yamlCases[0].input[0].role); expect(jsonlCases[0].input[0].content).toBe(yamlCases[0].input[0].content); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 2e9881808..fed09e1db 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -112,7 +112,7 @@ class CapturingCliProvider implements Provider { const baseTestCase: EvalTest = { id: 'case-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Explain logging improvements', input: [{ role: 'user', content: 'Explain logging improvements' }], expected_output: [], @@ -414,7 +414,7 @@ describe('runTestCase', () => { const result = await runEvalCase({ evalCase: { id: 'multi', - eval_set: 'ds', + dataset: 'ds', question: '', input: [ { role: 'system', content: 'Guide' }, @@ -459,7 +459,7 @@ describe('runTestCase', () => { await runEvalCase({ evalCase: { id: 'single', - eval_set: 'ds', + dataset: 'ds', question: '', input: [{ role: 'user', content: 'Hello' }], expected_output: [], @@ -579,7 +579,7 @@ class TraceProvider implements Provider { describe('runEvalCase trace integration', () => { const traceTestCase: EvalTest = { id: 'trace-case', - eval_set: 'trace-dataset', + dataset: 'trace-dataset', question: 'What is the weather?', input: [{ role: 'user', content: 'What is the weather?' }], expected_output: [], @@ -1583,7 +1583,7 @@ rl.on('close', () => { describe('deterministic assertion evaluators in orchestrator', () => { const assertionTestCase: EvalTest = { id: 'assert-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Test question', input: [{ role: 'user', content: 'Test question' }], expected_output: [], @@ -1760,7 +1760,7 @@ describe('deterministic assertion evaluators in orchestrator', () => { describe('criteria with assert runs only declared evaluators (#452)', () => { const criteriaTestCase: EvalTest = { id: 'no-implicit-grader-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Test question', input: [{ role: 'user', content: 'Test question' }], expected_output: [], @@ -1861,7 +1861,7 @@ describe('criteria with assert runs only declared evaluators (#452)', () => { describe('required gates', () => { const assertionTestCase: EvalTest = { id: 'required-gate-1', - eval_set: 'test-dataset', + dataset: 'test-dataset', question: 'Test question', input: [{ role: 'user', content: 'Test question' }], expected_output: [], diff --git a/packages/core/test/observability/streaming-observer.test.ts b/packages/core/test/observability/streaming-observer.test.ts index 71e5542ce..7acc36dcc 100644 --- a/packages/core/test/observability/streaming-observer.test.ts +++ b/packages/core/test/observability/streaming-observer.test.ts @@ -81,7 +81,7 @@ describe('OtelStreamingObserver', () => { expect(spans[0].name).toBe('agentv.eval'); expect(spans[0].attributes['agentv.test_id']).toBe('test-1'); expect(spans[0].attributes['agentv.target']).toBe('my-target'); - expect(spans[0].attributes['agentv.eval_set']).toBe('my-dataset'); + expect(spans[0].attributes['agentv.dataset']).toBe('my-dataset'); expect(spans[0].attributes['gen_ai.system']).toBe('agentv'); expect(spans[0].ended).toBe(false); }); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 8c55ef1c2..840b9e813 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -6387,6 +6387,9 @@ "conversation_id": { "type": "string" }, + "dataset": { + "type": "string" + }, "eval_set": { "type": "string" }, @@ -12698,6 +12701,9 @@ "conversation_id": { "type": "string" }, + "dataset": { + "type": "string" + }, "eval_set": { "type": "string" },