From 0b89f56811fc9ebf80546f22cd4fbde283cc526c Mon Sep 17 00:00:00 2001 From: System Two Date: Mon, 20 Apr 2026 17:42:45 +0000 Subject: [PATCH] feat(eval): add configurable trials, parallelism, and CI threshold support Extends the eval system to run multiple trials per case with bounded concurrency and deterministic pass-rate aggregation. Adds --trials, --ci, and --threshold CLI flags; trial artifacts are written under {run_dir}/{case_id}/trial-N/ with aggregated.json summaries. Existing single-trial configs continue working without change. --- Cargo.lock | 1 + Cargo.toml | 1 + src/cli/commands/eval/run.rs | 287 +++++++++++++++--- src/cli/commands/eval/score.rs | 84 ++++- src/cli/commands/eval/validate.rs | 6 + src/core/manifest.rs | 17 ++ src/eval/artifacts.rs | 141 ++++++++- src/eval/config.rs | 72 +++++ src/eval/runner.rs | 163 +++++++++- tests/cli/eval_tests.rs | 194 ++++++++++++ ..._cli__snapshot_helpers__eval_run_help.snap | 3 + 11 files changed, 894 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7c0df14..fffd377 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1477,6 +1477,7 @@ dependencies = [ "insta", "multer", "notify", + "num_cpus", "once_cell", "predicates", "regex", diff --git a/Cargo.toml b/Cargo.toml index e8f2878..85d7d86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ uuid = { version = "1.0", default-features = false, features = ["v4", "serde"] } chrono = { version = "0.4", default-features = false, features = ["serde", "clock"] } once_cell = "1.19" regex = { version = "1.10", default-features = false, features = ["unicode-perl"] } +num_cpus = "1.16" # File system and I/O tokio-util = { version = "0.7", features = ["io"] } diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs index cc8112e..246e6b6 100644 --- a/src/cli/commands/eval/run.rs +++ b/src/cli/commands/eval/run.rs @@ -7,7 +7,8 @@ use chrono::Utc; use clap::Args; use fastskill::core::project::resolve_project_file; use fastskill::eval::artifacts::{ - allocate_run_dir, write_case_artifacts, write_summary, CaseSummary, SummaryResult, + allocate_run_dir, write_case_trials_summary, write_summary, write_trial_artifacts, CaseStatus, + CaseSummary, CaseTrialsResult, SummaryResult, TrialResult, }; use fastskill::eval::checks::load_checks; use fastskill::eval::config::resolve_eval_config; @@ -16,6 +17,9 @@ use fastskill::eval::suite::load_suite; use fastskill::OutputFormat; use std::env; use std::path::PathBuf; +use std::sync::Arc; +use tokio::sync::Semaphore; +use tokio::task::JoinSet; /// Arguments for `fastskill eval run` #[derive(Debug, Args)] @@ -55,6 +59,18 @@ pub struct RunArgs { /// Do not fail with non-zero exit code on suite failure #[arg(long)] pub no_fail: bool, + + /// Override trials per case from config + #[arg(long)] + pub trials: Option, + + /// Enable CI mode: exit non-zero if suite pass rate below threshold + #[arg(long)] + pub ci: bool, + + /// Override pass threshold (0.0-1.0) + #[arg(long)] + pub threshold: Option, } fn validate_agent_key_for_run(s: &str) -> Result { @@ -71,13 +87,13 @@ fn validate_agent_key_for_run(s: &str) -> Result { /// Execute the `eval run` command using the default aikit-backed runner. pub async fn execute_run(args: RunArgs) -> CliResult<()> { - execute_run_with_runner(args, &AikitEvalRunner).await + execute_run_with_runner(args, Arc::new(AikitEvalRunner)).await } /// Execute `eval run` with an injectable [`EvalRunner`] (tests or future adapters). -pub async fn execute_run_with_runner( +pub async fn execute_run_with_runner( args: RunArgs, - runner: &R, + runner: Arc, ) -> CliResult<()> { let format = validate_format_args(&args.format, args.json)?; let use_json = format == OutputFormat::Json; @@ -102,6 +118,22 @@ pub async fn execute_run_with_runner( let eval_config = resolve_eval_config(&resolution.path, &project_root) .map_err(|e| CliError::Config(e.to_string()))?; + let trials_per_case = args.trials.unwrap_or(eval_config.trials_per_case); + if !(1..=1000).contains(&trials_per_case) { + return Err(CliError::Config(format!( + "EVAL_INVALID_TRIALS_CONFIG: trials must be in range [1, 1000], got {}", + trials_per_case + ))); + } + + let pass_threshold = args.threshold.unwrap_or(eval_config.pass_threshold); + if !(0.0..=1.0).contains(&pass_threshold) { + return Err(CliError::Config(format!( + "EVAL_INVALID_THRESHOLD: threshold must be in range [0.0, 1.0], got {}", + pass_threshold + ))); + } + // Check agent availability if eval_config.fail_on_missing_agent && !is_agent_available(&args.agent) { return Err(CliError::Config(format!( @@ -141,6 +173,16 @@ pub async fn execute_run_with_runner( vec![] }; + let total_trial_runs = (suite.cases.len() as u64) * (trials_per_case as u64); + if total_trial_runs >= 100 && !use_json { + eprintln!( + "warning: EVAL_COST_WARNING: running {} case(s) × {} trial(s) = {} total trial runs", + suite.cases.len(), + trials_per_case, + total_trial_runs + ); + } + // Allocate run directory let run_id = Utc::now().format("%Y-%m-%dT%H-%M-%SZ").to_string(); std::fs::create_dir_all(&args.output_dir).map_err(|e| { @@ -158,17 +200,18 @@ pub async fn execute_run_with_runner( model: args.model.clone(), project_root: project_root.clone(), timeout_seconds: eval_config.timeout_seconds, + pass_threshold, }; if !use_json { eprintln!( - "Running {} eval case(s) with agent '{}'...", + "Running {} eval case(s) with agent '{}' ({} trial(s) per case)...", suite.cases.len(), - args.agent + args.agent, + trials_per_case ); } - let mut case_results = Vec::new(); let mut case_summaries = Vec::new(); for case in &suite.cases { @@ -176,61 +219,185 @@ pub async fn execute_run_with_runner( eprintln!(" Running case '{}'...", case.id); } - let (run_output, case_result, trace_jsonl) = - runner.run_case(case, &run_opts, &checks).await; - - // Write artifacts - if let Err(e) = write_case_artifacts( - &run_dir, - &case.id, - &run_output.stdout, - &run_output.stderr, - &trace_jsonl, - &case_result, - ) { + let max_parallel = eval_config + .parallel + .unwrap_or_else(|| num_cpus::get().max(1) as u32) + .max(1) as usize; + let semaphore = Arc::new(Semaphore::new(max_parallel)); + let mut join_set: JoinSet< + CliResult<( + u32, + fastskill::eval::runner::CaseRunOutput, + fastskill::eval::artifacts::CaseResult, + String, + )>, + > = JoinSet::new(); + + for trial_id in 1..=trials_per_case { + let permit = Arc::clone(&semaphore); + let runner = Arc::clone(&runner); + let case_clone = case.clone(); + let opts_clone = run_opts.clone(); + let checks_vec = checks.clone(); + + join_set.spawn(async move { + let Ok(_permit) = permit.acquire().await else { + return Err(CliError::Config( + "EVAL_PARALLEL_EXHAUSTION: semaphore closed".to_string(), + )); + }; + let (out, res, trace) = + runner.run_case(&case_clone, &opts_clone, &checks_vec).await; + Ok((trial_id, out, res, trace)) + }); + } + + let mut trials: Vec = Vec::with_capacity(trials_per_case as usize); + let mut pass_count: u32 = 0; + let mut command_count_sum: usize = 0; + let mut input_tokens_sum: u64 = 0; + let mut output_tokens_sum: u64 = 0; + let mut saw_any_command_count = false; + let mut saw_any_input_tokens = false; + let mut saw_any_output_tokens = false; + + while let Some(joined) = join_set.join_next().await { + let (trial_id, out, case_result, trace_jsonl) = joined.map_err(|e| { + CliError::Config(format!( + "EVAL_PARALLEL_EXHAUSTION: trial task failed: {}", + e + )) + })??; + + let trial = TrialResult { + trial_id, + status: case_result.status.clone(), + command_count: case_result.command_count, + input_tokens: case_result.input_tokens, + output_tokens: case_result.output_tokens, + check_results: case_result.check_results.clone(), + error_message: case_result.error_message.clone(), + }; + + if trial.status == CaseStatus::Passed { + pass_count += 1; + } + if let Some(cc) = trial.command_count { + saw_any_command_count = true; + command_count_sum = command_count_sum.saturating_add(cc); + } + if let Some(it) = trial.input_tokens { + saw_any_input_tokens = true; + input_tokens_sum = input_tokens_sum.saturating_add(it); + } + if let Some(ot) = trial.output_tokens { + saw_any_output_tokens = true; + output_tokens_sum = output_tokens_sum.saturating_add(ot); + } + + // Write trial artifacts immediately (keeps memory bounded). + if let Err(e) = write_trial_artifacts( + &run_dir, + &case.id, + trial_id, + &out.stdout, + &out.stderr, + &trace_jsonl, + &trial, + ) { + if !use_json { + eprintln!( + " warning: failed to write artifacts for case '{}' trial {}: {}", + case.id, trial_id, e + ); + } + } + + trials.push(trial); + } + + trials.sort_by_key(|t| t.trial_id); + let total_trials = trials_per_case; + let pass_rate = pass_count as f64 / total_trials as f64; + let aggregated_status = if pass_rate >= pass_threshold { + CaseStatus::Passed + } else { + CaseStatus::Failed + }; + + let aggregated = CaseTrialsResult { + id: case.id.clone(), + trials: trials.clone(), + aggregated_status: aggregated_status.clone(), + pass_count, + total_trials, + pass_rate, + }; + + if let Err(e) = write_case_trials_summary(&run_dir, &case.id, &aggregated) { if !use_json { eprintln!( - " warning: failed to write artifacts for case '{}': {}", + " warning: failed to write aggregated summary for case '{}': {}", case.id, e ); } } - let summary_entry = CaseSummary { - id: case_result.id.clone(), - status: case_result.status.clone(), - command_count: case_result.command_count, - input_tokens: case_result.input_tokens, - output_tokens: case_result.output_tokens, - }; - - case_summaries.push(summary_entry); - case_results.push(case_result); + case_summaries.push(CaseSummary { + id: case.id.clone(), + status: aggregated_status, + command_count: if saw_any_command_count { + Some(command_count_sum) + } else { + None + }, + input_tokens: if saw_any_input_tokens { + Some(input_tokens_sum) + } else { + None + }, + output_tokens: if saw_any_output_tokens { + Some(output_tokens_sum) + } else { + None + }, + pass_count: Some(pass_count), + total_trials: Some(total_trials), + pass_rate: Some(pass_rate), + trials, + }); } - let passed = case_results + let passed = case_summaries .iter() .filter(|r| r.status == fastskill::eval::artifacts::CaseStatus::Passed) .count(); - let failed = case_results.len() - passed; - let suite_pass = failed == 0; + let failed = case_summaries.len() - passed; + let suite_pass_rate = if case_summaries.is_empty() { + 0.0 + } else { + passed as f64 / case_summaries.len() as f64 + }; + let suite_pass = if args.ci { + suite_pass_rate >= pass_threshold + } else { + failed == 0 + }; let summary = SummaryResult { suite_pass, + suite_pass_rate: Some(suite_pass_rate), agent: args.agent.clone(), model: args.model.clone(), - total_cases: case_results.len(), + total_cases: case_summaries.len(), passed, failed, + trials_per_case: Some(trials_per_case), + parallel: eval_config.parallel, + pass_threshold: Some(pass_threshold), run_dir: run_dir.clone(), - checks_path: eval_config.checks_path.map(|p| { - if p.is_absolute() { - p - } else { - project_root.join(p) - } - }), - skill_project_root: project_root, + checks_path: eval_config.checks_path.clone(), + skill_project_root: project_root.clone(), cases: case_summaries, }; @@ -249,22 +416,42 @@ pub async fn execute_run_with_runner( } else { println!( "\nEval run complete: {}/{} passed", - passed, - case_results.len() + passed, summary.total_cases ); println!(" run_dir: {}", run_dir.display()); if suite_pass { - println!(" result: PASSED"); + if args.ci { + println!( + " result: PASSED (suite pass rate {:.0}% ≥ {:.0}% threshold)", + suite_pass_rate * 100.0, + pass_threshold * 100.0 + ); + } else { + println!(" result: PASSED"); + } } else { - println!(" result: FAILED ({} case(s) failed)", failed); + if args.ci { + println!( + " result: FAILED (suite pass rate {:.0}% < {:.0}% threshold)", + suite_pass_rate * 100.0, + pass_threshold * 100.0 + ); + } else { + println!(" result: FAILED ({} case(s) failed)", failed); + } } } - if !suite_pass && !args.no_fail { + let should_fail = if args.ci { + suite_pass_rate < pass_threshold + } else { + !suite_pass + }; + + if should_fail && !args.no_fail { return Err(CliError::Config(format!( - "Eval suite failed: {}/{} cases passed", - passed, - case_results.len() + "Eval suite failed: {}/{} cases passed (threshold={})", + passed, summary.total_cases, pass_threshold ))); } diff --git a/src/cli/commands/eval/score.rs b/src/cli/commands/eval/score.rs index 6d9c84d..6a6cee5 100644 --- a/src/cli/commands/eval/score.rs +++ b/src/cli/commands/eval/score.rs @@ -3,7 +3,7 @@ use crate::cli::commands::common::validate_format_args; use crate::cli::error::{CliError, CliResult}; use clap::Args; -use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus}; +use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus, TrialResult}; use fastskill::eval::checks::load_checks; use fastskill::OutputFormat; use std::path::PathBuf; @@ -74,6 +74,7 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> { let mut new_failed = 0; let mut updated_cases = summary.cases.clone(); + let pass_threshold = summary.pass_threshold.unwrap_or(1.0); for case_summary in &mut updated_cases { let case_dir = args.run_dir.join(&case_summary.id); @@ -81,27 +82,77 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> { continue; } - let stdout_path = case_dir.join("stdout.txt"); - let trace_path = case_dir.join("trace.jsonl"); + let mut trial_dirs: Vec<(u32, PathBuf)> = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&case_dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + if let Some(name) = path.file_name().and_then(|s| s.to_str()) { + if let Some(suffix) = name.strip_prefix("trial-") { + if let Ok(id) = suffix.parse::() { + trial_dirs.push((id, path)); + } + } + } + } + } + } + trial_dirs.sort_by_key(|(id, _)| *id); - let stdout_content = std::fs::read_to_string(&stdout_path).unwrap_or_default(); - let trace_jsonl = std::fs::read_to_string(&trace_path).unwrap_or_default(); + // Legacy fallback: treat case root as a single trial. + if trial_dirs.is_empty() { + trial_dirs.push((1, case_dir.clone())); + } - let check_results = fastskill::eval::checks::run_checks( - &checks, - &stdout_content, - &trace_jsonl, - &summary.skill_project_root, - ); + let mut trials: Vec = Vec::with_capacity(trial_dirs.len()); + for (trial_id, tdir) in &trial_dirs { + let stdout_path = tdir.join("stdout.txt"); + let trace_path = tdir.join("trace.jsonl"); + + let stdout_content = std::fs::read_to_string(&stdout_path).unwrap_or_default(); + let trace_jsonl = std::fs::read_to_string(&trace_path).unwrap_or_default(); + + let check_results = fastskill::eval::checks::run_checks( + &checks, + &stdout_content, + &trace_jsonl, + &summary.skill_project_root, + ); + let all_passed = check_results.iter().all(|r| r.passed); + + trials.push(TrialResult { + trial_id: *trial_id, + status: if all_passed { + CaseStatus::Passed + } else { + CaseStatus::Failed + }, + command_count: None, + input_tokens: None, + output_tokens: None, + check_results, + error_message: None, + }); + } - let all_passed = check_results.iter().all(|r| r.passed); - case_summary.status = if all_passed { + let pass_count = trials + .iter() + .filter(|t| t.status == CaseStatus::Passed) + .count() as u32; + let total_trials = trials.len().max(1) as u32; + let pass_rate = pass_count as f64 / total_trials as f64; + + case_summary.trials = trials; + case_summary.pass_count = Some(pass_count); + case_summary.total_trials = Some(total_trials); + case_summary.pass_rate = Some(pass_rate); + case_summary.status = if pass_rate >= pass_threshold { CaseStatus::Passed } else { CaseStatus::Failed }; - if all_passed { + if case_summary.status == CaseStatus::Passed { new_passed += 1; } else { new_failed += 1; @@ -110,6 +161,11 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> { summary.passed = new_passed; summary.failed = new_failed; + summary.suite_pass_rate = if summary.total_cases == 0 { + Some(0.0) + } else { + Some(new_passed as f64 / summary.total_cases as f64) + }; summary.suite_pass = new_failed == 0; summary.cases = updated_cases; diff --git a/src/cli/commands/eval/validate.rs b/src/cli/commands/eval/validate.rs index d46d743..60bf439 100644 --- a/src/cli/commands/eval/validate.rs +++ b/src/cli/commands/eval/validate.rs @@ -109,6 +109,9 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> { "prompts_path": eval_config.prompts_path, "checks_path": eval_config.checks_path, "timeout_seconds": eval_config.timeout_seconds, + "trials_per_case": eval_config.trials_per_case, + "parallel": eval_config.parallel, + "pass_threshold": eval_config.pass_threshold, "fail_on_missing_agent": eval_config.fail_on_missing_agent, "project_root": eval_config.project_root, "case_count": case_count, @@ -127,6 +130,9 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> { println!(" check count: {}", check_count); } println!(" timeout: {}s", eval_config.timeout_seconds); + println!(" trials_per_case: {}", eval_config.trials_per_case); + println!(" parallel: {}", eval_config.parallel.unwrap_or(0)); + println!(" pass_threshold: {}", eval_config.pass_threshold); println!( " fail_on_missing_agent: {}", eval_config.fail_on_missing_agent diff --git a/src/core/manifest.rs b/src/core/manifest.rs index 5012001..eba5a40 100644 --- a/src/core/manifest.rs +++ b/src/core/manifest.rs @@ -289,6 +289,15 @@ pub struct EvalConfigToml { /// Timeout in seconds for each eval case execution #[serde(default = "default_eval_timeout_seconds")] pub timeout_seconds: u64, + /// Trials per case (default: 1) + #[serde(default = "default_trials_per_case")] + pub trials_per_case: u32, + /// Optional maximum parallelism for trials within one case (default: CPU cores) + #[serde(default)] + pub parallel: Option, + /// Pass threshold for trial aggregation (0.0-1.0, default: 1.0) + #[serde(default = "default_pass_threshold")] + pub pass_threshold: f64, /// When true, `eval run` / `eval validate --agent` fail fast if the agent CLI is not available #[serde(default = "default_fail_on_missing_agent")] pub fail_on_missing_agent: bool, @@ -298,6 +307,14 @@ fn default_eval_timeout_seconds() -> u64 { 900 } +fn default_trials_per_case() -> u32 { + 1 +} + +fn default_pass_threshold() -> f64 { + 1.0 +} + fn default_fail_on_missing_agent() -> bool { true } diff --git a/src/eval/artifacts.rs b/src/eval/artifacts.rs index b11a5b1..74cd1de 100644 --- a/src/eval/artifacts.rs +++ b/src/eval/artifacts.rs @@ -39,15 +39,47 @@ pub struct CaseResult { pub error_message: Option, } +/// Per-trial result for a case +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TrialResult { + pub trial_id: u32, + pub status: CaseStatus, + pub command_count: Option, + pub input_tokens: Option, + pub output_tokens: Option, + #[serde(default)] + pub check_results: Vec, + pub error_message: Option, +} + +/// Aggregated results for a case across multiple trials +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CaseTrialsResult { + pub id: String, + pub trials: Vec, + pub aggregated_status: CaseStatus, + pub pass_count: u32, + pub total_trials: u32, + pub pass_rate: f64, +} + /// Aggregated run summary #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SummaryResult { pub suite_pass: bool, + #[serde(default)] + pub suite_pass_rate: Option, pub agent: String, pub model: Option, pub total_cases: usize, pub passed: usize, pub failed: usize, + #[serde(default)] + pub trials_per_case: Option, + #[serde(default)] + pub parallel: Option, + #[serde(default)] + pub pass_threshold: Option, pub run_dir: PathBuf, pub checks_path: Option, pub skill_project_root: PathBuf, @@ -62,6 +94,14 @@ pub struct CaseSummary { pub command_count: Option, pub input_tokens: Option, pub output_tokens: Option, + #[serde(default)] + pub pass_count: Option, + #[serde(default)] + pub total_trials: Option, + #[serde(default)] + pub pass_rate: Option, + #[serde(default)] + pub trials: Vec, } /// All artifacts from a completed run @@ -106,26 +146,88 @@ pub fn allocate_run_dir(output_dir: &Path, run_id: &str) -> Result Result { + let trial_dir = run_dir.join(case_id).join(format!("trial-{}", trial_id)); + std::fs::create_dir_all(&trial_dir)?; + + std::fs::write(trial_dir.join("stdout.txt"), stdout)?; + std::fs::write(trial_dir.join("stderr.txt"), stderr)?; + std::fs::write(trial_dir.join("trace.jsonl"), trace_jsonl)?; + + let result_json = serde_json::to_string_pretty(result)?; + std::fs::write(trial_dir.join("result.json"), result_json)?; + + Ok(trial_dir) +} + +/// Write `{run_dir}/{case_id}/aggregated.json` +pub fn write_case_trials_summary( + run_dir: &Path, + case_id: &str, + trials_result: &CaseTrialsResult, +) -> Result<(), ArtifactsError> { let case_dir = run_dir.join(case_id); std::fs::create_dir_all(&case_dir)?; + let aggregated_json = serde_json::to_string_pretty(trials_result)?; + std::fs::write(case_dir.join("aggregated.json"), aggregated_json)?; + Ok(()) +} + +fn case_result_to_trial(case: &CaseResult, trial_id: u32) -> TrialResult { + TrialResult { + trial_id, + status: case.status.clone(), + command_count: case.command_count, + input_tokens: case.input_tokens, + output_tokens: case.output_tokens, + check_results: case.check_results.clone(), + error_message: case.error_message.clone(), + } +} - std::fs::write(case_dir.join("stdout.txt"), stdout)?; - std::fs::write(case_dir.join("stderr.txt"), stderr)?; - std::fs::write(case_dir.join("trace.jsonl"), trace_jsonl)?; +/// Write per-case artifacts for backwards-compatible callers. +/// +/// Artifacts are written as trial 1 under `{run_dir}/{case_id}/trial-1/`, and an +/// aggregated `{run_dir}/{case_id}/aggregated.json` is also created. +pub fn write_case_artifacts( + run_dir: &Path, + case_id: &str, + stdout: &[u8], + stderr: &[u8], + trace_jsonl: &str, + result: &CaseResult, +) -> Result { + let trial = case_result_to_trial(result, 1); + let trial_dir = + write_trial_artifacts(run_dir, case_id, 1, stdout, stderr, trace_jsonl, &trial)?; - let result_json = serde_json::to_string_pretty(result)?; - std::fs::write(case_dir.join("result.json"), result_json)?; + let pass_count = if result.status == CaseStatus::Passed { + 1 + } else { + 0 + }; + let pass_rate = pass_count as f64; + let aggregated = CaseTrialsResult { + id: result.id.clone(), + trials: vec![trial], + aggregated_status: result.status.clone(), + pass_count, + total_trials: 1, + pass_rate, + }; + write_case_trials_summary(run_dir, case_id, &aggregated)?; - Ok(case_dir) + Ok(trial_dir) } /// Write summary.json @@ -151,6 +253,23 @@ pub fn read_case_results(run_dir: &Path) -> Result, ArtifactsErr for entry in entries.flatten() { let path = entry.path(); if path.is_dir() { + let aggregated_path = path.join("aggregated.json"); + if aggregated_path.exists() { + let content = std::fs::read_to_string(&aggregated_path)?; + let aggregated: CaseTrialsResult = serde_json::from_str(&content)?; + results.push(CaseResult { + id: aggregated.id, + status: aggregated.aggregated_status, + command_count: None, + input_tokens: None, + output_tokens: None, + check_results: vec![], + error_message: None, + }); + continue; + } + + // Legacy layout fallback: `{case_id}/result.json` let result_path = path.join("result.json"); if result_path.exists() { let content = std::fs::read_to_string(&result_path)?; @@ -190,11 +309,15 @@ mod tests { let dir = TempDir::new().unwrap(); let summary = SummaryResult { suite_pass: true, + suite_pass_rate: Some(1.0), agent: "codex".to_string(), model: None, total_cases: 2, passed: 2, failed: 0, + trials_per_case: Some(1), + parallel: None, + pass_threshold: Some(1.0), run_dir: dir.path().to_path_buf(), checks_path: None, skill_project_root: dir.path().to_path_buf(), diff --git a/src/eval/config.rs b/src/eval/config.rs index ecb6e5b..eaa4f18 100644 --- a/src/eval/config.rs +++ b/src/eval/config.rs @@ -13,6 +13,12 @@ pub struct EvalConfig { pub checks_path: Option, /// Timeout in seconds for each case pub timeout_seconds: u64, + /// Trials per case (>= 1) + pub trials_per_case: u32, + /// Optional maximum parallelism for trials within one case + pub parallel: Option, + /// Pass threshold for trial aggregation (0.0-1.0) + pub pass_threshold: f64, /// Whether to fail fast if agent is not available pub fail_on_missing_agent: bool, /// Skill project root directory @@ -26,6 +32,10 @@ pub enum EvalConfigError { ConfigMissing, #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV not found: {0}")] PromptsNotFound(PathBuf), + #[error("EVAL_INVALID_TRIALS_CONFIG: trials_per_case must be in range [1, 1000], got {0}")] + InvalidTrialsConfig(u32), + #[error("EVAL_INVALID_THRESHOLD: pass_threshold must be in range [0.0, 1.0], got {0}")] + InvalidPassThreshold(f64), #[error("Failed to read skill-project.toml: {0}")] Io(#[from] std::io::Error), #[error("Failed to parse skill-project.toml: {0}")] @@ -56,6 +66,13 @@ pub fn resolve_from_toml( config: &EvalConfigToml, project_root: &Path, ) -> Result { + if config.trials_per_case < 1 || config.trials_per_case > 1000 { + return Err(EvalConfigError::InvalidTrialsConfig(config.trials_per_case)); + } + if !(0.0..=1.0).contains(&config.pass_threshold) { + return Err(EvalConfigError::InvalidPassThreshold(config.pass_threshold)); + } + let prompts_path = if config.prompts.is_absolute() { config.prompts.clone() } else { @@ -78,6 +95,9 @@ pub fn resolve_from_toml( prompts_path, checks_path, timeout_seconds: config.timeout_seconds, + trials_per_case: config.trials_per_case, + parallel: config.parallel, + pass_threshold: config.pass_threshold, fail_on_missing_agent: config.fail_on_missing_agent, project_root: project_root.to_path_buf(), }) @@ -112,6 +132,56 @@ mod tests { assert!(matches!(result, Err(EvalConfigError::PromptsNotFound(_)))); } + #[test] + fn test_resolve_eval_config_rejects_invalid_trials_per_case() { + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + std::fs::create_dir_all(&evals_dir).unwrap(); + std::fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger\ntest-1,hello,true\n", + ) + .unwrap(); + + let project_file = dir.path().join("skill-project.toml"); + std::fs::write( + &project_file, + "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntrials_per_case = 0\ntimeout_seconds = 600\nfail_on_missing_agent = false\n", + ) + .unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(matches!( + result, + Err(EvalConfigError::InvalidTrialsConfig(0)) + )); + } + + #[test] + fn test_resolve_eval_config_rejects_invalid_pass_threshold() { + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + std::fs::create_dir_all(&evals_dir).unwrap(); + std::fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger\ntest-1,hello,true\n", + ) + .unwrap(); + + let project_file = dir.path().join("skill-project.toml"); + std::fs::write( + &project_file, + "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\npass_threshold = 1.5\ntimeout_seconds = 600\nfail_on_missing_agent = false\n", + ) + .unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(matches!( + result, + Err(EvalConfigError::InvalidPassThreshold(_)) + )); + } + #[test] fn test_resolve_eval_config_success() { let dir = TempDir::new().unwrap(); @@ -135,6 +205,8 @@ mod tests { assert!(result.is_ok()); let config = result.unwrap(); assert_eq!(config.timeout_seconds, 600); + assert_eq!(config.trials_per_case, 1); + assert_eq!(config.pass_threshold, 1.0); assert!(!config.fail_on_missing_agent); } } diff --git a/src/eval/runner.rs b/src/eval/runner.rs index c5495c4..cf902af 100644 --- a/src/eval/runner.rs +++ b/src/eval/runner.rs @@ -1,14 +1,17 @@ //! Eval runner implementation using aikit-sdk -use crate::eval::artifacts::{CaseResult, CaseStatus}; +use crate::eval::artifacts::{CaseResult, CaseStatus, CaseTrialsResult, TrialResult}; use crate::eval::checks::{count_raw_json_events, run_checks, CheckDefinition}; use crate::eval::suite::EvalCase; use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl, TraceEvent, TracePayload}; use aikit_sdk::{run_agent_events, AgentEvent, RunOptions}; use async_trait::async_trait; use std::path::PathBuf; +use std::sync::Arc; use std::time::Duration; use thiserror::Error; +use tokio::sync::Semaphore; +use tokio::task::JoinSet; /// Options for running a single eval case #[derive(Debug, Clone)] @@ -21,6 +24,8 @@ pub struct CaseRunOptions { pub project_root: PathBuf, /// Timeout in seconds pub timeout_seconds: u64, + /// Per-case trial aggregation pass threshold (0.0-1.0) + pub pass_threshold: f64, } /// Raw output from running a case @@ -53,6 +58,16 @@ pub trait EvalRunner: Send + Sync { opts: &CaseRunOptions, checks: &[CheckDefinition], ) -> (CaseRunOutput, CaseResult, String); + + /// Run multiple trials for one case, returning the aggregated result. + async fn run_case_trials( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + trial_count: u32, + max_parallelism: Option, + ) -> CaseTrialsResult; } /// Default runner: `aikit_sdk::run_agent_events` inside `spawn_blocking` with SDK timeout/cwd. @@ -75,6 +90,99 @@ impl EvalRunner for AikitEvalRunner { ) -> (CaseRunOutput, CaseResult, String) { self.run_case_inner(case, opts, checks).await } + + async fn run_case_trials( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + trial_count: u32, + max_parallelism: Option, + ) -> CaseTrialsResult { + let max_parallel = max_parallelism + .unwrap_or_else(|| num_cpus::get().max(1) as u32) + .max(1) as usize; + let semaphore = Arc::new(Semaphore::new(max_parallel)); + let mut join_set: JoinSet = JoinSet::new(); + + for trial_id in 1..=trial_count { + let permit = Arc::clone(&semaphore); + let case_clone = case.clone(); + let opts_clone = opts.clone(); + let checks_vec = checks.to_vec(); + let runner = *self; + + join_set.spawn(async move { + let Ok(_permit) = permit.acquire().await else { + return TrialResult { + trial_id, + status: CaseStatus::Error, + command_count: None, + input_tokens: None, + output_tokens: None, + check_results: vec![], + error_message: Some( + "EVAL_PARALLEL_EXHAUSTION: semaphore closed".to_string(), + ), + }; + }; + let (_output, case_result, _trace) = runner + .run_case_inner(&case_clone, &opts_clone, &checks_vec) + .await; + TrialResult { + trial_id, + status: case_result.status, + command_count: case_result.command_count, + input_tokens: case_result.input_tokens, + output_tokens: case_result.output_tokens, + check_results: case_result.check_results, + error_message: case_result.error_message, + } + }); + } + + let mut trials = Vec::with_capacity(trial_count as usize); + while let Some(res) = join_set.join_next().await { + match res { + Ok(trial) => trials.push(trial), + Err(e) => { + // Join errors are treated as failed trials. + let next_id = (trials.len() as u32) + 1; + trials.push(TrialResult { + trial_id: next_id, + status: CaseStatus::Error, + command_count: None, + input_tokens: None, + output_tokens: None, + check_results: vec![], + error_message: Some(format!("EVAL_PARALLEL_EXHAUSTION: {}", e)), + }); + } + } + } + + trials.sort_by_key(|t| t.trial_id); + let pass_count = trials + .iter() + .filter(|t| t.status == CaseStatus::Passed) + .count() as u32; + let total_trials = trial_count.max(1); + let pass_rate = pass_count as f64 / total_trials as f64; + let aggregated_status = if pass_rate >= opts.pass_threshold { + CaseStatus::Passed + } else { + CaseStatus::Failed + }; + + CaseTrialsResult { + id: case.id.clone(), + trials, + aggregated_status, + pass_count, + total_trials, + pass_rate, + } + } } impl AikitEvalRunner { @@ -202,7 +310,14 @@ impl AikitEvalRunner { input_tokens: None, output_tokens: None, check_results, - error_message: None, + error_message: if run_output.timed_out { + Some(format!( + "EVAL_CASE_TIMEOUT: Case timed out after {}s", + timeout_secs + )) + } else { + None + }, }; (run_output, case_result, trace_jsonl) @@ -255,6 +370,48 @@ mod tests { }; (out, result, trace_jsonl) } + + async fn run_case_trials( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + trial_count: u32, + _max_parallelism: Option, + ) -> CaseTrialsResult { + let mut trials = Vec::new(); + for trial_id in 1..=trial_count { + let (_out, result, _trace) = self.run_case(case, opts, checks).await; + trials.push(TrialResult { + trial_id, + status: result.status, + command_count: result.command_count, + input_tokens: result.input_tokens, + output_tokens: result.output_tokens, + check_results: result.check_results, + error_message: result.error_message, + }); + } + let pass_count = trials + .iter() + .filter(|t| t.status == CaseStatus::Passed) + .count() as u32; + let total_trials = trial_count.max(1); + let pass_rate = pass_count as f64 / total_trials as f64; + let aggregated_status = if pass_rate >= opts.pass_threshold { + CaseStatus::Passed + } else { + CaseStatus::Failed + }; + CaseTrialsResult { + id: case.id.clone(), + trials, + aggregated_status, + pass_count, + total_trials, + pass_rate, + } + } } #[tokio::test] @@ -271,6 +428,7 @@ mod tests { model: None, project_root: PathBuf::from("/tmp"), timeout_seconds: 1, + pass_threshold: 1.0, }; let runner = StubEvalRunner; let (out, res, trace) = runner.run_case(&case, &opts, &[]).await; @@ -286,6 +444,7 @@ mod tests { model: Some("gpt-4".to_string()), project_root: PathBuf::from("/tmp"), timeout_seconds: 300, + pass_threshold: 1.0, }; assert_eq!(opts.agent_key, "codex"); assert_eq!(opts.model, Some("gpt-4".to_string())); diff --git a/tests/cli/eval_tests.rs b/tests/cli/eval_tests.rs index 4ecea72..4ffe5ef 100644 --- a/tests/cli/eval_tests.rs +++ b/tests/cli/eval_tests.rs @@ -403,6 +403,7 @@ fn test_eval_run_persists_event_trace_jsonl() { let run_dir = summary["run_dir"].as_str().unwrap(); let trace_path = std::path::Path::new(run_dir) .join("trace-case") + .join("trial-1") .join("trace.jsonl"); let trace_jsonl = fs::read_to_string(&trace_path).unwrap(); @@ -414,8 +415,201 @@ fn test_eval_run_persists_event_trace_jsonl() { let result_path = std::path::Path::new(run_dir) .join("trace-case") + .join("trial-1") .join("result.json"); let case_result: Value = serde_json::from_str(&fs::read_to_string(result_path).unwrap()).unwrap(); assert_eq!(case_result["command_count"], 1); } + +#[test] +fn test_eval_run_trials_threshold_and_ci_exit_semantics() { + use serde_json::Value; + use std::env; + use std::fs; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + fs::create_dir_all(&evals_dir).unwrap(); + fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger,tags,workspace_subdir\ntrial-case,\"test prompt\",true,\"basic\",\n", + ) + .unwrap(); + fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap(); + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 30\nfail_on_missing_agent = true\n", + ) + .unwrap(); + + // Fake agent that passes the first 3 invocations, then fails. + let bin_dir = dir.path().join("bin"); + fs::create_dir_all(&bin_dir).unwrap(); + let agent_path = bin_dir.join("agent"); + fs::write( + &agent_path, + "#!/usr/bin/env bash\nset -euo pipefail\nif [[ \"${1:-}\" == \"--version\" ]]; then echo \"agent 0.1\"; exit 0; fi\nstate_dir=\"${FASTSKILL_TEST_STATE_DIR:?}\"\nmkdir -p \"$state_dir\"\nlock=\"$state_dir/lock\"\ncount_file=\"$state_dir/count\"\nexec 9>\"$lock\"\nflock 9\ncount=0\nif [[ -f \"$count_file\" ]]; then count=$(cat \"$count_file\" || echo 0); fi\ncount=$((count+1))\necho \"$count\" > \"$count_file\"\nflock -u 9\n# Emit a raw_json line so trace persists and command_count=1.\necho '{\"event\":\"ok\"}'\nif [[ $count -le 3 ]]; then exit 0; else exit 1; fi\n", + ) + .unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&agent_path).unwrap().permissions(); + perms.set_mode(0o755); + fs::set_permissions(&agent_path, perms).unwrap(); + } + + let output_dir = dir.path().join("out"); + let path = env::var("PATH").unwrap_or_default(); + let merged_path = format!("{}:{}", bin_dir.display(), path); + let state_dir = dir.path().join("state"); + let env_vars = vec![ + ("PATH", merged_path.as_str()), + ("FASTSKILL_TEST_STATE_DIR", state_dir.to_str().unwrap()), + ]; + + // Threshold 0.6 should pass for 3/5. + let result = run_fastskill_command_with_env( + &[ + "eval", + "run", + "--agent", + "agent", + "--output-dir", + output_dir.to_str().unwrap(), + "--case", + "trial-case", + "--trials", + "5", + "--threshold", + "0.6", + "--ci", + "--json", + ], + &env_vars, + Some(dir.path()), + ); + assert!( + result.success, + "Expected eval run to succeed in CI mode at threshold=0.6, got stdout: {}, stderr: {}", + result.stdout, result.stderr + ); + let json_start = result.stdout.find('{').unwrap(); + let summary: Value = serde_json::from_str(&result.stdout[json_start..]).unwrap(); + assert_eq!(summary["cases"][0]["id"], "trial-case"); + assert_eq!(summary["cases"][0]["status"], "passed"); + assert_eq!(summary["cases"][0]["pass_count"], 3); + assert_eq!(summary["cases"][0]["total_trials"], 5); + + // Reset state and require 100% suite pass rate should fail in --ci mode. + fs::remove_file(state_dir.join("count")).ok(); + let result = run_fastskill_command_with_env( + &[ + "eval", + "run", + "--agent", + "agent", + "--output-dir", + output_dir.to_str().unwrap(), + "--case", + "trial-case", + "--trials", + "5", + "--threshold", + "1.0", + "--ci", + "--json", + ], + &env_vars, + Some(dir.path()), + ); + assert!( + !result.success, + "Expected eval run to fail in CI mode at threshold=1.0" + ); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("threshold") || combined.contains("Eval suite failed"), + "Expected threshold-related failure, got: {}", + combined + ); +} + +#[test] +fn test_eval_run_parallelism_reduces_wall_time() { + use std::env; + use std::fs; + use std::time::Instant; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + fs::create_dir_all(&evals_dir).unwrap(); + fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger,tags,workspace_subdir\nsleep-case,\"test prompt\",true,\"basic\",\n", + ) + .unwrap(); + fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap(); + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 30\nparallel = 4\nfail_on_missing_agent = true\n", + ) + .unwrap(); + + let bin_dir = dir.path().join("bin"); + fs::create_dir_all(&bin_dir).unwrap(); + let agent_path = bin_dir.join("agent"); + fs::write( + &agent_path, + "#!/usr/bin/env bash\nset -euo pipefail\nif [[ \"${1:-}\" == \"--version\" ]]; then echo \"agent 0.1\"; exit 0; fi\nsleep 0.5\necho '{\"event\":\"ok\"}'\nexit 0\n", + ) + .unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&agent_path).unwrap().permissions(); + perms.set_mode(0o755); + fs::set_permissions(&agent_path, perms).unwrap(); + } + + let output_dir = dir.path().join("out"); + let path = env::var("PATH").unwrap_or_default(); + let merged_path = format!("{}:{}", bin_dir.display(), path); + let env_vars = vec![("PATH", merged_path.as_str())]; + + let start = Instant::now(); + let result = run_fastskill_command_with_env( + &[ + "eval", + "run", + "--agent", + "agent", + "--output-dir", + output_dir.to_str().unwrap(), + "--case", + "sleep-case", + "--trials", + "4", + "--json", + ], + &env_vars, + Some(dir.path()), + ); + let elapsed = start.elapsed(); + assert!( + result.success, + "Expected eval run to succeed, got stdout: {}, stderr: {}", + result.stdout, result.stderr + ); + + // If trials executed sequentially with sleep(0.5), 4 trials would take ~2s. + // With parallel=4, it should be comfortably below that. + assert!( + elapsed.as_secs_f64() < 1.6, + "Expected parallel trials to complete faster; elapsed={:?}", + elapsed + ); +} diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap index 0e1c47b..2089e92 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap @@ -15,6 +15,9 @@ Options: --format Output format: table, json, grid, xml --json Shorthand for --format json --no-fail Do not fail with non-zero exit code on suite failure + --trials Override trials per case from config + --ci Enable CI mode: exit non-zero if suite pass rate below threshold + --threshold Override pass threshold (0.0-1.0) -h, --help Print help Examples: