From 0b89f56811fc9ebf80546f22cd4fbde283cc526c Mon Sep 17 00:00:00 2001
From: System Two <alexandre@55seeds.com>
Date: Mon, 20 Apr 2026 17:42:45 +0000
Subject: [PATCH] feat(eval): add configurable trials, parallelism, and CI
 threshold support

Extends the eval system to run multiple trials per case with bounded
concurrency and deterministic pass-rate aggregation. Adds --trials,
--ci, and --threshold CLI flags; trial artifacts are written under
{run_dir}/{case_id}/trial-N/ with aggregated.json summaries. Existing
single-trial configs continue working without change.
---
 Cargo.lock                                    |   1 +
 Cargo.toml                                    |   1 +
 src/cli/commands/eval/run.rs                  | 287 +++++++++++++++---
 src/cli/commands/eval/score.rs                |  84 ++++-
 src/cli/commands/eval/validate.rs             |   6 +
 src/core/manifest.rs                          |  17 ++
 src/eval/artifacts.rs                         | 141 ++++++++-
 src/eval/config.rs                            |  72 +++++
 src/eval/runner.rs                            | 163 +++++++++-
 tests/cli/eval_tests.rs                       | 194 ++++++++++++
 ..._cli__snapshot_helpers__eval_run_help.snap |   3 +
 11 files changed, 894 insertions(+), 75 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 7c0df14..fffd377 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1477,6 +1477,7 @@ dependencies = [
  "insta",
  "multer",
  "notify",
+ "num_cpus",
  "once_cell",
  "predicates",
  "regex",
diff --git a/Cargo.toml b/Cargo.toml
index e8f2878..85d7d86 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ uuid = { version = "1.0", default-features = false, features = ["v4", "serde"] }
 chrono = { version = "0.4", default-features = false, features = ["serde", "clock"] }
 once_cell = "1.19"
 regex = { version = "1.10", default-features = false, features = ["unicode-perl"] }
+num_cpus = "1.16"
 
 # File system and I/O
 tokio-util = { version = "0.7", features = ["io"] }
diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs
index cc8112e..246e6b6 100644
--- a/src/cli/commands/eval/run.rs
+++ b/src/cli/commands/eval/run.rs
@@ -7,7 +7,8 @@ use chrono::Utc;
 use clap::Args;
 use fastskill::core::project::resolve_project_file;
 use fastskill::eval::artifacts::{
-    allocate_run_dir, write_case_artifacts, write_summary, CaseSummary, SummaryResult,
+    allocate_run_dir, write_case_trials_summary, write_summary, write_trial_artifacts, CaseStatus,
+    CaseSummary, CaseTrialsResult, SummaryResult, TrialResult,
 };
 use fastskill::eval::checks::load_checks;
 use fastskill::eval::config::resolve_eval_config;
@@ -16,6 +17,9 @@ use fastskill::eval::suite::load_suite;
 use fastskill::OutputFormat;
 use std::env;
 use std::path::PathBuf;
+use std::sync::Arc;
+use tokio::sync::Semaphore;
+use tokio::task::JoinSet;
 
 /// Arguments for `fastskill eval run`
 #[derive(Debug, Args)]
@@ -55,6 +59,18 @@ pub struct RunArgs {
     /// Do not fail with non-zero exit code on suite failure
     #[arg(long)]
     pub no_fail: bool,
+
+    /// Override trials per case from config
+    #[arg(long)]
+    pub trials: Option<u32>,
+
+    /// Enable CI mode: exit non-zero if suite pass rate below threshold
+    #[arg(long)]
+    pub ci: bool,
+
+    /// Override pass threshold (0.0-1.0)
+    #[arg(long)]
+    pub threshold: Option<f64>,
 }
 
 fn validate_agent_key_for_run(s: &str) -> Result<String, String> {
@@ -71,13 +87,13 @@ fn validate_agent_key_for_run(s: &str) -> Result<String, String> {
 
 /// Execute the `eval run` command using the default aikit-backed runner.
 pub async fn execute_run(args: RunArgs) -> CliResult<()> {
-    execute_run_with_runner(args, &AikitEvalRunner).await
+    execute_run_with_runner(args, Arc::new(AikitEvalRunner)).await
 }
 
 /// Execute `eval run` with an injectable [`EvalRunner`] (tests or future adapters).
-pub async fn execute_run_with_runner<R: EvalRunner + ?Sized>(
+pub async fn execute_run_with_runner<R: EvalRunner + 'static>(
     args: RunArgs,
-    runner: &R,
+    runner: Arc<R>,
 ) -> CliResult<()> {
     let format = validate_format_args(&args.format, args.json)?;
     let use_json = format == OutputFormat::Json;
@@ -102,6 +118,22 @@ pub async fn execute_run_with_runner<R: EvalRunner + ?Sized>(
     let eval_config = resolve_eval_config(&resolution.path, &project_root)
         .map_err(|e| CliError::Config(e.to_string()))?;
 
+    let trials_per_case = args.trials.unwrap_or(eval_config.trials_per_case);
+    if !(1..=1000).contains(&trials_per_case) {
+        return Err(CliError::Config(format!(
+            "EVAL_INVALID_TRIALS_CONFIG: trials must be in range [1, 1000], got {}",
+            trials_per_case
+        )));
+    }
+
+    let pass_threshold = args.threshold.unwrap_or(eval_config.pass_threshold);
+    if !(0.0..=1.0).contains(&pass_threshold) {
+        return Err(CliError::Config(format!(
+            "EVAL_INVALID_THRESHOLD: threshold must be in range [0.0, 1.0], got {}",
+            pass_threshold
+        )));
+    }
+
     // Check agent availability
     if eval_config.fail_on_missing_agent && !is_agent_available(&args.agent) {
         return Err(CliError::Config(format!(
@@ -141,6 +173,16 @@ pub async fn execute_run_with_runner<R: EvalRunner + ?Sized>(
         vec![]
     };
 
+    let total_trial_runs = (suite.cases.len() as u64) * (trials_per_case as u64);
+    if total_trial_runs >= 100 && !use_json {
+        eprintln!(
+            "warning: EVAL_COST_WARNING: running {} case(s) × {} trial(s) = {} total trial runs",
+            suite.cases.len(),
+            trials_per_case,
+            total_trial_runs
+        );
+    }
+
     // Allocate run directory
     let run_id = Utc::now().format("%Y-%m-%dT%H-%M-%SZ").to_string();
     std::fs::create_dir_all(&args.output_dir).map_err(|e| {
@@ -158,17 +200,18 @@ pub async fn execute_run_with_runner<R: EvalRunner + ?Sized>(
         model: args.model.clone(),
         project_root: project_root.clone(),
         timeout_seconds: eval_config.timeout_seconds,
+        pass_threshold,
     };
 
     if !use_json {
         eprintln!(
-            "Running {} eval case(s) with agent '{}'...",
+            "Running {} eval case(s) with agent '{}' ({} trial(s) per case)...",
             suite.cases.len(),
-            args.agent
+            args.agent,
+            trials_per_case
         );
     }
 
-    let mut case_results = Vec::new();
     let mut case_summaries = Vec::new();
 
     for case in &suite.cases {
@@ -176,61 +219,185 @@ pub async fn execute_run_with_runner<R: EvalRunner + ?Sized>(
             eprintln!("  Running case '{}'...", case.id);
         }
 
-        let (run_output, case_result, trace_jsonl) =
-            runner.run_case(case, &run_opts, &checks).await;
-
-        // Write artifacts
-        if let Err(e) = write_case_artifacts(
-            &run_dir,
-            &case.id,
-            &run_output.stdout,
-            &run_output.stderr,
-            &trace_jsonl,
-            &case_result,
-        ) {
+        let max_parallel = eval_config
+            .parallel
+            .unwrap_or_else(|| num_cpus::get().max(1) as u32)
+            .max(1) as usize;
+        let semaphore = Arc::new(Semaphore::new(max_parallel));
+        let mut join_set: JoinSet<
+            CliResult<(
+                u32,
+                fastskill::eval::runner::CaseRunOutput,
+                fastskill::eval::artifacts::CaseResult,
+                String,
+            )>,
+        > = JoinSet::new();
+
+        for trial_id in 1..=trials_per_case {
+            let permit = Arc::clone(&semaphore);
+            let runner = Arc::clone(&runner);
+            let case_clone = case.clone();
+            let opts_clone = run_opts.clone();
+            let checks_vec = checks.clone();
+
+            join_set.spawn(async move {
+                let Ok(_permit) = permit.acquire().await else {
+                    return Err(CliError::Config(
+                        "EVAL_PARALLEL_EXHAUSTION: semaphore closed".to_string(),
+                    ));
+                };
+                let (out, res, trace) =
+                    runner.run_case(&case_clone, &opts_clone, &checks_vec).await;
+                Ok((trial_id, out, res, trace))
+            });
+        }
+
+        let mut trials: Vec<TrialResult> = Vec::with_capacity(trials_per_case as usize);
+        let mut pass_count: u32 = 0;
+        let mut command_count_sum: usize = 0;
+        let mut input_tokens_sum: u64 = 0;
+        let mut output_tokens_sum: u64 = 0;
+        let mut saw_any_command_count = false;
+        let mut saw_any_input_tokens = false;
+        let mut saw_any_output_tokens = false;
+
+        while let Some(joined) = join_set.join_next().await {
+            let (trial_id, out, case_result, trace_jsonl) = joined.map_err(|e| {
+                CliError::Config(format!(
+                    "EVAL_PARALLEL_EXHAUSTION: trial task failed: {}",
+                    e
+                ))
+            })??;
+
+            let trial = TrialResult {
+                trial_id,
+                status: case_result.status.clone(),
+                command_count: case_result.command_count,
+                input_tokens: case_result.input_tokens,
+                output_tokens: case_result.output_tokens,
+                check_results: case_result.check_results.clone(),
+                error_message: case_result.error_message.clone(),
+            };
+
+            if trial.status == CaseStatus::Passed {
+                pass_count += 1;
+            }
+            if let Some(cc) = trial.command_count {
+                saw_any_command_count = true;
+                command_count_sum = command_count_sum.saturating_add(cc);
+            }
+            if let Some(it) = trial.input_tokens {
+                saw_any_input_tokens = true;
+                input_tokens_sum = input_tokens_sum.saturating_add(it);
+            }
+            if let Some(ot) = trial.output_tokens {
+                saw_any_output_tokens = true;
+                output_tokens_sum = output_tokens_sum.saturating_add(ot);
+            }
+
+            // Write trial artifacts immediately (keeps memory bounded).
+            if let Err(e) = write_trial_artifacts(
+                &run_dir,
+                &case.id,
+                trial_id,
+                &out.stdout,
+                &out.stderr,
+                &trace_jsonl,
+                &trial,
+            ) {
+                if !use_json {
+                    eprintln!(
+                        "  warning: failed to write artifacts for case '{}' trial {}: {}",
+                        case.id, trial_id, e
+                    );
+                }
+            }
+
+            trials.push(trial);
+        }
+
+        trials.sort_by_key(|t| t.trial_id);
+        let total_trials = trials_per_case;
+        let pass_rate = pass_count as f64 / total_trials as f64;
+        let aggregated_status = if pass_rate >= pass_threshold {
+            CaseStatus::Passed
+        } else {
+            CaseStatus::Failed
+        };
+
+        let aggregated = CaseTrialsResult {
+            id: case.id.clone(),
+            trials: trials.clone(),
+            aggregated_status: aggregated_status.clone(),
+            pass_count,
+            total_trials,
+            pass_rate,
+        };
+
+        if let Err(e) = write_case_trials_summary(&run_dir, &case.id, &aggregated) {
             if !use_json {
                 eprintln!(
-                    "  warning: failed to write artifacts for case '{}': {}",
+                    "  warning: failed to write aggregated summary for case '{}': {}",
                     case.id, e
                 );
             }
         }
 
-        let summary_entry = CaseSummary {
-            id: case_result.id.clone(),
-            status: case_result.status.clone(),
-            command_count: case_result.command_count,
-            input_tokens: case_result.input_tokens,
-            output_tokens: case_result.output_tokens,
-        };
-
-        case_summaries.push(summary_entry);
-        case_results.push(case_result);
+        case_summaries.push(CaseSummary {
+            id: case.id.clone(),
+            status: aggregated_status,
+            command_count: if saw_any_command_count {
+                Some(command_count_sum)
+            } else {
+                None
+            },
+            input_tokens: if saw_any_input_tokens {
+                Some(input_tokens_sum)
+            } else {
+                None
+            },
+            output_tokens: if saw_any_output_tokens {
+                Some(output_tokens_sum)
+            } else {
+                None
+            },
+            pass_count: Some(pass_count),
+            total_trials: Some(total_trials),
+            pass_rate: Some(pass_rate),
+            trials,
+        });
     }
 
-    let passed = case_results
+    let passed = case_summaries
         .iter()
         .filter(|r| r.status == fastskill::eval::artifacts::CaseStatus::Passed)
         .count();
-    let failed = case_results.len() - passed;
-    let suite_pass = failed == 0;
+    let failed = case_summaries.len() - passed;
+    let suite_pass_rate = if case_summaries.is_empty() {
+        0.0
+    } else {
+        passed as f64 / case_summaries.len() as f64
+    };
+    let suite_pass = if args.ci {
+        suite_pass_rate >= pass_threshold
+    } else {
+        failed == 0
+    };
 
     let summary = SummaryResult {
         suite_pass,
+        suite_pass_rate: Some(suite_pass_rate),
         agent: args.agent.clone(),
         model: args.model.clone(),
-        total_cases: case_results.len(),
+        total_cases: case_summaries.len(),
         passed,
         failed,
+        trials_per_case: Some(trials_per_case),
+        parallel: eval_config.parallel,
+        pass_threshold: Some(pass_threshold),
         run_dir: run_dir.clone(),
-        checks_path: eval_config.checks_path.map(|p| {
-            if p.is_absolute() {
-                p
-            } else {
-                project_root.join(p)
-            }
-        }),
-        skill_project_root: project_root,
+        checks_path: eval_config.checks_path.clone(),
+        skill_project_root: project_root.clone(),
         cases: case_summaries,
     };
 
@@ -249,22 +416,42 @@ pub async fn execute_run_with_runner<R: EvalRunner + ?Sized>(
     } else {
         println!(
             "\nEval run complete: {}/{} passed",
-            passed,
-            case_results.len()
+            passed, summary.total_cases
         );
         println!("  run_dir: {}", run_dir.display());
         if suite_pass {
-            println!("  result: PASSED");
+            if args.ci {
+                println!(
+                    "  result: PASSED (suite pass rate {:.0}% ≥ {:.0}% threshold)",
+                    suite_pass_rate * 100.0,
+                    pass_threshold * 100.0
+                );
+            } else {
+                println!("  result: PASSED");
+            }
         } else {
-            println!("  result: FAILED ({} case(s) failed)", failed);
+            if args.ci {
+                println!(
+                    "  result: FAILED (suite pass rate {:.0}% < {:.0}% threshold)",
+                    suite_pass_rate * 100.0,
+                    pass_threshold * 100.0
+                );
+            } else {
+                println!("  result: FAILED ({} case(s) failed)", failed);
+            }
         }
     }
 
-    if !suite_pass && !args.no_fail {
+    let should_fail = if args.ci {
+        suite_pass_rate < pass_threshold
+    } else {
+        !suite_pass
+    };
+
+    if should_fail && !args.no_fail {
         return Err(CliError::Config(format!(
-            "Eval suite failed: {}/{} cases passed",
-            passed,
-            case_results.len()
+            "Eval suite failed: {}/{} cases passed (threshold={})",
+            passed, summary.total_cases, pass_threshold
         )));
     }
 
diff --git a/src/cli/commands/eval/score.rs b/src/cli/commands/eval/score.rs
index 6d9c84d..6a6cee5 100644
--- a/src/cli/commands/eval/score.rs
+++ b/src/cli/commands/eval/score.rs
@@ -3,7 +3,7 @@
 use crate::cli::commands::common::validate_format_args;
 use crate::cli::error::{CliError, CliResult};
 use clap::Args;
-use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus};
+use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus, TrialResult};
 use fastskill::eval::checks::load_checks;
 use fastskill::OutputFormat;
 use std::path::PathBuf;
@@ -74,6 +74,7 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> {
     let mut new_failed = 0;
 
     let mut updated_cases = summary.cases.clone();
+    let pass_threshold = summary.pass_threshold.unwrap_or(1.0);
 
     for case_summary in &mut updated_cases {
         let case_dir = args.run_dir.join(&case_summary.id);
@@ -81,27 +82,77 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> {
             continue;
         }
 
-        let stdout_path = case_dir.join("stdout.txt");
-        let trace_path = case_dir.join("trace.jsonl");
+        let mut trial_dirs: Vec<(u32, PathBuf)> = Vec::new();
+        if let Ok(entries) = std::fs::read_dir(&case_dir) {
+            for entry in entries.flatten() {
+                let path = entry.path();
+                if path.is_dir() {
+                    if let Some(name) = path.file_name().and_then(|s| s.to_str()) {
+                        if let Some(suffix) = name.strip_prefix("trial-") {
+                            if let Ok(id) = suffix.parse::<u32>() {
+                                trial_dirs.push((id, path));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        trial_dirs.sort_by_key(|(id, _)| *id);
 
-        let stdout_content = std::fs::read_to_string(&stdout_path).unwrap_or_default();
-        let trace_jsonl = std::fs::read_to_string(&trace_path).unwrap_or_default();
+        // Legacy fallback: treat case root as a single trial.
+        if trial_dirs.is_empty() {
+            trial_dirs.push((1, case_dir.clone()));
+        }
 
-        let check_results = fastskill::eval::checks::run_checks(
-            &checks,
-            &stdout_content,
-            &trace_jsonl,
-            &summary.skill_project_root,
-        );
+        let mut trials: Vec<TrialResult> = Vec::with_capacity(trial_dirs.len());
+        for (trial_id, tdir) in &trial_dirs {
+            let stdout_path = tdir.join("stdout.txt");
+            let trace_path = tdir.join("trace.jsonl");
+
+            let stdout_content = std::fs::read_to_string(&stdout_path).unwrap_or_default();
+            let trace_jsonl = std::fs::read_to_string(&trace_path).unwrap_or_default();
+
+            let check_results = fastskill::eval::checks::run_checks(
+                &checks,
+                &stdout_content,
+                &trace_jsonl,
+                &summary.skill_project_root,
+            );
+            let all_passed = check_results.iter().all(|r| r.passed);
+
+            trials.push(TrialResult {
+                trial_id: *trial_id,
+                status: if all_passed {
+                    CaseStatus::Passed
+                } else {
+                    CaseStatus::Failed
+                },
+                command_count: None,
+                input_tokens: None,
+                output_tokens: None,
+                check_results,
+                error_message: None,
+            });
+        }
 
-        let all_passed = check_results.iter().all(|r| r.passed);
-        case_summary.status = if all_passed {
+        let pass_count = trials
+            .iter()
+            .filter(|t| t.status == CaseStatus::Passed)
+            .count() as u32;
+        let total_trials = trials.len().max(1) as u32;
+        let pass_rate = pass_count as f64 / total_trials as f64;
+
+        case_summary.trials = trials;
+        case_summary.pass_count = Some(pass_count);
+        case_summary.total_trials = Some(total_trials);
+        case_summary.pass_rate = Some(pass_rate);
+        case_summary.status = if pass_rate >= pass_threshold {
             CaseStatus::Passed
         } else {
             CaseStatus::Failed
         };
 
-        if all_passed {
+        if case_summary.status == CaseStatus::Passed {
             new_passed += 1;
         } else {
             new_failed += 1;
@@ -110,6 +161,11 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> {
 
     summary.passed = new_passed;
     summary.failed = new_failed;
+    summary.suite_pass_rate = if summary.total_cases == 0 {
+        Some(0.0)
+    } else {
+        Some(new_passed as f64 / summary.total_cases as f64)
+    };
     summary.suite_pass = new_failed == 0;
     summary.cases = updated_cases;
 
diff --git a/src/cli/commands/eval/validate.rs b/src/cli/commands/eval/validate.rs
index d46d743..60bf439 100644
--- a/src/cli/commands/eval/validate.rs
+++ b/src/cli/commands/eval/validate.rs
@@ -109,6 +109,9 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
             "prompts_path": eval_config.prompts_path,
             "checks_path": eval_config.checks_path,
             "timeout_seconds": eval_config.timeout_seconds,
+            "trials_per_case": eval_config.trials_per_case,
+            "parallel": eval_config.parallel,
+            "pass_threshold": eval_config.pass_threshold,
             "fail_on_missing_agent": eval_config.fail_on_missing_agent,
             "project_root": eval_config.project_root,
             "case_count": case_count,
@@ -127,6 +130,9 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
             println!("  check count: {}", check_count);
         }
         println!("  timeout: {}s", eval_config.timeout_seconds);
+        println!("  trials_per_case: {}", eval_config.trials_per_case);
+        println!("  parallel: {}", eval_config.parallel.unwrap_or(0));
+        println!("  pass_threshold: {}", eval_config.pass_threshold);
         println!(
             "  fail_on_missing_agent: {}",
             eval_config.fail_on_missing_agent
diff --git a/src/core/manifest.rs b/src/core/manifest.rs
index 5012001..eba5a40 100644
--- a/src/core/manifest.rs
+++ b/src/core/manifest.rs
@@ -289,6 +289,15 @@ pub struct EvalConfigToml {
     /// Timeout in seconds for each eval case execution
     #[serde(default = "default_eval_timeout_seconds")]
     pub timeout_seconds: u64,
+    /// Trials per case (default: 1)
+    #[serde(default = "default_trials_per_case")]
+    pub trials_per_case: u32,
+    /// Optional maximum parallelism for trials within one case (default: CPU cores)
+    #[serde(default)]
+    pub parallel: Option<u32>,
+    /// Pass threshold for trial aggregation (0.0-1.0, default: 1.0)
+    #[serde(default = "default_pass_threshold")]
+    pub pass_threshold: f64,
     /// When true, `eval run` / `eval validate --agent` fail fast if the agent CLI is not available
     #[serde(default = "default_fail_on_missing_agent")]
     pub fail_on_missing_agent: bool,
@@ -298,6 +307,14 @@ fn default_eval_timeout_seconds() -> u64 {
     900
 }
 
+fn default_trials_per_case() -> u32 {
+    1
+}
+
+fn default_pass_threshold() -> f64 {
+    1.0
+}
+
 fn default_fail_on_missing_agent() -> bool {
     true
 }
diff --git a/src/eval/artifacts.rs b/src/eval/artifacts.rs
index b11a5b1..74cd1de 100644
--- a/src/eval/artifacts.rs
+++ b/src/eval/artifacts.rs
@@ -39,15 +39,47 @@ pub struct CaseResult {
     pub error_message: Option<String>,
 }
 
+/// Per-trial result for a case
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TrialResult {
+    pub trial_id: u32,
+    pub status: CaseStatus,
+    pub command_count: Option<usize>,
+    pub input_tokens: Option<u64>,
+    pub output_tokens: Option<u64>,
+    #[serde(default)]
+    pub check_results: Vec<CheckResult>,
+    pub error_message: Option<String>,
+}
+
+/// Aggregated results for a case across multiple trials
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CaseTrialsResult {
+    pub id: String,
+    pub trials: Vec<TrialResult>,
+    pub aggregated_status: CaseStatus,
+    pub pass_count: u32,
+    pub total_trials: u32,
+    pub pass_rate: f64,
+}
+
 /// Aggregated run summary
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SummaryResult {
     pub suite_pass: bool,
+    #[serde(default)]
+    pub suite_pass_rate: Option<f64>,
     pub agent: String,
     pub model: Option<String>,
     pub total_cases: usize,
     pub passed: usize,
     pub failed: usize,
+    #[serde(default)]
+    pub trials_per_case: Option<u32>,
+    #[serde(default)]
+    pub parallel: Option<u32>,
+    #[serde(default)]
+    pub pass_threshold: Option<f64>,
     pub run_dir: PathBuf,
     pub checks_path: Option<PathBuf>,
     pub skill_project_root: PathBuf,
@@ -62,6 +94,14 @@ pub struct CaseSummary {
     pub command_count: Option<usize>,
     pub input_tokens: Option<u64>,
     pub output_tokens: Option<u64>,
+    #[serde(default)]
+    pub pass_count: Option<u32>,
+    #[serde(default)]
+    pub total_trials: Option<u32>,
+    #[serde(default)]
+    pub pass_rate: Option<f64>,
+    #[serde(default)]
+    pub trials: Vec<TrialResult>,
 }
 
 /// All artifacts from a completed run
@@ -106,26 +146,88 @@ pub fn allocate_run_dir(output_dir: &Path, run_id: &str) -> Result<PathBuf, Arti
     Ok(base)
 }
 
-/// Write per-case artifacts (stdout.txt, stderr.txt, trace.jsonl, result.json)
-pub fn write_case_artifacts(
+/// Write per-trial artifacts (stdout.txt, stderr.txt, trace.jsonl, result.json) under:
+/// `{run_dir}/{case_id}/trial-{trial_id}/`
+pub fn write_trial_artifacts(
     run_dir: &Path,
     case_id: &str,
+    trial_id: u32,
     stdout: &[u8],
     stderr: &[u8],
     trace_jsonl: &str,
-    result: &CaseResult,
+    result: &TrialResult,
 ) -> Result<PathBuf, ArtifactsError> {
+    let trial_dir = run_dir.join(case_id).join(format!("trial-{}", trial_id));
+    std::fs::create_dir_all(&trial_dir)?;
+
+    std::fs::write(trial_dir.join("stdout.txt"), stdout)?;
+    std::fs::write(trial_dir.join("stderr.txt"), stderr)?;
+    std::fs::write(trial_dir.join("trace.jsonl"), trace_jsonl)?;
+
+    let result_json = serde_json::to_string_pretty(result)?;
+    std::fs::write(trial_dir.join("result.json"), result_json)?;
+
+    Ok(trial_dir)
+}
+
+/// Write `{run_dir}/{case_id}/aggregated.json`
+pub fn write_case_trials_summary(
+    run_dir: &Path,
+    case_id: &str,
+    trials_result: &CaseTrialsResult,
+) -> Result<(), ArtifactsError> {
     let case_dir = run_dir.join(case_id);
     std::fs::create_dir_all(&case_dir)?;
+    let aggregated_json = serde_json::to_string_pretty(trials_result)?;
+    std::fs::write(case_dir.join("aggregated.json"), aggregated_json)?;
+    Ok(())
+}
+
+fn case_result_to_trial(case: &CaseResult, trial_id: u32) -> TrialResult {
+    TrialResult {
+        trial_id,
+        status: case.status.clone(),
+        command_count: case.command_count,
+        input_tokens: case.input_tokens,
+        output_tokens: case.output_tokens,
+        check_results: case.check_results.clone(),
+        error_message: case.error_message.clone(),
+    }
+}
 
-    std::fs::write(case_dir.join("stdout.txt"), stdout)?;
-    std::fs::write(case_dir.join("stderr.txt"), stderr)?;
-    std::fs::write(case_dir.join("trace.jsonl"), trace_jsonl)?;
+/// Write per-case artifacts for backwards-compatible callers.
+///
+/// Artifacts are written as trial 1 under `{run_dir}/{case_id}/trial-1/`, and an
+/// aggregated `{run_dir}/{case_id}/aggregated.json` is also created.
+pub fn write_case_artifacts(
+    run_dir: &Path,
+    case_id: &str,
+    stdout: &[u8],
+    stderr: &[u8],
+    trace_jsonl: &str,
+    result: &CaseResult,
+) -> Result<PathBuf, ArtifactsError> {
+    let trial = case_result_to_trial(result, 1);
+    let trial_dir =
+        write_trial_artifacts(run_dir, case_id, 1, stdout, stderr, trace_jsonl, &trial)?;
 
-    let result_json = serde_json::to_string_pretty(result)?;
-    std::fs::write(case_dir.join("result.json"), result_json)?;
+    let pass_count = if result.status == CaseStatus::Passed {
+        1
+    } else {
+        0
+    };
+    let pass_rate = pass_count as f64;
+    let aggregated = CaseTrialsResult {
+        id: result.id.clone(),
+        trials: vec![trial],
+        aggregated_status: result.status.clone(),
+        pass_count,
+        total_trials: 1,
+        pass_rate,
+    };
+    write_case_trials_summary(run_dir, case_id, &aggregated)?;
 
-    Ok(case_dir)
+    Ok(trial_dir)
 }
 
 /// Write summary.json
@@ -151,6 +253,23 @@ pub fn read_case_results(run_dir: &Path) -> Result<Vec<CaseResult>, ArtifactsErr
     for entry in entries.flatten() {
         let path = entry.path();
         if path.is_dir() {
+            let aggregated_path = path.join("aggregated.json");
+            if aggregated_path.exists() {
+                let content = std::fs::read_to_string(&aggregated_path)?;
+                let aggregated: CaseTrialsResult = serde_json::from_str(&content)?;
+                results.push(CaseResult {
+                    id: aggregated.id,
+                    status: aggregated.aggregated_status,
+                    command_count: None,
+                    input_tokens: None,
+                    output_tokens: None,
+                    check_results: vec![],
+                    error_message: None,
+                });
+                continue;
+            }
+
+            // Legacy layout fallback: `{case_id}/result.json`
             let result_path = path.join("result.json");
             if result_path.exists() {
                 let content = std::fs::read_to_string(&result_path)?;
@@ -190,11 +309,15 @@ mod tests {
         let dir = TempDir::new().unwrap();
         let summary = SummaryResult {
             suite_pass: true,
+            suite_pass_rate: Some(1.0),
             agent: "codex".to_string(),
             model: None,
             total_cases: 2,
             passed: 2,
             failed: 0,
+            trials_per_case: Some(1),
+            parallel: None,
+            pass_threshold: Some(1.0),
             run_dir: dir.path().to_path_buf(),
             checks_path: None,
             skill_project_root: dir.path().to_path_buf(),
diff --git a/src/eval/config.rs b/src/eval/config.rs
index ecb6e5b..eaa4f18 100644
--- a/src/eval/config.rs
+++ b/src/eval/config.rs
@@ -13,6 +13,12 @@ pub struct EvalConfig {
     pub checks_path: Option<PathBuf>,
     /// Timeout in seconds for each case
     pub timeout_seconds: u64,
+    /// Trials per case (>= 1)
+    pub trials_per_case: u32,
+    /// Optional maximum parallelism for trials within one case
+    pub parallel: Option<u32>,
+    /// Pass threshold for trial aggregation (0.0-1.0)
+    pub pass_threshold: f64,
     /// Whether to fail fast if agent is not available
     pub fail_on_missing_agent: bool,
     /// Skill project root directory
@@ -26,6 +32,10 @@ pub enum EvalConfigError {
     ConfigMissing,
     #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV not found: {0}")]
     PromptsNotFound(PathBuf),
+    #[error("EVAL_INVALID_TRIALS_CONFIG: trials_per_case must be in range [1, 1000], got {0}")]
+    InvalidTrialsConfig(u32),
+    #[error("EVAL_INVALID_THRESHOLD: pass_threshold must be in range [0.0, 1.0], got {0}")]
+    InvalidPassThreshold(f64),
     #[error("Failed to read skill-project.toml: {0}")]
     Io(#[from] std::io::Error),
     #[error("Failed to parse skill-project.toml: {0}")]
@@ -56,6 +66,13 @@ pub fn resolve_from_toml(
     config: &EvalConfigToml,
     project_root: &Path,
 ) -> Result<EvalConfig, EvalConfigError> {
+    if config.trials_per_case < 1 || config.trials_per_case > 1000 {
+        return Err(EvalConfigError::InvalidTrialsConfig(config.trials_per_case));
+    }
+    if !(0.0..=1.0).contains(&config.pass_threshold) {
+        return Err(EvalConfigError::InvalidPassThreshold(config.pass_threshold));
+    }
+
     let prompts_path = if config.prompts.is_absolute() {
         config.prompts.clone()
     } else {
@@ -78,6 +95,9 @@ pub fn resolve_from_toml(
         prompts_path,
         checks_path,
         timeout_seconds: config.timeout_seconds,
+        trials_per_case: config.trials_per_case,
+        parallel: config.parallel,
+        pass_threshold: config.pass_threshold,
         fail_on_missing_agent: config.fail_on_missing_agent,
         project_root: project_root.to_path_buf(),
     })
@@ -112,6 +132,56 @@ mod tests {
         assert!(matches!(result, Err(EvalConfigError::PromptsNotFound(_))));
     }
 
+    #[test]
+    fn test_resolve_eval_config_rejects_invalid_trials_per_case() {
+        let dir = TempDir::new().unwrap();
+        let evals_dir = dir.path().join("evals");
+        std::fs::create_dir_all(&evals_dir).unwrap();
+        std::fs::write(
+            evals_dir.join("prompts.csv"),
+            "id,prompt,should_trigger\ntest-1,hello,true\n",
+        )
+        .unwrap();
+
+        let project_file = dir.path().join("skill-project.toml");
+        std::fs::write(
+            &project_file,
+            "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntrials_per_case = 0\ntimeout_seconds = 600\nfail_on_missing_agent = false\n",
+        )
+        .unwrap();
+
+        let result = resolve_eval_config(&project_file, dir.path());
+        assert!(matches!(
+            result,
+            Err(EvalConfigError::InvalidTrialsConfig(0))
+        ));
+    }
+
+    #[test]
+    fn test_resolve_eval_config_rejects_invalid_pass_threshold() {
+        let dir = TempDir::new().unwrap();
+        let evals_dir = dir.path().join("evals");
+        std::fs::create_dir_all(&evals_dir).unwrap();
+        std::fs::write(
+            evals_dir.join("prompts.csv"),
+            "id,prompt,should_trigger\ntest-1,hello,true\n",
+        )
+        .unwrap();
+
+        let project_file = dir.path().join("skill-project.toml");
+        std::fs::write(
+            &project_file,
+            "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\npass_threshold = 1.5\ntimeout_seconds = 600\nfail_on_missing_agent = false\n",
+        )
+        .unwrap();
+
+        let result = resolve_eval_config(&project_file, dir.path());
+        assert!(matches!(
+            result,
+            Err(EvalConfigError::InvalidPassThreshold(_))
+        ));
+    }
+
     #[test]
     fn test_resolve_eval_config_success() {
         let dir = TempDir::new().unwrap();
@@ -135,6 +205,8 @@ mod tests {
         assert!(result.is_ok());
         let config = result.unwrap();
         assert_eq!(config.timeout_seconds, 600);
+        assert_eq!(config.trials_per_case, 1);
+        assert_eq!(config.pass_threshold, 1.0);
         assert!(!config.fail_on_missing_agent);
     }
 }
diff --git a/src/eval/runner.rs b/src/eval/runner.rs
index c5495c4..cf902af 100644
--- a/src/eval/runner.rs
+++ b/src/eval/runner.rs
@@ -1,14 +1,17 @@
 //! Eval runner implementation using aikit-sdk
 
-use crate::eval::artifacts::{CaseResult, CaseStatus};
+use crate::eval::artifacts::{CaseResult, CaseStatus, CaseTrialsResult, TrialResult};
 use crate::eval::checks::{count_raw_json_events, run_checks, CheckDefinition};
 use crate::eval::suite::EvalCase;
 use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl, TraceEvent, TracePayload};
 use aikit_sdk::{run_agent_events, AgentEvent, RunOptions};
 use async_trait::async_trait;
 use std::path::PathBuf;
+use std::sync::Arc;
 use std::time::Duration;
 use thiserror::Error;
+use tokio::sync::Semaphore;
+use tokio::task::JoinSet;
 
 /// Options for running a single eval case
 #[derive(Debug, Clone)]
@@ -21,6 +24,8 @@ pub struct CaseRunOptions {
     pub project_root: PathBuf,
     /// Timeout in seconds
     pub timeout_seconds: u64,
+    /// Per-case trial aggregation pass threshold (0.0-1.0)
+    pub pass_threshold: f64,
 }
 
 /// Raw output from running a case
@@ -53,6 +58,16 @@ pub trait EvalRunner: Send + Sync {
         opts: &CaseRunOptions,
         checks: &[CheckDefinition],
     ) -> (CaseRunOutput, CaseResult, String);
+
+    /// Run multiple trials for one case, returning the aggregated result.
+    async fn run_case_trials(
+        &self,
+        case: &EvalCase,
+        opts: &CaseRunOptions,
+        checks: &[CheckDefinition],
+        trial_count: u32,
+        max_parallelism: Option<u32>,
+    ) -> CaseTrialsResult;
 }
 
 /// Default runner: `aikit_sdk::run_agent_events` inside `spawn_blocking` with SDK timeout/cwd.
@@ -75,6 +90,99 @@ impl EvalRunner for AikitEvalRunner {
     ) -> (CaseRunOutput, CaseResult, String) {
         self.run_case_inner(case, opts, checks).await
     }
+
+    async fn run_case_trials(
+        &self,
+        case: &EvalCase,
+        opts: &CaseRunOptions,
+        checks: &[CheckDefinition],
+        trial_count: u32,
+        max_parallelism: Option<u32>,
+    ) -> CaseTrialsResult {
+        let max_parallel = max_parallelism
+            .unwrap_or_else(|| num_cpus::get().max(1) as u32)
+            .max(1) as usize;
+        let semaphore = Arc::new(Semaphore::new(max_parallel));
+        let mut join_set: JoinSet<TrialResult> = JoinSet::new();
+
+        for trial_id in 1..=trial_count {
+            let permit = Arc::clone(&semaphore);
+            let case_clone = case.clone();
+            let opts_clone = opts.clone();
+            let checks_vec = checks.to_vec();
+            let runner = *self;
+
+            join_set.spawn(async move {
+                let Ok(_permit) = permit.acquire().await else {
+                    return TrialResult {
+                        trial_id,
+                        status: CaseStatus::Error,
+                        command_count: None,
+                        input_tokens: None,
+                        output_tokens: None,
+                        check_results: vec![],
+                        error_message: Some(
+                            "EVAL_PARALLEL_EXHAUSTION: semaphore closed".to_string(),
+                        ),
+                    };
+                };
+                let (_output, case_result, _trace) = runner
+                    .run_case_inner(&case_clone, &opts_clone, &checks_vec)
+                    .await;
+                TrialResult {
+                    trial_id,
+                    status: case_result.status,
+                    command_count: case_result.command_count,
+                    input_tokens: case_result.input_tokens,
+                    output_tokens: case_result.output_tokens,
+                    check_results: case_result.check_results,
+                    error_message: case_result.error_message,
+                }
+            });
+        }
+
+        let mut trials = Vec::with_capacity(trial_count as usize);
+        while let Some(res) = join_set.join_next().await {
+            match res {
+                Ok(trial) => trials.push(trial),
+                Err(e) => {
+                    // Join errors are treated as failed trials.
+                    let next_id = (trials.len() as u32) + 1;
+                    trials.push(TrialResult {
+                        trial_id: next_id,
+                        status: CaseStatus::Error,
+                        command_count: None,
+                        input_tokens: None,
+                        output_tokens: None,
+                        check_results: vec![],
+                        error_message: Some(format!("EVAL_PARALLEL_EXHAUSTION: {}", e)),
+                    });
+                }
+            }
+        }
+
+        trials.sort_by_key(|t| t.trial_id);
+        let pass_count = trials
+            .iter()
+            .filter(|t| t.status == CaseStatus::Passed)
+            .count() as u32;
+        let total_trials = trial_count.max(1);
+        let pass_rate = pass_count as f64 / total_trials as f64;
+        let aggregated_status = if pass_rate >= opts.pass_threshold {
+            CaseStatus::Passed
+        } else {
+            CaseStatus::Failed
+        };
+
+        CaseTrialsResult {
+            id: case.id.clone(),
+            trials,
+            aggregated_status,
+            pass_count,
+            total_trials,
+            pass_rate,
+        }
+    }
 }
 
 impl AikitEvalRunner {
@@ -202,7 +310,14 @@ impl AikitEvalRunner {
             input_tokens: None,
             output_tokens: None,
             check_results,
-            error_message: None,
+            error_message: if run_output.timed_out {
+                Some(format!(
+                    "EVAL_CASE_TIMEOUT: Case timed out after {}s",
+                    timeout_secs
+                ))
+            } else {
+                None
+            },
         };
 
         (run_output, case_result, trace_jsonl)
@@ -255,6 +370,48 @@ mod tests {
             };
             (out, result, trace_jsonl)
         }
+
+        async fn run_case_trials(
+            &self,
+            case: &EvalCase,
+            opts: &CaseRunOptions,
+            checks: &[CheckDefinition],
+            trial_count: u32,
+            _max_parallelism: Option<u32>,
+        ) -> CaseTrialsResult {
+            let mut trials = Vec::new();
+            for trial_id in 1..=trial_count {
+                let (_out, result, _trace) = self.run_case(case, opts, checks).await;
+                trials.push(TrialResult {
+                    trial_id,
+                    status: result.status,
+                    command_count: result.command_count,
+                    input_tokens: result.input_tokens,
+                    output_tokens: result.output_tokens,
+                    check_results: result.check_results,
+                    error_message: result.error_message,
+                });
+            }
+            let pass_count = trials
+                .iter()
+                .filter(|t| t.status == CaseStatus::Passed)
+                .count() as u32;
+            let total_trials = trial_count.max(1);
+            let pass_rate = pass_count as f64 / total_trials as f64;
+            let aggregated_status = if pass_rate >= opts.pass_threshold {
+                CaseStatus::Passed
+            } else {
+                CaseStatus::Failed
+            };
+            CaseTrialsResult {
+                id: case.id.clone(),
+                trials,
+                aggregated_status,
+                pass_count,
+                total_trials,
+                pass_rate,
+            }
+        }
     }
 
     #[tokio::test]
@@ -271,6 +428,7 @@ mod tests {
             model: None,
             project_root: PathBuf::from("/tmp"),
             timeout_seconds: 1,
+            pass_threshold: 1.0,
         };
         let runner = StubEvalRunner;
         let (out, res, trace) = runner.run_case(&case, &opts, &[]).await;
@@ -286,6 +444,7 @@ mod tests {
             model: Some("gpt-4".to_string()),
             project_root: PathBuf::from("/tmp"),
             timeout_seconds: 300,
+            pass_threshold: 1.0,
         };
         assert_eq!(opts.agent_key, "codex");
         assert_eq!(opts.model, Some("gpt-4".to_string()));
diff --git a/tests/cli/eval_tests.rs b/tests/cli/eval_tests.rs
index 4ecea72..4ffe5ef 100644
--- a/tests/cli/eval_tests.rs
+++ b/tests/cli/eval_tests.rs
@@ -403,6 +403,7 @@ fn test_eval_run_persists_event_trace_jsonl() {
     let run_dir = summary["run_dir"].as_str().unwrap();
     let trace_path = std::path::Path::new(run_dir)
         .join("trace-case")
+        .join("trial-1")
         .join("trace.jsonl");
     let trace_jsonl = fs::read_to_string(&trace_path).unwrap();
 
@@ -414,8 +415,201 @@ fn test_eval_run_persists_event_trace_jsonl() {
 
     let result_path = std::path::Path::new(run_dir)
         .join("trace-case")
+        .join("trial-1")
         .join("result.json");
     let case_result: Value =
         serde_json::from_str(&fs::read_to_string(result_path).unwrap()).unwrap();
     assert_eq!(case_result["command_count"], 1);
 }
+
+#[test]
+fn test_eval_run_trials_threshold_and_ci_exit_semantics() {
+    use serde_json::Value;
+    use std::env;
+    use std::fs;
+    use tempfile::TempDir;
+
+    let dir = TempDir::new().unwrap();
+    let evals_dir = dir.path().join("evals");
+    fs::create_dir_all(&evals_dir).unwrap();
+    fs::write(
+        evals_dir.join("prompts.csv"),
+        "id,prompt,should_trigger,tags,workspace_subdir\ntrial-case,\"test prompt\",true,\"basic\",\n",
+    )
+    .unwrap();
+    fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap();
+    fs::write(
+        dir.path().join("skill-project.toml"),
+        "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 30\nfail_on_missing_agent = true\n",
+    )
+    .unwrap();
+
+    // Fake agent that passes the first 3 invocations, then fails.
+    let bin_dir = dir.path().join("bin");
+    fs::create_dir_all(&bin_dir).unwrap();
+    let agent_path = bin_dir.join("agent");
+    fs::write(
+        &agent_path,
+        "#!/usr/bin/env bash\nset -euo pipefail\nif [[ \"${1:-}\" == \"--version\" ]]; then echo \"agent 0.1\"; exit 0; fi\nstate_dir=\"${FASTSKILL_TEST_STATE_DIR:?}\"\nmkdir -p \"$state_dir\"\nlock=\"$state_dir/lock\"\ncount_file=\"$state_dir/count\"\nexec 9>\"$lock\"\nflock 9\ncount=0\nif [[ -f \"$count_file\" ]]; then count=$(cat \"$count_file\" || echo 0); fi\ncount=$((count+1))\necho \"$count\" > \"$count_file\"\nflock -u 9\n# Emit a raw_json line so trace persists and command_count=1.\necho '{\"event\":\"ok\"}'\nif [[ $count -le 3 ]]; then exit 0; else exit 1; fi\n",
+    )
+    .unwrap();
+    #[cfg(unix)]
+    {
+        use std::os::unix::fs::PermissionsExt;
+        let mut perms = fs::metadata(&agent_path).unwrap().permissions();
+        perms.set_mode(0o755);
+        fs::set_permissions(&agent_path, perms).unwrap();
+    }
+
+    let output_dir = dir.path().join("out");
+    let path = env::var("PATH").unwrap_or_default();
+    let merged_path = format!("{}:{}", bin_dir.display(), path);
+    let state_dir = dir.path().join("state");
+    let env_vars = vec![
+        ("PATH", merged_path.as_str()),
+        ("FASTSKILL_TEST_STATE_DIR", state_dir.to_str().unwrap()),
+    ];
+
+    // Threshold 0.6 should pass for 3/5.
+    let result = run_fastskill_command_with_env(
+        &[
+            "eval",
+            "run",
+            "--agent",
+            "agent",
+            "--output-dir",
+            output_dir.to_str().unwrap(),
+            "--case",
+            "trial-case",
+            "--trials",
+            "5",
+            "--threshold",
+            "0.6",
+            "--ci",
+            "--json",
+        ],
+        &env_vars,
+        Some(dir.path()),
+    );
+    assert!(
+        result.success,
+        "Expected eval run to succeed in CI mode at threshold=0.6, got stdout: {}, stderr: {}",
+        result.stdout, result.stderr
+    );
+    let json_start = result.stdout.find('{').unwrap();
+    let summary: Value = serde_json::from_str(&result.stdout[json_start..]).unwrap();
+    assert_eq!(summary["cases"][0]["id"], "trial-case");
+    assert_eq!(summary["cases"][0]["status"], "passed");
+    assert_eq!(summary["cases"][0]["pass_count"], 3);
+    assert_eq!(summary["cases"][0]["total_trials"], 5);
+
+    // Reset state and require 100% suite pass rate should fail in --ci mode.
+    fs::remove_file(state_dir.join("count")).ok();
+    let result = run_fastskill_command_with_env(
+        &[
+            "eval",
+            "run",
+            "--agent",
+            "agent",
+            "--output-dir",
+            output_dir.to_str().unwrap(),
+            "--case",
+            "trial-case",
+            "--trials",
+            "5",
+            "--threshold",
+            "1.0",
+            "--ci",
+            "--json",
+        ],
+        &env_vars,
+        Some(dir.path()),
+    );
+    assert!(
+        !result.success,
+        "Expected eval run to fail in CI mode at threshold=1.0"
+    );
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("threshold") || combined.contains("Eval suite failed"),
+        "Expected threshold-related failure, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_run_parallelism_reduces_wall_time() {
+    use std::env;
+    use std::fs;
+    use std::time::Instant;
+    use tempfile::TempDir;
+
+    let dir = TempDir::new().unwrap();
+    let evals_dir = dir.path().join("evals");
+    fs::create_dir_all(&evals_dir).unwrap();
+    fs::write(
+        evals_dir.join("prompts.csv"),
+        "id,prompt,should_trigger,tags,workspace_subdir\nsleep-case,\"test prompt\",true,\"basic\",\n",
+    )
+    .unwrap();
+    fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap();
+    fs::write(
+        dir.path().join("skill-project.toml"),
+        "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 30\nparallel = 4\nfail_on_missing_agent = true\n",
+    )
+    .unwrap();
+
+    let bin_dir = dir.path().join("bin");
+    fs::create_dir_all(&bin_dir).unwrap();
+    let agent_path = bin_dir.join("agent");
+    fs::write(
+        &agent_path,
+        "#!/usr/bin/env bash\nset -euo pipefail\nif [[ \"${1:-}\" == \"--version\" ]]; then echo \"agent 0.1\"; exit 0; fi\nsleep 0.5\necho '{\"event\":\"ok\"}'\nexit 0\n",
+    )
+    .unwrap();
+    #[cfg(unix)]
+    {
+        use std::os::unix::fs::PermissionsExt;
+        let mut perms = fs::metadata(&agent_path).unwrap().permissions();
+        perms.set_mode(0o755);
+        fs::set_permissions(&agent_path, perms).unwrap();
+    }
+
+    let output_dir = dir.path().join("out");
+    let path = env::var("PATH").unwrap_or_default();
+    let merged_path = format!("{}:{}", bin_dir.display(), path);
+    let env_vars = vec![("PATH", merged_path.as_str())];
+
+    let start = Instant::now();
+    let result = run_fastskill_command_with_env(
+        &[
+            "eval",
+            "run",
+            "--agent",
+            "agent",
+            "--output-dir",
+            output_dir.to_str().unwrap(),
+            "--case",
+            "sleep-case",
+            "--trials",
+            "4",
+            "--json",
+        ],
+        &env_vars,
+        Some(dir.path()),
+    );
+    let elapsed = start.elapsed();
+    assert!(
+        result.success,
+        "Expected eval run to succeed, got stdout: {}, stderr: {}",
+        result.stdout, result.stderr
+    );
+
+    // If trials executed sequentially with sleep(0.5), 4 trials would take ~2s.
+    // With parallel=4, it should be comfortably below that.
+    assert!(
+        elapsed.as_secs_f64() < 1.6,
+        "Expected parallel trials to complete faster; elapsed={:?}",
+        elapsed
+    );
+}
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
index 0e1c47b..2089e92 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
@@ -15,6 +15,9 @@ Options:
       --format <FORMAT>          Output format: table, json, grid, xml
       --json                     Shorthand for --format json
       --no-fail                  Do not fail with non-zero exit code on suite failure
+      --trials <TRIALS>          Override trials per case from config
+      --ci                       Enable CI mode: exit non-zero if suite pass rate below threshold
+      --threshold <THRESHOLD>    Override pass threshold (0.0-1.0)
   -h, --help                     Print help
 
 Examples: