Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions src/cli/commands/eval/validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ use crate::cli::error::{CliError, CliResult};
use aikit_sdk::{is_agent_available, is_runnable, runnable_agents};
use clap::Args;
use fastskill::core::project::resolve_project_file;
use fastskill::eval::checks::load_checks;
use fastskill::eval::config::resolve_eval_config;
use fastskill::eval::suite::load_suite;
use fastskill::OutputFormat;
use std::env;

Expand Down Expand Up @@ -67,6 +69,23 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
let eval_config = resolve_eval_config(&resolution.path, &project_root)
.map_err(|e| CliError::Config(e.to_string()))?;

// Parse and validate prompts CSV
let suite =
load_suite(&eval_config.prompts_path).map_err(|e| CliError::Config(e.to_string()))?;
let case_count = suite.cases.len();

// Parse and validate checks TOML if present and exists
let check_count = if let Some(ref checks_path) = eval_config.checks_path {
if checks_path.exists() {
let checks = load_checks(checks_path).map_err(|e| CliError::Config(e.to_string()))?;
checks.len()
} else {
0
}
} else {
0
};

// Check agent availability if --agent was specified
if let Some(ref agent_key) = args.agent {
let available = is_agent_available(agent_key);
Expand All @@ -92,6 +111,8 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
"timeout_seconds": eval_config.timeout_seconds,
"fail_on_missing_agent": eval_config.fail_on_missing_agent,
"project_root": eval_config.project_root,
"case_count": case_count,
"check_count": check_count,
});
println!(
"{}",
Expand All @@ -100,8 +121,10 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
} else {
println!("eval configuration: valid");
println!(" prompts: {}", eval_config.prompts_path.display());
println!(" cases: {}", case_count);
if let Some(ref checks) = eval_config.checks_path {
println!(" checks: {}", checks.display());
println!(" check count: {}", check_count);
}
println!(" timeout: {}s", eval_config.timeout_seconds);
println!(
Expand Down
120 changes: 120 additions & 0 deletions tests/cli/eval_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,126 @@ fn test_eval_validate_with_eval_config() {
);
}

#[test]
fn test_eval_validate_invalid_csv_missing_column() {
use std::fs;
use tempfile::TempDir;

let dir = TempDir::new().unwrap();

let evals_dir = dir.path().join("evals");
fs::create_dir_all(&evals_dir).unwrap();
// CSV missing required 'should_trigger' column
fs::write(
evals_dir.join("prompts.csv"),
"id,prompt\ntest-1,\"Test prompt\"\n",
)
.unwrap();

fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap();
fs::write(
dir.path().join("skill-project.toml"),
"[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 300\nfail_on_missing_agent = false\n",
)
.unwrap();

let result = run_fastskill_command(&["eval", "validate"], Some(dir.path()));
assert!(
!result.success,
"Expected eval validate to fail due to missing CSV column"
);
let combined = format!("{}{}", result.stdout, result.stderr);
assert!(
combined.contains("EVAL_INVALID_CSV") || combined.contains("should_trigger"),
"Expected EVAL_INVALID_CSV error, got: {}",
combined
);
}

#[test]
fn test_eval_validate_invalid_checks_toml() {
use std::fs;
use tempfile::TempDir;

let dir = TempDir::new().unwrap();

let evals_dir = dir.path().join("evals");
fs::create_dir_all(&evals_dir).unwrap();
fs::write(
evals_dir.join("prompts.csv"),
"id,prompt,should_trigger,tags,workspace_subdir\ntest-1,\"Test prompt\",true,\"basic\",\n",
)
.unwrap();
// Invalid TOML syntax
fs::write(
evals_dir.join("checks.toml"),
"[[check]\nname = broken toml {\n",
)
.unwrap();

fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap();
fs::write(
dir.path().join("skill-project.toml"),
"[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\nchecks = \"evals/checks.toml\"\ntimeout_seconds = 300\nfail_on_missing_agent = false\n",
)
.unwrap();

let result = run_fastskill_command(&["eval", "validate"], Some(dir.path()));
assert!(
!result.success,
"Expected eval validate to fail due to invalid checks TOML"
);
let combined = format!("{}{}", result.stdout, result.stderr);
assert!(
combined.contains("EVAL_CHECKS_INVALID")
|| combined.contains("TOML")
|| combined.contains("toml"),
"Expected EVAL_CHECKS_INVALID error, got: {}",
combined
);
}

#[test]
fn test_eval_validate_with_counts_in_json_output() {
use std::fs;
use tempfile::TempDir;

let dir = TempDir::new().unwrap();

let evals_dir = dir.path().join("evals");
fs::create_dir_all(&evals_dir).unwrap();
fs::write(
evals_dir.join("prompts.csv"),
"id,prompt,should_trigger,tags,workspace_subdir\ntest-1,\"Test prompt\",true,\"basic\",\ntest-2,\"Another prompt\",false,\"\",\n",
)
.unwrap();
fs::write(
evals_dir.join("checks.toml"),
"[[check]]\nname = \"trigger_expectation\"\npattern = \"fastskill\"\nexpected = true\n",
)
.unwrap();

fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap();
fs::write(
dir.path().join("skill-project.toml"),
"[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\nchecks = \"evals/checks.toml\"\ntimeout_seconds = 300\nfail_on_missing_agent = false\n",
)
.unwrap();

let result = run_fastskill_command(&["eval", "validate", "--json"], Some(dir.path()));
assert!(
result.success,
"Expected eval validate to succeed, got stdout: {}, stderr: {}",
result.stdout, result.stderr
);

let json_start = result.stdout.find('{').unwrap();
let output: serde_json::Value = serde_json::from_str(&result.stdout[json_start..]).unwrap();
assert_eq!(output["valid"], true);
assert_eq!(output["case_count"], 2);
assert_eq!(output["check_count"], 1);
}

#[test]
fn test_eval_report_requires_run_dir() {
let result = run_fastskill_command(&["eval", "report"], None);
Expand Down
Loading