Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,414 changes: 679 additions & 735 deletions package-lock.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,9 @@ exports[`Assets Directory Snapshots > File listing > should match the expected f
"cdk/tsconfig.json",
"container/python/Dockerfile",
"container/python/dockerignore.template",
"evaluators/python-lambda/execution-role-policy.json",
"evaluators/python-lambda/lambda_function.py",
"evaluators/python-lambda/pyproject.toml",
"mcp/python-lambda/README.md",
"mcp/python-lambda/handler.py",
"mcp/python-lambda/pyproject.toml",
Expand Down
10 changes: 10 additions & 0 deletions src/assets/evaluators/python-lambda/execution-role-policy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": ["logs:CreateLogGroup", "logs:CreateLogStream", "logs:PutLogEvents"],
"Resource": "arn:aws:logs:*:*:log-group:/aws/lambda/*"
}
]
}
19 changes: 19 additions & 0 deletions src/assets/evaluators/python-lambda/lambda_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from bedrock_agentcore.evaluation.custom_code_based_evaluators import (
custom_code_based_evaluator,
EvaluatorInput,
EvaluatorOutput,
)


@custom_code_based_evaluator()
def handler(input: EvaluatorInput, context) -> EvaluatorOutput:
"""Evaluate agent behavior with custom logic.
Args:
input: Contains evaluation_level, session_spans, target_trace_id, target_span_id
Returns:
EvaluatorOutput with value/label for success, or errorCode/errorMessage for failure.
"""
# TODO: Replace with your evaluation logic
return EvaluatorOutput(value=1.0, label="Pass", explanation="Evaluation passed")
15 changes: 15 additions & 0 deletions src/assets/evaluators/python-lambda/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "{{ Name }}"
version = "0.1.0"
description = "AgentCore Code-Based Evaluator"
requires-python = ">=3.10"
dependencies = [
"bedrock-agentcore>=1.6.0",
]

[tool.hatch.build.targets.wheel]
packages = ["."]
13 changes: 13 additions & 0 deletions src/cli/commands/status/__tests__/action.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,19 @@ describe('computeResourceStatuses', () => {
expect(evalEntry!.detail).toBe('TRACE — LLM-as-a-Judge');
});

it('shows Code-based detail for code-based evaluator', () => {
const project = {
...baseProject,
evaluators: [{ name: 'CodeEval', level: 'SESSION', config: { codeBased: { managed: {} } } }],
} as unknown as AgentCoreProjectSpec;

const result = computeResourceStatuses(project, undefined);
const evalEntry = result.find(r => r.resourceType === 'evaluator' && r.name === 'CodeEval');

expect(evalEntry).toBeDefined();
expect(evalEntry!.detail).toBe('SESSION — Code-based');
});

it('marks evaluator as pending-removal when deployed but removed from schema', () => {
const resources: DeployedResourceState = {
evaluators: {
Expand Down
2 changes: 1 addition & 1 deletion src/cli/commands/status/action.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ export function computeResourceStatuses(
localItems: project.evaluators ?? [],
deployedRecord: resources?.evaluators ?? {},
getIdentifier: deployed => deployed.evaluatorArn,
getLocalDetail: item => `${item.level} — LLM-as-a-Judge`,
getLocalDetail: item => `${item.level} — ${item.config.codeBased ? 'Code-based' : 'LLM-as-a-Judge'}`,
});

const onlineEvalConfigs = diffResourceSet({
Expand Down
132 changes: 112 additions & 20 deletions src/cli/primitives/EvaluatorPrimitive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import type { EvaluationLevel, Evaluator, EvaluatorConfig } from '../../schema';
import { EvaluationLevelSchema, EvaluatorSchema } from '../../schema';
import { getErrorMessage } from '../errors';
import type { RemovalPreview, RemovalResult, SchemaChange } from '../operations/remove/types';
import { renderCodeBasedEvaluatorTemplate } from '../templates/EvaluatorRenderer';
import {
LEVEL_PLACEHOLDERS,
RATING_SCALE_PRESETS,
Expand All @@ -12,6 +13,9 @@ import {
import { BasePrimitive } from './BasePrimitive';
import type { AddResult, AddScreenComponent, RemovableResource } from './types';
import type { Command } from '@commander-js/extra-typings';
import { existsSync } from 'node:fs';
import { rm } from 'node:fs/promises';
import { dirname, join } from 'node:path';

export interface AddEvaluatorOptions {
name: string;
Expand All @@ -22,6 +26,9 @@ export interface AddEvaluatorOptions {

export type RemovableEvaluator = RemovableResource;

const DEFAULT_CODE_ENTRYPOINT = 'lambda_function.handler';
const DEFAULT_CODE_TIMEOUT = 60;

/**
* EvaluatorPrimitive handles all evaluator add/remove operations.
*/
Expand All @@ -31,9 +38,20 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
override readonly article = 'an';
readonly primitiveSchema = EvaluatorSchema;

async add(options: AddEvaluatorOptions): Promise<AddResult<{ evaluatorName: string }>> {
async add(options: AddEvaluatorOptions): Promise<AddResult<{ evaluatorName: string; codePath?: string }>> {
try {
const evaluator = await this.createEvaluator(options);

// Scaffold code for managed code-based evaluators
if (options.config.codeBased?.managed) {
const configRoot = findConfigRoot()!;
const projectRoot = dirname(configRoot);
const codeLocation = options.config.codeBased.managed.codeLocation;
const targetDir = join(projectRoot, codeLocation);
await renderCodeBasedEvaluatorTemplate(options.name, targetDir);
return { success: true, evaluatorName: evaluator.name, codePath: codeLocation };
}

return { success: true, evaluatorName: evaluator.name };
} catch (err) {
return { success: false, error: getErrorMessage(err) };
Expand All @@ -59,6 +77,17 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
};
}

// Delete scaffolded code directory for managed code-based evaluators
const evaluator = project.evaluators[index]!;
if (evaluator.config.codeBased?.managed) {
const configRoot = findConfigRoot()!;
const projectRoot = dirname(configRoot);
const codeDir = join(projectRoot, evaluator.config.codeBased.managed.codeLocation);
if (existsSync(codeDir)) {
await rm(codeDir, { recursive: true, force: true });
}
}

project.evaluators.splice(index, 1);
await this.writeProjectSpec(project);

Expand All @@ -77,6 +106,7 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
}

const summary: string[] = [`Removing evaluator: ${evaluatorName}`];
const directoriesToDelete: string[] = [];
const schemaChanges: SchemaChange[] = [];

const referencingConfigs = project.onlineEvalConfigs.filter(c => c.evaluators.includes(evaluatorName));
Expand All @@ -86,6 +116,18 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
);
}

// Preview code directory deletion for managed code-based evaluators
if (evaluator.config.codeBased?.managed) {
const configRoot = findConfigRoot()!;
const projectRoot = dirname(configRoot);
const codeLocation = evaluator.config.codeBased.managed.codeLocation;
const codeDir = join(projectRoot, codeLocation);
if (existsSync(codeDir)) {
directoriesToDelete.push(codeLocation);
summary.push(`Will delete directory: ${codeLocation}`);
}
}

const afterSpec = {
...project,
evaluators: project.evaluators.filter(e => e.name !== evaluatorName),
Expand All @@ -97,7 +139,7 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
after: afterSpec,
});

return { summary, directoriesToDelete: [], schemaChanges };
return { summary, directoriesToDelete, schemaChanges };
}

async getRemovable(): Promise<RemovableEvaluator[]> {
Expand All @@ -124,17 +166,17 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
addCmd
.command(this.kind)
.description('Add a custom evaluator to the project')
.option('--name <name>', 'Evaluator name [non-interactive]')
.option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL [non-interactive]')
.option('--model <model>', 'Bedrock model ID for LLM-as-a-Judge [non-interactive]')
.option('--name <name>', 'Evaluator name')
.option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL')
.option('--type <type>', 'Evaluator type: llm-as-a-judge (default) or code-based')
.option('--model <model>', '[LLM] Bedrock model ID for LLM-as-a-Judge')
.option(
'--instructions <text>',
'Evaluation prompt instructions (must include level-appropriate placeholders, e.g. {context}) [non-interactive]'
)
.option(
'--rating-scale <preset>',
`Rating scale preset: ${presetIds.join(', ')} (default: 1-5-quality) [non-interactive]`
'[LLM] Evaluation prompt instructions (must include level-appropriate placeholders, e.g. {context})'
)
.option('--rating-scale <preset>', `[LLM] Rating scale preset: ${presetIds.join(', ')} (default: 1-5-quality)`)
.option('--lambda-arn <arn>', '[Code-based] Existing Lambda function ARN (external)')
.option('--timeout <seconds>', '[Code-based] Lambda timeout in seconds, 1-300 (default: 60)')
.option(
'--config <path>',
'Path to evaluator config JSON file (overrides --model, --instructions, --rating-scale) [non-interactive]'
Expand All @@ -144,9 +186,12 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
async (cliOptions: {
name?: string;
level?: string;
type?: string;
model?: string;
instructions?: string;
ratingScale?: string;
lambdaArn?: string;
timeout?: string;
config?: string;
json?: boolean;
}) => {
Expand All @@ -170,21 +215,40 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
fail('--name and --level are required in non-interactive mode');
}

if (!cliOptions.config && !cliOptions.model) {
fail('Either --config or --model is required');
}

const levelResult = EvaluationLevelSchema.safeParse(cliOptions.level);
if (!levelResult.success) {
fail(`Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`);
}

const evalType = cliOptions.type ?? 'llm-as-a-judge';
if (evalType !== 'llm-as-a-judge' && evalType !== 'code-based') {
fail(`Invalid --type "${evalType}". Must be one of: llm-as-a-judge, code-based`);
}

// Cross-validate flags against evaluator type
if (evalType !== 'code-based') {
if (cliOptions.lambdaArn) fail('--lambda-arn requires --type code-based');
if (cliOptions.timeout) fail('--timeout requires --type code-based');
}
if (evalType === 'code-based') {
if (cliOptions.model) fail('--model cannot be used with --type code-based');
if (cliOptions.instructions) fail('--instructions cannot be used with --type code-based');
if (cliOptions.ratingScale) fail('--rating-scale cannot be used with --type code-based');
}

let configJson: EvaluatorConfig;

if (cliOptions.config) {
const { readFileSync } = await import('fs');
configJson = JSON.parse(readFileSync(cliOptions.config, 'utf-8')) as EvaluatorConfig;
} else if (evalType === 'code-based') {
configJson = this.buildCodeBasedConfig(cliOptions.name!, cliOptions.lambdaArn, cliOptions.timeout);
} else {
// --instructions is required when not using --config
// LLM-as-a-Judge flow
if (!cliOptions.model) {
fail('Either --config or --model is required for LLM-as-a-Judge evaluators');
}

if (!cliOptions.instructions) {
const level = levelResult.data!;
const placeholders = LEVEL_PLACEHOLDERS[level].map(p => `{${p}}`).join(', ');
Expand All @@ -194,21 +258,18 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
);
}

// Validate placeholders
const placeholderCheck = validateInstructionPlaceholders(cliOptions.instructions!, levelResult.data!);
if (placeholderCheck !== true) {
fail(placeholderCheck);
}

// Resolve rating scale
let ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale'];
let ratingScale: NonNullable<EvaluatorConfig['llmAsAJudge']>['ratingScale'];
const scaleInput = cliOptions.ratingScale ?? '1-5-quality';

const preset = RATING_SCALE_PRESETS.find(p => p.id === scaleInput);
if (preset) {
ratingScale = preset.ratingScale;
} else {
// Try parsing as custom format: "1:Poor:Fails, 2:Fair:Partially meets" or "Pass:Meets, Fail:Does not"
const isNumerical = /^\d/.test(scaleInput.trim());
const parsed = parseCustomRatingScale(scaleInput, isNumerical ? 'numerical' : 'categorical');
if (!parsed.success) {
Expand Down Expand Up @@ -239,7 +300,16 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
if (cliOptions.json) {
console.log(JSON.stringify(result));
} else if (result.success) {
console.log(`Added evaluator '${result.evaluatorName}'`);
if (result.codePath) {
console.log(`Created evaluator '${result.evaluatorName}'`);
console.log(` Code: ${result.codePath}lambda_function.py`);
console.log(` IAM: ${result.codePath}execution-role-policy.json`);
console.log(
`\n Next: Edit lambda_function.py with your evaluation logic, then run \`agentcore deploy\``
);
} else {
console.log(`Added evaluator '${result.evaluatorName}'`);
}
} else {
console.error(result.error);
}
Expand Down Expand Up @@ -280,6 +350,28 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
return null;
}

private buildCodeBasedConfig(name: string, lambdaArn?: string, timeoutStr?: string): EvaluatorConfig {
if (lambdaArn) {
return {
codeBased: {
external: { lambdaArn },
},
};
}

const timeoutSeconds = timeoutStr ? parseInt(timeoutStr, 10) : DEFAULT_CODE_TIMEOUT;
return {
codeBased: {
managed: {
codeLocation: `app/${name}/`,
entrypoint: DEFAULT_CODE_ENTRYPOINT,
timeoutSeconds,
additionalPolicies: ['execution-role-policy.json'],
},
},
};
}

private async createEvaluator(options: AddEvaluatorOptions): Promise<Evaluator> {
const project = await this.readProjectSpec();

Expand Down
12 changes: 12 additions & 0 deletions src/cli/primitives/OnlineEvalConfigPrimitive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,18 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig

this.checkDuplicate(project.onlineEvalConfigs, options.name, 'Online eval config');

// Block code-based evaluators — only LLM-as-a-Judge evaluators are supported for online evaluation.
// Checks local project config. ARN-based evaluators are filtered in the TUI by API evaluatorType.
// TODO: For ARN-based evaluators in non-interactive mode, call getEvaluator to check type.
for (const evalName of options.evaluators) {
const evaluator = project.evaluators.find(e => e.name === evalName);
if (evaluator?.config.codeBased) {
throw new Error(
`Code-based evaluator "${evalName}" cannot be used in online eval configs. Only LLM-as-a-Judge evaluators are supported for online evaluation.`
);
}
}

const config: OnlineEvalConfig = {
name: options.name,
agent: options.agent,
Expand Down
11 changes: 10 additions & 1 deletion src/cli/primitives/__tests__/EvaluatorPrimitive.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ const validConfig: EvaluatorConfig = {
},
};

function makeEvaluator(name: string, config?: EvaluatorConfig) {
return {
name,
type: 'CustomEvaluator',
level: 'SESSION',
config: config ?? validConfig,
};
}

function makeProject(
evaluators: { name: string }[] = [],
onlineEvalConfigs: { name: string; evaluators: string[] }[] = []
Expand All @@ -37,7 +46,7 @@ function makeProject(
runtimes: [],
memories: [],
credentials: [],
evaluators,
evaluators: evaluators.map(e => ('config' in e ? e : makeEvaluator(e.name))),
onlineEvalConfigs,
};
}
Expand Down
Loading
Loading