diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/answer_length_evaluator/answer_length_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/answer_length_evaluator/answer_length_evaluator.py index 8ff7f07b85af..0e6e564cdef1 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/answer_length_evaluator/answer_length_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/answer_length_evaluator/answer_length_evaluator.py @@ -2,13 +2,12 @@ class AnswerLengthEvaluator: - def __init__(self, *, config: str, threshold, **kwargs): - self.config = config - self.threshold = threshold + def __init__(self, **kwargs): + pass def __call__(self, *args, **kwargs): return { - "result": evaluate_answer_length(kwargs.get("response")), + "score": evaluate_answer_length(kwargs.get("response")), } diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py index aa137276e55c..6b0da0fec4cc 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py @@ -12,44 +12,52 @@ You MUST respond in the following JSON format only: { "score": , - "label": "", "reason": "", - "explanation": "" + "explanation": "", + "tone": "", + "confidence": "" } - -A score of 3 or above is considered "Pass", below 3 is "Fail". """ -def build_evaluation_messages(query: str, response: str) -> list: - """Build the messages list for the LLM evaluation call. +def build_evaluation_instructions() -> str: + """Return the system instructions for the LLM evaluation call. + + :return: The system prompt string for the Responses API. + """ + return FRIENDLINESS_SYSTEM_PROMPT + + +def build_evaluation_input(query: str, response: str) -> str: + """Build the user input for the LLM evaluation call. :param query: The original user query. :param response: The response to evaluate for friendliness. - :return: A list of message dicts for the chat completion API. + :return: A string prompt for the Responses API. """ - return [ - {"role": "system", "content": FRIENDLINESS_SYSTEM_PROMPT}, - { - "role": "user", - "content": ( - f"Please evaluate the friendliness of the following response.\n\n" - f"Original query: {query}\n\n" - f"Response to evaluate: {response}" - ), - }, - ] + return ( + f"Please evaluate the friendliness of the following response.\n\n" + f"Original query: {query}\n\n" + f"Response to evaluate: {response}" + ) def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: """Parse the LLM's JSON response into a structured evaluation result. + The return dict has the standard top-level keys (score, label, reason, + threshold, passed) and a ``properties`` dict for any extra output fields + the evaluator wants to surface. + :param raw_result: The raw string output from the LLM. :param threshold: The minimum score to be considered "Pass". - :return: A dict with score, label, reason, and explanation. + :return: A dict with score, label, reason, threshold, passed, and properties. """ import json + # Keys that are promoted to the top level of the result + top_level_keys = {"score", "label", "reason"} + try: # Try to extract JSON from the response (handle markdown code blocks) text = raw_result.strip() @@ -57,17 +65,25 @@ def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: text = text.split("\n", 1)[1] if "\n" in text else text[3:] text = text.rsplit("```", 1)[0] result = json.loads(text.strip()) - score = int(result.get("score", threshold)) + score = max(1, min(5, int(result.get("score", threshold)))) + passed = score >= threshold + + # Collect any extra fields returned by the LLM into properties + properties = {k: v for k, v in result.items() if k not in top_level_keys} + return { - "score": max(1, min(5, score)), - "label": result.get("label", "Pass" if score >= threshold else "Fail"), + "score": score, + "label": "Pass" if passed else "Fail", "reason": result.get("reason", "No reason provided"), - "explanation": result.get("explanation", "No explanation provided"), + "threshold": threshold, + "passed": passed, + "properties": properties, # extra metadata surfaced in the evaluation results } except (json.JSONDecodeError, ValueError, KeyError): return { "score": threshold, - "label": "Pass", + "label": "Fail", "reason": "Could not parse LLM response", - "explanation": f"Raw LLM output: {raw_result}", + "threshold": threshold, + "passed": False, } diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py index 730237af61f5..a72ea35988c6 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py @@ -1,66 +1,42 @@ """Custom evaluator that uses an LLM to assess the friendliness of a response.""" -from openai import AzureOpenAI -from common_util.util import build_evaluation_messages, parse_evaluation_result +from openai import OpenAI +from common_util.util import build_evaluation_instructions, build_evaluation_input, parse_evaluation_result class FriendlyEvaluator: """Evaluates how friendly and approachable a response is using an LLM judge. - This evaluator sends the query and response to an LLM, which returns a - friendliness score (1-5), a pass/fail label, a reason, and a detailed explanation. + This evaluator sends the query and response to an LLM via the OpenAI Responses + API, which returns a friendliness score (1-5), a pass/fail label, a reason, + and a detailed explanation. - :param model_config: A dict containing Azure OpenAI connection info. Expected keys: - - azure_endpoint: The Azure OpenAI endpoint URL. - - azure_deployment: The deployment/model name. - - api_version: The API version (default: "2024-06-01"). - - api_key: (Optional) The API key. If not provided, DefaultAzureCredential is used. + :param api_key: The OpenAI API key. + :param model_name: The model_name to use for evaluation (e.g. "gpt-4o"). :param threshold: The minimum score (1-5) to be considered "Pass" (default: 3). """ - def __init__(self, *, model_config: dict, threshold: int = 3, **kwargs): - self.model_config = model_config + def __init__(self, *, api_key: str, model_name: str, threshold: int = 3, **kwargs): + self.client = OpenAI(api_key=api_key) + self.model_name = model_name self.threshold = threshold - api_key = model_config.get("api_key") - - if api_key: - self.client = AzureOpenAI( - azure_endpoint=model_config["azure_endpoint"], - api_key=api_key, - api_version=model_config.get("api_version", "2024-06-01"), - ) - else: - from azure.identity import DefaultAzureCredential, get_bearer_token_provider - - token_provider = get_bearer_token_provider( - DefaultAzureCredential(), - "https://cognitiveservices.azure.com/.default", - ) - self.client = AzureOpenAI( - azure_endpoint=model_config["azure_endpoint"], - azure_ad_token_provider=token_provider, - api_version=model_config.get("api_version", "2024-06-01"), - ) - - self.deployment = model_config["azure_deployment"] def __call__(self, *, query: str, response: str, **kwargs) -> dict: """Evaluate the friendliness of a response. :param query: The original user query. :param response: The response to evaluate. - :return: A dict with score, label, reason, and explanation. + :return: A dict with score, label, reason, threshold, passed, and properties. """ - messages = build_evaluation_messages(query, response) - - completion = self.client.chat.completions.create( - model=self.deployment, - messages=messages, + result = self.client.responses.create( + model=self.model_name, + instructions=build_evaluation_instructions(), + input=build_evaluation_input(query, response), temperature=0.0, - max_tokens=500, + max_output_tokens=500, ) - raw_result = completion.choices[0].message.content + raw_result = result.output_text if raw_result is None: - raise ValueError("No content in completion response") + raise ValueError("No content in response") return parse_evaluation_result(raw_result, self.threshold) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py index b6d1a9195fbf..8d5abe2d9c55 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py @@ -81,11 +81,8 @@ entry_point="answer_length_evaluator:AnswerLengthEvaluator", init_parameters={ "type": "object", - "properties": { - "config": {"type": "string"}, - "threshold": {"type": "number"} - }, - "required": ["config", "threshold"], + "properties": {}, + "required": [], }, data_schema={ "type": "object", @@ -96,7 +93,7 @@ "required": ["query", "response"], }, metrics={ - "result": EvaluatorMetric( + "score": EvaluatorMetric( type=EvaluatorMetricType.ORDINAL, desirable_direction=EvaluatorMetricDirection.INCREASE, min_value=1, @@ -140,10 +137,7 @@ "type": "azure_ai_evaluator", "name": evaluator_name, "evaluator_name": evaluator_name, - "initialization_parameters": { - "config": "example config value", - "threshold": 3, - }, + "initialization_parameters": {}, } ] @@ -215,13 +209,13 @@ print("Waiting for evaluation run to complete...") # --------------------------------------------------------------- - # 5. Cleanup (uncomment to delete) + # 5. Cleanup # --------------------------------------------------------------- - # print("\nCleaning up...") - # project_client.beta.evaluators.delete_version( - # name=code_evaluator.name, - # version=code_evaluator.version, - # ) - # client.evals.delete(eval_id=eval_object.id) - # print("Cleanup done.") + print("\nCleaning up...") + project_client.beta.evaluators.delete_version( + name=code_evaluator.name, + version=code_evaluator.version, + ) + client.evals.delete(eval_id=eval_object.id) + print("Cleanup done.") print("\nDone - upload, eval creation, and eval run verified successfully.") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py index 67e168d3509a..a54443867fb3 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py @@ -7,13 +7,14 @@ """ DESCRIPTION: Given an AIProjectClient, this sample demonstrates how to: - 1. Upload a custom LLM-based evaluator (FriendlyEvaluator) with nested - folder structure (common_util/) using `evaluators.upload()`. - 2. Create an evaluation (eval) that references the uploaded evaluator. - 3. Run the evaluation with inline data and poll for results. + 1. Run the FriendlyEvaluator standalone to verify it works locally. + 2. Upload the evaluator code (with nested folder structure) using + ``evaluators.upload()``. + 3. Create an evaluation (eval) that references the uploaded evaluator. + 4. Run the evaluation with inline data and poll for results. - The FriendlyEvaluator calls Azure OpenAI to judge the friendliness of a - response and returns score, label, reason, and explanation. + The FriendlyEvaluator calls OpenAI Responses API to judge the friendliness + of a response and returns score, label, reason, and explanation. USAGE: python sample_eval_upload_friendly_evaluator.py @@ -24,10 +25,13 @@ Set these environment variables with your own values: 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint. - 2) FOUNDRY_MODEL_NAME - Optional. The name of the model deployment to use for evaluation. + 2) OPENAI_API_KEY - Required. The OpenAI API key. + 3) OPENAI_MODEL - Optional. The model to use (default: gpt-4o). """ import os +import sys +import json import time import random import string @@ -56,12 +60,49 @@ load_dotenv() endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] -model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME") -azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"] -azure_openai_api_key = os.environ["AZURE_OPENAI_API_KEY"] +openai_api_key = os.environ["OPENAI_API_KEY"] +openai_model = os.environ.get("OPENAI_MODEL") -# The folder containing the FriendlyEvaluator code, including common_util/ subfolder -local_upload_folder = str(Path(__file__).parent / "custom_evaluators" / "friendly_evaluator") +# Add the evaluator folder to sys.path so we can import it for local testing +evaluator_folder = str(Path(__file__).parent / "custom_evaluators" / "friendly_evaluator") +sys.path.insert(0, evaluator_folder) + +from friendly_evaluator import FriendlyEvaluator # noqa: E402 + +# --------------------------------------------------------------- +# 1. Run FriendlyEvaluator standalone to verify it works locally +# --------------------------------------------------------------- +print(f"=== Step 1: Standalone FriendlyEvaluator test (model={openai_model}) ===\n") + +evaluator = FriendlyEvaluator(api_key=openai_api_key, model_name=openai_model, threshold=3) + +test_cases = [ + { + "query": "How do I reset my password?", + "response": "Go to settings. Click reset. Done.", + }, + { + "query": "How do I reset my password?", + "response": ( + "Great question! I'd be happy to help you reset your password. " + "Just head over to Settings > Security > Reset Password, and follow " + "the prompts. If you run into any trouble, feel free to ask — I'm here to help! 😊" + ), + }, + { + "query": "Can you help me with my order?", + "response": "Read the FAQ.", + }, +] + +for i, tc in enumerate(test_cases, 1): + print(f"--- Test Case {i} ---") + print(f"Query: {tc['query']}") + print(f"Response: {tc['response'][:80]}...") + result = evaluator(query=tc["query"], response=tc["response"]) + print(f"Result: {json.dumps(result, indent=2)}\n") + +print("Standalone test complete.\n") with ( DefaultAzureCredential() as credential, @@ -69,7 +110,7 @@ project_client.get_openai_client() as client, ): # --------------------------------------------------------------- - # 1. Upload evaluator code and create evaluator version + # 2. Upload evaluator code and create evaluator version # The folder structure uploaded is: # friendly_evaluator/ # friendly_evaluator.py <- entry point @@ -80,6 +121,8 @@ suffix = "".join(random.choices(string.ascii_lowercase, k=5)) evaluator_name = f"friendly_evaluator_{suffix}" + print(f"=== Step 2: Upload evaluator as '{evaluator_name}' ===\n") + evaluator_version = EvaluatorVersion( evaluator_type=EvaluatorType.CUSTOM, categories=[EvaluatorCategory.QUALITY], @@ -90,19 +133,17 @@ init_parameters={ "type": "object", "properties": { - "model_config": { - "type": "object", - "description": "Azure OpenAI configuration for the LLM judge", - "properties": { - "azure_endpoint": {"type": "string"}, - "api_version": {"type": "string"}, - "api_key": {"type": "string"}, - }, - "required": ["azure_endpoint", "api_key"], + "api_key": { + "type": "string", + "description": "OpenAI API key for the LLM judge", + }, + "model_name": { + "type": "string", + "description": "Model name to use for evaluation (e.g. gpt-4o)", }, "threshold": {"type": "number"}, }, - "required": ["model_config", "threshold"], + "required": ["api_key", "model_name", "threshold"], }, data_schema={ "type": "object", @@ -123,33 +164,32 @@ ), ) - print("Uploading FriendlyEvaluator (with nested common_util folder)...") friendly_evaluator = project_client.beta.evaluators.upload( name=evaluator_name, evaluator_version=evaluator_version, - folder=local_upload_folder, + folder=evaluator_folder, ) - print(f"\nEvaluator created: name={friendly_evaluator.name}, version={friendly_evaluator.version}") + print(f"Evaluator created: name={friendly_evaluator.name}, version={friendly_evaluator.version}") print(f"Evaluator ID: {friendly_evaluator.id}") pprint(friendly_evaluator) # --------------------------------------------------------------- - # 2. Create an evaluation referencing the uploaded evaluator + # 3. Create an evaluation referencing the uploaded evaluator # --------------------------------------------------------------- + print(f"\n=== Step 3: Create evaluation ===\n") + data_source_config = DataSourceConfigCustom( - { - "type": "custom", - "item_schema": { - "type": "object", - "properties": { - "query": {"type": "string"}, - "response": {"type": "string"}, - }, - "required": ["query", "response"], + type="custom", + item_schema={ + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"}, }, - "include_sample_schema": True, - } + "required": ["query", "response"], + }, + include_sample_schema=True, ) testing_criteria = [ @@ -158,13 +198,13 @@ "name": evaluator_name, "evaluator_name": evaluator_name, "initialization_parameters": { - "deployment_name": f"{model_deployment_name}", # provide model_config or, deployment name passed is used to construct the model_config for the evaluator. - "threshold": 3, + "api_key": openai_api_key, + "model_name": openai_model, + "threshold": 3, }, } ] - print("\nCreating evaluation...") eval_object = client.evals.create( name=f"Friendliness Evaluation - {suffix}", data_source_config=data_source_config, @@ -173,9 +213,10 @@ print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") # --------------------------------------------------------------- - # 3. Run the evaluation with inline data + # 4. Run the evaluation with inline data # --------------------------------------------------------------- - print("\nCreating evaluation run with inline data...") + print(f"\n=== Step 4: Create evaluation run ===\n") + eval_run_object = client.evals.runs.create( eval_id=eval_object.id, name=f"Friendliness Eval Run - {suffix}", @@ -218,12 +259,14 @@ pprint(eval_run_object) # --------------------------------------------------------------- - # 4. Poll for evaluation run completion + # 5. Poll for evaluation run completion # --------------------------------------------------------------- + print("\n=== Step 5: Polling for results ===\n") + while True: run = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) if run.status in ("completed", "failed"): - print(f"\nEvaluation run finished with status: {run.status}") + print(f"Evaluation run finished with status: {run.status}") output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) pprint(output_items) print(f"\nEvaluation run Report URL: {run.report_url}") @@ -232,13 +275,13 @@ print("Waiting for evaluation run to complete...") # --------------------------------------------------------------- - # 5. Cleanup (uncomment to delete) + # 6. Cleanup # --------------------------------------------------------------- - # print("\nCleaning up...") - # project_client.beta.evaluators.delete_version( - # name=friendly_evaluator.name, - # version=friendly_evaluator.version, - # ) - # client.evals.delete(eval_id=eval_object.id) - # print("Cleanup done.") - print("\nDone - FriendlyEvaluator upload, eval creation, and eval run verified successfully.") + print("\nCleaning up...") + project_client.beta.evaluators.delete_version( + name=friendly_evaluator.name, + version=friendly_evaluator.version, + ) + client.evals.delete(eval_id=eval_object.id) + print("Cleanup done.") + print("\nDone - FriendlyEvaluator standalone test, upload, eval creation, and eval run verified successfully.")