Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(self, *, config: str, threshold, **kwargs):

def __call__(self, *args, **kwargs):
return {
"result": evaluate_answer_length(kwargs.get("response")),
"score": evaluate_answer_length(kwargs.get("response")),
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,62 +12,78 @@
You MUST respond in the following JSON format only:
{
"score": <integer 1-5>,
"label": "<Pass or Fail>",
"reason": "<brief reason for the score>",
"explanation": "<detailed explanation of why the response received this score>"
"explanation": "<detailed explanation of why the response received this score>",
"tone": "<the overall tone detected, e.g. warm, neutral, dismissive>",
"confidence": "<high, medium, or low confidence in the assessment>"
}

A score of 3 or above is considered "Pass", below 3 is "Fail".
"""


def build_evaluation_messages(query: str, response: str) -> list:
"""Build the messages list for the LLM evaluation call.
def build_evaluation_instructions() -> str:
"""Return the system instructions for the LLM evaluation call.

:return: The system prompt string for the Responses API.
"""
return FRIENDLINESS_SYSTEM_PROMPT


def build_evaluation_input(query: str, response: str) -> str:
"""Build the user input for the LLM evaluation call.

:param query: The original user query.
:param response: The response to evaluate for friendliness.
:return: A list of message dicts for the chat completion API.
:return: A string prompt for the Responses API.
"""
return [
{"role": "system", "content": FRIENDLINESS_SYSTEM_PROMPT},
{
"role": "user",
"content": (
f"Please evaluate the friendliness of the following response.\n\n"
f"Original query: {query}\n\n"
f"Response to evaluate: {response}"
),
},
]
return (
f"Please evaluate the friendliness of the following response.\n\n"
f"Original query: {query}\n\n"
f"Response to evaluate: {response}"
)


def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict:
"""Parse the LLM's JSON response into a structured evaluation result.

The return dict has the standard top-level keys (score, label, reason,
threshold, passed) and a ``properties`` dict for any extra output fields
the evaluator wants to surface.

:param raw_result: The raw string output from the LLM.
:param threshold: The minimum score to be considered "Pass".
:return: A dict with score, label, reason, and explanation.
:return: A dict with score, label, reason, threshold, passed, and properties.
"""
import json

# Keys that are promoted to the top level of the result
top_level_keys = {"score", "label", "reason"}

try:
# Try to extract JSON from the response (handle markdown code blocks)
text = raw_result.strip()
if text.startswith("```"):
text = text.split("\n", 1)[1] if "\n" in text else text[3:]
text = text.rsplit("```", 1)[0]
result = json.loads(text.strip())
score = int(result.get("score", threshold))
score = max(1, min(5, int(result.get("score", threshold))))
passed = score >= threshold

# Collect any extra fields returned by the LLM into properties
properties = {k: v for k, v in result.items() if k not in top_level_keys}

return {
"score": max(1, min(5, score)),
"label": result.get("label", "Pass" if score >= threshold else "Fail"),
"score": score,
"label": "Pass" if passed else "Fail",
"reason": result.get("reason", "No reason provided"),
"explanation": result.get("explanation", "No explanation provided"),
"threshold": threshold,
"passed": passed,
"properties": properties, # extra metadata surfaced in the evaluation results
}
except (json.JSONDecodeError, ValueError, KeyError):
return {
"score": threshold,
"label": "Pass",
"label": "Fail",
"reason": "Could not parse LLM response",
"explanation": f"Raw LLM output: {raw_result}",
"threshold": threshold,
"passed": False,
}
Original file line number Diff line number Diff line change
@@ -1,66 +1,42 @@
"""Custom evaluator that uses an LLM to assess the friendliness of a response."""

from openai import AzureOpenAI
from common_util.util import build_evaluation_messages, parse_evaluation_result
from openai import OpenAI
from common_util.util import build_evaluation_instructions, build_evaluation_input, parse_evaluation_result


class FriendlyEvaluator:
"""Evaluates how friendly and approachable a response is using an LLM judge.

This evaluator sends the query and response to an LLM, which returns a
friendliness score (1-5), a pass/fail label, a reason, and a detailed explanation.
This evaluator sends the query and response to an LLM via the OpenAI Responses
API, which returns a friendliness score (1-5), a pass/fail label, a reason,
and a detailed explanation.

:param model_config: A dict containing Azure OpenAI connection info. Expected keys:
- azure_endpoint: The Azure OpenAI endpoint URL.
- azure_deployment: The deployment/model name.
- api_version: The API version (default: "2024-06-01").
- api_key: (Optional) The API key. If not provided, DefaultAzureCredential is used.
:param api_key: The OpenAI API key.
:param model_name: The model_name to use for evaluation (e.g. "gpt-4o").
:param threshold: The minimum score (1-5) to be considered "Pass" (default: 3).
"""

def __init__(self, *, model_config: dict, threshold: int = 3, **kwargs):
self.model_config = model_config
def __init__(self, *, api_key: str, model_name: str, threshold: int = 3, **kwargs):
self.client = OpenAI(api_key=api_key)
self.model_name = model_name
self.threshold = threshold
api_key = model_config.get("api_key")

if api_key:
self.client = AzureOpenAI(
azure_endpoint=model_config["azure_endpoint"],
api_key=api_key,
api_version=model_config.get("api_version", "2024-06-01"),
)
else:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

token_provider = get_bearer_token_provider(
DefaultAzureCredential(),
"https://cognitiveservices.azure.com/.default",
)
self.client = AzureOpenAI(
azure_endpoint=model_config["azure_endpoint"],
azure_ad_token_provider=token_provider,
api_version=model_config.get("api_version", "2024-06-01"),
)

self.deployment = model_config["azure_deployment"]

def __call__(self, *, query: str, response: str, **kwargs) -> dict:
"""Evaluate the friendliness of a response.

:param query: The original user query.
:param response: The response to evaluate.
:return: A dict with score, label, reason, and explanation.
:return: A dict with score, label, reason, threshold, passed, and properties.
"""
messages = build_evaluation_messages(query, response)

completion = self.client.chat.completions.create(
model=self.deployment,
messages=messages,
result = self.client.responses.create(
model=self.model_name,
instructions=build_evaluation_instructions(),
input=build_evaluation_input(query, response),
temperature=0.0,
max_tokens=500,
max_output_tokens=500,
)

raw_result = completion.choices[0].message.content
raw_result = result.output_text
if raw_result is None:
raise ValueError("No content in completion response")
raise ValueError("No content in response")
return parse_evaluation_result(raw_result, self.threshold)
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"required": ["query", "response"],
},
metrics={
"result": EvaluatorMetric(
"score": EvaluatorMetric(
type=EvaluatorMetricType.ORDINAL,
desirable_direction=EvaluatorMetricDirection.INCREASE,
min_value=1,
Expand Down Expand Up @@ -215,13 +215,13 @@
print("Waiting for evaluation run to complete...")

# ---------------------------------------------------------------
# 5. Cleanup (uncomment to delete)
# 5. Cleanup
# ---------------------------------------------------------------
# print("\nCleaning up...")
# project_client.beta.evaluators.delete_version(
# name=code_evaluator.name,
# version=code_evaluator.version,
# )
# client.evals.delete(eval_id=eval_object.id)
# print("Cleanup done.")
print("\nCleaning up...")
project_client.beta.evaluators.delete_version(
name=code_evaluator.name,
version=code_evaluator.version,
)
client.evals.delete(eval_id=eval_object.id)
print("Cleanup done.")
print("\nDone - upload, eval creation, and eval run verified successfully.")
Loading
Loading