Azure · w-javed · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
@@ -8,7 +8,7 @@ def __init__(self, *, config: str, threshold, **kwargs):
 
     def __call__(self, *args, **kwargs):
         return {
-            "result": evaluate_answer_length(kwargs.get("response")), 
+            "score": evaluate_answer_length(kwargs.get("response")), 
         }
 
 

@@ -12,62 +12,78 @@
 You MUST respond in the following JSON format only:
 {
     "score": <integer 1-5>,
-    "label": "<Pass or Fail>",
     "reason": "<brief reason for the score>",
-    "explanation": "<detailed explanation of why the response received this score>"
+    "explanation": "<detailed explanation of why the response received this score>",
+    "tone": "<the overall tone detected, e.g. warm, neutral, dismissive>",
+    "confidence": "<high, medium, or low confidence in the assessment>"
 }
-
-A score of 3 or above is considered "Pass", below 3 is "Fail".
 """
 
 
-def build_evaluation_messages(query: str, response: str) -> list:
-    """Build the messages list for the LLM evaluation call.
+def build_evaluation_instructions() -> str:
+    """Return the system instructions for the LLM evaluation call.
+
+    :return: The system prompt string for the Responses API.
+    """
+    return FRIENDLINESS_SYSTEM_PROMPT
+
+
+def build_evaluation_input(query: str, response: str) -> str:
+    """Build the user input for the LLM evaluation call.
 
     :param query: The original user query.
     :param response: The response to evaluate for friendliness.
-    :return: A list of message dicts for the chat completion API.
+    :return: A string prompt for the Responses API.
     """
-    return [
-        {"role": "system", "content": FRIENDLINESS_SYSTEM_PROMPT},
-        {
-            "role": "user",
-            "content": (
-                f"Please evaluate the friendliness of the following response.\n\n"
-                f"Original query: {query}\n\n"
-                f"Response to evaluate: {response}"
-            ),
-        },
-    ]
+    return (
+        f"Please evaluate the friendliness of the following response.\n\n"
+        f"Original query: {query}\n\n"
+        f"Response to evaluate: {response}"
+    )
 
 
 def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict:
     """Parse the LLM's JSON response into a structured evaluation result.
 
+    The return dict has the standard top-level keys (score, label, reason,
+    threshold, passed) and a ``properties`` dict for any extra output fields
+    the evaluator wants to surface.
+
     :param raw_result: The raw string output from the LLM.
     :param threshold: The minimum score to be considered "Pass".
-    :return: A dict with score, label, reason, and explanation.
+    :return: A dict with score, label, reason, threshold, passed, and properties.
     """
     import json
 
+    # Keys that are promoted to the top level of the result
+    top_level_keys = {"score", "label", "reason"}
+
     try:
         # Try to extract JSON from the response (handle markdown code blocks)
         text = raw_result.strip()
         if text.startswith("```"):
             text = text.split("\n", 1)[1] if "\n" in text else text[3:]
             text = text.rsplit("```", 1)[0]
         result = json.loads(text.strip())
-        score = int(result.get("score", threshold))
+        score = max(1, min(5, int(result.get("score", threshold))))
+        passed = score >= threshold
+
+        # Collect any extra fields returned by the LLM into properties
+        properties = {k: v for k, v in result.items() if k not in top_level_keys}
+
         return {
-            "score": max(1, min(5, score)),
-            "label": result.get("label", "Pass" if score >= threshold else "Fail"),
+            "score": score,
+            "label": "Pass" if passed else "Fail",
             "reason": result.get("reason", "No reason provided"),
-            "explanation": result.get("explanation", "No explanation provided"),
+            "threshold": threshold,
+            "passed": passed,
+            "properties": properties,  # extra metadata surfaced in the evaluation results
         }
     except (json.JSONDecodeError, ValueError, KeyError):
         return {
             "score": threshold,
-            "label": "Pass",
+            "label": "Fail",
             "reason": "Could not parse LLM response",
-            "explanation": f"Raw LLM output: {raw_result}",
+            "threshold": threshold,
+            "passed": False,
         }
@@ -1,66 +1,42 @@
 """Custom evaluator that uses an LLM to assess the friendliness of a response."""
 
-from openai import AzureOpenAI
-from common_util.util import build_evaluation_messages, parse_evaluation_result
+from openai import OpenAI
+from common_util.util import build_evaluation_instructions, build_evaluation_input, parse_evaluation_result
 
 
 class FriendlyEvaluator:
     """Evaluates how friendly and approachable a response is using an LLM judge.
 
-    This evaluator sends the query and response to an LLM, which returns a
-    friendliness score (1-5), a pass/fail label, a reason, and a detailed explanation.
+    This evaluator sends the query and response to an LLM via the OpenAI Responses
+    API, which returns a friendliness score (1-5), a pass/fail label, a reason,
+    and a detailed explanation.
 
-    :param model_config: A dict containing Azure OpenAI connection info. Expected keys:
-        - azure_endpoint: The Azure OpenAI endpoint URL.
-        - azure_deployment: The deployment/model name.
-        - api_version: The API version (default: "2024-06-01").
-        - api_key: (Optional) The API key. If not provided, DefaultAzureCredential is used.
+    :param api_key: The OpenAI API key.
+    :param model_name: The model_name to use for evaluation (e.g. "gpt-4o").
     :param threshold: The minimum score (1-5) to be considered "Pass" (default: 3).
     """
 
-    def __init__(self, *, model_config: dict, threshold: int = 3, **kwargs):
-        self.model_config = model_config
+    def __init__(self, *, api_key: str, model_name: str, threshold: int = 3, **kwargs):
+        self.client = OpenAI(api_key=api_key)
+        self.model_name = model_name
         self.threshold = threshold
-        api_key = model_config.get("api_key")
-
-        if api_key:
-            self.client = AzureOpenAI(
-                azure_endpoint=model_config["azure_endpoint"],
-                api_key=api_key,
-                api_version=model_config.get("api_version", "2024-06-01"),
-            )
-        else:
-            from azure.identity import DefaultAzureCredential, get_bearer_token_provider
-
-            token_provider = get_bearer_token_provider(
-                DefaultAzureCredential(),
-                "https://cognitiveservices.azure.com/.default",
-            )
-            self.client = AzureOpenAI(
-                azure_endpoint=model_config["azure_endpoint"],
-                azure_ad_token_provider=token_provider,
-                api_version=model_config.get("api_version", "2024-06-01"),
-            )
-
-        self.deployment = model_config["azure_deployment"]
 
     def __call__(self, *, query: str, response: str, **kwargs) -> dict:
         """Evaluate the friendliness of a response.
 
         :param query: The original user query.
         :param response: The response to evaluate.
-        :return: A dict with score, label, reason, and explanation.
+        :return: A dict with score, label, reason, threshold, passed, and properties.
         """
-        messages = build_evaluation_messages(query, response)
-
-        completion = self.client.chat.completions.create(
-            model=self.deployment,
-            messages=messages,
+        result = self.client.responses.create(
+            model=self.model_name,
+            instructions=build_evaluation_instructions(),
+            input=build_evaluation_input(query, response),
             temperature=0.0,
-            max_tokens=500,
+            max_output_tokens=500,
         )
 
-        raw_result = completion.choices[0].message.content
+        raw_result = result.output_text
         if raw_result is None:
-            raise ValueError("No content in completion response")
+            raise ValueError("No content in response")
         return parse_evaluation_result(raw_result, self.threshold)
@@ -96,7 +96,7 @@
                 "required": ["query", "response"],
             },
             metrics={
-                "result": EvaluatorMetric(
+                "score": EvaluatorMetric(
                     type=EvaluatorMetricType.ORDINAL,
                     desirable_direction=EvaluatorMetricDirection.INCREASE,
                     min_value=1,
@@ -215,13 +215,13 @@
         print("Waiting for evaluation run to complete...")
 
     # ---------------------------------------------------------------
-    # 5. Cleanup (uncomment to delete)
+    # 5. Cleanup 
     # ---------------------------------------------------------------
-    # print("\nCleaning up...")
-    # project_client.beta.evaluators.delete_version(
-    #     name=code_evaluator.name,
-    #     version=code_evaluator.version,
-    # )
-    # client.evals.delete(eval_id=eval_object.id)
-    # print("Cleanup done.")
+    print("\nCleaning up...")
+    project_client.beta.evaluators.delete_version(
+        name=code_evaluator.name,
+        version=code_evaluator.version,
+    )
+    client.evals.delete(eval_id=eval_object.id)
+    print("Cleanup done.")
     print("\nDone - upload, eval creation, and eval run verified successfully.")