Azure · w-javed · Mar 11, 2026 · Mar 15, 2026 · Mar 16, 2026 · Mar 17, 2026
@@ -1,5 +1,49 @@
 # Release History
 
+## 1.16.4 (Unreleased)
+
+### Features Added
+
+### Breaking Changes
+
+### Bugs Fixed
+
+### Other Changes
+
+## 1.16.3 (2026-04-01)
+
+### Features Added
+
+- Added `extra_headers` support to `OpenAIModelConfiguration` to allow passing custom HTTP headers.
+
+### Bugs Fixed
+- Fixed attack success rate (ASR) always reporting 0% because the sync eval API's `passed` field indicates task completion, not content safety. Replaced `passed`-based logic with score-based threshold comparison matching `_evaluation_processor.py`.
+- Fixed partial red team results being discarded when some objectives fail. Previously, if PyRIT raised due to incomplete objectives (e.g., evaluator model refuses to score), all completed results were lost. Now recovers partial results from PyRIT's memory database.
+- Fixed evaluator token metrics (`promptTokens`, `completionTokens`) not persisted in red teaming output items. The sync eval API returns camelCase keys but the extraction code only checked for snake_case, silently dropping all evaluator token usage data.
+
+## 1.16.2 (2026-03-24)
+
+### Bugs Fixed
+- Fixed adversarial chat target incorrectly using user's callback instead of RAI service, causing callback response to appear as user message in red team scan results when using converter strategies (e.g., `DIFFICULT`, `Tense`).
+- Fixed `hate_unfairness` attack success rate always reporting 0% due to metric name mapping using legacy `hate_fairness` name instead of canonical `hate_unfairness`.
+- Fixed `TypeError` in multi-turn and crescendo attacks caused by PyRIT 0.11+ renaming `send_prompt_async` parameter from `prompt_request` to `message`.
+- Expanded endpoint normalization (`/openai/v1` suffix) to all Azure OpenAI host suffixes (including sovereign clouds), fixing 404 errors for classic AOAI endpoints with PyRIT 0.11+.
+- Added `aml-aca-token` credential fallback for agent-type seed requests when running locally without ACA managed identity.
+- Extract RAI scorer token metrics (prompt_tokens, completion_tokens, total_tokens) into Score metadata and propagate through result processing for downstream aggregation.
+
+## 1.16.1 (2026-03-18)
+
+### Features Added
+
+- Agentic evaluators (Groundedness, ToolCallAccuracy, ToolCallSuccess, ToolInputAccuracy, ToolOutputUtilization, ToolSelection) now accept plain string inputs directly, skipping structured parsing when string format is provided.
+
+### Bugs Fixed
+- Fixed inconsistency where sample data in evaluation result items did not match the generated sample data from corresponding input rows, ensuring proper synchronization between row-level input samples and their associated evaluation output items.
+- Fixed indirect jailbreak (XPIA) `ValueError` when targeting models by converting `binary_path` file-based context prompts to inline text before invoking the callback target.
+- Fixed content-filter responses showing raw JSON API payloads in red team results by detecting blocked responses and replacing them with human-readable messages.
+- Fixed missing `token_usage` on row-level output items for agent targets using the Foundry execution path by extracting usage data from piece labels.
+- Fixed 7 backwards-compatibility bugs with the `_use_legacy_endpoint=True` feature flag including metric name mapping, result parsing, conversation evaluation mode, and Foundry scorer integration.
+
 ## 1.16.0 (2026-03-10)
 
 ### Bugs Fixed

@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_4da51ba6ab"
+  "Tag": "python/evaluation/azure-ai-evaluation_baead44c3f"
 }
@@ -43,6 +43,47 @@
 
 LOGGER = logging.getLogger(__name__)
 
+# Metric names that differ between the sync evals endpoint and the legacy annotation endpoint.
+# Key = sync endpoint metric name, Value = legacy annotation API metric name.
+# Used bidirectionally: forward lookup for sync→legacy, reverse for legacy→sync.
+# Note: only metrics where the API request metric name differs should be here.
+# For XPIA and ECI, the legacy API uses the annotation_task, not MetricList,
+# so the metric name doesn't need remapping — but the response key does.
+_SYNC_TO_LEGACY_METRIC_NAMES: Dict[str, str] = {
+    "hate_unfairness": "hate_fairness",
+    "groundedness": "generic_groundedness",
+}
+
+# Legacy response key lookup: the annotation API may return results under a different
+# key than the sync metric name. This is a superset of _SYNC_TO_LEGACY_METRIC_NAMES.
+_SYNC_TO_LEGACY_RESPONSE_KEYS: Dict[str, str] = {
+    **_SYNC_TO_LEGACY_METRIC_NAMES,
+    "indirect_attack": "xpia",
+    "election_critical_information": "eci",
+}
+
+# Reverse mapping: legacy metric name → sync metric name (built once at module level)
+_LEGACY_TO_SYNC_METRIC_NAMES: Dict[str, str] = {v: k for k, v in _SYNC_TO_LEGACY_METRIC_NAMES.items()}
+
+
+def _normalize_metric_for_endpoint(metric_name, use_legacy_endpoint, metric_display_name=None):
+    """Normalize metric name based on which endpoint is being used.
+
+    Returns (metric_name, metric_display_name) tuple with the correct metric name
+    for the target endpoint, and metric_display_name set to preserve output key names.
+    """
+    metric_name_str = metric_name.value if hasattr(metric_name, "value") else metric_name
+    if use_legacy_endpoint:
+        legacy_name = _SYNC_TO_LEGACY_METRIC_NAMES.get(metric_name_str)
+        if legacy_name:
+            return legacy_name, (metric_display_name or metric_name_str)
+    else:
+        sync_name = _LEGACY_TO_SYNC_METRIC_NAMES.get(metric_name_str)
+        if sync_name:
+            return sync_name, metric_display_name
+    return metric_name, metric_display_name
+
+
 USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
     "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
 }
@@ -453,9 +494,19 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
                 )
                 result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
             return result
+        # Check for metric_name in response; also check legacy response key name if different.
+        # Note: parse_response is only called from legacy endpoint functions (evaluate_with_rai_service
+        # and evaluate_with_rai_service_multimodal), so this fallback is inherently legacy-only.
+        response_key = metric_name
         if metric_name not in batch_response[0]:
-            return {}
-        response = batch_response[0][metric_name]
+            legacy_key = _SYNC_TO_LEGACY_RESPONSE_KEYS.get(
+                metric_name.value if hasattr(metric_name, "value") else metric_name
+            )
+            if legacy_key and legacy_key in batch_response[0]:
+                response_key = legacy_key
+            else:
+                return {}
+        response = batch_response[0][response_key]
         response = response.replace("false", "False")
         response = response.replace("true", "True")
         parsed_response = literal_eval(response)
@@ -547,13 +598,23 @@ def _parse_content_harm_response(
     }
 
     response = batch_response[0]
+    # Check for metric_name in response; also check legacy response key name if different.
+    # Note: _parse_content_harm_response is only called from parse_response, which is
+    # only called from legacy endpoint functions, so this fallback is inherently legacy-only.
+    response_key = metric_name
     if metric_name not in response:
-        return result
+        legacy_key = _SYNC_TO_LEGACY_RESPONSE_KEYS.get(
+            metric_name.value if hasattr(metric_name, "value") else metric_name
+        )
+        if legacy_key and legacy_key in response:
+            response_key = legacy_key
+        else:
+            return result
 
     try:
-        harm_response = literal_eval(response[metric_name])
+        harm_response = literal_eval(response[response_key])
     except Exception:  # pylint: disable=broad-exception-caught
-        harm_response = response[metric_name]
+        harm_response = response[response_key]
 
     total_tokens = 0
     prompt_tokens = 0
@@ -1044,6 +1105,10 @@ async def evaluate_with_rai_service_sync(
     :return: The EvalRunOutputItem containing the evaluation results (or parsed dict if legacy).
     :rtype: Union[EvalRunOutputItem, Dict[str, Union[str, float]]]
     """
+    metric_name, metric_display_name = _normalize_metric_for_endpoint(
+        metric_name, use_legacy_endpoint, metric_display_name
+    )
+
     # Route to legacy endpoint if requested
     if use_legacy_endpoint:
         return await evaluate_with_rai_service(
@@ -1261,13 +1326,16 @@ async def evaluate_with_rai_service_sync_multimodal(
     :return: The EvalRunOutputItem or legacy response payload.
     :rtype: Union[Dict, EvalRunOutputItem]
     """
+    metric_name, metric_display_name = _normalize_metric_for_endpoint(metric_name, use_legacy_endpoint)
+
     # Route to legacy endpoint if requested
     if use_legacy_endpoint:
         return await evaluate_with_rai_service_multimodal(
             messages=messages,
             metric_name=metric_name,
             project_scope=project_scope,
             credential=credential,
+            metric_display_name=metric_display_name,
         )
 
     # Sync evals endpoint implementation (default)
@@ -1316,6 +1384,7 @@ async def evaluate_with_rai_service_multimodal(
     metric_name: str,
     project_scope: Union[str, AzureAIProject],
     credential: TokenCredential,
+    metric_display_name: Optional[str] = None,
 ):
     """Evaluate the content safety of the response using Responsible AI service (legacy endpoint)
     :param messages: The normalized list of messages.
@@ -1327,6 +1396,8 @@ async def evaluate_with_rai_service_multimodal(
     :type project_scope: Union[str, AzureAIProject]
     :param credential: The Azure authentication credential.
     :type credential: ~azure.core.credentials.TokenCredential
+    :param metric_display_name: The display name for the metric in output keys. If None, uses metric_name.
+    :type metric_display_name: Optional[str]
     :return: The parsed annotation result.
     :rtype: List[List[Dict]]
     """
@@ -1341,7 +1412,7 @@ async def evaluate_with_rai_service_multimodal(
         await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
         operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
         annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
-        result = parse_response(annotation_response, metric_name)
+        result = parse_response(annotation_response, metric_name, metric_display_name)
         return result
     else:
         token = await fetch_or_reuse_token(credential)
@@ -1350,5 +1421,5 @@ async def evaluate_with_rai_service_multimodal(
         # Submit annotation request and fetch result
         operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
         annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
-        result = parse_response(annotation_response, metric_name)
+        result = parse_response(annotation_response, metric_name, metric_display_name)
         return result
@@ -107,6 +107,28 @@ def get_default_threshold_for_evaluator(evaluator: str) -> int:
     return config.get("default_threshold", 3)
 
 
+def is_attack_successful(score: float, evaluator: str) -> bool:
+    """Determine whether a red team attack succeeded based on the evaluator score.
+
+    Red team scoring uses score-based threshold comparison: higher scores indicate
+    more harmful content, so an attack succeeds when the score exceeds the
+    evaluator's default threshold (e.g. >3 for 0-7 content harm scales, >0 for
+    binary evaluators like task_adherence).
+
+    The sync eval API's 'passed' field indicates whether the eval task completed
+    successfully, NOT whether the content is safe — so it must not be used.
+
+    :param score: The raw evaluation score.
+    :type score: float
+    :param evaluator: The evaluator/metric name used to look up the threshold.
+    :type evaluator: str
+    :return: True if the attack succeeded (score > threshold), False otherwise.
+    :rtype: bool
+    """
+    threshold = get_default_threshold_for_evaluator(evaluator)
+    return float(score) > float(threshold)
+
+
 def convert_binary_to_numeric(value: Union[str, bool, int]) -> int:
     """Convert binary evaluator outputs to numeric scores.
 

@@ -2358,15 +2358,31 @@ def _convert_single_row_to_aoai_format(
     # Convert criteria groups to results
     run_output_results = []
     top_sample = {}
+    if input_data and len(input_data) > 0 and "sample.generated_sample_data" in input_data:
+        top_sample_str = input_data["sample.generated_sample_data"]
+        if top_sample_str and isinstance(top_sample_str, str):
+            try:
+                top_sample_dict = json.loads(top_sample_str)
+                if top_sample_dict and isinstance(top_sample_dict, dict):
+                    top_sample = top_sample_dict
+                    input_data.pop("sample.generated_sample_data", None)
+                    if "sample.output_status" in input_data:
+                        input_data.pop("sample.output_status", None)
+                    if "sample.output_status.status" in input_data:
+                        input_data.pop("sample.output_status.status", None)
+                    if "sample.output_status.message" in input_data:
+                        input_data.pop("sample.output_status.message", None)
+            except Exception as e:
+                logger.error(
+                    f"Failed to parse generated_sample_data as JSON for row {row_idx}, eval_id: {eval_id}, eval_run_id: {eval_run_id}. Storing as string. Error: {e}"
+                )
 
     # Process each criteria group to extract metric results of output items.
     for criteria_name, metrics in criteria_groups.items():
         criteria_results, sample = _process_criteria_metrics(
             criteria_name, metrics, testing_criteria_metadata, logger, eval_id, eval_run_id
         )
         run_output_results.extend(criteria_results)
-        run_output_results.extend(criteria_results)
+        run_output_results.extend(criteria_results)
+        # Fallback: if no top-level sample is set from input_data, use the first non-empty sample from criteria results.
+        if not top_sample and sample:
+            top_sample = sample
-        run_output_results.extend(criteria_results)
+        run_output_results.extend(criteria_results)
+        # Fallback: if no top-level sample is set from input_data, use the first non-empty sample from criteria results.
+        if not top_sample and sample:
+            top_sample = sample
-        if sample:
-            top_sample = sample
 
     # Add error summaries if needed
     _add_error_summaries(run_output_results, eval_run_summary, testing_criteria_metadata, row_idx)
@@ -3428,6 +3444,8 @@ def _calculate_aoai_evaluation_summary(
                     and result_item["metric"] not in dup_usage_list
                 ):
                     sample_data_list.append(result_item["sample"])
+            if "sample" in aoai_result and aoai_result["sample"] and isinstance(aoai_result["sample"], dict):
+                sample_data_list.append(aoai_result["sample"])
 
         for sample_data in sample_data_list:
             if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:

@@ -105,6 +105,14 @@ def __call__(  # pylint: disable=docstring-missing-param
         """
         return super().__call__(*args, **kwargs)
 
+    @override
+    def _convert_kwargs_to_eval_input(self, **kwargs):
+        if self._use_legacy_endpoint and "conversation" in kwargs and kwargs["conversation"] is not None:
+            # Legacy endpoint: pass conversation through intact so _evaluate_conversation
+            # can send all messages in a single API call (pre-sync-migration behavior).
+            return [kwargs]
+        return super()._convert_kwargs_to_eval_input(**kwargs)
+
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
         """Perform the evaluation using the Azure AI RAI service.
@@ -125,17 +133,31 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
     async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
         """Evaluates content according to this evaluator's metric.
         Evaluates each turn separately to maintain per-turn granularity.
+        When using the legacy endpoint, sends the entire conversation in a single call
+        (matching pre-sync-migration behavior) via the sync wrapper for metric normalization.
         """
-        validate_conversation(conversation)
         messages = conversation["messages"]
 
         # Convert enum to string value
         metric_value = self._eval_metric.value if hasattr(self._eval_metric, "value") else self._eval_metric
 
-        # Extract conversation turns (user-assistant pairs)
+        if self._use_legacy_endpoint:
+            # Legacy path: send entire conversation in a single call (pre-sync-migration behavior)
+            # Route through evaluate_with_rai_service_sync_multimodal for metric normalization.
+            result = await evaluate_with_rai_service_sync_multimodal(
+                messages=messages,
+                metric_name=metric_value,
+                project_scope=self._azure_ai_project,
+                credential=self._credential,
+                use_legacy_endpoint=True,
+            )
+            # Wrap as single-turn result and aggregate to produce evaluation_per_turn structure
+            return self._aggregate_results([result])
+
+        # Sync path: validate multimodal conversation and evaluate each turn separately
+        validate_conversation(conversation)
         turns = self._extract_turns(messages)
 
-        # Evaluate each turn separately
         per_turn_results = []
         for turn in turns:
             turn_result = await evaluate_with_rai_service_sync_multimodal(
@@ -213,6 +235,10 @@ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
             use_legacy_endpoint=self._use_legacy_endpoint,
         )
 
+        # Legacy endpoint returns a pre-parsed dict from parse_response(); return directly
+        if self._use_legacy_endpoint:
+            return eval_result
+
         # Parse the EvalRunOutputItem format to the expected dict format
         return self._parse_eval_result(eval_result)
 

@@ -375,6 +375,11 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
             )
+
+        # If response is a string, we can skip the context extraction and just return the eval input
+        if response and isinstance(response, str):
+            return super()._convert_kwargs_to_eval_input(query=query, response=response, context=response)
+
         context = self._get_context_from_agent_response(response, tool_definitions)
 
         if not self._validate_context(context) and self._is_single_entry(response) and self._is_single_entry(query):