From c8f5958a0086cc96e16ed250e4fb47c768782df9 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Thu, 2 Apr 2026 00:05:36 -0700 Subject: [PATCH 1/7] feat(evaluation): support properties passthrough in AOAI evaluation results Pass through evaluator properties dict in AOAI evaluation results. When an evaluator returns a properties dict, it is included alongside score, label, reason, threshold, and passed in the result object. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azure-ai-evaluation/CHANGELOG.md | 14 +++++- .../ai/evaluation/_evaluate/_evaluate.py | 12 +++++ .../azure/ai/evaluation/_version.py | 2 +- .../tests/unittests/test_evaluate.py | 49 +++++++++++++++++++ 4 files changed, 75 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 969d46b2f081..1f8417b289ed 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,6 +1,18 @@ # Release History -## 1.16.3 (Unreleased) +## 1.16.4 (Unreleased) + +### Features Added + +- Added support for evaluator `properties` passthrough in AOAI evaluation results. When an evaluator returns a `properties` dict, it is included alongside `score`, `label`, `reason`, `threshold`, and `passed` in the result object. + +### Breaking Changes + +### Bugs Fixed + +### Other Changes + +## 1.16.3 (2026-04-01) ### Features Added diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index d46f3dd216fb..1339fd7cde94 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -2591,8 +2591,12 @@ def _extract_metric_values( } """ result_per_metric = {} + properties = None for metric_key, metric_value in metrics.items(): + if metric_key == "properties" and isinstance(metric_value, dict): + properties = metric_value + continue metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) temp_result_per_metric = {} if metric not in result_per_metric: @@ -2612,6 +2616,11 @@ def _extract_metric_values( if result_name == "label" and criteria_type == "azure_ai_evaluator" and derived_passed is not None: _append_indirect_attachments_to_results(result_per_metric, "passed", metric, derived_passed, None, None) + if properties is not None: + for metric_dict in result_per_metric.values(): + if metric_dict is not None and len(metric_dict) > 0: + metric_dict["properties"] = properties + empty_metrics = [] empty_metrics.extend( metric for metric, metric_dict in result_per_metric.items() if metric_dict is None or len(metric_dict) == 0 @@ -2879,6 +2888,7 @@ def _create_result_object( threshold = metric_values.get("threshold") passed = metric_values.get("passed") sample = metric_values.get("sample") + properties = metric_values.get("properties") # Handle decrease boolean metrics if is_inverse: @@ -2898,6 +2908,8 @@ def _create_result_object( if sample is not None: result_obj["sample"] = sample + if properties is not None: + result_obj["properties"] = properties return result_obj diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py index c8760db90712..bf8c38aa224f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py @@ -3,4 +3,4 @@ # --------------------------------------------------------- # represents upcoming version -VERSION = "1.16.3" +VERSION = "1.16.4" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 47ef67eb4baa..257e13da7053 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -1353,6 +1353,55 @@ def run_test(): assert len(empty_converted["_evaluation_results_list"]) == 0 assert empty_converted["_evaluation_summary"]["result_counts"]["total"] == 0 + # Test properties passthrough for custom evaluators + property_results = { + "metrics": {}, + "rows": [ + { + "inputs.query": "test query", + "outputs.friendly_eval.score": 4.5, + "outputs.friendly_eval.score_threshold": 3, + "outputs.friendly_eval.score_result": "Pass", + "outputs.friendly_eval.score_reason": "The response was warm", + "outputs.friendly_eval.properties": { + "explanation": "Detailed reasoning about friendliness", + "tone": "warm", + "confidence": "high", + }, + } + ], + "studio_url": None, + } + + _convert_results_to_aoai_evaluation_results( + results=property_results, + logger=logger, + eval_run_id=eval_run_id, + eval_id=eval_id, + evaluators={"friendly_eval": lambda **kwargs: {"score": 1}}, + eval_meta_data={ + "testing_criteria": [ + { + "name": "friendly_eval", + "type": "quality", + "metrics": ["score"], + } + ] + }, + ) + + property_result = property_results["_evaluation_results_list"][0]["results"][0] + assert property_result["score"] == 4.5 + assert property_result["label"] == "Pass" + assert property_result["reason"] == "The response was warm" + assert property_result["threshold"] == 3 + assert property_result["properties"] == { + "explanation": "Detailed reasoning about friendliness", + "tone": "warm", + "confidence": "high", + } + assert "explanation" not in property_result + @patch( "azure.ai.evaluation._evaluate._evaluate._map_names_to_builtins", return_value={}, From e67520c7cd470f15aed9d4c890b328f554f7f209 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Thu, 2 Apr 2026 00:24:32 -0700 Subject: [PATCH 2/7] docs: update docstrings for properties passthrough per PR review Update _extract_metric_values and _create_result_object docstrings to document the new properties field and its expected dict type. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azure/ai/evaluation/_evaluate/_evaluate.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 1339fd7cde94..f065911ab7a8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -2576,7 +2576,8 @@ def _extract_metric_values( "score": 4.5, "coherence_reason": "Good flow", "threshold": 3.0, - "sample": {...} + "sample": {...}, + "properties": {"explanation": "Detailed analysis...", "confidence": 0.95} } expected_metrics = ["score"] @@ -2586,9 +2587,14 @@ def _extract_metric_values( "score": 4.5, "reason": "Good flow", "threshold": 3.0, - "sample": {...} + "sample": {...}, + "properties": {"explanation": "Detailed analysis...", "confidence": 0.95} } } + + Note: If a ``properties`` key is present in the metrics dict and its value is a dict, + it is extracted and attached to every per-metric result entry. This allows evaluators + to return additional output fields alongside standard score/reason/threshold values. """ result_per_metric = {} properties = None @@ -2878,8 +2884,13 @@ def _create_result_object( "reason": "Good logical flow", "threshold": 3.0, "passed": None, - "sample": {"input": "...", "output": "..."} + "sample": {"input": "...", "output": "..."}, + "properties": {"explanation": "...", "confidence": 0.95} } + + Note: The ``properties`` field is included only when the evaluator returned a + properties dict. It carries additional output fields beyond the standard + score/label/reason/threshold/passed values. """ # Extract values score = metric_values.get("score") From 99b96b4e20b8e046387fee8f621a757a69f9d29b Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Thu, 2 Apr 2026 18:33:03 -0700 Subject: [PATCH 3/7] fix: log warning when properties is not a dict Address PR review: warn users when their custom evaluator returns 'properties' as a non-dict type so they can fix the output format. Also add properties to _create_result_object example input. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azure/ai/evaluation/_evaluate/_evaluate.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index f065911ab7a8..e99b5f127c9d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -2600,8 +2600,15 @@ def _extract_metric_values( properties = None for metric_key, metric_value in metrics.items(): - if metric_key == "properties" and isinstance(metric_value, dict): - properties = metric_value + if metric_key == "properties": + if isinstance(metric_value, dict): + properties = metric_value + else: + logger.warning( + "Evaluator '%s' returned 'properties' as %s instead of dict; ignoring.", + criteria_name, + type(metric_value).__name__, + ) continue metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) temp_result_per_metric = {} @@ -2870,7 +2877,8 @@ def _create_result_object( "score": 4.5, "reason": "Good logical flow", "threshold": 3.0, - "sample": {"input": "...", "output": "..."} + "sample": {"input": "...", "output": "..."}, + "properties": {"explanation": "...", "confidence": 0.95} } criteria_type = "quality" From 330ec9b85811d3f66726e17f115ae5858ca2d36d Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:16:23 -0700 Subject: [PATCH 4/7] Release-1-16-4 --- sdk/evaluation/azure-ai-evaluation/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 1f8417b289ed..9d6556378469 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,6 +1,6 @@ # Release History -## 1.16.4 (Unreleased) +## 1.16.4 (2026-04-03) ### Features Added From 38f57817fc93a8a15c9f8522138f0593d2356663 Mon Sep 17 00:00:00 2001 From: Sydney Lister Date: Fri, 3 Apr 2026 10:25:28 -0400 Subject: [PATCH 5/7] Fix stray space in _eval_metric.value attribute access Remove erroneous space in self._eval_metric. value (two occurrences) that would cause an AttributeError at runtime when building result keys for _details and _total_tokens fields. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index f9c5ab099029..ea75955e1c81 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -321,7 +321,7 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]: # Extract details from scoreProperties if score_properties: - parsed_result[f"{self._eval_metric. value}_details"] = _prepare_details(score_properties) + parsed_result[f"{self._eval_metric.value}_details"] = _prepare_details(score_properties) # Extract token counts from metrics metrics = properties.get("metrics", {}) @@ -339,7 +339,7 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]: total_tokens = "" # Add token metadata (matching old format) - parsed_result[f"{self._eval_metric. value}_total_tokens"] = total_tokens + parsed_result[f"{self._eval_metric.value}_total_tokens"] = total_tokens parsed_result[f"{self._eval_metric.value}_prompt_tokens"] = prompt_tokens parsed_result[f"{self._eval_metric.value}_completion_tokens"] = completion_tokens From 1aea0821c4a34629e11fab2a179aedb20b48af3d Mon Sep 17 00:00:00 2001 From: Sydney Lister Date: Fri, 3 Apr 2026 11:15:54 -0400 Subject: [PATCH 6/7] Remove empty changelog sections to fix Build Analyze check Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- sdk/evaluation/azure-ai-evaluation/CHANGELOG.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 9d6556378469..e2e66824f47c 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -6,12 +6,6 @@ - Added support for evaluator `properties` passthrough in AOAI evaluation results. When an evaluator returns a `properties` dict, it is included alongside `score`, `label`, `reason`, `threshold`, and `passed` in the result object. -### Breaking Changes - -### Bugs Fixed - -### Other Changes - ## 1.16.3 (2026-04-01) ### Features Added From 38055cb30ee27dd834d7e1a7bcb6bdbbd4b0ab64 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Fri, 3 Apr 2026 11:43:39 -0700 Subject: [PATCH 7/7] Address PR feedback: copy properties dict and add non-dict test - Use properties.copy() to avoid shared dict reference across metrics - Add test for non-dict properties logging and omission - Change properties type mismatch log level from warning to info Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ai/evaluation/_evaluate/_evaluate.py | 4 +- .../tests/unittests/test_evaluate.py | 42 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index e99b5f127c9d..149ad3cf4adc 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -2604,7 +2604,7 @@ def _extract_metric_values( if isinstance(metric_value, dict): properties = metric_value else: - logger.warning( + logger.info( "Evaluator '%s' returned 'properties' as %s instead of dict; ignoring.", criteria_name, type(metric_value).__name__, @@ -2632,7 +2632,7 @@ def _extract_metric_values( if properties is not None: for metric_dict in result_per_metric.values(): if metric_dict is not None and len(metric_dict) > 0: - metric_dict["properties"] = properties + metric_dict["properties"] = properties.copy() empty_metrics = [] empty_metrics.extend( diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 257e13da7053..ce19c8c8c1c8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -1402,6 +1402,48 @@ def run_test(): } assert "explanation" not in property_result + # Test that non-dict properties logs a warning and is omitted + non_dict_property_results = { + "metrics": {}, + "rows": [ + { + "inputs.query": "test query", + "outputs.friendly_eval.score": 3.0, + "outputs.friendly_eval.score_threshold": 3, + "outputs.friendly_eval.score_result": "Pass", + "outputs.friendly_eval.score_reason": "Acceptable", + "outputs.friendly_eval.properties": "not_a_dict", + } + ], + "studio_url": None, + } + + with patch.object(logger, "info") as mock_info: + _convert_results_to_aoai_evaluation_results( + results=non_dict_property_results, + logger=logger, + eval_run_id=eval_run_id, + eval_id=eval_id, + evaluators={"friendly_eval": lambda **kwargs: {"score": 1}}, + eval_meta_data={ + "testing_criteria": [ + { + "name": "friendly_eval", + "type": "quality", + "metrics": ["score"], + } + ] + }, + ) + + non_dict_result = non_dict_property_results["_evaluation_results_list"][0]["results"][0] + assert "properties" not in non_dict_result + mock_info.assert_any_call( + "Evaluator '%s' returned 'properties' as %s instead of dict; ignoring.", + "friendly_eval", + "str", + ) + @patch( "azure.ai.evaluation._evaluate._evaluate._map_names_to_builtins", return_value={},