Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Release History

## 1.16.3 (Unreleased)
## 1.16.4 (2026-04-03)

### Features Added

- Added support for evaluator `properties` passthrough in AOAI evaluation results. When an evaluator returns a `properties` dict, it is included alongside `score`, `label`, `reason`, `threshold`, and `passed` in the result object.

## 1.16.3 (2026-04-01)

### Features Added

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2576,7 +2576,8 @@ def _extract_metric_values(
"score": 4.5,
"coherence_reason": "Good flow",
"threshold": 3.0,
"sample": {...}
"sample": {...},
"properties": {"explanation": "Detailed analysis...", "confidence": 0.95}
}
expected_metrics = ["score"]

Expand All @@ -2586,13 +2587,29 @@ def _extract_metric_values(
"score": 4.5,
"reason": "Good flow",
"threshold": 3.0,
"sample": {...}
"sample": {...},
"properties": {"explanation": "Detailed analysis...", "confidence": 0.95}
}
}

Note: If a ``properties`` key is present in the metrics dict and its value is a dict,
it is extracted and attached to every per-metric result entry. This allows evaluators
to return additional output fields alongside standard score/reason/threshold values.
"""
result_per_metric = {}
properties = None

for metric_key, metric_value in metrics.items():
if metric_key == "properties":
if isinstance(metric_value, dict):
properties = metric_value
else:
logger.info(
"Evaluator '%s' returned 'properties' as %s instead of dict; ignoring.",
criteria_name,
type(metric_value).__name__,
)
continue
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
temp_result_per_metric = {}
if metric not in result_per_metric:
Expand All @@ -2612,6 +2629,11 @@ def _extract_metric_values(
if result_name == "label" and criteria_type == "azure_ai_evaluator" and derived_passed is not None:
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, derived_passed, None, None)

if properties is not None:
for metric_dict in result_per_metric.values():
if metric_dict is not None and len(metric_dict) > 0:
metric_dict["properties"] = properties.copy()

empty_metrics = []
empty_metrics.extend(
metric for metric, metric_dict in result_per_metric.items() if metric_dict is None or len(metric_dict) == 0
Expand Down Expand Up @@ -2855,7 +2877,8 @@ def _create_result_object(
"score": 4.5,
"reason": "Good logical flow",
"threshold": 3.0,
"sample": {"input": "...", "output": "..."}
"sample": {"input": "...", "output": "..."},
"properties": {"explanation": "...", "confidence": 0.95}
}
criteria_type = "quality"

Expand All @@ -2869,8 +2892,13 @@ def _create_result_object(
"reason": "Good logical flow",
"threshold": 3.0,
"passed": None,
"sample": {"input": "...", "output": "..."}
"sample": {"input": "...", "output": "..."},
"properties": {"explanation": "...", "confidence": 0.95}
}

Note: The ``properties`` field is included only when the evaluator returned a
properties dict. It carries additional output fields beyond the standard
score/label/reason/threshold/passed values.
"""
# Extract values
score = metric_values.get("score")
Expand All @@ -2879,6 +2907,7 @@ def _create_result_object(
threshold = metric_values.get("threshold")
passed = metric_values.get("passed")
sample = metric_values.get("sample")
properties = metric_values.get("properties")

# Handle decrease boolean metrics
if is_inverse:
Expand All @@ -2898,6 +2927,8 @@ def _create_result_object(

if sample is not None:
result_obj["sample"] = sample
if properties is not None:
result_obj["properties"] = properties

return result_obj

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:

# Extract details from scoreProperties
if score_properties:
parsed_result[f"{self._eval_metric. value}_details"] = _prepare_details(score_properties)
parsed_result[f"{self._eval_metric.value}_details"] = _prepare_details(score_properties)

# Extract token counts from metrics
metrics = properties.get("metrics", {})
Expand All @@ -339,7 +339,7 @@ def _parse_eval_result(self, eval_result) -> Dict[str, T]:
total_tokens = ""

# Add token metadata (matching old format)
parsed_result[f"{self._eval_metric. value}_total_tokens"] = total_tokens
parsed_result[f"{self._eval_metric.value}_total_tokens"] = total_tokens
parsed_result[f"{self._eval_metric.value}_prompt_tokens"] = prompt_tokens
parsed_result[f"{self._eval_metric.value}_completion_tokens"] = completion_tokens

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# ---------------------------------------------------------
# represents upcoming version

VERSION = "1.16.3"
VERSION = "1.16.4"
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,97 @@ def run_test():
assert len(empty_converted["_evaluation_results_list"]) == 0
assert empty_converted["_evaluation_summary"]["result_counts"]["total"] == 0

# Test properties passthrough for custom evaluators
property_results = {
"metrics": {},
"rows": [
{
"inputs.query": "test query",
"outputs.friendly_eval.score": 4.5,
"outputs.friendly_eval.score_threshold": 3,
"outputs.friendly_eval.score_result": "Pass",
"outputs.friendly_eval.score_reason": "The response was warm",
"outputs.friendly_eval.properties": {
"explanation": "Detailed reasoning about friendliness",
"tone": "warm",
"confidence": "high",
},
}
],
"studio_url": None,
}

_convert_results_to_aoai_evaluation_results(
results=property_results,
logger=logger,
eval_run_id=eval_run_id,
eval_id=eval_id,
evaluators={"friendly_eval": lambda **kwargs: {"score": 1}},
eval_meta_data={
"testing_criteria": [
{
"name": "friendly_eval",
"type": "quality",
"metrics": ["score"],
}
]
},
)

property_result = property_results["_evaluation_results_list"][0]["results"][0]
assert property_result["score"] == 4.5
assert property_result["label"] == "Pass"
assert property_result["reason"] == "The response was warm"
assert property_result["threshold"] == 3
assert property_result["properties"] == {
"explanation": "Detailed reasoning about friendliness",
"tone": "warm",
"confidence": "high",
}
assert "explanation" not in property_result

# Test that non-dict properties logs a warning and is omitted
non_dict_property_results = {
"metrics": {},
"rows": [
{
"inputs.query": "test query",
"outputs.friendly_eval.score": 3.0,
"outputs.friendly_eval.score_threshold": 3,
"outputs.friendly_eval.score_result": "Pass",
"outputs.friendly_eval.score_reason": "Acceptable",
"outputs.friendly_eval.properties": "not_a_dict",
}
],
"studio_url": None,
}

with patch.object(logger, "info") as mock_info:
_convert_results_to_aoai_evaluation_results(
results=non_dict_property_results,
logger=logger,
eval_run_id=eval_run_id,
eval_id=eval_id,
evaluators={"friendly_eval": lambda **kwargs: {"score": 1}},
eval_meta_data={
"testing_criteria": [
{
"name": "friendly_eval",
"type": "quality",
"metrics": ["score"],
}
]
},
)

non_dict_result = non_dict_property_results["_evaluation_results_list"][0]["results"][0]
assert "properties" not in non_dict_result
mock_info.assert_any_call(
"Evaluator '%s' returned 'properties' as %s instead of dict; ignoring.",
"friendly_eval",
"str",
)

@patch(
"azure.ai.evaluation._evaluate._evaluate._map_names_to_builtins",
return_value={},
Expand Down