diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 2413945889e1..9d0741633f80 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -1,10 +1,8 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -import math import os import logging -import re from typing import Dict, List, Union, TypeVar, Optional from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -15,7 +13,7 @@ ErrorTarget, EvaluationException, ) -from ..._common.utils import check_score_is_valid +from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response from azure.ai.evaluation._common._experimental import experimental logger = logging.getLogger(__name__) @@ -222,24 +220,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ - # Import helper functions from base class module - from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( - _is_intermediate_response, - _preprocess_messages, - ) - - # Check for intermediate response - if _is_intermediate_response(eval_input.get("response")): - return self._not_applicable_result( - "Intermediate response. Please provide the agent's final response for evaluation.", - self.threshold, - has_details=True, - ) - - # Preprocess messages if they are lists - if isinstance(eval_input.get("response"), list): - eval_input["response"] = _preprocess_messages(eval_input["response"]) - if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Call Accuracy evaluator."), @@ -249,8 +229,30 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, ) - if isinstance(eval_input.get("query"), list): - eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Reformat conversation history for cleaner evaluation + eval_input["query"] = reformat_conversation_history( + eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + ) + + # Reformat tool_calls for cleaner evaluation. + # Reconstruct a proper message structure from the already-extracted tool call dicts: + # - one assistant message per tool_call content item + # - one role="tool" message per tool call that has an attached tool_result + # (only present when response was parsed via _parse_tools_from_response) + if isinstance(eval_input.get("tool_calls"), list): + tool_call_items = eval_input["tool_calls"] + messages = [] + for tc in tool_call_items: + messages.append({"role": "assistant", "content": [tc]}) + if "tool_result" in tc: + messages.append( + { + "role": "tool", + "tool_call_id": tc.get("tool_call_id"), + "content": [{"type": "tool_result", "tool_result": tc["tool_result"]}], + } + ) + eval_input["tool_calls"] = reformat_agent_response(messages, include_tool_messages=True) # Single LLM call for all tool calls prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) @@ -307,9 +309,28 @@ async def _real_call(self, **kwargs): :return: The evaluation result :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]] """ + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # Validate input before processing self._validator.validate_eval_input(kwargs) + # Check for intermediate response and preprocess both response and query + # before parsing tool calls, so _convert_kwargs_to_eval_input operates on clean data + response = kwargs.get("response") + if _is_intermediate_response(response): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self.threshold, + has_details=True, + ) + if isinstance(response, list): + kwargs["response"] = _preprocess_messages(response) + if isinstance(kwargs.get("query"), list): + kwargs["query"] = _preprocess_messages(kwargs["query"]) + # Convert inputs into list of evaluable inputs. eval_input = self._convert_kwargs_to_eval_input(**kwargs) if isinstance(eval_input, dict) and eval_input.get("error_message"): diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 8b23a56da3f6..bf6a264bcd83 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -4,48 +4,65 @@ from azure.ai.evaluation import ToolCallAccuracyEvaluator from azure.ai.evaluation._exceptions import EvaluationException - # This mock should return a dictionary that mimics the output of the prompty (the _flow call), # which is then processed by the _do_eval method. +import re as _re + + +def _parse_tool_names_from_string(tool_calls_str): + """Parse tool names from reformatted [TOOL_CALL] lines.""" + pattern = r"\[TOOL_CALL\]\s+(\S+)\(" + return _re.findall(pattern, tool_calls_str) + + +_BUILTIN_TOOL_NAMES = [ + "bing_custom_search", + "bing_grounding", + "file_search", + "azure_ai_search", + "fabric_dataagent", + "code_interpreter", + "sharepoint_grounding", + "openapi", +] + + async def flow_side_effect(timeout, **kwargs): tool_calls = kwargs.get("tool_calls", []) query = kwargs.get("query", "") - # Handle built-in tool calls first - count them as relevant - builtin_calls = 0 - custom_function_calls = [] - - for tc in tool_calls: - tool_type = tc.get("type", "") - tool_name = tc.get("name", "") - - # Only support converter format: {type: "tool_call", name: "tool_name", arguments: {...}} - if tool_type == "tool_call": - if tool_name in [ - "bing_custom_search", - "bing_grounding", - "file_search", - "azure_ai_search", - "fabric_dataagent", - "code_interpreter", - "sharepoint_grounding", - "openapi", - ]: - builtin_calls += 1 - else: - # custom function tool call - custom_function_calls.append(tc) - - # Handle traditional function tool calls with tool_call_id only for non-built-in tools - good_calls = sum(1 for tc in custom_function_calls if "good" in tc.get("tool_call_id", "")) - bad_calls = sum(1 for tc in custom_function_calls if "bad" in tc.get("tool_call_id", "")) - invalid_calls = sum(1 for tc in custom_function_calls if "invalid" in tc.get("tool_call_id", "")) - - total_calls = len(tool_calls) - total_good_calls = good_calls + builtin_calls + # Handle reformatted string tool_calls (after reformat_agent_response) + if isinstance(tool_calls, str): + tool_names = _parse_tool_names_from_string(tool_calls) + builtin_calls = sum(1 for n in tool_names if n in _BUILTIN_TOOL_NAMES) + custom_names = [n for n in tool_names if n not in _BUILTIN_TOOL_NAMES] + good_calls = sum(1 for n in custom_names if "good" in n) + bad_calls = sum(1 for n in custom_names if "bad" in n) + invalid_calls = sum(1 for n in custom_names if "invalid" in n) + total_calls = len(tool_names) + total_good_calls = good_calls + builtin_calls + else: + # Handle dict-based tool_calls (legacy path) + builtin_calls = 0 + custom_function_calls = [] + + for tc in tool_calls: + tool_type = tc.get("type", "") + tool_name = tc.get("name", "") + + if tool_type == "tool_call": + if tool_name in _BUILTIN_TOOL_NAMES: + builtin_calls += 1 + else: + custom_function_calls.append(tc) + + good_calls = sum(1 for tc in custom_function_calls if "good" in tc.get("name", "")) + bad_calls = sum(1 for tc in custom_function_calls if "bad" in tc.get("name", "")) + invalid_calls = sum(1 for tc in custom_function_calls if "invalid" in tc.get("name", "")) + total_calls = len(tool_calls) + total_good_calls = good_calls + builtin_calls if invalid_calls > 0: - # Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid return { "llm_output": { "chain_of_thought": "The tool calls were very correct that I returned a huge number!", @@ -85,20 +102,20 @@ def test_evaluate_tools_valid1(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_good", - "name": "fetch_weather", + "tool_call_id": "call_1", + "name": "fetch_weather_good", "arguments": {"location": "Paris"}, }, { "type": "tool_call", - "tool_call_id": "call_bad", - "name": "buy_jacket", + "tool_call_id": "call_2", + "name": "buy_jacket_bad", "arguments": {"type": "raincoat"}, }, ] tool_definitions = [ { - "name": "fetch_weather", + "name": "fetch_weather_good", "type": "function", "description": "Fetches the weather information for the specified location.", "parameters": { @@ -112,7 +129,7 @@ def test_evaluate_tools_valid1(self, mock_model_config): }, }, { - "name": "buy_jacket", + "name": "buy_jacket_bad", "type": "function", "description": "Buy a jacket of the given type.", "parameters": { @@ -147,20 +164,20 @@ def test_evaluate_tools_valid2(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_bad", - "name": "fetch_weather", + "tool_call_id": "call_1", + "name": "fetch_weather_bad", "arguments": {"location": "Tokyo"}, }, { "type": "tool_call", - "tool_call_id": "call_bad", - "name": "buy_jacket", + "tool_call_id": "call_2", + "name": "buy_jacket_bad", "arguments": {"type": "raincoat"}, }, ] tool_definitions = [ { - "name": "fetch_weather", + "name": "fetch_weather_bad", "type": "function", "description": "Fetches the weather information for the specified location.", "parameters": { @@ -174,7 +191,7 @@ def test_evaluate_tools_valid2(self, mock_model_config): }, }, { - "name": "buy_jacket", + "name": "buy_jacket_bad", "type": "function", "description": "Buy a jacket of the given type.", "parameters": { @@ -209,20 +226,20 @@ def test_evaluate_tools_valid3(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_good", - "name": "fetch_weather", + "tool_call_id": "call_1", + "name": "fetch_weather_good", "arguments": {"location": "Paris"}, }, { "type": "tool_call", - "tool_call_id": "call_good", - "name": "buy_jacket", + "tool_call_id": "call_2", + "name": "buy_jacket_good", "arguments": {"type": "jacket"}, }, ] tool_definitions = [ { - "name": "fetch_weather", + "name": "fetch_weather_good", "type": "function", "description": "Fetches the weather information for the specified location.", "parameters": { @@ -236,7 +253,7 @@ def test_evaluate_tools_valid3(self, mock_model_config): }, }, { - "name": "buy_jacket", + "name": "buy_jacket_good", "type": "function", "description": "Buy a jacket of the given type.", "parameters": { @@ -272,14 +289,14 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_invalid", - "name": "fetch_weather", + "tool_call_id": "call_1", + "name": "fetch_weather_invalid", "arguments": {"location": "Tokyo"}, }, ] tool_definitions = [ { - "name": "fetch_weather", + "name": "fetch_weather_invalid", "type": "function", "description": "Fetches the weather information for the specified location.", "parameters": { @@ -306,20 +323,20 @@ def test_evaluate_tools_some_missing_tool_definitions(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_good", - "name": "fetch_weather", + "tool_call_id": "call_1", + "name": "fetch_weather_good", "arguments": {"location": "Tokyo"}, }, { "type": "tool_call", - "tool_call_id": "call_bad", - "name": "buy_jacket", + "tool_call_id": "call_2", + "name": "buy_jacket_bad", "arguments": {"type": "raincoat"}, }, ] tool_definitions = [ { - "name": "fetch_weather", + "name": "fetch_weather_good", "type": "function", "description": "Fetches the weather information for the specified location.", "parameters": { @@ -331,7 +348,7 @@ def test_evaluate_tools_some_missing_tool_definitions(self, mock_model_config): } }, }, - }, # buy_jacket definition is missing + }, # buy_jacket_bad definition is missing ] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) @@ -354,14 +371,14 @@ def test_evaluate_tools_built_in_tool_definition(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_good", - "name": "fetch_weather", + "tool_call_id": "call_1", + "name": "fetch_weather_good", "arguments": {"location": "Tokyo"}, }, ] tool_definitions = [ { - "name": "fetch_weather", + "name": "fetch_weather_good", "type": "some_built_in", # Not a 'function' type but shouldn't be filtered out "description": "Fetches the weather information for the specified location.", "parameters": { @@ -664,8 +681,8 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_builtin_good", - "name": "get_countries_LookupCountryByCurrency", + "tool_call_id": "call_1", + "name": "get_countries_LookupCountryByCurrency_good", "arguments": {"currency": "GBP"}, }, ] @@ -711,7 +728,7 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config): "auth": {"type": "anonymous", "security_scheme": {}}, "functions": [ { - "name": "get_countries_LookupCountryByCurrency", + "name": "get_countries_LookupCountryByCurrency_good", "type": "function", "description": "Search by currency.", "parameters": { @@ -740,14 +757,14 @@ def test_evaluate_missing_query(self, mock_model_config): tool_calls = [ { "type": "tool_call", - "tool_call_id": "call_good", - "name": "get_weather", + "tool_call_id": "call_1", + "name": "get_weather_good", "arguments": {"location": "Paris"}, } ] tool_definitions = [ { - "name": "get_weather", + "name": "get_weather_good", "type": "function", "description": "Get weather information", "parameters": { @@ -769,3 +786,239 @@ def test_evaluate_missing_query(self, mock_model_config): evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions) assert "Query is a required input" in str(exc_info.value) + + def test_evaluate_with_conversation_history_query(self, mock_model_config): + """Test that query provided as a list of messages (conversation history) is reformatted correctly.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + query = [ + {"role": "system", "content": "You are a helpful weather assistant."}, + {"role": "user", "content": "What's the weather like in Paris?"}, + {"role": "assistant", "content": "Let me check that for you."}, + ] + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_1", + "name": "fetch_weather_good", + "arguments": {"location": "Paris"}, + } + ] + tool_definitions = [ + { + "name": "fetch_weather_good", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + }, + } + ] + + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + + key = ToolCallAccuracyEvaluator._RESULT_KEY + assert result is not None + assert key in result + assert f"{key}_result" in result + assert result[key] == 5.0 + assert result[f"{key}_result"] == "pass" + + def test_evaluate_with_response_list(self, mock_model_config): + """Test that response provided as a list of messages containing tool calls is reformatted correctly.""" + + async def flow_side_effect_string_tool_calls(timeout, **kwargs): + # When tool_calls is a reformatted string, return a fixed passing score + tool_calls = kwargs.get("tool_calls", "") + if isinstance(tool_calls, str) and tool_calls: + return { + "llm_output": { + "chain_of_thought": "Tool calls were reformatted and evaluated.", + "tool_calls_success_level": 5, + "details": {"tool_calls_made_by_agent": 1, "correct_tool_calls_made_by_agent": 1}, + } + } + return { + "llm_output": { + "chain_of_thought": "No tool calls found.", + "tool_calls_success_level": 1, + "details": {}, + } + } + + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect_string_tool_calls) + + query = "What's the weather in Paris?" + response = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_123", + "name": "fetch_weather", + "arguments": {"location": "Paris"}, + } + ], + } + ] + tool_definitions = [ + { + "name": "fetch_weather", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + }, + } + ] + + result = evaluator(query=query, response=response, tool_definitions=tool_definitions) + + key = ToolCallAccuracyEvaluator._RESULT_KEY + assert result is not None + assert key in result + assert f"{key}_result" in result + assert result[key] == 5.0 + assert result[f"{key}_result"] == "pass" + + def test_tool_calls_reformatting_is_applied(self, mock_model_config): + """Verify that when response is provided as a list, the tool_calls passed to _flow + is the reformatted string produced by reformat_agent_response, not the raw message list.""" + captured_kwargs = {} + + async def capturing_flow(timeout, **kwargs): + captured_kwargs.update(kwargs) + return { + "llm_output": { + "chain_of_thought": "Verified reformatted tool_calls string.", + "tool_calls_success_level": 5, + "details": {"tool_calls_made_by_agent": 1, "correct_tool_calls_made_by_agent": 1}, + } + } + + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=capturing_flow) + + query = "What's the weather in Paris?" + response = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_123", + "name": "fetch_weather", + "arguments": {"location": "Paris"}, + } + ], + } + ] + tool_definitions = [ + { + "name": "fetch_weather", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + }, + } + ] + + evaluator(query=query, response=response, tool_definitions=tool_definitions) + + # tool_calls should be a reformatted string, not a raw list + tool_calls_sent = captured_kwargs.get("tool_calls") + assert isinstance( + tool_calls_sent, str + ), f"Expected tool_calls to be a reformatted string, got {type(tool_calls_sent)}" + assert "[TOOL_CALL]" in tool_calls_sent + assert "fetch_weather" in tool_calls_sent + assert "Paris" in tool_calls_sent + + def test_tool_result_included_in_reformatted_tool_calls(self, mock_model_config): + """Verify that tool results are included in reformatted tool_calls when response + contains a role=tool message, and are absent when tool_calls is passed directly.""" + captured_kwargs = {} + + async def capturing_flow(timeout, **kwargs): + captured_kwargs.update(kwargs) + return { + "llm_output": { + "chain_of_thought": "Evaluated.", + "tool_calls_success_level": 5, + "details": {"tool_calls_made_by_agent": 1, "correct_tool_calls_made_by_agent": 1}, + } + } + + tool_definitions = [ + { + "name": "fetch_weather", + "type": "function", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}}, + }, + } + ] + + # Case 1: response includes a tool result message — [TOOL_RESULT] should appear + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=capturing_flow) + + response_with_result = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_123", + "name": "fetch_weather", + "arguments": {"location": "Paris"}, + } + ], + }, + { + "role": "tool", + "tool_call_id": "call_123", + "content": [{"type": "tool_result", "tool_result": '{"temperature": "15C", "condition": "sunny"}'}], + }, + ] + + evaluator( + query="What's the weather in Paris?", response=response_with_result, tool_definitions=tool_definitions + ) + tool_calls_sent = captured_kwargs.get("tool_calls") + assert isinstance(tool_calls_sent, str) + assert "[TOOL_CALL]" in tool_calls_sent + assert ( + "[TOOL_RESULT]" in tool_calls_sent + ), "Tool result should be included when response contains a tool result message" + assert "15C" in tool_calls_sent + + # Case 2: tool_calls passed directly (no tool_result field) — [TOOL_RESULT] must not appear + captured_kwargs.clear() + evaluator._flow = MagicMock(side_effect=capturing_flow) + + direct_tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_1", + "name": "fetch_weather", + "arguments": {"location": "Paris"}, + } + ] + + evaluator(query="What's the weather in Paris?", tool_calls=direct_tool_calls, tool_definitions=tool_definitions) + tool_calls_sent = captured_kwargs.get("tool_calls") + assert isinstance(tool_calls_sent, str) + assert "[TOOL_CALL]" in tool_calls_sent + assert ( + "[TOOL_RESULT]" not in tool_calls_sent + ), "Tool result must not appear when tool_calls has no attached result"