diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 2413945889e1..9d0741633f80 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -1,10 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import math
 import os
 import logging
-import re
 from typing import Dict, List, Union, TypeVar, Optional
 from typing_extensions import overload, override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -15,7 +13,7 @@
     ErrorTarget,
     EvaluationException,
 )
-from ..._common.utils import check_score_is_valid
+from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
 from azure.ai.evaluation._common._experimental import experimental
 
 logger = logging.getLogger(__name__)
@@ -222,24 +220,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
-        # Import helper functions from base class module
-        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
-            _is_intermediate_response,
-            _preprocess_messages,
-        )
-
-        # Check for intermediate response
-        if _is_intermediate_response(eval_input.get("response")):
-            return self._not_applicable_result(
-                "Intermediate response. Please provide the agent's final response for evaluation.",
-                self.threshold,
-                has_details=True,
-            )
-
-        # Preprocess messages if they are lists
-        if isinstance(eval_input.get("response"), list):
-            eval_input["response"] = _preprocess_messages(eval_input["response"])
-
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to the Tool Call Accuracy evaluator."),
@@ -249,8 +229,30 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
             )
 
-        if isinstance(eval_input.get("query"), list):
-            eval_input["query"] = _preprocess_messages(eval_input["query"])
+        # Reformat conversation history for cleaner evaluation
+        eval_input["query"] = reformat_conversation_history(
+            eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
+        )
+
+        # Reformat tool_calls for cleaner evaluation.
+        # Reconstruct a proper message structure from the already-extracted tool call dicts:
+        # - one assistant message per tool_call content item
+        # - one role="tool" message per tool call that has an attached tool_result
+        #   (only present when response was parsed via _parse_tools_from_response)
+        if isinstance(eval_input.get("tool_calls"), list):
+            tool_call_items = eval_input["tool_calls"]
+            messages = []
+            for tc in tool_call_items:
+                messages.append({"role": "assistant", "content": [tc]})
+                if "tool_result" in tc:
+                    messages.append(
+                        {
+                            "role": "tool",
+                            "tool_call_id": tc.get("tool_call_id"),
+                            "content": [{"type": "tool_result", "tool_result": tc["tool_result"]}],
+                        }
+                    )
+            eval_input["tool_calls"] = reformat_agent_response(messages, include_tool_messages=True)
 
         # Single LLM call for all tool calls
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
@@ -307,9 +309,28 @@ async def _real_call(self, **kwargs):
         :return: The evaluation result
         :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
         """
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # Validate input before processing
         self._validator.validate_eval_input(kwargs)
 
+        # Check for intermediate response and preprocess both response and query
+        # before parsing tool calls, so _convert_kwargs_to_eval_input operates on clean data
+        response = kwargs.get("response")
+        if _is_intermediate_response(response):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self.threshold,
+                has_details=True,
+            )
+        if isinstance(response, list):
+            kwargs["response"] = _preprocess_messages(response)
+        if isinstance(kwargs.get("query"), list):
+            kwargs["query"] = _preprocess_messages(kwargs["query"])
+
         # Convert inputs into list of evaluable inputs.
         eval_input = self._convert_kwargs_to_eval_input(**kwargs)
         if isinstance(eval_input, dict) and eval_input.get("error_message"):
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
index 8b23a56da3f6..bf6a264bcd83 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -4,48 +4,65 @@
 from azure.ai.evaluation import ToolCallAccuracyEvaluator
 from azure.ai.evaluation._exceptions import EvaluationException
 
-
 # This mock should return a dictionary that mimics the output of the prompty (the _flow call),
 # which is then processed by the _do_eval method.
+import re as _re
+
+
+def _parse_tool_names_from_string(tool_calls_str):
+    """Parse tool names from reformatted [TOOL_CALL] lines."""
+    pattern = r"\[TOOL_CALL\]\s+(\S+)\("
+    return _re.findall(pattern, tool_calls_str)
+
+
+_BUILTIN_TOOL_NAMES = [
+    "bing_custom_search",
+    "bing_grounding",
+    "file_search",
+    "azure_ai_search",
+    "fabric_dataagent",
+    "code_interpreter",
+    "sharepoint_grounding",
+    "openapi",
+]
+
+
 async def flow_side_effect(timeout, **kwargs):
     tool_calls = kwargs.get("tool_calls", [])
     query = kwargs.get("query", "")
 
-    # Handle built-in tool calls first - count them as relevant
-    builtin_calls = 0
-    custom_function_calls = []
-
-    for tc in tool_calls:
-        tool_type = tc.get("type", "")
-        tool_name = tc.get("name", "")
-
-        # Only support converter format: {type: "tool_call", name: "tool_name", arguments: {...}}
-        if tool_type == "tool_call":
-            if tool_name in [
-                "bing_custom_search",
-                "bing_grounding",
-                "file_search",
-                "azure_ai_search",
-                "fabric_dataagent",
-                "code_interpreter",
-                "sharepoint_grounding",
-                "openapi",
-            ]:
-                builtin_calls += 1
-            else:
-                # custom function tool call
-                custom_function_calls.append(tc)
-
-    # Handle traditional function tool calls with tool_call_id only for non-built-in tools
-    good_calls = sum(1 for tc in custom_function_calls if "good" in tc.get("tool_call_id", ""))
-    bad_calls = sum(1 for tc in custom_function_calls if "bad" in tc.get("tool_call_id", ""))
-    invalid_calls = sum(1 for tc in custom_function_calls if "invalid" in tc.get("tool_call_id", ""))
-
-    total_calls = len(tool_calls)
-    total_good_calls = good_calls + builtin_calls
+    # Handle reformatted string tool_calls (after reformat_agent_response)
+    if isinstance(tool_calls, str):
+        tool_names = _parse_tool_names_from_string(tool_calls)
+        builtin_calls = sum(1 for n in tool_names if n in _BUILTIN_TOOL_NAMES)
+        custom_names = [n for n in tool_names if n not in _BUILTIN_TOOL_NAMES]
+        good_calls = sum(1 for n in custom_names if "good" in n)
+        bad_calls = sum(1 for n in custom_names if "bad" in n)
+        invalid_calls = sum(1 for n in custom_names if "invalid" in n)
+        total_calls = len(tool_names)
+        total_good_calls = good_calls + builtin_calls
+    else:
+        # Handle dict-based tool_calls (legacy path)
+        builtin_calls = 0
+        custom_function_calls = []
+
+        for tc in tool_calls:
+            tool_type = tc.get("type", "")
+            tool_name = tc.get("name", "")
+
+            if tool_type == "tool_call":
+                if tool_name in _BUILTIN_TOOL_NAMES:
+                    builtin_calls += 1
+                else:
+                    custom_function_calls.append(tc)
+
+        good_calls = sum(1 for tc in custom_function_calls if "good" in tc.get("name", ""))
+        bad_calls = sum(1 for tc in custom_function_calls if "bad" in tc.get("name", ""))
+        invalid_calls = sum(1 for tc in custom_function_calls if "invalid" in tc.get("name", ""))
+        total_calls = len(tool_calls)
+        total_good_calls = good_calls + builtin_calls
 
     if invalid_calls > 0:
-        # Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
         return {
             "llm_output": {
                 "chain_of_thought": "The tool calls were very correct that I returned a huge number!",
@@ -85,20 +102,20 @@ def test_evaluate_tools_valid1(self, mock_model_config):
         tool_calls = [
             {
                 "type": "tool_call",
-                "tool_call_id": "call_good",
-                "name": "fetch_weather",
+                "tool_call_id": "call_1",
+                "name": "fetch_weather_good",
                 "arguments": {"location": "Paris"},
             },
             {
                 "type": "tool_call",
-                "tool_call_id": "call_bad",
-                "name": "buy_jacket",
+                "tool_call_id": "call_2",
+                "name": "buy_jacket_bad",
                 "arguments": {"type": "raincoat"},
             },
         ]
         tool_definitions = [
             {
-                "name": "fetch_weather",
+                "name": "fetch_weather_good",
                 "type": "function",
                 "description": "Fetches the weather information for the specified location.",
                 "parameters": {
@@ -112,7 +129,7 @@ def test_evaluate_tools_valid1(self, mock_model_config):
                 },
             },
             {
-                "name": "buy_jacket",
+                "name": "buy_jacket_bad",
                 "type": "function",
                 "description": "Buy a jacket of the given type.",
                 "parameters": {
@@ -147,20 +164,20 @@ def test_evaluate_tools_valid2(self, mock_model_config):
         tool_calls = [
             {
                 "type": "tool_call",
-                "tool_call_id": "call_bad",
-                "name": "fetch_weather",
+                "tool_call_id": "call_1",
+                "name": "fetch_weather_bad",
                 "arguments": {"location": "Tokyo"},
             },
             {
                 "type": "tool_call",
-                "tool_call_id": "call_bad",
-                "name": "buy_jacket",
+                "tool_call_id": "call_2",
+                "name": "buy_jacket_bad",
                 "arguments": {"type": "raincoat"},
             },
         ]
         tool_definitions = [
             {
-                "name": "fetch_weather",
+                "name": "fetch_weather_bad",
                 "type": "function",
                 "description": "Fetches the weather information for the specified location.",
                 "parameters": {
@@ -174,7 +191,7 @@ def test_evaluate_tools_valid2(self, mock_model_config):
                 },
             },
             {
-                "name": "buy_jacket",
+                "name": "buy_jacket_bad",
                 "type": "function",
                 "description": "Buy a jacket of the given type.",
                 "parameters": {
@@ -209,20 +226,20 @@ def test_evaluate_tools_valid3(self, mock_model_config):
         tool_calls = [
             {
                 "type": "tool_call",
-                "tool_call_id": "call_good",
-                "name": "fetch_weather",
+                "tool_call_id": "call_1",
+                "name": "fetch_weather_good",
                 "arguments": {"location": "Paris"},
             },
             {
                 "type": "tool_call",
-                "tool_call_id": "call_good",
-                "name": "buy_jacket",
+                "tool_call_id": "call_2",
+                "name": "buy_jacket_good",
                 "arguments": {"type": "jacket"},
             },
         ]
         tool_definitions = [
             {
-                "name": "fetch_weather",
+                "name": "fetch_weather_good",
                 "type": "function",
                 "description": "Fetches the weather information for the specified location.",
                 "parameters": {
@@ -236,7 +253,7 @@ def test_evaluate_tools_valid3(self, mock_model_config):
                 },
             },
             {
-                "name": "buy_jacket",
+                "name": "buy_jacket_good",
                 "type": "function",
                 "description": "Buy a jacket of the given type.",
                 "parameters": {
@@ -272,14 +289,14 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config):
             tool_calls = [
                 {
                     "type": "tool_call",
-                    "tool_call_id": "call_invalid",
-                    "name": "fetch_weather",
+                    "tool_call_id": "call_1",
+                    "name": "fetch_weather_invalid",
                     "arguments": {"location": "Tokyo"},
                 },
             ]
             tool_definitions = [
                 {
-                    "name": "fetch_weather",
+                    "name": "fetch_weather_invalid",
                     "type": "function",
                     "description": "Fetches the weather information for the specified location.",
                     "parameters": {
@@ -306,20 +323,20 @@ def test_evaluate_tools_some_missing_tool_definitions(self, mock_model_config):
         tool_calls = [
             {
                 "type": "tool_call",
-                "tool_call_id": "call_good",
-                "name": "fetch_weather",
+                "tool_call_id": "call_1",
+                "name": "fetch_weather_good",
                 "arguments": {"location": "Tokyo"},
             },
             {
                 "type": "tool_call",
-                "tool_call_id": "call_bad",
-                "name": "buy_jacket",
+                "tool_call_id": "call_2",
+                "name": "buy_jacket_bad",
                 "arguments": {"type": "raincoat"},
             },
         ]
         tool_definitions = [
             {
-                "name": "fetch_weather",
+                "name": "fetch_weather_good",
                 "type": "function",
                 "description": "Fetches the weather information for the specified location.",
                 "parameters": {
@@ -331,7 +348,7 @@ def test_evaluate_tools_some_missing_tool_definitions(self, mock_model_config):
                         }
                     },
                 },
-            },  # buy_jacket definition is missing
+            },  # buy_jacket_bad definition is missing
         ]
         result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
@@ -354,14 +371,14 @@ def test_evaluate_tools_built_in_tool_definition(self, mock_model_config):
         tool_calls = [
             {
                 "type": "tool_call",
-                "tool_call_id": "call_good",
-                "name": "fetch_weather",
+                "tool_call_id": "call_1",
+                "name": "fetch_weather_good",
                 "arguments": {"location": "Tokyo"},
             },
         ]
         tool_definitions = [
             {
-                "name": "fetch_weather",
+                "name": "fetch_weather_good",
                 "type": "some_built_in",  # Not a 'function' type but shouldn't be filtered out
                 "description": "Fetches the weather information for the specified location.",
                 "parameters": {
@@ -664,8 +681,8 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
         tool_calls = [
             {
                 "type": "tool_call",
-                "tool_call_id": "call_builtin_good",
-                "name": "get_countries_LookupCountryByCurrency",
+                "tool_call_id": "call_1",
+                "name": "get_countries_LookupCountryByCurrency_good",
                 "arguments": {"currency": "GBP"},
             },
         ]
@@ -711,7 +728,7 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
                 "auth": {"type": "anonymous", "security_scheme": {}},
                 "functions": [
                     {
-                        "name": "get_countries_LookupCountryByCurrency",
+                        "name": "get_countries_LookupCountryByCurrency_good",
                         "type": "function",
                         "description": "Search by currency.",
                         "parameters": {
@@ -740,14 +757,14 @@ def test_evaluate_missing_query(self, mock_model_config):
         tool_calls = [
             {
                 "type": "tool_call",
-                "tool_call_id": "call_good",
-                "name": "get_weather",
+                "tool_call_id": "call_1",
+                "name": "get_weather_good",
                 "arguments": {"location": "Paris"},
             }
         ]
         tool_definitions = [
             {
-                "name": "get_weather",
+                "name": "get_weather_good",
                 "type": "function",
                 "description": "Get weather information",
                 "parameters": {
@@ -769,3 +786,239 @@ def test_evaluate_missing_query(self, mock_model_config):
             evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)
 
         assert "Query is a required input" in str(exc_info.value)
+
+    def test_evaluate_with_conversation_history_query(self, mock_model_config):
+        """Test that query provided as a list of messages (conversation history) is reformatted correctly."""
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        query = [
+            {"role": "system", "content": "You are a helpful weather assistant."},
+            {"role": "user", "content": "What's the weather like in Paris?"},
+            {"role": "assistant", "content": "Let me check that for you."},
+        ]
+        tool_calls = [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_1",
+                "name": "fetch_weather_good",
+                "arguments": {"location": "Paris"},
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "fetch_weather_good",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}},
+                },
+            }
+        ]
+
+        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+
+        key = ToolCallAccuracyEvaluator._RESULT_KEY
+        assert result is not None
+        assert key in result
+        assert f"{key}_result" in result
+        assert result[key] == 5.0
+        assert result[f"{key}_result"] == "pass"
+
+    def test_evaluate_with_response_list(self, mock_model_config):
+        """Test that response provided as a list of messages containing tool calls is reformatted correctly."""
+
+        async def flow_side_effect_string_tool_calls(timeout, **kwargs):
+            # When tool_calls is a reformatted string, return a fixed passing score
+            tool_calls = kwargs.get("tool_calls", "")
+            if isinstance(tool_calls, str) and tool_calls:
+                return {
+                    "llm_output": {
+                        "chain_of_thought": "Tool calls were reformatted and evaluated.",
+                        "tool_calls_success_level": 5,
+                        "details": {"tool_calls_made_by_agent": 1, "correct_tool_calls_made_by_agent": 1},
+                    }
+                }
+            return {
+                "llm_output": {
+                    "chain_of_thought": "No tool calls found.",
+                    "tool_calls_success_level": 1,
+                    "details": {},
+                }
+            }
+
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect_string_tool_calls)
+
+        query = "What's the weather in Paris?"
+        response = [
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_call",
+                        "tool_call_id": "call_123",
+                        "name": "fetch_weather",
+                        "arguments": {"location": "Paris"},
+                    }
+                ],
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "fetch_weather",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}},
+                },
+            }
+        ]
+
+        result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
+
+        key = ToolCallAccuracyEvaluator._RESULT_KEY
+        assert result is not None
+        assert key in result
+        assert f"{key}_result" in result
+        assert result[key] == 5.0
+        assert result[f"{key}_result"] == "pass"
+
+    def test_tool_calls_reformatting_is_applied(self, mock_model_config):
+        """Verify that when response is provided as a list, the tool_calls passed to _flow
+        is the reformatted string produced by reformat_agent_response, not the raw message list."""
+        captured_kwargs = {}
+
+        async def capturing_flow(timeout, **kwargs):
+            captured_kwargs.update(kwargs)
+            return {
+                "llm_output": {
+                    "chain_of_thought": "Verified reformatted tool_calls string.",
+                    "tool_calls_success_level": 5,
+                    "details": {"tool_calls_made_by_agent": 1, "correct_tool_calls_made_by_agent": 1},
+                }
+            }
+
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=capturing_flow)
+
+        query = "What's the weather in Paris?"
+        response = [
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_call",
+                        "tool_call_id": "call_123",
+                        "name": "fetch_weather",
+                        "arguments": {"location": "Paris"},
+                    }
+                ],
+            }
+        ]
+        tool_definitions = [
+            {
+                "name": "fetch_weather",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}},
+                },
+            }
+        ]
+
+        evaluator(query=query, response=response, tool_definitions=tool_definitions)
+
+        # tool_calls should be a reformatted string, not a raw list
+        tool_calls_sent = captured_kwargs.get("tool_calls")
+        assert isinstance(
+            tool_calls_sent, str
+        ), f"Expected tool_calls to be a reformatted string, got {type(tool_calls_sent)}"
+        assert "[TOOL_CALL]" in tool_calls_sent
+        assert "fetch_weather" in tool_calls_sent
+        assert "Paris" in tool_calls_sent
+
+    def test_tool_result_included_in_reformatted_tool_calls(self, mock_model_config):
+        """Verify that tool results are included in reformatted tool_calls when response
+        contains a role=tool message, and are absent when tool_calls is passed directly."""
+        captured_kwargs = {}
+
+        async def capturing_flow(timeout, **kwargs):
+            captured_kwargs.update(kwargs)
+            return {
+                "llm_output": {
+                    "chain_of_thought": "Evaluated.",
+                    "tool_calls_success_level": 5,
+                    "details": {"tool_calls_made_by_agent": 1, "correct_tool_calls_made_by_agent": 1},
+                }
+            }
+
+        tool_definitions = [
+            {
+                "name": "fetch_weather",
+                "type": "function",
+                "description": "Fetches the weather information for the specified location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "The location to fetch weather for."}},
+                },
+            }
+        ]
+
+        # Case 1: response includes a tool result message — [TOOL_RESULT] should appear
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=capturing_flow)
+
+        response_with_result = [
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_call",
+                        "tool_call_id": "call_123",
+                        "name": "fetch_weather",
+                        "arguments": {"location": "Paris"},
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_123",
+                "content": [{"type": "tool_result", "tool_result": '{"temperature": "15C", "condition": "sunny"}'}],
+            },
+        ]
+
+        evaluator(
+            query="What's the weather in Paris?", response=response_with_result, tool_definitions=tool_definitions
+        )
+        tool_calls_sent = captured_kwargs.get("tool_calls")
+        assert isinstance(tool_calls_sent, str)
+        assert "[TOOL_CALL]" in tool_calls_sent
+        assert (
+            "[TOOL_RESULT]" in tool_calls_sent
+        ), "Tool result should be included when response contains a tool result message"
+        assert "15C" in tool_calls_sent
+
+        # Case 2: tool_calls passed directly (no tool_result field) — [TOOL_RESULT] must not appear
+        captured_kwargs.clear()
+        evaluator._flow = MagicMock(side_effect=capturing_flow)
+
+        direct_tool_calls = [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_1",
+                "name": "fetch_weather",
+                "arguments": {"location": "Paris"},
+            }
+        ]
+
+        evaluator(query="What's the weather in Paris?", tool_calls=direct_tool_calls, tool_definitions=tool_definitions)
+        tool_calls_sent = captured_kwargs.get("tool_calls")
+        assert isinstance(tool_calls_sent, str)
+        assert "[TOOL_CALL]" in tool_calls_sent
+        assert (
+            "[TOOL_RESULT]" not in tool_calls_sent
+        ), "Tool result must not appear when tool_calls has no attached result"