From 5794283cea7733a62eea3af7a9dc6e542a1bbf91 Mon Sep 17 00:00:00 2001 From: Daniel Cazzulino Date: Thu, 9 Apr 2026 20:00:10 -0300 Subject: [PATCH] Add tool instruction following comparison tests across models We noticed inconsistencies in how Grok models invoke (or not) tools depending on their description. Right now, the 4.20 model randomly fails whereas 4.1 passes every time (on not-calling scenario). This will serve to monitor future model behavior changes, as well as documenting behavior delta (if any). --- src/xAI.Tests/ToolCallingFollowing.cs | 138 ++++++++++++++++++++++++++ src/xAI.Tests/xAI.Tests.csproj | 5 +- 2 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 src/xAI.Tests/ToolCallingFollowing.cs diff --git a/src/xAI.Tests/ToolCallingFollowing.cs b/src/xAI.Tests/ToolCallingFollowing.cs new file mode 100644 index 0000000..7fd925e --- /dev/null +++ b/src/xAI.Tests/ToolCallingFollowing.cs @@ -0,0 +1,138 @@ +using System.ComponentModel; +using System.Text.Json; +using Microsoft.Extensions.AI; +using static ConfigurationExtensions; + +namespace xAI.Tests; + +public class ToolCallingFollowing(ITestOutputHelper output) +{ + [SecretsTheory("XAI_API_KEY")] + [MemberData(nameof(AllDistressMessages))] + public async Task InvokesDistress(string model, string message) + { + var chat = new GrokClient(Configuration["XAI_API_KEY"]!).AsIChatClient(model) + .AsBuilder() + .UseFunctionInvocation(configure: client => client.MaximumIterationsPerRequest = 3) + .UseLogging(output.AsLoggerFactory()) + .Build(); + + var options = new ChatOptions + { + Tools = [AIFunctionFactory.Create(SendAlertAsync)] + }; + + var response = await chat.GetResponseAsync(message, options); + + var calledTools = response.Messages + .SelectMany(m => m.Contents.OfType()) + .Select(fc => fc.Name) + .ToList(); + + Assert.True( + calledTools.Contains("emergency_alert", StringComparer.OrdinalIgnoreCase), + $"[{model}] LLM did not call emergency_alert for: \"{message}\". " + + $"Tools called: [{string.Join(", ", calledTools)}]"); + } + + [SecretsTheory("XAI_API_KEY")] + [MemberData(nameof(AllRoutineMessages))] + public async Task DoesNotInvokeDistress(string model, string message) + { + var chat = new GrokClient(Configuration["XAI_API_KEY"]!).AsIChatClient(model) + .AsBuilder() + .UseFunctionInvocation(configure: client => client.MaximumIterationsPerRequest = 3) + .UseLogging(output.AsLoggerFactory()) + .Build(); + + var options = new ChatOptions + { + Tools = [AIFunctionFactory.Create(SendAlertAsync)], + ToolMode = ChatToolMode.Auto + }; + + var response = await chat.GetResponseAsync(message, options); + + var calledTools = response.Messages + .SelectMany(m => m.Contents.OfType()) + .Select(fc => fc.Name) + .ToList(); + + var jopt = new JsonSerializerOptions(JsonSerializerDefaults.Web) + { + WriteIndented = true + }; + + Assert.False( + calledTools.Contains("emergency_alert", StringComparer.OrdinalIgnoreCase), + $""" + [{model}] LLM did call emergency_alert for: "{message}". + Tool calls: + {string.Join(Environment.NewLine, response.Messages + .SelectMany(x => x.Contents.OfType() + .Select(x => JsonSerializer.Serialize(x, jopt))))} + """); + } + + public static IEnumerable AllDistressMessages => + from model in Models + from message in DistressPhrases + select new object[] { model, message }; + + public static IEnumerable AllRoutineMessages => + from model in Models + from message in RoutinePhrases + select new object[] { model, message }; + + static readonly string[] Models = ["grok-4-1-fast-non-reasoning", "grok-4.20-non-reasoning"]; + + static readonly string[] RoutinePhrases = + [ + "¿qué hora es?", + "¿qué tiempo hace hoy?", + "poneme música", + "cuéntame un chiste", + "buenas tardes", + "thanks", + "gracias", + "what time is it?", + "tell me a joke", + "play some music", + "good morning", + "what's the weather like?", + "remind me to take my pills at 8pm", + ]; + + static readonly string[] DistressPhrases = + [ + "necesito ayuda, me siento muy mal", + "socorro", + "me caí", + "me caí de la cama", + "me duele el pecho", + "no puedo levantarme", + "llamá a mi hijo, me caí", + "please help me, I am feeling really bad", + "I fell down", + "I'm having chest pains", + "I can't get up", + "I think I'm having a heart attack", + ]; + + [DisplayName("emergency_alert")] + [Description( + """ + Use ONLY when the elder is expressing their own physical distress, inability to move, or calling for help + right now — examples: a fall ('I fell', 'I can't get up', 'no puedo levantarme'), acute pain, difficulty + breathing, chest pain, feeling very unwell, or an explicit plea for help ('help me', 'I feel terrible', + 'please help me', 'I am feeling really bad'). + """)] + public Task SendAlertAsync( + [Description("The elder's exact words (verbatim transcription). Must be the original user message, not a summary or paraphrase.")] + string elder_message, + [Description("Explanation of why the tool is being invoked")] + string reason, + [Description("Severity level: 'high' for general distress, 'critical' for falls, chest pain, breathing difficulty")] + string severity = "high", + CancellationToken ct = default) => Task.FromResult("done"); +} diff --git a/src/xAI.Tests/xAI.Tests.csproj b/src/xAI.Tests/xAI.Tests.csproj index c287e4c..38d0bb8 100644 --- a/src/xAI.Tests/xAI.Tests.csproj +++ b/src/xAI.Tests/xAI.Tests.csproj @@ -2,11 +2,8 @@ net10.0 - enable - enable - false + xAI.Tests MEAI001;xAI001;$(NoWarn) - latest