Support for scoring tool calls in ChatCompletions API (#93)

jwmueller · elisno · web-flow · commit 1b43e21744b0 · 2025-07-14T21:31:05.000Z
Co-authored-by: Elías Snorrason &lt;eliassno@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.1.15] - 2025-07-14
+
+### Changed
+
+- Enabled `TLMChatCompletion.score()`to evaluate tool calls in `ChatCompletion` objects
+
+
 ## [1.1.14] - 2025-07-08
 
 ### Added
@@ -237,7 +244,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Release of the Cleanlab TLM Python client.
 
 
-[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.14...HEAD
+[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.15...HEAD
+[1.1.15]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.14...v1.1.15
 [1.1.14]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.13...v1.1.14
 [1.1.13]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.12...v1.1.13
 [1.1.12]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.11...v1.1.12
diff --git a/src/cleanlab_tlm/__about__.py b/src/cleanlab_tlm/__about__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: MIT
-__version__ = "1.1.14"
+__version__ = "1.1.15"
diff --git a/src/cleanlab_tlm/utils/chat.py b/src/cleanlab_tlm/utils/chat.py
@@ -6,10 +6,10 @@
 
 import json
 import warnings
-from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
 
 if TYPE_CHECKING:
-    from openai.types.chat import ChatCompletionMessageParam
+    from openai.types.chat import ChatCompletionMessage, ChatCompletionMessageParam
 
 # Define message prefixes
 _SYSTEM_PREFIX = "System: "
@@ -443,41 +443,44 @@ def form_prompt_string(
     )
 
 
-def form_response_string_chat_completions_api(response: dict[str, Any]) -> str:
+def form_response_string_chat_completions_api(response: Union[dict[str, Any], "ChatCompletionMessage"]) -> str:
     """
     Format an assistant response message dictionary from the Chat Completions API into a single string.
 
-    This function takes a response.choices[0].message.to_dict() from a chat.completions.create()
-    and formats it into a string that includes both content and tool calls (if present).
+    Given a ChatCompletion object `response` from `chat.completions.create()`,
+    this function can take either a ChatCompletionMessage object from `response.choices[0].message`
+    or a dictionary from `response.choices[0].message.to_dict()`.
+
+    All inputs are formatted into a string that includes both content and tool calls (if present).
     Tool calls are formatted using XML tags with JSON content, consistent with the format
     used in `form_prompt_string`.
 
     Args:
-        response (dict[str, Any]): A chat completion response message dictionary, containing:
-            - 'content' (str): The main response content from the LLM
-            - 'tool_calls' (List[Dict], optional): List of tool calls made by the LLM,
-              where each tool call contains function name and arguments
+        response (Union[dict[str, Any], ChatCompletionMessage]): Either:
+            - A ChatCompletionMessage object from the OpenAI response
+            - A chat completion response message dictionary, containing:
+              - 'content' (str): The main response content from the LLM
+              - 'tool_calls' (List[Dict], optional): List of tool calls made by the LLM,
+                where each tool call contains function name and arguments
 
     Returns:
         str: A formatted string containing the response content and any tool calls.
              Tool calls are formatted as XML tags containing JSON with function
              name and arguments.
 
     Raises:
-        TypeError: If response is not a dictionary.
+        TypeError: If response is not a dictionary or ChatCompletionMessage object.
     """
-    if not isinstance(response, dict):
-        raise TypeError(f"Expected response to be a dict, got {type(response).__name__}")
-
-    content = response.get("content") or ""
-
-    if "tool_calls" in response:
+    response_dict = _response_to_dict(response)
+    content = response_dict.get("content") or ""
+    tool_calls = response_dict.get("tool_calls")
+    if tool_calls is not None:
         try:
-            tool_calls = "\n".join(
+            tool_calls_str = "\n".join(
                 f"{_TOOL_CALL_TAG_START}\n{json.dumps({'name': call['function']['name'], 'arguments': json.loads(call['function']['arguments']) if call['function']['arguments'] else {}}, indent=2)}\n{_TOOL_CALL_TAG_END}"
-                for call in response["tool_calls"]
+                for call in tool_calls
             )
-            return f"{content}\n{tool_calls}".strip() if content else tool_calls
+            return f"{content}\n{tool_calls_str}".strip() if content else tool_calls_str
         except (KeyError, TypeError, json.JSONDecodeError) as e:
             # Log the error but continue with just the content
             warnings.warn(
@@ -487,3 +490,24 @@ def form_response_string_chat_completions_api(response: dict[str, Any]) -> str:
             )
 
     return str(content)
+
+
+def _response_to_dict(response: Any) -> dict[str, Any]:
+    # `response` should be a Union[dict[str, Any], ChatCompletionMessage], but last isinstance check wouldn't be reachable
+    if isinstance(response, dict):
+        # Start with this isinstance check first to import `openai` lazily
+        return response
+
+    try:
+        from openai.types.chat import ChatCompletionMessage
+    except ImportError as e:
+        raise ImportError(
+            "OpenAI is required to handle ChatCompletionMessage objects directly. Please install it with `pip install openai`."
+        ) from e
+
+    if not isinstance(response, ChatCompletionMessage):
+        raise TypeError(
+            f"Expected response to be a dict or ChatCompletionMessage object, got {type(response).__name__}"
+        )
+
+    return response.model_dump()
diff --git a/src/cleanlab_tlm/utils/chat_completions.py b/src/cleanlab_tlm/utils/chat_completions.py
@@ -14,10 +14,10 @@
 )
 from cleanlab_tlm.internal.types import TLMQualityPreset
 from cleanlab_tlm.tlm import TLM, TLMOptions, TLMScore
-from cleanlab_tlm.utils.chat import form_prompt_string
+from cleanlab_tlm.utils.chat import _form_prompt_chat_completions_api, form_response_string_chat_completions_api
 
 if TYPE_CHECKING:
-    from openai.types.chat import ChatCompletion
+    from openai.types.chat import ChatCompletion, ChatCompletionMessage
 
 
 class TLMChatCompletion(BaseTLM):
@@ -82,26 +82,31 @@ def score(
         Returns:
             TLMScore: A dict containing the trustworthiness score and optional logs
         """
+        self._validate_chat_completion(response)
         if (messages := openai_kwargs.get("messages")) is None:
             raise ValueError("messages is a required OpenAI input argument.")
         tools = openai_kwargs.get("tools", None)
 
-        prompt_text = form_prompt_string(messages, tools)
-        response_text = _get_string_response(response)
+        prompt_text = _form_prompt_chat_completions_api(messages, tools)
+        response_text = form_response_string_chat_completions_api(response=self._get_response_message(response))
 
         return cast(TLMScore, self._tlm.get_trustworthiness_score(prompt_text, response_text))
 
-
-def _get_string_response(response: "ChatCompletion") -> str:
-    try:
-        from openai.types.chat import ChatCompletion
-    except ImportError:
-        raise ImportError(
-            "OpenAI is required to use the TLMChatCompletion class. Please install it with `pip install openai`."
-        )
-
-    if not isinstance(response, ChatCompletion):
-        raise TypeError("The response is not an OpenAI ChatCompletion object.")
-    if response.choices[0].message.content is None:
-        raise ValueError("The OpenAI ChatCompletion object does not contain a message content.")
-    return str(response.choices[0].message.content)
+    @staticmethod
+    def _get_response_message(response: "ChatCompletion") -> "ChatCompletionMessage":
+        return response.choices[0].message
+
+    def _validate_chat_completion(self, response: Any) -> None:
+        # `response` should be a ChatCompletion, but isinstance checks wouldn't be reachable
+        try:
+            from openai.types.chat import ChatCompletion
+        except ImportError as e:
+            raise ImportError(
+                f"OpenAI is required to use the {self.__class__.__name__} class. Please install it with `pip install openai`."
+            ) from e
+        if not isinstance(response, ChatCompletion):
+            raise TypeError("The response is not an OpenAI ChatCompletion object.")
+
+        message = self._get_response_message(response)
+        if message.content is None and message.tool_calls is None:
+            raise ValueError("The OpenAI ChatCompletion object does not contain a message content or tool calls.")
diff --git a/tests/test_chat.py b/tests/test_chat.py
@@ -1,6 +1,8 @@
 from typing import TYPE_CHECKING, Any, cast
 
 import pytest
+from openai.types.chat import ChatCompletionMessage
+from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall, Function
 
 from cleanlab_tlm.utils.chat import (
     _form_prompt_chat_completions_api,
@@ -1366,13 +1368,13 @@ def test_form_response_string_chat_completions_api_empty_arguments() -> None:
 
 def test_form_response_string_chat_completions_api_invalid_input() -> None:
     """Test form_response_string_chat_completions_api raises TypeError for invalid input."""
-    with pytest.raises(TypeError, match="Expected response to be a dict, got str"):
+    with pytest.raises(TypeError, match="Expected response to be a dict or ChatCompletionMessage object, got str"):
         form_response_string_chat_completions_api("not a dict")  # type: ignore[arg-type]
 
-    with pytest.raises(TypeError, match="Expected response to be a dict, got list"):
+    with pytest.raises(TypeError, match="Expected response to be a dict or ChatCompletionMessage object, got list"):
         form_response_string_chat_completions_api([])  # type: ignore[arg-type]
 
-    with pytest.raises(TypeError, match="Expected response to be a dict, got NoneType"):
+    with pytest.raises(TypeError, match="Expected response to be a dict or ChatCompletionMessage object, got NoneType"):
         form_response_string_chat_completions_api(None)  # type: ignore[arg-type]
 
 
@@ -1406,3 +1408,184 @@ def test_form_response_string_chat_completions_api_malformed_tool_calls() -> Non
     with pytest.warns(UserWarning, match="Error formatting tool_calls in response.*Returning content only"):
         result = form_response_string_chat_completions_api(response)
         assert result == "Let me check that."
+
+
+############## ChatCompletionMessage tests ##############
+
+
+def test_form_response_string_chat_completions_api_chatcompletion_message_just_content() -> None:
+    """Test form_response_string_chat_completions_api with ChatCompletionMessage containing just content."""
+
+    content = "Hello, how can I help you today?"
+    message = ChatCompletionMessage(
+        role="assistant",
+        content=content,
+    )
+    result = form_response_string_chat_completions_api(message)
+    assert result == content
+
+
+def test_form_response_string_chat_completions_api_chatcompletion_message_just_tool_calls() -> None:
+    """Test form_response_string_chat_completions_api with ChatCompletionMessage containing just tool calls."""
+    message = ChatCompletionMessage(
+        role="assistant",
+        content=None,
+        tool_calls=[
+            ChatCompletionMessageToolCall(
+                id="call_123",
+                function=Function(
+                    name="search_restaurants",
+                    arguments='{"city": "Tokyo", "cuisine_type": "sushi", "max_price": 150, "dietary_restrictions": ["vegetarian", "gluten-free"], "open_now": true}',
+                ),
+                type="function",
+            )
+        ],
+    )
+    expected = (
+        "<tool_call>\n"
+        "{\n"
+        '  "name": "search_restaurants",\n'
+        '  "arguments": {\n'
+        '    "city": "Tokyo",\n'
+        '    "cuisine_type": "sushi",\n'
+        '    "max_price": 150,\n'
+        '    "dietary_restrictions": [\n'
+        '      "vegetarian",\n'
+        '      "gluten-free"\n'
+        "    ],\n"
+        '    "open_now": true\n'
+        "  }\n"
+        "}\n"
+        "</tool_call>"
+    )
+    result = form_response_string_chat_completions_api(message)
+    assert result == expected
+
+
+def test_form_response_string_chat_completions_api_chatcompletion_message_content_and_tool_calls() -> None:
+    """Test form_response_string_chat_completions_api with ChatCompletionMessage containing both content and tool calls."""
+    message = ChatCompletionMessage(
+        role="assistant",
+        content="I'll check the weather for you.",
+        tool_calls=[
+            ChatCompletionMessageToolCall(
+                id="call_123",
+                function=Function(
+                    name="get_weather",
+                    arguments='{"location": "Paris"}',
+                ),
+                type="function",
+            )
+        ],
+    )
+    expected = (
+        "I'll check the weather for you.\n"
+        "<tool_call>\n"
+        "{\n"
+        '  "name": "get_weather",\n'
+        '  "arguments": {\n'
+        '    "location": "Paris"\n'
+        "  }\n"
+        "}\n"
+        "</tool_call>"
+    )
+    result = form_response_string_chat_completions_api(message)
+    assert result == expected
+
+
+def test_form_response_string_chat_completions_api_chatcompletion_message_multiple_tool_calls() -> None:
+    """Test form_response_string_chat_completions_api with ChatCompletionMessage containing multiple tool calls."""
+    message = ChatCompletionMessage(
+        role="assistant",
+        content="Let me check multiple things for you.",
+        tool_calls=[
+            ChatCompletionMessageToolCall(
+                id="call_123",
+                function=Function(
+                    name="get_weather",
+                    arguments='{"location": "Paris"}',
+                ),
+                type="function",
+            ),
+            ChatCompletionMessageToolCall(
+                id="call_456",
+                function=Function(
+                    name="get_time",
+                    arguments='{"timezone": "UTC"}',
+                ),
+                type="function",
+            ),
+        ],
+    )
+    expected = (
+        "Let me check multiple things for you.\n"
+        "<tool_call>\n"
+        "{\n"
+        '  "name": "get_weather",\n'
+        '  "arguments": {\n'
+        '    "location": "Paris"\n'
+        "  }\n"
+        "}\n"
+        "</tool_call>\n"
+        "<tool_call>\n"
+        "{\n"
+        '  "name": "get_time",\n'
+        '  "arguments": {\n'
+        '    "timezone": "UTC"\n'
+        "  }\n"
+        "}\n"
+        "</tool_call>"
+    )
+    result = form_response_string_chat_completions_api(message)
+    assert result == expected
+
+
+def test_form_response_string_chat_completions_api_chatcompletion_message_empty_content() -> None:
+    """Test form_response_string_chat_completions_api with ChatCompletionMessage containing empty content."""
+    message = ChatCompletionMessage(
+        role="assistant",
+        content="",
+    )
+    expected = ""
+    result = form_response_string_chat_completions_api(message)
+    assert result == expected
+
+
+def test_form_response_string_chat_completions_api_chatcompletion_message_empty_arguments() -> None:
+    """Test form_response_string_chat_completions_api with ChatCompletionMessage containing empty arguments."""
+    message = ChatCompletionMessage(
+        role="assistant",
+        content="Running action",
+        tool_calls=[
+            ChatCompletionMessageToolCall(
+                id="call_123",
+                function=Function(
+                    name="execute_action",
+                    arguments="",
+                ),
+                type="function",
+            )
+        ],
+    )
+    expected = (
+        "Running action\n"
+        "<tool_call>\n"
+        "{\n"
+        '  "name": "execute_action",\n'
+        '  "arguments": {}\n'
+        "}\n"
+        "</tool_call>"
+    )
+    result = form_response_string_chat_completions_api(message)
+    assert result == expected
+
+
+def test_form_response_string_chat_completions_api_chatcompletion_message_none_content() -> None:
+    """Test form_response_string_chat_completions_api with ChatCompletionMessage containing None content."""
+    message = ChatCompletionMessage(
+        role="assistant",
+        content=None,
+    )
+    expected = ""
+    result = form_response_string_chat_completions_api(message)
+    assert result == expected
diff --git a/tests/test_chat_completions.py b/tests/test_chat_completions.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# SPDX-License-Identifier: MIT`
`2`		`-__version__ = "1.1.14"`
	`2`	`+__version__ = "1.1.15"`