Add better structured outputs handling for ChatCompletions (#95)

huiwengoh · web-flow · commit a8c8a2f75722 · 2025-07-18T12:19:33.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,12 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.1.17] - 2025-07-18
+
+### Changed
+
+- Enabled `TLMChatCompletion.score()`to evaluate structured outputs in `ChatCompletion` objects
+
+
 ## [1.1.16] - 2025-07-15
 
 ### Changed
 
 - Add internal setting to bypass model validation check (for custom/VPC models)
 
+
 ## [1.1.15] - 2025-07-14
 
 ### Changed
@@ -28,6 +36,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Support for `best` and `high` preset in `TrustworthyRAG`
 
 ### Changed 
+
 - Deprecate `use_self_reflection`
 - Documentation updates for new default configurations
 
diff --git a/src/cleanlab_tlm/__about__.py b/src/cleanlab_tlm/__about__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: MIT
-__version__ = "1.1.16"
+__version__ = "1.1.17"
diff --git a/src/cleanlab_tlm/internal/api/api.py b/src/cleanlab_tlm/internal/api/api.py
@@ -45,6 +45,7 @@
 
 if TYPE_CHECKING:
     import requests
+    from openai.types.chat import ChatCompletion
 
     from cleanlab_tlm.internal.concurrency import TlmRateHandler
     from cleanlab_tlm.utils.rag import Eval
@@ -53,6 +54,7 @@
 base_url = os.environ.get("CLEANLAB_API_BASE_URL", "https://api.cleanlab.ai/api")
 tlm_base_url = f"{base_url}/v0/trustworthy_llm"
 tlm_rag_base_url = f"{base_url}/v1/rag_trustworthy_llm"
+tlm_openai_base_url = f"{base_url}/v1/openai_trustworthy_llm"
 
 
 def _construct_headers(api_key: Optional[str], content_type: Optional[str] = "application/json") -> JSONDict:
@@ -115,7 +117,7 @@ def handle_rate_limit_error_from_resp(resp: aiohttp.ClientResponse) -> None:
         )
 
 
-async def handle_tlm_client_error_from_resp(resp: aiohttp.ClientResponse, batch_index: Optional[int]) -> None:
+async def handle_tlm_client_error_from_resp(resp: aiohttp.ClientResponse, batch_index: Optional[int] = None) -> None:
     """Catches 4XX (client error) errors."""
     if 400 <= resp.status < 500:  # noqa: PLR2004
         try:
@@ -133,7 +135,7 @@ async def handle_tlm_client_error_from_resp(resp: aiohttp.ClientResponse, batch_
         raise TlmBadRequestError(error_message, retryable)
 
 
-async def handle_tlm_api_error_from_resp(resp: aiohttp.ClientResponse, batch_index: Optional[int]) -> None:
+async def handle_tlm_api_error_from_resp(resp: aiohttp.ClientResponse, batch_index: Optional[int] = None) -> None:
     """Catches 5XX (server error) errors."""
     if 500 <= resp.status < 600:  # noqa: PLR2004
         try:
@@ -527,3 +529,51 @@ async def tlm_rag_score(
             ordered_res[evaluation.name] = res_json[evaluation.name]
 
     return ordered_res
+
+
+@tlm_retry
+async def tlm_chat_completions_score(
+    api_key: str,
+    response: ChatCompletion,
+    client_session: Optional[aiohttp.ClientSession] = None,
+    **input_kwargs: Any,
+) -> JSONDict:
+    """
+    Score an OpenAI ChatCompletion response using Trustworthy Language Model
+
+    Args:
+        api_key (str): API key for auth
+        response (ChatCompletion): response to be evaluated (OpenAI ChatCompletion object)
+        client_session (aiohttp.ClientSession): client session used to issue TLM request
+        **input_kwargs: additional keyword arguments (openai arguments or TLM options) to pass to the TLM request.
+    Returns:
+        JSONDict: dictionary with trustworthiness score and any evaluation results
+    """
+    local_scoped_client = False
+    if not client_session:
+        client_session = aiohttp.ClientSession()
+        local_scoped_client = True
+
+    try:
+        res = await client_session.post(
+            f"{tlm_openai_base_url}/score",
+            json={
+                "response": response.model_dump(),
+                **input_kwargs,
+            },
+            headers=_construct_headers(api_key),
+        )
+
+        res_json = await res.json()
+
+        await handle_api_key_error_from_resp(res)
+        await handle_http_bad_request_error_from_resp(res)
+        handle_rate_limit_error_from_resp(res)
+        await handle_tlm_client_error_from_resp(res)
+        await handle_tlm_api_error_from_resp(res)
+
+    finally:
+        if local_scoped_client:
+            await client_session.close()
+
+    return cast(JSONDict, res_json)
diff --git a/src/cleanlab_tlm/utils/chat_completions.py b/src/cleanlab_tlm/utils/chat_completions.py
@@ -5,12 +5,14 @@
 It works for any OpenAI LLM model, as well as the many other non-OpenAI LLMs that are also usable via Chat Completions API (Gemini, DeepSeek, Llama, etc).
 """
 
+import asyncio
 from typing import TYPE_CHECKING, Any, Optional, cast
 
+from cleanlab_tlm.internal.api.api import tlm_chat_completions_score
 from cleanlab_tlm.internal.base import BaseTLM
 from cleanlab_tlm.internal.constants import (
     _DEFAULT_TLM_QUALITY_PRESET,
-    _VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS,
+    _VALID_TLM_QUALITY_PRESETS,
 )
 from cleanlab_tlm.internal.types import TLMQualityPreset
 from cleanlab_tlm.tlm import TLM, TLMOptions, TLMScore
@@ -52,7 +54,7 @@ def __init__(
         """
         super().__init__(
             quality_preset=quality_preset,
-            valid_quality_presets=_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS,
+            valid_quality_presets=_VALID_TLM_QUALITY_PRESETS,
             support_custom_eval_criteria=True,
             api_key=api_key,
             options=options,
@@ -85,6 +87,26 @@ def score(
         self._validate_chat_completion(response)
         if (messages := openai_kwargs.get("messages")) is None:
             raise ValueError("messages is a required OpenAI input argument.")
+
+        combined_kwargs = {**openai_kwargs, **self._options}
+
+        # handle structured outputs differently
+        if openai_kwargs.get("response_format"):
+            return cast(
+                TLMScore,
+                self._event_loop.run_until_complete(
+                    asyncio.wait_for(
+                        tlm_chat_completions_score(
+                            api_key=self._api_key,
+                            response=response,
+                            **combined_kwargs,
+                        ),
+                        timeout=self._timeout,
+                    )
+                ),
+            )
+
+        # all other cases
         tools = openai_kwargs.get("tools", None)
 
         prompt_text = _form_prompt_chat_completions_api(messages, tools)
diff --git a/tests/test_chat_completions.py b/tests/test_chat_completions.py
@@ -5,6 +5,11 @@
 from openai.types.chat import ChatCompletion, ChatCompletionMessage
 from openai.types.chat.chat_completion import Choice
 from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall, Function
+from openai.types.completion_usage import (
+    CompletionTokensDetails,
+    CompletionUsage,
+    PromptTokensDetails,
+)
 
 from cleanlab_tlm.internal.types import TLMQualityPreset
 from cleanlab_tlm.tlm import TLMScore
@@ -19,7 +24,7 @@
 
 @pytest.mark.parametrize(
     "quality_preset",
-    ["base", "low", "medium"],
+    ["base", "low", "medium", "high", "best"],
 )
 def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset) -> None:
     tlm_chat = TLMChatCompletion(quality_preset=quality_preset)
@@ -118,6 +123,80 @@ def test_tlm_chat_completion_score_with_tools() -> None:
     assert is_trustworthiness_score_json_format(score)
 
 
+def test_tlm_chat_completion_score_with_structured_output() -> None:
+    tlm_chat = TLMChatCompletion()
+    openai_kwargs = {
+        "model": "gpt-4.1-mini",
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful math tutor. Guide the user through the solution step by step.",
+            },
+            {"role": "user", "content": "how can I solve 8x + 7 = -23"},
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "math_reasoning",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "steps": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "explanation": {"type": "string"},
+                                    "output": {"type": "string"},
+                                },
+                                "required": ["explanation", "output"],
+                                "additionalProperties": False,
+                            },
+                        },
+                        "final_answer": {"type": "string"},
+                    },
+                    "required": ["steps", "final_answer"],
+                    "additionalProperties": False,
+                },
+                "strict": True,
+            },
+        },
+    }
+    response = ChatCompletion(
+        id="test",
+        choices=[
+            Choice(
+                index=0,
+                message=ChatCompletionMessage(
+                    role="assistant",
+                    content='{"steps":[{"explanation":"Start with the original equation: 8x + 7 = -23","output":"8x + 7 = -23"},{"explanation":"Subtract 7 from both sides to isolate the term with x on one side. This will give us: 8x = -23 - 7","output":"8x = -30"},{"explanation":"Now simplify the right side: -23 - 7 equals -30, so we have 8x = -30","output":"8x = -30"},{"explanation":"Next, divide both sides by 8 to solve for x. This gives us: x = -30 / 8","output":"x = -3.75"},{"explanation":"We can also simplify -30 / 8 by dividing both the numerator and the denominator by 2. This leads to: x = -15 / 4","output":"x = -15/4 (or -3.75 as a decimal)"}],"final_answer":"x = -15/4 or x = -3.75"}',
+                ),
+                finish_reason="stop",
+            )
+        ],
+        usage=CompletionUsage(
+            completion_tokens=50,
+            completion_tokens_details=CompletionTokensDetails(
+                accepted_prediction_tokens=0,
+                audio_tokens=0,
+                reasoning_tokens=0,
+                rejected_prediction_tokens=0,
+            ),
+            prompt_tokens=50,
+            prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0),
+            total_tokens=100,
+        ),
+        created=1234567890,
+        model="test-model",
+        object="chat.completion",
+    )
+
+    score = tlm_chat.score(response=response, **openai_kwargs)
+
+    assert score is not None
+    assert is_trustworthiness_score_json_format(score)
+
+
 def test_tlm_chat_completion_score_invalid_response() -> None:
     tlm_chat = TLMChatCompletion()
     openai_kwargs = {

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# SPDX-License-Identifier: MIT`
`2`		`-__version__ = "1.1.16"`
	`2`	`+__version__ = "1.1.17"`