[TLMChatCompletion] Add functionality for structured outputs per-field scoring (#120)

huiwengoh · web-flow · commit f8d50a1133d1 · 2025-09-22T16:23:03.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.1.32] - 2025-09-22
+
+### Added
+
+- Add per-field scoring functionality for structured outputs responses in `TLMChatCompletion`
+
 ## [1.1.31] - 2025-09-18
 
 ### Added
@@ -343,7 +349,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Release of the Cleanlab TLM Python client.
 
-[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.31...HEAD
+[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.32...HEAD
+[1.1.32]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.31...v1.1.32
 [1.1.31]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.30...v1.1.31
 [1.1.30]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.29...v1.1.30
 [1.1.29]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.28...v1.1.29
diff --git a/src/cleanlab_tlm/__about__.py b/src/cleanlab_tlm/__about__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: MIT
-__version__ = "1.1.31"
+__version__ = "1.1.32"
diff --git a/src/cleanlab_tlm/internal/api/api.py b/src/cleanlab_tlm/internal/api/api.py
@@ -41,6 +41,7 @@
     _TLM_TRUSTWORTHINESS_KEY,
     _TLM_USER_ID_KEY,
 )
+from cleanlab_tlm.internal.exception_handling import handle_tlm_exceptions
 from cleanlab_tlm.internal.types import JSONDict
 
 if TYPE_CHECKING:
@@ -533,6 +534,7 @@ async def tlm_rag_score(
 
 
 @tlm_retry
+@handle_tlm_exceptions(response_type="TLMScore")
 async def tlm_chat_completions_score(
     api_key: str,
     response: ChatCompletion,
@@ -577,7 +579,14 @@ async def tlm_chat_completions_score(
         if local_scoped_client:
             await client_session.close()
 
-    return cast(JSONDict, res_json)
+    tlm_result = {
+        "trustworthiness_score": res_json["trustworthiness_score"],
+    }
+
+    if "log" in input_kwargs:
+        tlm_result["log"] = res_json["log"]
+
+    return tlm_result
 
 
 @tlm_retry
diff --git a/src/cleanlab_tlm/internal/constants.py b/src/cleanlab_tlm/internal/constants.py
@@ -66,7 +66,7 @@
     "discrepancy",
 }
 TLM_REASONING_EFFORT_VALUES: set[str] = {"none", "low", "medium", "high"}
-TLM_VALID_LOG_OPTIONS: set[str] = {"perplexity", "explanation"}
+TLM_VALID_LOG_OPTIONS: set[str] = {"perplexity", "explanation", "per_field_score"}
 TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS: set[str] = {
     "perplexity",
     _TLM_CONSTRAIN_OUTPUTS_KEY,
diff --git a/src/cleanlab_tlm/internal/exception_handling.py b/src/cleanlab_tlm/internal/exception_handling.py
@@ -58,7 +58,7 @@ def decorator(
         async def wrapper(*args: Any, **kwargs: Any) -> ResponseT:
             capture_exceptions = kwargs.get("capture_exceptions", False)
             batch_index = kwargs.get("batch_index")
-            evals = getattr(args[0], "_evals", [])
+            evals = getattr(args[0], "_evals", []) if args else []
             try:
                 return await func(*args, **kwargs)
             except asyncio.TimeoutError:
diff --git a/src/cleanlab_tlm/utils/chat_completions.py b/src/cleanlab_tlm/utils/chat_completions.py
@@ -6,6 +6,7 @@
 """
 
 import asyncio
+import json
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
 from cleanlab_tlm.internal.api.api import tlm_chat_completions_score
@@ -84,6 +85,13 @@ def score(
         Returns:
             TLMScore: A dict containing the trustworthiness score and optional logs
         """
+        try:
+            from openai.lib._parsing._completions import type_to_response_format_param
+        except ImportError as e:
+            raise ImportError(
+                f"OpenAI is required to use the {self.__class__.__name__} class. Please install it with `pip install openai`."
+            ) from e
+
         self._validate_chat_completion(response)
         if (messages := openai_kwargs.get("messages")) is None:
             raise ValueError("messages is a required OpenAI input argument.")
@@ -95,7 +103,14 @@ def score(
         }
 
         # handle structured outputs differently
-        if openai_kwargs.get("response_format"):
+        if combined_kwargs.get("response_format"):
+            if "log" in combined_kwargs and "explanation" in combined_kwargs["log"]:
+                raise ValueError(
+                    "`explanation` is not supported when `response_format` is specified, "
+                    "use `per_field_score` instead to get detailed explanations for each field"
+                )
+
+            combined_kwargs["response_format"] = type_to_response_format_param(combined_kwargs["response_format"])
             return cast(
                 TLMScore,
                 self._event_loop.run_until_complete(
@@ -111,7 +126,7 @@ def score(
             )
 
         # all other cases
-        tools = openai_kwargs.get("tools", None)
+        tools = combined_kwargs.get("tools")
 
         prompt_text = _form_prompt_chat_completions_api(messages, tools)
         response_text = form_response_string_chat_completions(response=response)
@@ -195,6 +210,97 @@ def get_explanation(
 
         raise TypeError("tlm_result must be a TLMScore or ChatCompletion object.")
 
+    def get_untrustworthy_fields(
+        self,
+        *,
+        response: Optional["ChatCompletion"] = None,
+        tlm_result: Union[TLMScore, "ChatCompletion"],
+        threshold: float = 0.8,
+        display_details: bool = True,
+    ) -> list[str]:
+        """Gets the fields of a structured output response that are considered untrustworthy by TLM.
+        Only works for responses that are valid JSON objects (uses `response_format` to specify the output format).
+        Prints detailed information about the untrustworthy fields if `display_details` is True.
+
+        Args:
+            response (ChatCompletion): The OpenAI ChatCompletion response object to evaluate
+            tlm_result (TLMScore | ChatCompletion): The result object from a previous TLM call
+            threshold (float): The threshold for considering a field untrustworthy
+            display_details (bool): Whether to display detailed information about the untrustworthy fields
+
+        Returns:
+            list[str]: The fields of the response that are considered untrustworthy by TLM
+        """
+        try:
+            from openai.types.chat import ChatCompletion
+        except ImportError as e:
+            raise ImportError(
+                f"OpenAI is required to use the {self.__class__.__name__} class. Please install it with `pip install openai`."
+            ) from e
+
+        if isinstance(tlm_result, dict):
+            if response is None:
+                raise ValueError("'response' is required when tlm_result is a TLMScore object")
+
+            tlm_metadata = tlm_result
+            response_text = response.choices[0].message.content or "{}"
+
+        elif isinstance(tlm_result, ChatCompletion):
+            if getattr(tlm_result, "tlm_metadata", None) is None:
+                raise ValueError("tlm_result must contain tlm_metadata.")
+
+            tlm_metadata = tlm_result.tlm_metadata  # type: ignore
+            response_text = tlm_result.choices[0].message.content or "{}"
+
+        else:
+            raise TypeError("tlm_result must be a TLMScore or ChatCompletion object.")
+
+        if "per_field_score" not in tlm_metadata.get("log", {}):
+            raise ValueError(
+                "`per_field_score` is not present in the log.\n"
+                "`get_untrustworthy_fields()` can only be called scoring structured outputs responses and specifying "
+                "`per_field_score` in the `log` option for TLM."
+            )
+
+        try:
+            so_response = json.loads(response_text)
+        except Exception:
+            raise ValueError(
+                "The LLM response must be a valid JSON output (use `response_format` to specify the output format)"
+            )
+
+        per_field_score = tlm_metadata["log"]["per_field_score"]
+        per_score_details = []
+
+        for key, value in per_field_score.items():
+            score = value["score"]
+            if float(score) < threshold:
+                key_details = {
+                    "response": so_response[key],
+                    "score": score,
+                    "explanation": value["explanation"],
+                }
+                per_score_details.append({key: key_details})
+
+        per_score_details.sort(key=lambda x: next(iter(x.values()))["score"])
+        untrustworthy_fields = [next(iter(item.keys())) for item in per_score_details]
+
+        if display_details:
+            if len(untrustworthy_fields) == 0:
+                print("No untrustworthy fields found")
+
+            else:
+                print(f"Untrustworthy fields: {untrustworthy_fields}\n")
+                for item in per_score_details:
+                    print(f"Field: {next(iter(item.keys()))}")
+                    details = next(iter(item.values()))
+                    print(f"Response: {details['response']}")
+                    print(f"Score: {details['score']}")
+                    print(f"Explanation: {details['explanation']}")
+                    print()
+
+        return untrustworthy_fields
+
     @staticmethod
     def _get_response_message(response: "ChatCompletion") -> "ChatCompletionMessage":
         return response.choices[0].message
diff --git a/tests/test_chat_completions.py b/tests/test_chat_completions.py
@@ -206,6 +206,86 @@ def test_tlm_chat_completion_score_with_structured_output() -> None:
     assert is_trustworthiness_score_json_format(score)
 
 
+def test_tlm_chat_completion_structured_output_per_field_scoring() -> None:
+    tlm_chat = TLMChatCompletion(options={"log": ["per_field_score"]})
+
+    openai_kwargs = {
+        "model": "gpt-4.1-mini",
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful math tutor. Guide the user through the solution step by step.",
+            },
+            {"role": "user", "content": "how can I solve 8x + 7 = -23"},
+        ],
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "math_reasoning",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "steps": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "explanation": {"type": "string"},
+                                    "output": {"type": "string"},
+                                },
+                                "required": ["explanation", "output"],
+                                "additionalProperties": False,
+                            },
+                        },
+                        "final_answer": {"type": "string"},
+                    },
+                    "required": ["steps", "final_answer"],
+                    "additionalProperties": False,
+                },
+                "strict": True,
+            },
+        },
+    }
+    response = ChatCompletion(
+        id="test",
+        choices=[
+            Choice(
+                index=0,
+                message=ChatCompletionMessage(
+                    role="assistant",
+                    content='{"steps":[{"explanation":"Start with the original equation: 8x + 7 = -23","output":"8x + 7 = -23"},{"explanation":"Subtract 7 from both sides to isolate the term with x on one side. This will give us: 8x = -23 - 7","output":"8x = -30"},{"explanation":"Now simplify the right side: -23 - 7 equals -30, so we have 8x = -30","output":"8x = -30"},{"explanation":"Next, divide both sides by 8 to solve for x. This gives us: x = -30 / 8","output":"x = -3.75"},{"explanation":"We can also simplify -30 / 8 by dividing both the numerator and the denominator by 2. This leads to: x = -15 / 4","output":"x = -15/4 (or -3.75 as a decimal)"}],"final_answer":"x = -17/4"}',
+                ),
+                finish_reason="stop",
+            )
+        ],
+        usage=CompletionUsage(
+            completion_tokens=50,
+            completion_tokens_details=CompletionTokensDetails(
+                accepted_prediction_tokens=0,
+                audio_tokens=0,
+                reasoning_tokens=0,
+                rejected_prediction_tokens=0,
+            ),
+            prompt_tokens=50,
+            prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0),
+            total_tokens=100,
+        ),
+        created=1234567890,
+        model="test-model",
+        object="chat.completion",
+    )
+
+    score = tlm_chat.score(response=response, **openai_kwargs)
+
+    assert score is not None
+    assert is_trustworthiness_score_json_format(score)
+
+    # test per_field_score
+    assert len(score["log"]["per_field_score"]) == 2  # noqa: PLR2004
+    assert {"steps", "final_answer"} == set(score["log"]["per_field_score"].keys())
+    assert tlm_chat.get_untrustworthy_fields(response=response, tlm_result=score) == ["final_answer"]
+
+
 def test_tlm_chat_completion_score_invalid_response() -> None:
     tlm_chat = TLMChatCompletion()
     openai_kwargs = {
@@ -248,8 +328,14 @@ def test_tlm_chat_completion_score_missing_messages() -> None:
 @pytest.mark.parametrize(
     "arguments, condition",  # noqa: PT006
     [
-        (json.dumps({"query": "Capital of Germany"}), lambda score: score["trustworthiness_score"] < 0.5),  # noqa: PLR2004
-        (json.dumps({"query": "Capital of France"}), lambda score: score["trustworthiness_score"] >= 0.8),  # noqa: PLR2004
+        (
+            json.dumps({"query": "Capital of Germany"}),
+            lambda score: score["trustworthiness_score"] < 0.5,  # noqa: PLR2004
+        ),
+        (
+            json.dumps({"query": "Capital of France"}),
+            lambda score: score["trustworthiness_score"] >= 0.8,  # noqa: PLR2004
+        ),
     ],
     ids=["bad_arguments", "good_arguments"],
 )

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# SPDX-License-Identifier: MIT`
`2`		`-__version__ = "1.1.31"`
	`2`	`+__version__ = "1.1.32"`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@`
`66`	`66`	`"discrepancy",`
`67`	`67`	`}`
`68`	`68`	`TLM_REASONING_EFFORT_VALUES: set[str] = {"none", "low", "medium", "high"}`
`69`		`-TLM_VALID_LOG_OPTIONS: set[str] = {"perplexity", "explanation"}`
	`69`	`+TLM_VALID_LOG_OPTIONS: set[str] = {"perplexity", "explanation", "per_field_score"}`
`70`	`70`	`TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS: set[str] = {`
`71`	`71`	`"perplexity",`
`72`	`72`	`_TLM_CONSTRAIN_OUTPUTS_KEY,`