added score_async (#125)

gordon-lim · web-flow · commit 5d812abe5f3b · 2025-09-25T13:56:43.000-07:00
* added score_async

* fmt

* update changelog
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added 
+
+- Add `score_async` API for TLMChatCompletion
+
 ## [1.1.34] - 2025-09-24
 
 ### Added
diff --git a/src/cleanlab_tlm/utils/chat_completions.py b/src/cleanlab_tlm/utils/chat_completions.py
@@ -78,6 +78,28 @@ def score(
     ) -> TLMScore:
         """Score the trustworthiness of an OpenAI ChatCompletion response.
 
+        Args:
+            response (ChatCompletion): The OpenAI ChatCompletion response object to evaluate
+            **openai_kwargs (Any): The original kwargs passed to OpenAI's create() method, must include 'messages'
+
+        Returns:
+            TLMScore: A dict containing the trustworthiness score and optional logs
+        """
+        return self._event_loop.run_until_complete(self.score_async(response=response, **openai_kwargs))
+
+    async def score_async(
+        self,
+        *,
+        response: "ChatCompletion",
+        **openai_kwargs: Any,
+    ) -> TLMScore:
+        """Asynchronously score the trustworthiness of an OpenAI ChatCompletion response.
+        This method is similar to the [`score()`](#method-score) method but operates asynchronously,
+        allowing for non-blocking concurrent operations.
+
+        Use this method if you want to score multiple ChatCompletion responses concurrently
+        without blocking the execution of other operations.
+
         Args:
             response (ChatCompletion): The OpenAI ChatCompletion response object to evaluate
             **openai_kwargs (Any): The original kwargs passed to OpenAI's create() method, must include 'messages'
@@ -113,15 +135,13 @@ def score(
             combined_kwargs["response_format"] = type_to_response_format_param(combined_kwargs["response_format"])
             return cast(
                 TLMScore,
-                self._event_loop.run_until_complete(
-                    asyncio.wait_for(
-                        tlm_chat_completions_score(
-                            api_key=self._api_key,
-                            response=response,
-                            **combined_kwargs,
-                        ),
-                        timeout=self._timeout,
-                    )
+                await asyncio.wait_for(
+                    tlm_chat_completions_score(
+                        api_key=self._api_key,
+                        response=response,
+                        **combined_kwargs,
+                    ),
+                    timeout=self._timeout,
                 ),
             )
 
@@ -131,7 +151,7 @@ def score(
         prompt_text = _form_prompt_chat_completions_api(messages, tools)
         response_text = form_response_string_chat_completions(response=response)
 
-        return cast(TLMScore, self._tlm.get_trustworthiness_score(prompt_text, response_text))
+        return cast(TLMScore, await self._tlm.get_trustworthiness_score_async(prompt_text, response_text))
 
     def get_explanation(
         self,
diff --git a/tests/test_chat_completions.py b/tests/test_chat_completions.py
@@ -1,5 +1,6 @@
+import asyncio
 import json
-from typing import Callable
+from typing import Any, Callable
 
 import pytest
 from openai.types.chat import ChatCompletion, ChatCompletionMessage
@@ -23,6 +24,18 @@
 test_response = make_text_unique(TEST_RESPONSE)
 
 
+def _run_score_sync_or_async(
+    tlm_chat: TLMChatCompletion,
+    response: ChatCompletion,
+    is_async: bool,
+    **openai_kwargs: Any,
+) -> TLMScore:
+    """Runs either sync or async score method based on is_async parameter."""
+    if is_async:
+        return asyncio.run(tlm_chat.score_async(response=response, **openai_kwargs))
+    return tlm_chat.score(response=response, **openai_kwargs)
+
+
 def test_get_model_name() -> None:
     tlm = TLMChatCompletion()
     model_name = tlm.get_model_name()
@@ -35,7 +48,8 @@ def test_get_model_name() -> None:
     "quality_preset",
     ["base", "low", "medium", "high", "best"],
 )
-def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset) -> None:
+@pytest.mark.parametrize("is_async", [False, True], ids=["sync", "async"])
+def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset, is_async: bool) -> None:
     tlm_chat = TLMChatCompletion(quality_preset=quality_preset)
     openai_kwargs = {
         "model": "gpt-4.1-mini",
@@ -55,13 +69,14 @@ def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset) -> None:
         object="chat.completion",
     )
 
-    score = tlm_chat.score(response=response, **openai_kwargs)
+    score = _run_score_sync_or_async(tlm_chat, response, is_async, **openai_kwargs)
 
     assert score is not None
     assert is_trustworthiness_score_json_format(score)
 
 
-def test_tlm_chat_completion_score_with_options() -> None:
+@pytest.mark.parametrize("is_async", [False, True], ids=["sync", "async"])
+def test_tlm_chat_completion_score_with_options(is_async: bool) -> None:
     tlm_chat = TLMChatCompletion(options={"log": ["explanation", "perplexity"]})
     openai_kwargs = {
         "model": "gpt-4.1-mini",
@@ -81,13 +96,14 @@ def test_tlm_chat_completion_score_with_options() -> None:
         object="chat.completion",
     )
 
-    score = tlm_chat.score(response=response, **openai_kwargs)
+    score = _run_score_sync_or_async(tlm_chat, response, is_async, **openai_kwargs)
 
     assert score is not None
     assert is_trustworthiness_score_json_format(score)
 
 
-def test_tlm_chat_completion_score_with_tools() -> None:
+@pytest.mark.parametrize("is_async", [False, True], ids=["sync", "async"])
+def test_tlm_chat_completion_score_with_tools(is_async: bool) -> None:
     tlm_chat = TLMChatCompletion()
     openai_kwargs = {
         "model": "gpt-4.1-mini",
@@ -126,13 +142,14 @@ def test_tlm_chat_completion_score_with_tools() -> None:
         object="chat.completion",
     )
 
-    score = tlm_chat.score(response=response, **openai_kwargs)
+    score = _run_score_sync_or_async(tlm_chat, response, is_async, **openai_kwargs)
 
     assert score is not None
     assert is_trustworthiness_score_json_format(score)
 
 
-def test_tlm_chat_completion_score_with_structured_output() -> None:
+@pytest.mark.parametrize("is_async", [False, True], ids=["sync", "async"])
+def test_tlm_chat_completion_score_with_structured_output(is_async: bool) -> None:
     tlm_chat = TLMChatCompletion()
     openai_kwargs = {
         "model": "gpt-4.1-mini",
@@ -200,13 +217,14 @@ def test_tlm_chat_completion_score_with_structured_output() -> None:
         object="chat.completion",
     )
 
-    score = tlm_chat.score(response=response, **openai_kwargs)
+    score = _run_score_sync_or_async(tlm_chat, response, is_async, **openai_kwargs)
 
     assert score is not None
     assert is_trustworthiness_score_json_format(score)
 
 
-def test_tlm_chat_completion_structured_output_per_field_scoring() -> None:
+@pytest.mark.parametrize("is_async", [False, True], ids=["sync", "async"])
+def test_tlm_chat_completion_structured_output_per_field_scoring(is_async: bool) -> None:
     tlm_chat = TLMChatCompletion(options={"log": ["per_field_score"]})
 
     openai_kwargs = {
@@ -275,7 +293,7 @@ def test_tlm_chat_completion_structured_output_per_field_scoring() -> None:
         object="chat.completion",
     )
 
-    score = tlm_chat.score(response=response, **openai_kwargs)
+    score = _run_score_sync_or_async(tlm_chat, response, is_async, **openai_kwargs)
 
     assert score is not None
     assert is_trustworthiness_score_json_format(score)
@@ -339,7 +357,10 @@ def test_tlm_chat_completion_score_missing_messages() -> None:
     ],
     ids=["bad_arguments", "good_arguments"],
 )
-def test_tlm_chat_completion_score_tool_calls(arguments: str, condition: Callable[[TLMScore], bool]) -> None:
+@pytest.mark.parametrize("is_async", [False, True], ids=["sync", "async"])
+def test_tlm_chat_completion_score_tool_calls(
+    arguments: str, condition: Callable[[TLMScore], bool], is_async: bool
+) -> None:
     tlm_chat = TLMChatCompletion()
 
     openai_kwargs = {
@@ -390,7 +411,7 @@ def test_tlm_chat_completion_score_tool_calls(arguments: str, condition: Callabl
         object="chat.completion",
     )
 
-    score = tlm_chat.score(response=response, **openai_kwargs)
+    score = _run_score_sync_or_async(tlm_chat, response, is_async, **openai_kwargs)
 
     assert score is not None
     assert condition(score)