Update default model to gpt-4.1-mini (#66)

huiwengoh · web-flow · commit 94b7eec1f35d · 2025-05-30T13:43:13.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.1.4] - 2025-05-30
+
+### Changed
+
+- Update default model to `gpt-4.1-mini`
+
 ## [1.1.3] - 2025-05-13
 
 ### Changed
@@ -156,7 +162,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Release of the Cleanlab TLM Python client.
 
-[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.3...HEAD
+[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.4...HEAD
+[1.1.4]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.3...v1.1.4
 [1.1.3]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.2...v1.1.3
 [1.1.2]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.1...v1.1.2
 [1.1.1]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.0...v1.1.1
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ To get started, copy the code below to try your own prompt or score existing pro
 
 ```python
 from cleanlab_tlm import TLM
-tlm = TLM(options={"log": ["explanation"], "model": "gpt-4o-mini"}) # GPT, Claude, etc.
+tlm = TLM(options={"log": ["explanation"], "model": "gpt-4.1-mini"}) # GPT, Claude, etc.
 out = tlm.prompt("What's the third month of the year alphabetically?")
 print(out)
 ```
diff --git a/src/cleanlab_tlm/__about__.py b/src/cleanlab_tlm/__about__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: MIT
-__version__ = "1.1.3"
+__version__ = "1.1.4"
diff --git a/src/cleanlab_tlm/internal/api/api.py b/src/cleanlab_tlm/internal/api/api.py
@@ -436,7 +436,7 @@ async def tlm_rag_generate(
         if evaluation.name not in [_TLM_RESPONSE_KEY, _TLM_TRUSTWORTHINESS_KEY]:
             ordered_res[evaluation.name] = res_json[evaluation.name]
 
-    return cast(JSONDict, ordered_res)
+    return ordered_res
 
 
 @tlm_retry
@@ -526,4 +526,4 @@ async def tlm_rag_score(
         if evaluation.name not in [_TLM_RESPONSE_KEY, _TLM_TRUSTWORTHINESS_KEY]:
             ordered_res[evaluation.name] = res_json[evaluation.name]
 
-    return cast(JSONDict, ordered_res)
+    return ordered_res
diff --git a/src/cleanlab_tlm/internal/constants.py b/src/cleanlab_tlm/internal/constants.py
@@ -32,7 +32,7 @@
     "nova-lite",
     "nova-pro",
 ]
-_TLM_DEFAULT_MODEL: str = "gpt-4o-mini"
+_TLM_DEFAULT_MODEL: str = "gpt-4.1-mini"
 _TLM_DEFAULT_CONTEXT_LIMIT: int = 70000
 _VALID_TLM_TASKS: set[str] = {task.value for task in Task}
 TLM_TASK_SUPPORTING_CONSTRAIN_OUTPUTS: set[Task] = {
diff --git a/src/cleanlab_tlm/tlm.py b/src/cleanlab_tlm/tlm.py
@@ -255,16 +255,13 @@ def prompt(
         validate_tlm_prompt(prompt)
         tlm_prompt_process_and_validate_kwargs(prompt, self._task, kwargs)
         if isinstance(prompt, str):
-            return cast(
-                TLMResponse,
-                self._event_loop.run_until_complete(
-                    self._prompt_async(
-                        prompt,
-                        timeout=self._timeout,
-                        capture_exceptions=False,
-                        constrain_outputs=kwargs.get(_TLM_CONSTRAIN_OUTPUTS_KEY),
-                    ),
-                ),
+            return self._event_loop.run_until_complete(
+                self._prompt_async(
+                    prompt,
+                    timeout=self._timeout,
+                    capture_exceptions=False,
+                    constrain_outputs=kwargs.get(_TLM_CONSTRAIN_OUTPUTS_KEY),
+                )
             )
 
         return self._event_loop.run_until_complete(
@@ -324,14 +321,13 @@ async def prompt_async(
 
         async with aiohttp.ClientSession() as session:
             if isinstance(prompt, str):
-                tlm_response = await self._prompt_async(
+                return await self._prompt_async(
                     prompt,
                     session,
                     timeout=self._timeout,
                     capture_exceptions=False,
                     constrain_outputs=kwargs.get(_TLM_CONSTRAIN_OUTPUTS_KEY),
                 )
-                return cast(TLMResponse, tlm_response)
 
             return await self._batch_prompt(
                 prompt,
@@ -417,16 +413,13 @@ def get_trustworthiness_score(
         processed_response = tlm_score_process_response_and_kwargs(prompt, response, self._task, kwargs)
 
         if isinstance(prompt, str) and isinstance(processed_response, dict):
-            return cast(
-                TLMScore,
-                self._event_loop.run_until_complete(
-                    self._get_trustworthiness_score_async(
-                        prompt,
-                        processed_response,
-                        timeout=self._timeout,
-                        capture_exceptions=False,
-                    )
-                ),
+            return self._event_loop.run_until_complete(
+                self._get_trustworthiness_score_async(
+                    prompt,
+                    processed_response,
+                    timeout=self._timeout,
+                    capture_exceptions=False,
+                )
             )
 
         assert isinstance(prompt, Sequence)
@@ -483,14 +476,13 @@ async def get_trustworthiness_score_async(
 
         async with aiohttp.ClientSession() as session:
             if isinstance(prompt, str) and isinstance(processed_response, dict):
-                trustworthiness_score = await self._get_trustworthiness_score_async(
+                return await self._get_trustworthiness_score_async(
                     prompt,
                     processed_response,
                     session,
                     timeout=self._timeout,
                     capture_exceptions=False,
                 )
-                return cast(TLMScore, trustworthiness_score)
 
             assert isinstance(prompt, Sequence)
             assert isinstance(processed_response, Sequence)
@@ -598,16 +590,16 @@ class TLMOptions(TypedDict):
     - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, `use_self_reflection` = False.
         When using `get_trustworthiness_score()` on "base" preset, a cheaper self-reflection will be used to compute the trustworthiness score.
 
-    By default, TLM uses the: "medium" `quality_preset`, "gpt-4o-mini" base `model`, and `max_tokens` is set to 512.
+    By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512.
     You can set custom values for these arguments regardless of the quality preset specified.
 
     Args:
         model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", \
          "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", \
-         "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): \
+         "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): \
         Underlying base LLM to use (better models yield better results, faster models yield faster/cheaper results).
-        - Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-4.5-preview", \
-            "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro".
+        - Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", \
+            "claude-3.7-sonnet", "claude-3.5-haiku".
         - Recommended models for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", "claude-3.5-sonnet-v2".
         - Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro".
 
diff --git a/tests/constants.py b/tests/constants.py
@@ -37,7 +37,6 @@
     "nova-pro",
     "gpt-4",
     "gpt-4.1",
-    "gpt-4.1-mini",
     "gpt-4.5-preview",
 ]
 VALID_TLM_MODELS: list[str] = [model for model in _VALID_TLM_MODELS if model not in excluded_tlm_models]
diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -644,7 +644,7 @@ def test_custom_eval_criteria_validation(tlm_api_key: str) -> None:
     # Invalid: extra keys
     with pytest.raises(
         ValidationError,
-        match="^Invalid keys {'extra'} found in custom_eval_criteria item 0. Supported keys are: {'name', 'criteria'}.$",
+        match="^Invalid keys {'extra'} found in custom_eval_criteria item 0. Supported keys are: ({'name', 'criteria'}|{'criteria', 'name'}).$",
     ):
         TLM(
             api_key=tlm_api_key,

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# SPDX-License-Identifier: MIT`
`2`		`-__version__ = "1.1.3"`
	`2`	`+__version__ = "1.1.4"`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`"nova-lite",`
`33`	`33`	`"nova-pro",`
`34`	`34`	`]`
`35`		`-_TLM_DEFAULT_MODEL: str = "gpt-4o-mini"`
	`35`	`+_TLM_DEFAULT_MODEL: str = "gpt-4.1-mini"`
`36`	`36`	`_TLM_DEFAULT_CONTEXT_LIMIT: int = 70000`
`37`	`37`	`_VALID_TLM_TASKS: set[str] = {task.value for task in Task}`
`38`	`38`	`TLM_TASK_SUPPORTING_CONSTRAIN_OUTPUTS: set[Task] = {`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,6 @@`
`37`	`37`	`"nova-pro",`
`38`	`38`	`"gpt-4",`
`39`	`39`	`"gpt-4.1",`
`40`		`- "gpt-4.1-mini",`
`41`	`40`	`"gpt-4.5-preview",`
`42`	`41`	`]`
`43`	`42`	`VALID_TLM_MODELS: list[str] = [model for model in _VALID_TLM_MODELS if model not in excluded_tlm_models]`