Skip to content

Commit 94b7eec

Browse files
authored
Update default model to gpt-4.1-mini (#66)
1 parent a479e32 commit 94b7eec

File tree

8 files changed

+34
-36
lines changed

8 files changed

+34
-36
lines changed

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.1.4] - 2025-05-30
11+
12+
### Changed
13+
14+
- Update default model to `gpt-4.1-mini`
15+
1016
## [1.1.3] - 2025-05-13
1117

1218
### Changed
@@ -156,7 +162,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
156162

157163
- Release of the Cleanlab TLM Python client.
158164

159-
[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.3...HEAD
165+
[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.4...HEAD
166+
[1.1.4]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.3...v1.1.4
160167
[1.1.3]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.2...v1.1.3
161168
[1.1.2]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.1...v1.1.2
162169
[1.1.1]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.0...v1.1.1

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ To get started, copy the code below to try your own prompt or score existing pro
2424

2525
```python
2626
from cleanlab_tlm import TLM
27-
tlm = TLM(options={"log": ["explanation"], "model": "gpt-4o-mini"}) # GPT, Claude, etc.
27+
tlm = TLM(options={"log": ["explanation"], "model": "gpt-4.1-mini"}) # GPT, Claude, etc.
2828
out = tlm.prompt("What's the third month of the year alphabetically?")
2929
print(out)
3030
```

src/cleanlab_tlm/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# SPDX-License-Identifier: MIT
2-
__version__ = "1.1.3"
2+
__version__ = "1.1.4"

src/cleanlab_tlm/internal/api/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ async def tlm_rag_generate(
436436
if evaluation.name not in [_TLM_RESPONSE_KEY, _TLM_TRUSTWORTHINESS_KEY]:
437437
ordered_res[evaluation.name] = res_json[evaluation.name]
438438

439-
return cast(JSONDict, ordered_res)
439+
return ordered_res
440440

441441

442442
@tlm_retry
@@ -526,4 +526,4 @@ async def tlm_rag_score(
526526
if evaluation.name not in [_TLM_RESPONSE_KEY, _TLM_TRUSTWORTHINESS_KEY]:
527527
ordered_res[evaluation.name] = res_json[evaluation.name]
528528

529-
return cast(JSONDict, ordered_res)
529+
return ordered_res

src/cleanlab_tlm/internal/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
"nova-lite",
3333
"nova-pro",
3434
]
35-
_TLM_DEFAULT_MODEL: str = "gpt-4o-mini"
35+
_TLM_DEFAULT_MODEL: str = "gpt-4.1-mini"
3636
_TLM_DEFAULT_CONTEXT_LIMIT: int = 70000
3737
_VALID_TLM_TASKS: set[str] = {task.value for task in Task}
3838
TLM_TASK_SUPPORTING_CONSTRAIN_OUTPUTS: set[Task] = {

src/cleanlab_tlm/tlm.py

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -255,16 +255,13 @@ def prompt(
255255
validate_tlm_prompt(prompt)
256256
tlm_prompt_process_and_validate_kwargs(prompt, self._task, kwargs)
257257
if isinstance(prompt, str):
258-
return cast(
259-
TLMResponse,
260-
self._event_loop.run_until_complete(
261-
self._prompt_async(
262-
prompt,
263-
timeout=self._timeout,
264-
capture_exceptions=False,
265-
constrain_outputs=kwargs.get(_TLM_CONSTRAIN_OUTPUTS_KEY),
266-
),
267-
),
258+
return self._event_loop.run_until_complete(
259+
self._prompt_async(
260+
prompt,
261+
timeout=self._timeout,
262+
capture_exceptions=False,
263+
constrain_outputs=kwargs.get(_TLM_CONSTRAIN_OUTPUTS_KEY),
264+
)
268265
)
269266

270267
return self._event_loop.run_until_complete(
@@ -324,14 +321,13 @@ async def prompt_async(
324321

325322
async with aiohttp.ClientSession() as session:
326323
if isinstance(prompt, str):
327-
tlm_response = await self._prompt_async(
324+
return await self._prompt_async(
328325
prompt,
329326
session,
330327
timeout=self._timeout,
331328
capture_exceptions=False,
332329
constrain_outputs=kwargs.get(_TLM_CONSTRAIN_OUTPUTS_KEY),
333330
)
334-
return cast(TLMResponse, tlm_response)
335331

336332
return await self._batch_prompt(
337333
prompt,
@@ -417,16 +413,13 @@ def get_trustworthiness_score(
417413
processed_response = tlm_score_process_response_and_kwargs(prompt, response, self._task, kwargs)
418414

419415
if isinstance(prompt, str) and isinstance(processed_response, dict):
420-
return cast(
421-
TLMScore,
422-
self._event_loop.run_until_complete(
423-
self._get_trustworthiness_score_async(
424-
prompt,
425-
processed_response,
426-
timeout=self._timeout,
427-
capture_exceptions=False,
428-
)
429-
),
416+
return self._event_loop.run_until_complete(
417+
self._get_trustworthiness_score_async(
418+
prompt,
419+
processed_response,
420+
timeout=self._timeout,
421+
capture_exceptions=False,
422+
)
430423
)
431424

432425
assert isinstance(prompt, Sequence)
@@ -483,14 +476,13 @@ async def get_trustworthiness_score_async(
483476

484477
async with aiohttp.ClientSession() as session:
485478
if isinstance(prompt, str) and isinstance(processed_response, dict):
486-
trustworthiness_score = await self._get_trustworthiness_score_async(
479+
return await self._get_trustworthiness_score_async(
487480
prompt,
488481
processed_response,
489482
session,
490483
timeout=self._timeout,
491484
capture_exceptions=False,
492485
)
493-
return cast(TLMScore, trustworthiness_score)
494486

495487
assert isinstance(prompt, Sequence)
496488
assert isinstance(processed_response, Sequence)
@@ -598,16 +590,16 @@ class TLMOptions(TypedDict):
598590
- **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, `use_self_reflection` = False.
599591
When using `get_trustworthiness_score()` on "base" preset, a cheaper self-reflection will be used to compute the trustworthiness score.
600592
601-
By default, TLM uses the: "medium" `quality_preset`, "gpt-4o-mini" base `model`, and `max_tokens` is set to 512.
593+
By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512.
602594
You can set custom values for these arguments regardless of the quality preset specified.
603595
604596
Args:
605597
model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", \
606598
"o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", \
607-
"claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): \
599+
"claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): \
608600
Underlying base LLM to use (better models yield better results, faster models yield faster/cheaper results).
609-
- Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-4.5-preview", \
610-
"claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro".
601+
- Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", \
602+
"claude-3.7-sonnet", "claude-3.5-haiku".
611603
- Recommended models for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", "claude-3.5-sonnet-v2".
612604
- Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro".
613605

tests/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
"nova-pro",
3838
"gpt-4",
3939
"gpt-4.1",
40-
"gpt-4.1-mini",
4140
"gpt-4.5-preview",
4241
]
4342
VALID_TLM_MODELS: list[str] = [model for model in _VALID_TLM_MODELS if model not in excluded_tlm_models]

tests/test_validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ def test_custom_eval_criteria_validation(tlm_api_key: str) -> None:
644644
# Invalid: extra keys
645645
with pytest.raises(
646646
ValidationError,
647-
match="^Invalid keys {'extra'} found in custom_eval_criteria item 0. Supported keys are: {'name', 'criteria'}.$",
647+
match="^Invalid keys {'extra'} found in custom_eval_criteria item 0. Supported keys are: ({'name', 'criteria'}|{'criteria', 'name'}).$",
648648
):
649649
TLM(
650650
api_key=tlm_api_key,

0 commit comments

Comments
 (0)