|
5 | 5 | from openai.types.chat import ChatCompletion, ChatCompletionMessage |
6 | 6 | from openai.types.chat.chat_completion import Choice |
7 | 7 | from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall, Function |
| 8 | +from openai.types.completion_usage import ( |
| 9 | + CompletionTokensDetails, |
| 10 | + CompletionUsage, |
| 11 | + PromptTokensDetails, |
| 12 | +) |
8 | 13 |
|
9 | 14 | from cleanlab_tlm.internal.types import TLMQualityPreset |
10 | 15 | from cleanlab_tlm.tlm import TLMScore |
|
19 | 24 |
|
20 | 25 | @pytest.mark.parametrize( |
21 | 26 | "quality_preset", |
22 | | - ["base", "low", "medium"], |
| 27 | + ["base", "low", "medium", "high", "best"], |
23 | 28 | ) |
24 | 29 | def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset) -> None: |
25 | 30 | tlm_chat = TLMChatCompletion(quality_preset=quality_preset) |
@@ -118,6 +123,80 @@ def test_tlm_chat_completion_score_with_tools() -> None: |
118 | 123 | assert is_trustworthiness_score_json_format(score) |
119 | 124 |
|
120 | 125 |
|
| 126 | +def test_tlm_chat_completion_score_with_structured_output() -> None: |
| 127 | + tlm_chat = TLMChatCompletion() |
| 128 | + openai_kwargs = { |
| 129 | + "model": "gpt-4.1-mini", |
| 130 | + "messages": [ |
| 131 | + { |
| 132 | + "role": "system", |
| 133 | + "content": "You are a helpful math tutor. Guide the user through the solution step by step.", |
| 134 | + }, |
| 135 | + {"role": "user", "content": "how can I solve 8x + 7 = -23"}, |
| 136 | + ], |
| 137 | + "response_format": { |
| 138 | + "type": "json_schema", |
| 139 | + "json_schema": { |
| 140 | + "name": "math_reasoning", |
| 141 | + "schema": { |
| 142 | + "type": "object", |
| 143 | + "properties": { |
| 144 | + "steps": { |
| 145 | + "type": "array", |
| 146 | + "items": { |
| 147 | + "type": "object", |
| 148 | + "properties": { |
| 149 | + "explanation": {"type": "string"}, |
| 150 | + "output": {"type": "string"}, |
| 151 | + }, |
| 152 | + "required": ["explanation", "output"], |
| 153 | + "additionalProperties": False, |
| 154 | + }, |
| 155 | + }, |
| 156 | + "final_answer": {"type": "string"}, |
| 157 | + }, |
| 158 | + "required": ["steps", "final_answer"], |
| 159 | + "additionalProperties": False, |
| 160 | + }, |
| 161 | + "strict": True, |
| 162 | + }, |
| 163 | + }, |
| 164 | + } |
| 165 | + response = ChatCompletion( |
| 166 | + id="test", |
| 167 | + choices=[ |
| 168 | + Choice( |
| 169 | + index=0, |
| 170 | + message=ChatCompletionMessage( |
| 171 | + role="assistant", |
| 172 | + content='{"steps":[{"explanation":"Start with the original equation: 8x + 7 = -23","output":"8x + 7 = -23"},{"explanation":"Subtract 7 from both sides to isolate the term with x on one side. This will give us: 8x = -23 - 7","output":"8x = -30"},{"explanation":"Now simplify the right side: -23 - 7 equals -30, so we have 8x = -30","output":"8x = -30"},{"explanation":"Next, divide both sides by 8 to solve for x. This gives us: x = -30 / 8","output":"x = -3.75"},{"explanation":"We can also simplify -30 / 8 by dividing both the numerator and the denominator by 2. This leads to: x = -15 / 4","output":"x = -15/4 (or -3.75 as a decimal)"}],"final_answer":"x = -15/4 or x = -3.75"}', |
| 173 | + ), |
| 174 | + finish_reason="stop", |
| 175 | + ) |
| 176 | + ], |
| 177 | + usage=CompletionUsage( |
| 178 | + completion_tokens=50, |
| 179 | + completion_tokens_details=CompletionTokensDetails( |
| 180 | + accepted_prediction_tokens=0, |
| 181 | + audio_tokens=0, |
| 182 | + reasoning_tokens=0, |
| 183 | + rejected_prediction_tokens=0, |
| 184 | + ), |
| 185 | + prompt_tokens=50, |
| 186 | + prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0), |
| 187 | + total_tokens=100, |
| 188 | + ), |
| 189 | + created=1234567890, |
| 190 | + model="test-model", |
| 191 | + object="chat.completion", |
| 192 | + ) |
| 193 | + |
| 194 | + score = tlm_chat.score(response=response, **openai_kwargs) |
| 195 | + |
| 196 | + assert score is not None |
| 197 | + assert is_trustworthiness_score_json_format(score) |
| 198 | + |
| 199 | + |
121 | 200 | def test_tlm_chat_completion_score_invalid_response() -> None: |
122 | 201 | tlm_chat = TLMChatCompletion() |
123 | 202 | openai_kwargs = { |
|
0 commit comments