|
3 | 3 | import os |
4 | 4 | import warnings |
5 | 5 | from collections.abc import Sequence |
6 | | -from typing import TYPE_CHECKING, Any, Callable, Optional, Union |
| 6 | +from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast |
7 | 7 |
|
8 | 8 | from cleanlab_tlm.errors import ValidationError |
9 | 9 | from cleanlab_tlm.internal.constants import ( |
|
26 | 26 | from cleanlab_tlm.internal.types import Task |
27 | 27 |
|
28 | 28 | if TYPE_CHECKING: |
29 | | - from cleanlab_tlm.tlm import TLMOptions |
30 | | - from cleanlab_tlm.utils.rag import Eval |
| 29 | + from cleanlab_tlm.tlm import TLMOptions, TLMResponse, TLMScore |
| 30 | + from cleanlab_tlm.utils.rag import Eval, TrustworthyRAGResponse, TrustworthyRAGScore |
31 | 31 |
|
32 | 32 | SKIP_VALIDATE_TLM_OPTIONS: bool = os.environ.get("CLEANLAB_TLM_SKIP_VALIDATE_TLM_OPTIONS", "false").lower() == "true" |
33 | 33 |
|
@@ -366,6 +366,164 @@ def tlm_score_process_response_and_kwargs( |
366 | 366 | return [dict(zip(combined_response_keys, values)) for values in combined_response_values_transposed] |
367 | 367 |
|
368 | 368 |
|
| 369 | +def tlm_explanation_format_tlm_result( |
| 370 | + tlm_result: Union[TLMResponse, Sequence[TLMResponse], TLMScore, Sequence[TLMScore]], |
| 371 | + response: Optional[Union[str, Sequence[str]]] = None, |
| 372 | +) -> Union[dict[str, Any], list[dict[str, Any]]]: |
| 373 | + if isinstance(tlm_result, Sequence): |
| 374 | + if not all(isinstance(r, dict) for r in tlm_result): |
| 375 | + raise ValidationError("all items in the tlm_result sequence must be dicts") |
| 376 | + |
| 377 | + if not all("trustworthiness_score" in r for r in tlm_result): |
| 378 | + raise ValidationError("all items in the tlm_result sequence must contain a 'trustworthiness_score' key") |
| 379 | + |
| 380 | + # for .get_trustworthiness_score() cases, the response is passed in as a separate argument |
| 381 | + if not all("response" in r for r in tlm_result): |
| 382 | + if response is None: |
| 383 | + raise ValidationError( |
| 384 | + "'response' is required if not provided in tlm_result, pass it in using the 'response' argument" |
| 385 | + ) |
| 386 | + if not isinstance(response, Sequence) or isinstance(response, str): |
| 387 | + raise ValidationError("response must be a sequence when tlm_result is a sequence") |
| 388 | + if len(response) != len(tlm_result): |
| 389 | + raise ValidationError("response and score sequences must have the same length") |
| 390 | + if not all(isinstance(r, str) for r in response): |
| 391 | + raise ValidationError("all items in the response sequence must be strings") |
| 392 | + |
| 393 | + return [{"response": r, **tlm_result} for r, tlm_result in zip(response, tlm_result)] |
| 394 | + |
| 395 | + # for .prompt() cases, the response is provided in the tlm_result dict |
| 396 | + if response is not None: |
| 397 | + raise ValidationError( |
| 398 | + "response should only be provided once, either using the 'response' argument or in 'tlm_result'" |
| 399 | + ) |
| 400 | + |
| 401 | + return cast(list[dict[str, Any]], tlm_result) |
| 402 | + |
| 403 | + if not isinstance(tlm_result, dict): |
| 404 | + raise ValidationError("tlm_result must be a dict or a sequence of dicts") |
| 405 | + |
| 406 | + if "trustworthiness_score" not in tlm_result: |
| 407 | + raise ValidationError("tlm_result must contain a 'trustworthiness' key") |
| 408 | + |
| 409 | + # the .get_trustworthiness_score() case |
| 410 | + if "response" not in tlm_result: |
| 411 | + if response is None: |
| 412 | + raise ValidationError( |
| 413 | + "'response' is required if not provided in tlm_result, pass it in using the 'response' argument" |
| 414 | + ) |
| 415 | + if not isinstance(response, str): |
| 416 | + raise ValidationError("response must be a string when tlm_result is a dict") |
| 417 | + return {"response": response, **tlm_result} |
| 418 | + |
| 419 | + # the .prompt() case |
| 420 | + if response is not None: |
| 421 | + raise ValidationError( |
| 422 | + "response should only be provided once, either using the 'response' argument or in 'tlm_result'" |
| 423 | + ) |
| 424 | + return cast(dict[str, Any], tlm_result) |
| 425 | + |
| 426 | + |
| 427 | +def tlm_explanation_format_trustworthy_rag_result( |
| 428 | + tlm_result: Union[ |
| 429 | + TrustworthyRAGResponse, |
| 430 | + Sequence[TrustworthyRAGResponse], |
| 431 | + TrustworthyRAGScore, |
| 432 | + Sequence[TrustworthyRAGScore], |
| 433 | + ], |
| 434 | + response: Optional[Union[str, Sequence[str]]] = None, |
| 435 | +) -> Union[dict[str, Any], list[dict[str, Any]]]: |
| 436 | + if isinstance(tlm_result, Sequence): |
| 437 | + if not all(isinstance(r, dict) for r in tlm_result): |
| 438 | + raise ValidationError("all items in the tlm_result sequence must be dicts") |
| 439 | + |
| 440 | + if not all( |
| 441 | + "trustworthiness" in r |
| 442 | + and isinstance(r["trustworthiness"], dict) |
| 443 | + and "score" in r["trustworthiness"] |
| 444 | + and r["trustworthiness"]["score"] is not None |
| 445 | + for r in tlm_result |
| 446 | + ): |
| 447 | + raise ValidationError( |
| 448 | + "all items in the tlm_result sequence must contain a 'trustworthiness' dict with a non-None 'score' key" |
| 449 | + ) |
| 450 | + |
| 451 | + # for .score() cases, the response is passed in as a separate argument |
| 452 | + if not all("response" in r for r in tlm_result): |
| 453 | + if response is None: |
| 454 | + raise ValidationError( |
| 455 | + "'response' is required if not provided in tlm_result, pass it in using the 'response' argument" |
| 456 | + ) |
| 457 | + if not isinstance(response, Sequence) or isinstance(response, str): |
| 458 | + raise ValidationError("response must be a sequence when tlm_result is a sequence") |
| 459 | + if len(response) != len(tlm_result): |
| 460 | + raise ValidationError("response and score sequences must have the same length") |
| 461 | + if not all(isinstance(r, str) for r in response): |
| 462 | + raise ValidationError("all items in the response sequence must be strings") |
| 463 | + |
| 464 | + return [ |
| 465 | + { |
| 466 | + "response": resp, |
| 467 | + "trustworthiness_score": res["trustworthiness"]["score"], # type: ignore |
| 468 | + **{k: v for k, v in res["trustworthiness"].items() if k != "score"}, # type: ignore |
| 469 | + } |
| 470 | + for resp, res in zip(response, tlm_result) |
| 471 | + ] |
| 472 | + |
| 473 | + # for .generate() cases, the response is provided in the tlm_result dict |
| 474 | + if response is not None: |
| 475 | + raise ValidationError( |
| 476 | + "response should only be provided once, either using the 'response' argument or in 'tlm_result'" |
| 477 | + ) |
| 478 | + |
| 479 | + return [ |
| 480 | + { |
| 481 | + "response": res["response"], |
| 482 | + "trustworthiness_score": res["trustworthiness"]["score"], # type: ignore |
| 483 | + **{k: v for k, v in res["trustworthiness"].items() if k != "score"}, # type: ignore |
| 484 | + } |
| 485 | + for res in tlm_result |
| 486 | + ] |
| 487 | + |
| 488 | + if not isinstance(tlm_result, dict): |
| 489 | + raise ValidationError("tlm_result must be a dict or a sequence of dicts") |
| 490 | + |
| 491 | + if ( |
| 492 | + "trustworthiness" not in tlm_result |
| 493 | + or not isinstance(tlm_result["trustworthiness"], dict) |
| 494 | + or "score" not in tlm_result["trustworthiness"] |
| 495 | + or tlm_result["trustworthiness"]["score"] is None |
| 496 | + ): |
| 497 | + raise ValidationError("tlm_result must contain a 'trustworthiness' dict with a non-None 'score' key") |
| 498 | + |
| 499 | + # the .score() case |
| 500 | + if "response" not in tlm_result: |
| 501 | + if response is None: |
| 502 | + raise ValidationError( |
| 503 | + "'response' is required if not provided in tlm_result, pass it in using the 'response' argument" |
| 504 | + ) |
| 505 | + if not isinstance(response, str): |
| 506 | + raise ValidationError("response must be a string when tlm_result is a dict") |
| 507 | + |
| 508 | + return { |
| 509 | + "response": response, |
| 510 | + "trustworthiness_score": tlm_result["trustworthiness"]["score"], |
| 511 | + **{k: v for k, v in tlm_result["trustworthiness"].items() if k != "score"}, |
| 512 | + } |
| 513 | + |
| 514 | + # the .generate() case |
| 515 | + if response is not None: |
| 516 | + raise ValidationError( |
| 517 | + "response should only be provided once, either using the 'response' argument or in 'tlm_result'" |
| 518 | + ) |
| 519 | + |
| 520 | + return { |
| 521 | + "response": tlm_result["response"], |
| 522 | + "trustworthiness_score": tlm_result["trustworthiness"]["score"], |
| 523 | + **{k: v for k, v in tlm_result["trustworthiness"].items() if k != "score"}, |
| 524 | + } |
| 525 | + |
| 526 | + |
369 | 527 | def validate_tlm_lite_score_options(score_options: Any) -> None: |
370 | 528 | invalid_score_keys = set(score_options.keys()).intersection(INVALID_SCORE_OPTIONS) |
371 | 529 | if invalid_score_keys: |
|
0 commit comments