docs: add LLM-as-a-Judge example (#804)

allglc · web-flow · commit 8fd26417e0d2 · 2025-11-24T20:17:41.000+01:00
* download and process HaluEval of the data

* Show how to use a table based predictor

* Add visualizations

* Add naive threshold (also added to quickstart)

*update history
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -4,6 +4,8 @@ History
 
 1.x.x (2025-xx-xx)
 ------------------
+* Add an example of risk control with LLM as a judge
+* Add comparison with naive threshold in risk control quick start example
 
 1.2.0 (2025-11-17)
 ------------------
diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
@@ -7,6 +7,8 @@
 
 """
 
+# sphinx_gallery_thumbnail_number = 2
+
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.datasets import make_circles
@@ -18,7 +20,7 @@
 from mapie.risk_control import BinaryClassificationController
 from mapie.utils import train_conformalize_test_split
 
-RANDOM_STATE = 1
+RANDOM_STATE = 42
 
 ##############################################################################
 # First, load the dataset and then split it into training, calibration
@@ -28,9 +30,9 @@
 (X_train, X_calib, X_test, y_train, y_calib, y_test) = train_conformalize_test_split(
     X,
     y,
-    train_size=0.8,
+    train_size=0.7,
     conformalize_size=0.1,
-    test_size=0.1,
+    test_size=0.2,
     random_state=RANDOM_STATE,
 )
 
@@ -112,7 +114,7 @@
     f"{len(bcc.valid_predict_params)} thresholds found that guarantee a precision of "
     f"at least {target_precision} with a confidence of {confidence_level}.\n"
     "Among those, the one that maximizes the secondary objective (recall here) is: "
-    f"{bcc.best_predict_param:.3f}."
+    f"{bcc.best_predict_param:.2f}."
 )
 
 
@@ -128,6 +130,10 @@
     y_pred = (proba_positive_class >= threshold).astype(int)
     precisions[i] = precision_score(y_calib, y_pred)
 
+naive_threshold_index = np.argmin(
+    np.where(precisions >= target_precision, precisions - target_precision, np.inf)
+)
+
 valid_thresholds_indices = np.array(
     [t in bcc.valid_predict_params for t in tested_thresholds]
 )
@@ -155,6 +161,15 @@
     edgecolors="k",
     s=300,
 )
+plt.scatter(
+    tested_thresholds[naive_threshold_index],
+    precisions[naive_threshold_index],
+    c="tab:red",
+    label="Naive threshold",
+    marker="*",
+    edgecolors="k",
+    s=300,
+)
 plt.axhline(target_precision, color="tab:gray", linestyle="--")
 plt.text(
     0.7,
@@ -168,9 +183,28 @@
 plt.legend()
 plt.show()
 
+proba_positive_class_test = clf.predict_proba(X_test)[:, 1]
+y_pred_naive = (
+    proba_positive_class_test >= tested_thresholds[naive_threshold_index]
+).astype(int)
+print(
+    "With the naive threshold, the precision is:\n "
+    f"- {precisions[naive_threshold_index]:.3f} on the calibration set\n "
+    f"- {precision_score(y_test, y_pred_naive):.3f} on the test set."
+)
+
+print(
+    "\n\n With risk control, the precision is:\n"
+    f"- {precisions[best_threshold_index]:.3f} on the calibration set\n "
+    f"- {precision_score(y_test, bcc.predict(X_test)):.3f} on the test set."
+)
+
 ##############################################################################
 # Contrary to the naive way of computing a threshold to satisfy a precision target on
 # calibration data, risk control provides statistical guarantees on unseen data.
+# In this example, the naive threshold results in a precision on the test set that is
+# lower than the target precision while risk control takes a margin to guarantee
+# the target precision on unseen data with high probability.
 # In the plot above, we can see that not all thresholds corresponding to a precision
 # higher than the target are valid. This is due to the uncertainty inherent to the
 # finite size of the calibration set, which risk control takes into account.
@@ -179,12 +213,14 @@
 # small number of observations used to compute the precision, following the Learn Then
 # Test procedure. In the most extreme case, no observation is available, which causes
 # the precision value to be ill-defined and set to 0.
-
+#
 # Besides computing a set of valid thresholds,
 # :class:`~mapie.risk_control.BinaryClassificationController` also outputs the "best"
 # one, which is the valid threshold that maximizes a secondary objective
 # (recall here).
-#
+
+
+##############################################################################
 # After obtaining the best threshold, we can use the ``predict`` function of
 # :class:`~mapie.risk_control.BinaryClassificationController` for future predictions,
 # or use scikit-learn's ``FixedThresholdClassifier`` as a wrapper to benefit
@@ -206,15 +242,15 @@
     X_test[y_test == 0, 1],
     edgecolors="k",
     c="tab:blue",
-    alpha=0.5,
+    alpha=0.3,
     label='"negative" class',
 )
 plt.scatter(
     X_test[y_test == 1, 0],
     X_test[y_test == 1, 1],
     edgecolors="k",
     c="tab:red",
-    alpha=0.5,
+    alpha=0.3,
     label='"positive" class',
 )
 plt.title("Decision Boundary of FixedThresholdClassifier")
diff --git a/examples/risk_control/2-advanced-analysis/plot_risk_control_llm_as_a_judge.py b/examples/risk_control/2-advanced-analysis/plot_risk_control_llm_as_a_judge.py
@@ -0,0 +1,232 @@
+"""
+Risk Control for LLM as a Judge
+===============================
+
+This example demonstrates how to use risk control methods for Large Language Models (LLMs) acting as judges.
+We simulate a scenario where an LLM evaluates answers, and we want to control the risk of hallucination detection.
+"""
+
+# sphinx_gallery_thumbnail_number = 2
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from sklearn.metrics import precision_score
+from sklearn.model_selection import train_test_split
+
+from mapie.risk_control import BinaryClassificationController
+
+np.random.seed(0)
+
+##############################################################################
+# First, we load HaluEval Question-Answering Data, an open-source dataset for evaluating hallucination in LLMs.
+# Then, we preprocess the data to create a suitable format for our analysis.
+url = "https://raw.githubusercontent.com/RUCAIBox/HaluEval/main/data/qa_data.json"
+df = pd.read_json(url, lines=True)
+print("Sample of the original dataset:\n\n", df.iloc[0])
+
+# Melt the dataframe to combine right_answer and hallucinated_answer into a single column
+df = df.melt(
+    id_vars=["knowledge", "question"],
+    value_vars=["right_answer", "hallucinated_answer"],
+    var_name="answer_type",
+    value_name="answer",
+    ignore_index=False,  # Keep the original index to allow sorting back to pairs
+)
+
+# Sort by index to keep the pairs together (right_answer and hallucinated_answer for
+# the same question)
+df = df.sort_index()
+
+# Create the 'hallucinated' flag based on the original column name and drop the helper
+# column 'answer_type'
+df["hallucinated"] = df["answer_type"] == "hallucinated_answer"
+df = df.drop(columns=["answer_type"])
+df = df.reset_index(drop=True)
+
+# Create judge input prompts
+df["judge_input"] = df.apply(
+    lambda row: f"""
+    You are a judge evaluating whether an answer to a question is faithful to the 
+    provided knowledge snippet.
+
+    Knowledge: {row["knowledge"]}
+    Question: {row["question"]}
+    Answer: {row["answer"]}
+
+    Does the answer contain information that is NOT supported by the knowledge?
+
+    Provide a score between 0.0 and 1.0 indicating the probability that the answer is a
+    hallucination.
+""",
+    axis=1,
+)
+
+print("Sample of the processed dataset:\n\n", df.iloc[0])
+
+
+##############################################################################
+# For demonstration purposes, we simulate the LLM judge's behavior using a simple table-based predictor.
+# In practice, you would replace this with actual LLM API calls to get judge scores or read from a file
+# of judge scores obtained from a complex LangChain pipeline for instance.
+
+
+class TableBasePredictor:
+    def __init__(self, df):
+        df["judge_score"] = df["hallucinated"].apply(self.generate_biased_score)
+        self.df = df[["judge_input", "judge_score"]]
+        self.df = self.df.set_index("judge_input")
+
+    def predict_proba(self, X):
+        score_positive = self.df.loc[X]["judge_score"].values
+        score_negative = 1 - score_positive
+        return np.vstack([score_negative, score_positive]).T
+
+    @staticmethod
+    def generate_biased_score(is_hallucinated):
+        """Generate a biased score based on whether the answer is hallucinated."""
+        if is_hallucinated:
+            return np.random.beta(a=3, b=1)
+        else:
+            return np.random.beta(a=1, b=3)
+
+
+llm_judge = TableBasePredictor(df)
+
+plt.figure()
+plt.hist(
+    df[df["hallucinated"]]["judge_score"],
+    bins=30,
+    alpha=0.8,
+    label="Hallucinated answer",
+    density=True,
+)
+plt.hist(
+    df[~df["hallucinated"]]["judge_score"],
+    bins=30,
+    alpha=0.8,
+    label="Correct answer",
+    density=True,
+)
+plt.xlabel("Judge Score (Probability of Hallucination)")
+plt.ylabel("Density")
+plt.title("Distribution of Judge Scores")
+plt.legend()
+plt.show()
+
+##############################################################################
+# Next, we split the data into calibration and test sets. We then initialize a
+# :class:`~mapie.risk_control.BinaryClassificationController` using the LLM judge's
+# probability estimation function, a risk metric (here, "precision"), a target risk level,
+# and a confidence level. We use the calibration data to compute statistically guaranteed thresholds.
+
+X = df["judge_input"].to_numpy()
+y = df["hallucinated"].astype(int)
+
+X_calib, X_test, y_calib, y_test = train_test_split(X, y, test_size=0.8, random_state=0)
+target_precision = 0.9
+confidence_level = 0.9
+
+bcc = BinaryClassificationController(
+    predict_function=llm_judge.predict_proba,
+    risk="precision",
+    target_level=target_precision,
+    confidence_level=confidence_level,
+    best_predict_param_choice="recall",
+)
+bcc.calibrate(X_calib, y_calib)
+
+print(f"The best threshold is: {bcc.best_predict_param}")
+
+y_calib_pred_controlled = bcc.predict(X_calib)
+precision_calib = precision_score(y_calib, y_calib_pred_controlled)
+
+y_test_pred_controlled = bcc.predict(X_test)
+precision_test = precision_score(y_test, y_test_pred_controlled)
+
+print(
+    "With risk control, the precision is:\n"
+    f"- {precision_calib:.3f} on the calibration set \n"
+    f"- {precision_test:.3f} on the test set."
+)
+
+##############################################################################
+# Finally, let us visualize the precision achieved on the calibration set for
+# the tested thresholds, highlighting the valid thresholds and the best one
+# (which maximizes recall).
+
+proba_positive_class = llm_judge.predict_proba(X_calib)[:, 1]
+
+tested_thresholds = bcc._predict_params
+precisions = np.full(len(tested_thresholds), np.inf)
+for i, threshold in enumerate(tested_thresholds):
+    y_pred = (proba_positive_class >= threshold).astype(int)
+    precisions[i] = precision_score(y_calib, y_pred)
+
+naive_threshold_index = np.argmin(
+    np.where(precisions >= target_precision, precisions - target_precision, np.inf)
+)
+naive_threshold = tested_thresholds[naive_threshold_index]
+
+valid_thresholds_indices = np.array(
+    [t in bcc.valid_predict_params for t in tested_thresholds]
+)
+best_threshold_index = np.where(tested_thresholds == bcc.best_predict_param)[0][0]
+
+plt.figure()
+plt.scatter(
+    tested_thresholds[valid_thresholds_indices],
+    precisions[valid_thresholds_indices],
+    c="tab:green",
+    label="Valid thresholds",
+)
+plt.scatter(
+    tested_thresholds[~valid_thresholds_indices],
+    precisions[~valid_thresholds_indices],
+    c="tab:red",
+    label="Invalid thresholds",
+)
+plt.scatter(
+    tested_thresholds[best_threshold_index],
+    precisions[best_threshold_index],
+    c="tab:green",
+    label="Best threshold",
+    marker="*",
+    edgecolors="k",
+    s=300,
+)
+plt.scatter(
+    tested_thresholds[naive_threshold_index],
+    precisions[naive_threshold_index],
+    c="tab:red",
+    label="Naive threshold",
+    marker="*",
+    edgecolors="k",
+    s=300,
+)
+plt.axhline(target_precision, color="tab:gray", linestyle="--")
+plt.text(
+    0.7,
+    target_precision + 0.02,
+    "Target precision",
+    color="tab:gray",
+    fontstyle="italic",
+)
+plt.xlabel("Threshold")
+plt.ylabel("Precision")
+plt.legend()
+plt.show()
+
+proba_positive_class_test = llm_judge.predict_proba(X_test)[:, 1]
+y_pred_naive = (proba_positive_class_test >= naive_threshold).astype(int)
+
+print(
+    "With the naive threshold, the precision is:\n"
+    f"- {precisions[naive_threshold_index]:.3f} on the calibration set\n"
+    f"- {precision_score(y_test, y_pred_naive):.3f} on the test set."
+)
+
+##############################################################################
+# While the naive threshold achieves the target precision on the calibration set,
+# it fails to do so on the test set. This highlights the importance of using
+# risk control methods to ensure that performance guarantees hold on unseen data.