From 9ec2d65dd72480ef3baff2e7dad9296ef5b9cecc Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Tue, 16 Sep 2025 14:56:42 +0200
Subject: [PATCH 01/14] better doc organisation: rename
 multilabel_classification as risk_control and move tuto to all examples

---
 .gitignore                                                  | 2 +-
 doc/Makefile                                                | 2 +-
 doc/conf.py                                                 | 4 ++--
 doc/index.rst                                               | 2 +-
 examples/multilabel_classification/README.rst               | 4 ----
 .../1-quickstart/README.rst                                 | 4 ++--
 examples/risk_control/2-advanced-analysis/README.rst        | 6 ++++++
 .../2-advanced-analysis}/plot_tutorial_risk_control.py      | 0
 examples/risk_control/README.rst                            | 6 ++++++
 9 files changed, 19 insertions(+), 11 deletions(-)
 delete mode 100644 examples/multilabel_classification/README.rst
 rename examples/{multilabel_classification => risk_control}/1-quickstart/README.rst (56%)
 create mode 100644 examples/risk_control/2-advanced-analysis/README.rst
 rename examples/{multilabel_classification/1-quickstart => risk_control/2-advanced-analysis}/plot_tutorial_risk_control.py (100%)
 create mode 100644 examples/risk_control/README.rst

diff --git a/.gitignore b/.gitignore
index f787972b1..4d3384e64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,7 +13,7 @@ doc/_build/
 doc/examples_classification/
 doc/examples_regression/
 doc/examples_calibration/
-doc/examples_multilabel_classification/
+doc/examples_risk_control/
 doc/examples_mondrian/
 doc/auto_examples/
 doc/modules/generated/
diff --git a/doc/Makefile b/doc/Makefile
index 841011bd2..ba1723db0 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -50,7 +50,7 @@ clean:
 	-rm -rf $(BUILDDIR)/*
 	-rm -rf examples_regression/
 	-rm -rf examples_classification/
-	-rm -rf examples_multilabel_classification/
+	-rm -rf examples_risk_control/
 	-rm -rf examples_calibration/
 	-rm -rf examples_mondrian/
 	-rm -rf generated/*
diff --git a/doc/conf.py b/doc/conf.py
index 78cee8a31..eacd46e6e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -321,14 +321,14 @@
     "examples_dirs": [
         "../examples/regression",
         "../examples/classification",
-        "../examples/multilabel_classification",
+        "../examples/risk_control",
         "../examples/calibration",
         "../examples/mondrian",
     ],
     "gallery_dirs": [
         "examples_regression",
         "examples_classification",
-        "examples_multilabel_classification",
+        "examples_risk_control",
         "examples_calibration",
         "examples_mondrian",
     ],
diff --git a/doc/index.rst b/doc/index.rst
index 2807c04bd..808257330 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -24,7 +24,7 @@
    :caption: Control prediction errors
 
    theoretical_description_risk_control
-   examples_multilabel_classification/1-quickstart/plot_tutorial_risk_control
+   examples_risk_control/index
    external_risk_control_package
 
 .. toctree::
diff --git a/examples/multilabel_classification/README.rst b/examples/multilabel_classification/README.rst
deleted file mode 100644
index 1f8a7b3fc..000000000
--- a/examples/multilabel_classification/README.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _general_examples:
-
-General examples
-================
\ No newline at end of file
diff --git a/examples/multilabel_classification/1-quickstart/README.rst b/examples/risk_control/1-quickstart/README.rst
similarity index 56%
rename from examples/multilabel_classification/1-quickstart/README.rst
rename to examples/risk_control/1-quickstart/README.rst
index 65aaf6366..2970a4ef1 100644
--- a/examples/multilabel_classification/1-quickstart/README.rst
+++ b/examples/risk_control/1-quickstart/README.rst
@@ -1,6 +1,6 @@
-.. _multilabel_classification_examples_1:
+.. _risk_control_examples_1:
 
 1. Quickstart examples
 ----------------------
 
-The following examples present the main functionalities of MAPIE through basic quickstart regression problems.
\ No newline at end of file
+The following examples present the main functionalities of MAPIE through basic quickstart risk control problems.
\ No newline at end of file
diff --git a/examples/risk_control/2-advanced-analysis/README.rst b/examples/risk_control/2-advanced-analysis/README.rst
new file mode 100644
index 000000000..2179cbdbd
--- /dev/null
+++ b/examples/risk_control/2-advanced-analysis/README.rst
@@ -0,0 +1,6 @@
+.. _risk_control_examples_2:
+
+2. Advanced analysis
+--------------------
+
+The following examples use MAPIE for discussing more complex risk control problems.
\ No newline at end of file
diff --git a/examples/multilabel_classification/1-quickstart/plot_tutorial_risk_control.py b/examples/risk_control/2-advanced-analysis/plot_tutorial_risk_control.py
similarity index 100%
rename from examples/multilabel_classification/1-quickstart/plot_tutorial_risk_control.py
rename to examples/risk_control/2-advanced-analysis/plot_tutorial_risk_control.py
diff --git a/examples/risk_control/README.rst b/examples/risk_control/README.rst
new file mode 100644
index 000000000..f5f00e9f5
--- /dev/null
+++ b/examples/risk_control/README.rst
@@ -0,0 +1,6 @@
+.. _risk_control_examples:
+
+All risk control examples
+=========================
+
+Following is a collection of notebooks demonstrating how to use MAPIE for risk control.
\ No newline at end of file

From d8057c6952f6d07c4b8f4e48925f4394d36f923a Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Tue, 16 Sep 2025 17:35:03 +0200
Subject: [PATCH 02/14] make clarifications, improve the overview table, and
 fix typos

---
 doc/theoretical_description_risk_control.rst | 43 ++++++++++++++------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/doc/theoretical_description_risk_control.rst b/doc/theoretical_description_risk_control.rst
index 76629d311..c99204e06 100644
--- a/doc/theoretical_description_risk_control.rst
+++ b/doc/theoretical_description_risk_control.rst
@@ -13,26 +13,43 @@ Getting started with risk control in MAPIE
 Overview
 ========
 
+This section provides an overview of risk control in MAPIE. For those unfamiliar with the concept of risk control, the next section provides an introduction to the topic.
+
 Three methods of risk control have been implemented in MAPIE so far :
 **Risk-Controlling Prediction Sets** (RCPS) [1], **Conformal Risk Control** (CRC) [2] and **Learn Then Test** (LTT) [3].
-The difference between these methods is the way the conformity scores are computed.
 
-As of now, MAPIE supports risk control for two machine learning tasks: **binary classification**, as well as **multi-label classification** (including applications like image segmentation).
+As of now, MAPIE supports risk control for two machine learning tasks: **binary classification**, as well as **multi-label classification** (in particular applications like image segmentation).
 The table below details the available methods for each task:
 
+.. |br| raw:: html
+
+   <br />
+   
 .. list-table:: Available risk control methods in MAPIE for each ML task
    :header-rows: 1
 
-   * - Risk control method
-     - Binary classification
-     - Multi-label classification (image segmentation)
+   * - Risk control |br| method
+     - Type of |br| control
+     - Assumption |br| on the data
+     - Non-monotonic |br| risks
+     - Binary |br| classification
+     - Multi-label |br| classification
    * - RCPS
+     - Probability
+     - i.i.d.
+     - ❌
      - ❌
      - ✅
    * - CRC
+     - Expectation
+     - Exchangeable
+     - ❌
      - ❌
      - ✅
    * - LTT
+     - Probability
+     - i.i.d
+     - ✅
      - ✅
      - ✅
 
@@ -41,7 +58,7 @@ In MAPIE for multi-label classification, CRC and RCPS are used for recall contro
 1. What is risk control?
 ========================
 
-Before diving into risk control, let's take the simple example of a binary classification model, which separates the incoming data into the two classes thanks to its threshold: predictions above it are classified as 1, and those below as 0. Suppose we want to find a threshold that guarantees that our model achieves a certain level of precision. A naive, yet straightforward approach to do this is to evaluate how precision varies with different threshold values on a validation dataset. By plotting this relationship (see plot below), we can identify the range of thresholds that meet our desired precision requirement (green zone on the graph).
+Before diving into risk control, let's take the simple example of a binary classification model, which separates the incoming data into two classes. Predicted probabilities above a given threshold (e.g., 0.5) correspond to predicting the "positive" class and probabilities below correspond to the "negative" class. Suppose we want to find a threshold that guarantees that our model achieves a certain level of precision. A naive, yet straightforward approach to do this is to evaluate how precision varies with different threshold values on a validation dataset. By plotting this relationship (see plot below), we can identify the range of thresholds that meet our desired precision requirement (green zone on the graph).
 
 .. image:: images/example_without_risk_control.png
    :width: 600
@@ -54,7 +71,7 @@ So far, so good. But here is the catch: while the chosen threshold effectively k
 Risk control is the science of adjusting a model's parameter, typically denoted :math:`\lambda`, so that a given risk stays below a desired level with high probability on unseen data.
 Note that here, the term *risk* is used to describe an undesirable outcome of the model (e.g., type I error): therefore, it is a value we want to minimize, and in our case, keep under a certain level. Also note that risk control can easily be applied to metrics we want to maximize (e.g., precision), simply by controlling the complement (e.g., 1-precision).
 
-The strength of risk control lies in the statistical guarantees it provides on unseen data. Unlike the naive method presented earlier, it determines a value of :math:`\lambda` that ensures the risk is controlled *beyond* the training data.
+The strength of risk control lies in the statistical guarantees it provides on unseen data. Unlike the naive method presented earlier, it determines a value of :math:`\lambda` that ensures the risk is controlled *beyond* the validation data.
 
 Applying risk control to the previous example would allow us to get a new — albeit narrower — range of thresholds (blue zone on the graph) that are **statistically guaranteed**.
 
@@ -66,7 +83,7 @@ This guarantee is critical in a wide range of use cases (especially in high-stak
 
 —
 
-To express risk control in mathematical terms, we denote by R the risk we want to control, and introduce the following two parameters:
+To express risk control in mathematical terms, we denote by :math:`R` the risk we want to control, and introduce the following two parameters:
 
 - :math:`\alpha`: the target level below which we want the risk to remain, as shown in the figure below;
 
@@ -76,13 +93,13 @@ To express risk control in mathematical terms, we denote by R the risk we want t
 
 - :math:`\delta`: the confidence level associated with the risk control.
 
-In other words, the risk is said to be controlled if :math:`R \leq \alpha` with probability at least :math:`1 - \delta`.
+In other words, the risk is said to be controlled if :math:`R \leq \alpha` with probability at least :math:`1 - \delta`, where the probability is over the randomness in the sampling of the dataset.
 
 The three risk control methods implemented in MAPIE — RCPS, CRC and LTT — rely on different assumptions, and offer slightly different guarantees:
 
 - **CRC** requires the data to be **exchangeable**, and gives a guarantee on the **expectation of the risk**: :math:`\mathbb{E}(R) \leq \alpha`;
 
-- **RCPS** and **LTT** both impose stricter assumptions, requiring the data to be **independent and identically distributed** (i.i.d.), which implies exchangeability. The guarantee they provide is on the **probability that the risk does not exceed :math:`\alpha`**: :math:`\mathbb{P}(R \leq \alpha) \geq 1 - \delta`.
+- **RCPS** and **LTT** both impose stricter assumptions, requiring the data to be **independent and identically distributed** (i.i.d.), which implies exchangeability. The guarantee they provide is on the **probability that the risk does not exceed** :math:`\boldsymbol{\alpha}`: :math:`\mathbb{P}(R \leq \alpha) \geq 1 - \delta`.
 
 .. image:: images/risk_distribution.png
    :width: 600
@@ -94,7 +111,7 @@ The plot above gives a visual representation of the difference between the two t
 
 - The risk is controlled in probability (RCPS/LTT) if at least :math:`1 - \delta` percent of its distribution over unseen data is below :math:`\alpha`.
 
-Note that at the opposite of the other two methods, LTT allows to control any non-monotonic risk.
+Note that contrary to the other two methods, LTT allows to control any non-monotonic risk.
 
 The following section provides a detailed overview of each method.
 
@@ -234,7 +251,7 @@ We are going to present the Learn Then Test framework that allows the user to co
 This method has been introduced in article [3].
 The settings here are the same as RCPS and CRC, we just need to introduce some new parameters:
 
-- Let :math:`\Lambda` be a discretized for our :math:`\lambda`, meaning that :math:`\Lambda = \{\lambda_1, ..., \lambda_n\}`.
+- Let :math:`\Lambda` be a discretized set for our :math:`\lambda`, meaning that :math:`\Lambda = \{\lambda_1, ..., \lambda_n\}`.
 
 - Let :math:`p_\lambda` be a valid p-value for the null hypothesis :math:`\mathbb{H}_j: R(\lambda_j)>\alpha`.
 
@@ -250,7 +267,7 @@ In order to find all the parameters :math:`\lambda` that satisfy the above condi
   :math:`\{(x_1, y_1), \dots, (x_n, y_n)\}`.
 
 - For each :math:`\lambda_j` in a discrete set :math:`\Lambda = \{\lambda_1, \lambda_2,\dots, \lambda_n\}`, we associate the null hypothesis
-  :math:`\mathcal{H}_j: R(\lambda_j) > \alpha`, as rejecting the hypothesis corresponds to selecting :math:`\lambda_j` as a point where risk the risk
+  :math:`\mathcal{H}_j: R(\lambda_j) > \alpha`, as rejecting the hypothesis corresponds to selecting :math:`\lambda_j` as a point where the risk
   is controlled.
 
 - For each null hypothesis, we compute a valid p-value using a concentration inequality :math:`p_{\lambda_j}`. Here we choose to compute the Hoeffding-Bentkus p-value

From e088e854868fb75ec7cee245ab00beda2b2c417b Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Thu, 18 Sep 2025 14:06:30 +0200
Subject: [PATCH 03/14] add quick start risk control

---
 doc/index.rst                                 |   1 +
 doc/quick_start.rst                           |   8 +-
 ...plot_risk_control_binary_classification.py | 126 ++++++++++++++++++
 3 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py

diff --git a/doc/index.rst b/doc/index.rst
index 808257330..1d2881cb0 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -24,6 +24,7 @@
    :caption: Control prediction errors
 
    theoretical_description_risk_control
+   examples_risk_control/1-quickstart/plot_risk_control_binary_classification
    examples_risk_control/index
    external_risk_control_package
 
diff --git a/doc/quick_start.rst b/doc/quick_start.rst
index 9794a4000..995d68157 100644
--- a/doc/quick_start.rst
+++ b/doc/quick_start.rst
@@ -40,4 +40,10 @@ Here, we generate one-dimensional noisy data that we fit with a MLPRegressor: `U
 3. Classification
 =======================
 
-Similarly, it's possible to do the same for a basic classification problem: `Use MAPIE to plot prediction sets <https://mapie.readthedocs.io/en/stable/examples_classification/1-quickstart/plot_quickstart_classification.html>`_
\ No newline at end of file
+Similarly, it's possible to do the same for a basic classification problem: `Use MAPIE to plot prediction sets <https://mapie.readthedocs.io/en/stable/examples_classification/1-quickstart/plot_quickstart_classification.html>`_
+
+
+4. Risk Control
+=======================
+
+MAPIE implements risk control methods for multilabel classification (in particular, image segmentation) and binary classification: `Use MAPIE to control risk for a binary classifier <https://mapie.readthedocs.io/en/stable/examples_risk_control/1-quickstart/plot_risk_control_binary_classification.html>`_
\ No newline at end of file
diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
new file mode 100644
index 000000000..51398188c
--- /dev/null
+++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
@@ -0,0 +1,126 @@
+"""
+=================================================
+Use MAPIE to control risk for a binary classifier
+=================================================
+
+In this example, we explain how to do risk control for binary classification with MAPIE.
+
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import make_circles
+from sklearn.svm import SVC
+from sklearn.model_selection import FixedThresholdClassifier
+from sklearn.metrics import precision_score
+from sklearn.inspection import DecisionBoundaryDisplay
+
+from mapie.risk_control import BinaryClassificationController, precision
+from mapie.utils import train_conformalize_test_split
+
+RANDOM_STATE = 1
+
+##############################################################################
+# Let us first load the dataset and fit an SVC on the training data.
+
+X, y = make_circles(n_samples=3000, noise=0.3,
+                    factor=0.3, random_state=RANDOM_STATE)
+(X_train, X_calib, X_test,
+ y_train, y_calib, y_test) = train_conformalize_test_split(
+     X, y, train_size=0.8, conformalize_size=0.1, test_size=0.1,
+     random_state=RANDOM_STATE)
+
+clf = SVC(probability=True, random_state=RANDOM_STATE)
+clf.fit(X_train, y_train)
+
+##############################################################################
+# Next, we initialize a :class:`~mapie.risk_control.BinaryClassificationController`
+# using the probability estimation function from the fitted estimator:
+# ``clf.predict_proba``, a risk function (here the precision), a target risk level, and
+# a confidence level. Then we use the calibration data to compute statistically
+# guaranteed thresholds using a risk control method.
+
+target_precision = 0.8
+bcc = BinaryClassificationController(
+    clf.predict_proba, precision, target_level=target_precision, confidence_level=0.9)
+bcc.calibrate(X_calib, y_calib)
+
+print(f'{len(bcc.valid_predict_params)} valid thresholds found. '
+      f'The best one is {bcc.best_predict_param:.3f}.')
+
+
+##############################################################################
+# In the plot below, we visualize how the threshold values impact precision, and what
+# thresholds have been computed as statistically guaranteed.
+
+proba_positive_class = clf.predict_proba(X_calib)[:, 1]
+
+tested_thresholds = bcc._predict_params
+precisions = np.full(len(tested_thresholds), np.inf)
+for i, threshold in enumerate(tested_thresholds):
+    y_pred = (proba_positive_class >= threshold).astype(int)
+    precisions[i] = precision_score(y_calib, y_pred)
+
+valid_thresholds_indices = np.array(
+    [t in bcc.valid_predict_params for t in tested_thresholds])
+best_threshold_index = np.where(
+    tested_thresholds == bcc.best_predict_param)[0][0]
+
+plt.figure()
+plt.scatter(tested_thresholds[valid_thresholds_indices],
+            precisions[valid_thresholds_indices], c='tab:green',
+            label='Valid thresholds')
+plt.scatter(tested_thresholds[~valid_thresholds_indices],
+            precisions[~valid_thresholds_indices], c='tab:red',
+            label='Invalid thresholds')
+plt.scatter(tested_thresholds[best_threshold_index], precisions[best_threshold_index],
+            c='tab:green', label='Best threshold', marker='*', edgecolors='k', s=300)
+plt.axhline(target_precision, color='tab:gray', linestyle='--')
+plt.text(0, target_precision+0.02, 'Target precision',
+         color='tab:gray', fontstyle='italic')
+plt.xlabel('Threshold', labelpad=15)
+plt.ylabel('Precision')
+plt.legend()
+plt.show()
+
+##############################################################################
+# Contrary to the naive way of computing a threshold to satisfy a precision target on
+# calibration data, risk control provides statistical guarantees on unseen data.
+# Besides computing a set of valid thresholds,
+# :class:`~mapie.risk_control.BinaryClassificationController` also outputs the best
+# one, which in the case of precision is the threshold that, among all valid ones,
+# maximizes recall.
+#
+# In the figure above, the highest threshold values are considered invalid due to the
+# small number of observations used to compute the precision, following the Learn then
+# Test procedure. In the most extreme case, no observation is available, which causes
+# the precision value to be ill-defined and set to 0.
+#
+# After obtaining the best threshold, we can use the ``predict`` function of
+# :class:`~mapie.risk_control.BinaryClassificationController` for future predictions,
+# or use scikit-learn's ``FixedThresholdClassifier`` as a wrapper to benefit
+# from functionalities like easily plotting the decision boundary as seen below.
+
+y_pred = bcc.predict(X_test)
+
+clf_threshold = FixedThresholdClassifier(clf, threshold=bcc.best_predict_param)
+# necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator
+clf_threshold.fit(X_train, y_train)
+
+disp = DecisionBoundaryDisplay.from_estimator(
+    clf_threshold, X_test, response_method="predict", cmap=plt.cm.coolwarm)
+
+plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1],
+            edgecolors='k', c='tab:blue', alpha=0.5, label='"negative" class')
+plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1],
+            edgecolors='k', c='tab:red', alpha=0.5, label='"positive" class')
+plt.title("Decision Boundary of FixedThresholdClassifier")
+plt.xlabel("Feature 1")
+plt.ylabel("Feature 2")
+plt.legend()
+plt.show()
+
+##############################################################################
+# Different risk functions have been implemented, such as precision and recall, but you
+# can also implement your own custom function using
+# :class:`~mapie.risk_control.BinaryClassificationRisk`.

From d1636b8a99610af7190322bab2c1427775dea690 Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Thu, 18 Sep 2025 15:41:06 +0200
Subject: [PATCH 04/14] Revert incorrect renaming of calibration to
 conformalization in risk_control.py

---
 HISTORY.rst              |  1 +
 doc/v1_release_notes.rst |  2 --
 mapie/risk_control.py    | 28 ++++++++++++++--------------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/HISTORY.rst b/HISTORY.rst
index 29408cc31..eb3409a93 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -16,6 +16,7 @@ History
 * MAPIE now supports Python versions up to the latest release (currently 3.13)
 * Change `prefit` default value to `True` in split methods' docstrings to remain consistent with the implementation
 * Fix issue 699 to replace `TimeSeriesRegressor.partial_fit` with `TimeSeriesRegressor.update`
+* Revert incorrect renaming of calibration to conformalization in risk_control.py
 
 1.0.1 (2025-05-22)
 ------------------
diff --git a/doc/v1_release_notes.rst b/doc/v1_release_notes.rst
index 41ae6aa08..0a946a1e3 100644
--- a/doc/v1_release_notes.rst
+++ b/doc/v1_release_notes.rst
@@ -263,8 +263,6 @@ Risk control
 
 The ``MapieMultiLabelClassifier`` class has been renamed ``PrecisionRecallController``.
 
-The parameter ``calib_size`` from the ``fit`` method has been renamed ``conformalize_size``.
-
 Calibration
 ^^^^^^^^^^^^^
 
diff --git a/mapie/risk_control.py b/mapie/risk_control.py
index f5a57c2c4..e9a2a7c8b 100644
--- a/mapie/risk_control.py
+++ b/mapie/risk_control.py
@@ -362,7 +362,7 @@ def _check_estimator(
 
         Warning
             If estimator is then to warn about the split of the
-            data between train and conformalization
+            data between train and calibration
         """
         if (estimator is None) and (not _refit):
             raise ValueError(
@@ -374,19 +374,19 @@ def _check_estimator(
             estimator = MultiOutputClassifier(
                 LogisticRegression()
             )
-            X_train, X_conf, y_train, y_conf = train_test_split(
-                    X,
-                    y,
-                    test_size=self.conformalize_size,
-                    random_state=self.random_state,
+            X_train, X_calib, y_train, y_calib = train_test_split(
+                X,
+                y,
+                test_size=self.calib_size,
+                random_state=self.random_state,
             )
             estimator.fit(X_train, y_train)
             warnings.warn(
                 "WARNING: To avoid overfitting, X has been split"
-                + "into X_train and X_conf. The conformalization will only"
-                + "be done on X_conf"
+                + "into X_train and X_calib. The calibration will only"
+                + "be done on X_calib"
             )
-            return estimator, X_conf, y_conf
+            return estimator, X_calib, y_calib
 
         if isinstance(estimator, Pipeline):
             est = estimator[-1]
@@ -589,7 +589,7 @@ def fit(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        conformalize_size: Optional[float] = .3
+        calib_size: Optional[float] = .3
     ) -> PrecisionRecallController:
         """
         Fit the base estimator or use the fitted base estimator.
@@ -602,8 +602,8 @@ def fit(
         y: NDArray of shape (n_samples, n_classes)
             Training labels.
 
-        conformalize_size: Optional[float]
-            Size of the conformalization dataset with respect to X if the
+        calib_size: Optional[float]
+            Size of the calibration dataset with respect to X if the
             given model is ``None`` need to fit a LogisticRegression.
 
             By default .3
@@ -613,7 +613,7 @@ def fit(
         PrecisionRecallController
             The model itself.
         """
-        self.conformalize_size = conformalize_size
+        self.calib_size = calib_size
         return self.partial_fit(X, y, _refit=True)
 
     def predict(
@@ -696,7 +696,7 @@ def predict(
             )
             self._check_valid_index(alpha_np)
             self.lambdas_star, self.r_star = find_lambda_control_star(
-               self.r_hat, self.valid_index, self.lambdas
+                self.r_hat, self.valid_index, self.lambdas
             )
             y_pred_proba_array = (
                 y_pred_proba_array >

From fcbff66a57f73028bbd2c67da25936c23222bcde Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Thu, 18 Sep 2025 16:09:34 +0200
Subject: [PATCH 05/14] add link to notebook theoretical validity risk control

---
 doc/theoretical_description_risk_control.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/theoretical_description_risk_control.rst b/doc/theoretical_description_risk_control.rst
index c99204e06..5bb6e73e8 100644
--- a/doc/theoretical_description_risk_control.rst
+++ b/doc/theoretical_description_risk_control.rst
@@ -117,6 +117,8 @@ The following section provides a detailed overview of each method.
 
 2. Theoretical description
 ==========================
+Note that a notebook testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data is available here: `theoretical_validity_tests.ipynb <https://github.com/scikit-learn-contrib/MAPIE/tree/master/notebooks/risk_control/theoretical_validity_tests.ipynb>`__.
+
 2.1 Risk-Controlling Prediction Sets
 ------------------------------------
 2.1.1 General settings

From 0610bb183ce035117c96048f1a57fe7ef932066d Mon Sep 17 00:00:00 2001
From: Valentin Laurent <valentin.laurent.fr@gmail.com>
Date: Thu, 18 Sep 2025 16:56:47 +0200
Subject: [PATCH 06/14] Update
 examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py

Change title
---
 .../1-quickstart/plot_risk_control_binary_classification.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
index 51398188c..1244712ef 100644
--- a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
+++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
@@ -1,7 +1,7 @@
 """
-=================================================
-Use MAPIE to control risk for a binary classifier
-=================================================
+========================================================
+Use MAPIE to control the precision of a binary classifier
+========================================================
 
 In this example, we explain how to do risk control for binary classification with MAPIE.
 

From 8ae2c47d740c3d1e352734051b0e9f362467cc95 Mon Sep 17 00:00:00 2001
From: Valentin Laurent <valentin.laurent.fr@gmail.com>
Date: Wed, 17 Sep 2025 16:44:35 +0200
Subject: [PATCH 07/14] DOC - BinaryClassificationController docstrings

---
 mapie/risk_control.py | 170 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 165 insertions(+), 5 deletions(-)

diff --git a/mapie/risk_control.py b/mapie/risk_control.py
index e9a2a7c8b..1f54f8715 100644
--- a/mapie/risk_control.py
+++ b/mapie/risk_control.py
@@ -134,7 +134,7 @@ class PrecisionRecallController(BaseEstimator, ClassifierMixin):
 
     References
     ----------
-    [1] Lihua Lei Jitendra Malik Stephen Bates, Anastasios Angelopoulos
+    [1] Lihua Lei Jitendra Malik Stephen Bates, Anastasios Angelopoulos,
     and Michael I. Jordan. Distribution-free, risk-controlling prediction
     sets. CoRR, abs/2101.02703, 2021.
     URL https://arxiv.org/abs/2101.02703
@@ -197,7 +197,7 @@ def __init__(
 
     def _check_parameters(self) -> None:
         """
-        Check n_jobs, verbose and random_states.
+        Check n_jobs, verbose, and random_states.
 
         Raises
         ------
@@ -719,6 +719,22 @@ def predict(
 
 
 class BinaryClassificationRisk:
+    """
+    Parameters
+    ----------
+    risk_occurrence : Callable[[int, int], bool]
+    risk_condition : Callable[[int, int], bool]
+    higher_is_better : bool
+
+    Attributes
+    ----------
+    risk_occurrence : Callable[[int, int], bool]
+    risk_condition : Callable[[int, int], bool]
+    higher_is_better : bool
+
+    Examples
+    --------
+    """
     # Any risk that can be defined in the following way will work using the binary
     # Hoeffding-Bentkus p-values used in MAPIE
     # Take the example of precision in the docstring to explain how the class works.
@@ -738,6 +754,16 @@ def get_value_and_effective_sample_size(
         y_true: NDArray,  # shape (n_samples,), values in {0, 1}
         y_pred: NDArray,  # shape (n_samples,), values in {0, 1}
     ) -> Tuple[float, int]:
+        """
+        Parameters
+        ----------
+        y_true : NDArray
+        y_pred : NDArray
+
+        Returns
+        -------
+        Tuple[float, int]
+        """
         # float between 0 and 1, int between 0 and len(y_true)
         # Returns 1-risk_occurrence if higher_is_better is True
         # returns (1, -1) when the risk is not defined (condition never met)
@@ -790,6 +816,108 @@ def get_value_and_effective_sample_size(
 
 
 class BinaryClassificationController:
+    """
+    Controls the risk or performance of a binary classifier.
+
+    BinaryClassificationController finds the decision thresholds of a binary classifier
+    that statistically guarantee a risk to be below a target level
+    (the risk is "controlled").
+    It can be used to control a performance metric as well, such as the precision.
+    In that case, the thresholds guarantee that the performance is above a target level.
+
+    Usage:
+
+    1. Instantiate a BinaryClassificationController, providing the predict_proba method
+       of your binary classifier
+    2. Call the calibrate method to find the thresholds
+    3. Use the predict method to predict using the best threshold
+
+    Note: for a given model, calibration dataset, target level, and confidence level,
+    there may not be any thresholds controlling the risk.
+
+    Parameters
+    ----------
+    predict_function : Callable[[ArrayLike], NDArray]
+        predict_proba method of a fitted binary classifier.
+        Its output signature must be of shape (len(X), 2)
+
+    risk : BinaryClassificationRisk
+        The risk or performance metric to control.
+        Valid options:
+
+        - An existing risk defined in `mapie.risk_control` (e.g. precision, recall,
+          accuracy, false_positive_rate)
+        - A custom instance of BinaryClassificationRisk object
+
+    target_level : float
+        The maximum risk level (or minimum performance level). Must be between 0 and 1.
+
+    confidence_level : float, default=0.9
+        The confidence level with which the risk (or performance) is controlled.
+        See the documentation for detailed explanations.
+
+    best_predict_param_choice : Union["auto", BinaryClassificationRisk], default="auto"
+        How to select the best threshold from the valid thresholds that control the risk
+        (or performance). The BinaryClassificationController will try to minimize
+        (or maximize) a secondary objective.
+        Valid options:
+
+        - "auto" (default)
+        - An existing risk defined in `mapie.risk_control` (e.g. precision, recall,
+          accuracy, false_positive_rate)
+        - A custom instance of BinaryClassificationRisk object
+
+    Attributes
+    ----------
+    valid_predict_params : NDArray
+        The valid thresholds that control the risk (or performance).
+        Use the calibrate method to compute these.
+
+    best_predict_param : Optional[float]
+        The best thresholds that control the risk (or performance).
+        Use the calibrate method to compute it.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split
+    >>> from mapie.risk_control import BinaryClassificationController, precision
+
+    >>> X, y = make_classification(
+    ...     n_features=2,
+    ...     n_redundant=0,
+    ...     n_informative=2,
+    ...     n_clusters_per_class=1,
+    ...     n_classes=2,
+    ...     random_state=42,
+    ...     class_sep=2.0
+    ... )
+    >>> X_train, X_temp, y_train, y_temp = train_test_split(
+    ...     X, y, test_size=0.4, random_state=42
+    ... )
+    >>> X_calib, X_test, y_calib, y_test = train_test_split(
+    ...     X_temp, y_temp, test_size=0.1, random_state=42
+    ... )
+
+    >>> clf = LogisticRegression().fit(X_train, y_train)
+
+    >>> controller = BinaryClassificationController(
+    ...     predict_function=clf.predict_proba,
+    ...     risk=precision,
+    ...     target_level=0.6
+    ... )
+
+    >>> controller.calibrate(X_calib, y_calib)
+    >>> predictions = controller.predict(X_test)  # doctest: +SKIP
+
+    References
+    ----------
+    Angelopoulos, Anastasios N., Stephen, Bates, Emmanuel J. Candès, et al.
+    "Learn Then Test: Calibrating Predictive Algorithms to Achieve Risk Control." (2022)
+
+    """
     _best_predict_param_choice_map = {
         precision: recall,
         recall: precision,
@@ -799,10 +927,8 @@ class BinaryClassificationController:
 
     def __init__(
         self,
-        # X -> y_proba of shape (n_samples, 2)
         predict_function: Callable[[ArrayLike], NDArray],
-        risk: BinaryClassificationRisk,  # to import from mapie.risk_control
-        # above or below depending if risk is higher_is_better or not
+        risk: BinaryClassificationRisk,
         target_level: float,
         confidence_level: float = 0.9,
         best_predict_param_choice: Union[
@@ -833,6 +959,23 @@ def calibrate(  # pragma: no cover
         X_calibrate: ArrayLike,
         y_calibrate: ArrayLike
     ) -> None:
+        """
+        Calibrate the BinaryClassificationController.
+        Sets attributes valid_predict_params and best_predict_param (if the risk
+        or performance can be controlled at the target level).
+
+        Parameters
+        ----------
+        X_calibrate : ArrayLike
+            Features of the calibration set.
+
+        y_calibrate : ArrayLike
+            Binary labels of the calibration set.
+
+        Returns
+        -------
+        None
+        """
         y_calibrate_ = np.asarray(y_calibrate, dtype=int)
 
         predictions_per_param = self._get_predictions_per_param(
@@ -869,6 +1012,23 @@ def calibrate(  # pragma: no cover
             )
 
     def predict(self, X_test: ArrayLike) -> NDArray:
+        """
+        Predict using predict_function at the best threshold.
+
+        Parameters
+        ----------
+        X_test : ArrayLike
+            Features
+
+        Returns
+        -------
+        NDArray of shape (n_samples,)
+
+        Raises
+        ------
+        ValueError if the method .calibrate was not called,
+        or if no valid thresholds were found during calibration.
+        """
         if self.best_predict_param is None:
             raise ValueError(
                 "Cannot predict. "

From 709d4e29b85285718d98cd664c99e950d2a2b3f2 Mon Sep 17 00:00:00 2001
From: Valentin Laurent <valentin.laurent.fr@gmail.com>
Date: Thu, 18 Sep 2025 16:48:25 +0200
Subject: [PATCH 08/14] DOC - BinaryClassificationRisk docstring, + make some
 attributes private

---
 mapie/risk_control.py | 88 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 22 deletions(-)

diff --git a/mapie/risk_control.py b/mapie/risk_control.py
index 1f54f8715..b4ee727c4 100644
--- a/mapie/risk_control.py
+++ b/mapie/risk_control.py
@@ -720,62 +720,104 @@ def predict(
 
 class BinaryClassificationRisk:
     """
+    Define a risk (or a performance metric) to be used with the
+    BinaryClassificationController. Predefined instances are implemented,
+    see :func:`mapie.risk_control.precision`, :func:`mapie.risk_control.recall`,
+    :func:`mapie.risk_control.accuracy` and
+    :func:`mapie.risk_control.false_positive_rate`.
+
+    Here, a binary classification risk (or performance) is defined by an occurrence and
+    a condition. Let's take the example of precision. Precision is the sum of true
+    positives over the total number of positives. In other words, precision is
+    the average of correct predictions (occurrence) given that those predictions
+    are positive (condition). Programmatically,
+    ``precision = (sum(y_pred == y_true) if y_pred == 1)/sum(y_pred == 1)``.
+    Because precision is a performance metric rather than a risk, `higher_is_better`
+    must be set to `True`. See the implementation of `precision` in mapie.risk_control.
+
+    Note: any risk or performance metric that can be defined as
+    ``sum(occurrence if condition) / sum(occurrence)`` can be theoretically controlled
+    with the BinaryClassificationController, thanks to the LearnThenTest framework [1]
+    and the binary Hoeffding-Bentkus p-values implemented in MAPIE.
+
+    Note: by definition, the value of the risk (or performance metric) here is always
+    between 0 and 1.
+
     Parameters
     ----------
     risk_occurrence : Callable[[int, int], bool]
+        A function defining the occurrence of the risk for a given sample.
+        Must take y_true and y_pred as input and return a boolean.
+
     risk_condition : Callable[[int, int], bool]
+        A function defining the condition of the risk for a given sample,
+        Must take y_true and y_pred as input and return a boolean.
+
     higher_is_better : bool
+        Whether this BinaryClassificationRisk instance is a risk
+        (higher_is_better=False) or a performance metric (higher_is_better=True).
 
     Attributes
     ----------
-    risk_occurrence : Callable[[int, int], bool]
-    risk_condition : Callable[[int, int], bool]
     higher_is_better : bool
+        See above.
 
-    Examples
-    --------
+    References
+    ----------
+    [1] Angelopoulos, Anastasios N., Stephen, Bates, Emmanuel J. Candès, et al.
+    "Learn Then Test: Calibrating Predictive Algorithms to Achieve Risk Control." (2022)
     """
-    # Any risk that can be defined in the following way will work using the binary
-    # Hoeffding-Bentkus p-values used in MAPIE
-    # Take the example of precision in the docstring to explain how the class works.
-    # Explain that it works by computing sum(risk_occurence[risk_cond])
+
     def __init__(
         self,
         risk_occurrence: Callable[[int, int], bool],
         risk_condition: Callable[[int, int], bool],
         higher_is_better: bool,
     ):
-        self.risk_occurrence = risk_occurrence
-        self.risk_condition = risk_condition
+        self._risk_occurrence = risk_occurrence
+        self._risk_condition = risk_condition
         self.higher_is_better = higher_is_better
 
     def get_value_and_effective_sample_size(
         self,
-        y_true: NDArray,  # shape (n_samples,), values in {0, 1}
-        y_pred: NDArray,  # shape (n_samples,), values in {0, 1}
+        y_true: NDArray,
+        y_pred: NDArray,
     ) -> Tuple[float, int]:
         """
+        Computes the value of a risk given an array of ground
+        truth labels and the corresponding predictions. Also returns the number of
+        samples used to compute that value.
+
+        That number can be different from the total number of samples. For example, in
+        the case of precision, only the samples with positive predictions are used.
+
+        In the case of a performance metric, this function returns 1 - perf_value.
+
         Parameters
         ----------
         y_true : NDArray
+            NDArray of ground truth labels, of shape (n_samples,), with values in {0, 1}
+
         y_pred : NDArray
+            NDArray of predictions, of shape (n_samples,), with values in {0, 1}
 
         Returns
         -------
-        Tuple[float, int]
+        A tuple containing the value of the risk between 0 and 1,
+        and the number of effective samples used to compute that value
+        (between 1 and n_samples).
+
+        In the case of a performance metric, this function returns 1 - perf_value.
+
+        If the risk is not defined (condition never met), the value is set to 1,
+        and the number of effective samples is set to -1.
         """
-        # float between 0 and 1, int between 0 and len(y_true)
-        # Returns 1-risk_occurrence if higher_is_better is True
-        # returns (1, -1) when the risk is not defined (condition never met)
-        # In this case, the corresponding lambda shouldn't be considered valid.
-        # In the current LTT implementation, providing n_obs=-1 will result
-        # in an infinite p_value, effectively invaliding the lambda
         risk_occurrences = np.array([
-            self.risk_occurrence(y_true_i, y_pred_i)
+            self._risk_occurrence(y_true_i, y_pred_i)
             for y_true_i, y_pred_i in zip(y_true, y_pred)
         ])
         risk_conditions = np.array([
-            self.risk_condition(y_true_i, y_pred_i)
+            self._risk_condition(y_true_i, y_pred_i)
             for y_true_i, y_pred_i in zip(y_true, y_pred)
         ])
         effective_sample_size = len(y_true) - np.sum(~risk_conditions)
@@ -787,6 +829,9 @@ def get_value_and_effective_sample_size(
             if self.higher_is_better:
                 risk_value = 1 - risk_value
             return risk_value, effective_sample_size_int
+        # In this case, the corresponding lambda shouldn't be considered valid.
+        # In the current LTT implementation, providing n_obs=-1 will result
+        # in an infinite p_value, effectively invaliding the lambda
         return 1, -1
 
 
@@ -916,7 +961,6 @@ class BinaryClassificationController:
     ----------
     Angelopoulos, Anastasios N., Stephen, Bates, Emmanuel J. Candès, et al.
     "Learn Then Test: Calibrating Predictive Algorithms to Achieve Risk Control." (2022)
-
     """
     _best_predict_param_choice_map = {
         precision: recall,

From ca8f1780801381246070b8b9860c2b9f6d78e979 Mon Sep 17 00:00:00 2001
From: Valentin Laurent <valentin.laurent.fr@gmail.com>
Date: Thu, 18 Sep 2025 17:10:38 +0200
Subject: [PATCH 09/14] DOC - Fix docstrings formatting, add classes to the API
 page in ReadTheDoc

---
 doc/api.rst           |  2 ++
 mapie/risk_control.py | 23 +++++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index dbfbaaa8c..b043beb1a 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -102,6 +102,8 @@ Risk Control
    :template: class.rst
 
    mapie.risk_control.PrecisionRecallController
+   mapie.risk_control.BinaryClassificationController
+   mapie.risk_control.BinaryClassificationRisk
 
 Calibration
 ===========
diff --git a/mapie/risk_control.py b/mapie/risk_control.py
index b4ee727c4..8ef9fd4ed 100644
--- a/mapie/risk_control.py
+++ b/mapie/risk_control.py
@@ -760,7 +760,7 @@ class BinaryClassificationRisk:
     Attributes
     ----------
     higher_is_better : bool
-        See above.
+        See params.
 
     References
     ----------
@@ -803,14 +803,15 @@ def get_value_and_effective_sample_size(
 
         Returns
         -------
-        A tuple containing the value of the risk between 0 and 1,
-        and the number of effective samples used to compute that value
-        (between 1 and n_samples).
+        Tuple[float, int]
+            A tuple containing the value of the risk between 0 and 1,
+            and the number of effective samples used to compute that value
+            (between 1 and n_samples).
 
-        In the case of a performance metric, this function returns 1 - perf_value.
+            In the case of a performance metric, this function returns 1 - perf_value.
 
-        If the risk is not defined (condition never met), the value is set to 1,
-        and the number of effective samples is set to -1.
+            If the risk is not defined (condition never met), the value is set to 1,
+            and the number of effective samples is set to -1.
         """
         risk_occurrences = np.array([
             self._risk_occurrence(y_true_i, y_pred_i)
@@ -1066,12 +1067,14 @@ def predict(self, X_test: ArrayLike) -> NDArray:
 
         Returns
         -------
-        NDArray of shape (n_samples,)
+        NDArray
+            NDArray of shape (n_samples,)
 
         Raises
         ------
-        ValueError if the method .calibrate was not called,
-        or if no valid thresholds were found during calibration.
+        ValueError
+            If the method .calibrate was not called,
+            or if no valid thresholds were found during calibration.
         """
         if self.best_predict_param is None:
             raise ValueError(

From 3026f13a56cd5c072f3fdef95c3446e90512d97a Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Fri, 19 Sep 2025 14:45:22 +0200
Subject: [PATCH 10/14] clarifications of explanations and formatting

---
 ...plot_risk_control_binary_classification.py | 91 ++++++++++++-------
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
index 1244712ef..ed8b6b277 100644
--- a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
+++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
@@ -1,7 +1,7 @@
 """
-========================================================
+=========================================================
 Use MAPIE to control the precision of a binary classifier
-========================================================
+=========================================================
 
 In this example, we explain how to do risk control for binary classification with MAPIE.
 
@@ -23,12 +23,13 @@
 ##############################################################################
 # Let us first load the dataset and fit an SVC on the training data.
 
-X, y = make_circles(n_samples=3000, noise=0.3,
-                    factor=0.3, random_state=RANDOM_STATE)
+X, y = make_circles(n_samples=3000, noise=0.3, factor=0.3, random_state=RANDOM_STATE)
 (X_train, X_calib, X_test,
  y_train, y_calib, y_test) = train_conformalize_test_split(
-     X, y, train_size=0.8, conformalize_size=0.1, test_size=0.1,
-     random_state=RANDOM_STATE)
+     X, y, 
+     train_size=0.8, conformalize_size=0.1, test_size=0.1,
+     random_state=RANDOM_STATE
+     )
 
 clf = SVC(probability=True, random_state=RANDOM_STATE)
 clf.fit(X_train, y_train)
@@ -41,12 +42,18 @@
 # guaranteed thresholds using a risk control method.
 
 target_precision = 0.8
+confidence_level = 0.9
 bcc = BinaryClassificationController(
-    clf.predict_proba, precision, target_level=target_precision, confidence_level=0.9)
+    clf.predict_proba, 
+    precision, target_level=target_precision, 
+    confidence_level=confidence_level
+    )
 bcc.calibrate(X_calib, y_calib)
 
-print(f'{len(bcc.valid_predict_params)} valid thresholds found. '
-      f'The best one is {bcc.best_predict_param:.3f}.')
+print(f'{len(bcc.valid_predict_params)} thresholds found that guarantee a precision of '
+      f'at least {target_precision} with a confidence of {confidence_level}.\n'
+      'Among those, the one that maximizes the secondary objective (recall here) is: '
+      f'{bcc.best_predict_param:.3f}.')
 
 
 ##############################################################################
@@ -67,18 +74,23 @@
     tested_thresholds == bcc.best_predict_param)[0][0]
 
 plt.figure()
-plt.scatter(tested_thresholds[valid_thresholds_indices],
-            precisions[valid_thresholds_indices], c='tab:green',
-            label='Valid thresholds')
-plt.scatter(tested_thresholds[~valid_thresholds_indices],
-            precisions[~valid_thresholds_indices], c='tab:red',
-            label='Invalid thresholds')
-plt.scatter(tested_thresholds[best_threshold_index], precisions[best_threshold_index],
-            c='tab:green', label='Best threshold', marker='*', edgecolors='k', s=300)
+plt.scatter(
+    tested_thresholds[valid_thresholds_indices], precisions[valid_thresholds_indices], 
+    c='tab:green', label='Valid thresholds'
+    )
+plt.scatter(
+    tested_thresholds[~valid_thresholds_indices], precisions[~valid_thresholds_indices],
+    c='tab:red', label='Invalid thresholds'
+    )
+plt.scatter(
+    tested_thresholds[best_threshold_index], precisions[best_threshold_index],
+    c='tab:green', label='Best threshold', marker='*', edgecolors='k', s=300
+    )
 plt.axhline(target_precision, color='tab:gray', linestyle='--')
-plt.text(0, target_precision+0.02, 'Target precision',
-         color='tab:gray', fontstyle='italic')
-plt.xlabel('Threshold', labelpad=15)
+plt.text(
+    0.7, target_precision+0.02, 'Target precision', color='tab:gray', fontstyle='italic'
+)
+plt.xlabel('Threshold')
 plt.ylabel('Precision')
 plt.legend()
 plt.show()
@@ -86,15 +98,19 @@
 ##############################################################################
 # Contrary to the naive way of computing a threshold to satisfy a precision target on
 # calibration data, risk control provides statistical guarantees on unseen data.
-# Besides computing a set of valid thresholds,
-# :class:`~mapie.risk_control.BinaryClassificationController` also outputs the best
-# one, which in the case of precision is the threshold that, among all valid ones,
-# maximizes recall.
+# In the plot above, we can see that not all thresholds corresponding to a precision 
+# higher that the target are valid. This is due to the uncertainty inherent to the 
+# finite size of the calibration set, which risk control takes into account.
 #
-# In the figure above, the highest threshold values are considered invalid due to the
+# In particular, the highest threshold values are considered invalid due to the
 # small number of observations used to compute the precision, following the Learn then
 # Test procedure. In the most extreme case, no observation is available, which causes
 # the precision value to be ill-defined and set to 0.
+
+# Besides computing a set of valid thresholds,
+# :class:`~mapie.risk_control.BinaryClassificationController` also outputs the "best"
+# one, which is the valid threshold that maximizes a secondary objective 
+# (recall here).
 #
 # After obtaining the best threshold, we can use the ``predict`` function of
 # :class:`~mapie.risk_control.BinaryClassificationController` for future predictions,
@@ -104,16 +120,22 @@
 y_pred = bcc.predict(X_test)
 
 clf_threshold = FixedThresholdClassifier(clf, threshold=bcc.best_predict_param)
-# necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator
-clf_threshold.fit(X_train, y_train)
+clf_threshold.fit(X_train, y_train) 
+# .fit necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator
 
-disp = DecisionBoundaryDisplay.from_estimator(
-    clf_threshold, X_test, response_method="predict", cmap=plt.cm.coolwarm)
 
-plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1],
-            edgecolors='k', c='tab:blue', alpha=0.5, label='"negative" class')
-plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1],
-            edgecolors='k', c='tab:red', alpha=0.5, label='"positive" class')
+disp = DecisionBoundaryDisplay.from_estimator(
+    clf_threshold, X_test, response_method="predict", cmap=plt.cm.coolwarm
+    )
+
+plt.scatter(
+    X_test[y_test == 0, 0], X_test[y_test == 0, 1],
+    edgecolors='k', c='tab:blue', alpha=0.5, label='"negative" class'
+    )
+plt.scatter(
+    X_test[y_test == 1, 0], X_test[y_test == 1, 1],
+    edgecolors='k', c='tab:red', alpha=0.5, label='"positive" class'
+    )
 plt.title("Decision Boundary of FixedThresholdClassifier")
 plt.xlabel("Feature 1")
 plt.ylabel("Feature 2")
@@ -123,4 +145,5 @@
 ##############################################################################
 # Different risk functions have been implemented, such as precision and recall, but you
 # can also implement your own custom function using
-# :class:`~mapie.risk_control.BinaryClassificationRisk`.
+# :class:`~mapie.risk_control.BinaryClassificationRisk` and choose your own 
+# secondary objective.

From 28dbc3cf81815c19680fc90cfc183a05f77e6f22 Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Fri, 19 Sep 2025 14:49:53 +0200
Subject: [PATCH 11/14] fix trailing whitespace

---
 .../plot_risk_control_binary_classification.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
index ed8b6b277..216594119 100644
--- a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
+++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py
@@ -26,7 +26,7 @@
 X, y = make_circles(n_samples=3000, noise=0.3, factor=0.3, random_state=RANDOM_STATE)
 (X_train, X_calib, X_test,
  y_train, y_calib, y_test) = train_conformalize_test_split(
-     X, y, 
+     X, y,
      train_size=0.8, conformalize_size=0.1, test_size=0.1,
      random_state=RANDOM_STATE
      )
@@ -44,8 +44,8 @@
 target_precision = 0.8
 confidence_level = 0.9
 bcc = BinaryClassificationController(
-    clf.predict_proba, 
-    precision, target_level=target_precision, 
+    clf.predict_proba,
+    precision, target_level=target_precision,
     confidence_level=confidence_level
     )
 bcc.calibrate(X_calib, y_calib)
@@ -75,7 +75,7 @@
 
 plt.figure()
 plt.scatter(
-    tested_thresholds[valid_thresholds_indices], precisions[valid_thresholds_indices], 
+    tested_thresholds[valid_thresholds_indices], precisions[valid_thresholds_indices],
     c='tab:green', label='Valid thresholds'
     )
 plt.scatter(
@@ -98,8 +98,8 @@
 ##############################################################################
 # Contrary to the naive way of computing a threshold to satisfy a precision target on
 # calibration data, risk control provides statistical guarantees on unseen data.
-# In the plot above, we can see that not all thresholds corresponding to a precision 
-# higher that the target are valid. This is due to the uncertainty inherent to the 
+# In the plot above, we can see that not all thresholds corresponding to a precision
+# higher that the target are valid. This is due to the uncertainty inherent to the
 # finite size of the calibration set, which risk control takes into account.
 #
 # In particular, the highest threshold values are considered invalid due to the
@@ -109,7 +109,7 @@
 
 # Besides computing a set of valid thresholds,
 # :class:`~mapie.risk_control.BinaryClassificationController` also outputs the "best"
-# one, which is the valid threshold that maximizes a secondary objective 
+# one, which is the valid threshold that maximizes a secondary objective
 # (recall here).
 #
 # After obtaining the best threshold, we can use the ``predict`` function of
@@ -120,7 +120,7 @@
 y_pred = bcc.predict(X_test)
 
 clf_threshold = FixedThresholdClassifier(clf, threshold=bcc.best_predict_param)
-clf_threshold.fit(X_train, y_train) 
+clf_threshold.fit(X_train, y_train)
 # .fit necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator
 
 
@@ -145,5 +145,5 @@
 ##############################################################################
 # Different risk functions have been implemented, such as precision and recall, but you
 # can also implement your own custom function using
-# :class:`~mapie.risk_control.BinaryClassificationRisk` and choose your own 
+# :class:`~mapie.risk_control.BinaryClassificationRisk` and choose your own
 # secondary objective.

From 373f25fed2a6902c3668ed527b4e0ef18e6ba96d Mon Sep 17 00:00:00 2001
From: Adrien Le Coz <adrien.le-coz@inria.fr>
Date: Mon, 22 Sep 2025 10:59:34 +0200
Subject: [PATCH 12/14] change position of notebook link

---
 doc/theoretical_description_risk_control.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/theoretical_description_risk_control.rst b/doc/theoretical_description_risk_control.rst
index 5bb6e73e8..6c8cbe4c4 100644
--- a/doc/theoretical_description_risk_control.rst
+++ b/doc/theoretical_description_risk_control.rst
@@ -117,7 +117,6 @@ The following section provides a detailed overview of each method.
 
 2. Theoretical description
 ==========================
-Note that a notebook testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data is available here: `theoretical_validity_tests.ipynb <https://github.com/scikit-learn-contrib/MAPIE/tree/master/notebooks/risk_control/theoretical_validity_tests.ipynb>`__.
 
 2.1 Risk-Controlling Prediction Sets
 ------------------------------------
@@ -278,6 +277,7 @@ In order to find all the parameters :math:`\lambda` that satisfy the above condi
 - Return :math:`\hat{\Lambda} =  \mathcal{A}(\{p_j\}_{j\in\{1,\dots,\lvert \Lambda \rvert})`, where :math:`\mathcal{A}`, is an algorithm
   that controls the family-wise error rate (FWER), for example, Bonferonni correction.
 
+Note that a notebook testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data is available here: `theoretical_validity_tests.ipynb <https://github.com/scikit-learn-contrib/MAPIE/tree/master/notebooks/risk_control/theoretical_validity_tests.ipynb>`__.
 
 References
 ==========

From ea6d0e3772a783175f72026b9b11666404afdfae Mon Sep 17 00:00:00 2001
From: Valentin Laurent <valentin.laurent.fr@gmail.com>
Date: Fri, 19 Sep 2025 18:23:26 +0200
Subject: [PATCH 13/14] DOC & MTN - Fix docstrings, add an exception handling
 if users passes wrong predict function

---
 mapie/risk_control.py            | 27 +++++++++++++------
 mapie/tests/test_risk_control.py | 45 +++++++++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/mapie/risk_control.py b/mapie/risk_control.py
index 8ef9fd4ed..055b5f566 100644
--- a/mapie/risk_control.py
+++ b/mapie/risk_control.py
@@ -722,13 +722,13 @@ class BinaryClassificationRisk:
     """
     Define a risk (or a performance metric) to be used with the
     BinaryClassificationController. Predefined instances are implemented,
-    see :func:`mapie.risk_control.precision`, :func:`mapie.risk_control.recall`,
-    :func:`mapie.risk_control.accuracy` and
-    :func:`mapie.risk_control.false_positive_rate`.
+    see :data:`mapie.risk_control.precision`, :data:`mapie.risk_control.recall`,
+    :data:`mapie.risk_control.accuracy` and
+    :data:`mapie.risk_control.false_positive_rate`.
 
     Here, a binary classification risk (or performance) is defined by an occurrence and
     a condition. Let's take the example of precision. Precision is the sum of true
-    positives over the total number of positives. In other words, precision is
+    positives over the total number of predicted positives. In other words, precision is
     the average of correct predictions (occurrence) given that those predictions
     are positive (condition). Programmatically,
     ``precision = (sum(y_pred == y_true) if y_pred == 1)/sum(y_pred == 1)``.
@@ -736,7 +736,7 @@ class BinaryClassificationRisk:
     must be set to `True`. See the implementation of `precision` in mapie.risk_control.
 
     Note: any risk or performance metric that can be defined as
-    ``sum(occurrence if condition) / sum(occurrence)`` can be theoretically controlled
+    ``sum(occurrence if condition) / sum(condition)`` can be theoretically controlled
     with the BinaryClassificationController, thanks to the LearnThenTest framework [1]
     and the binary Hoeffding-Bentkus p-values implemented in MAPIE.
 
@@ -879,7 +879,7 @@ class BinaryClassificationController:
     3. Use the predict method to predict using the best threshold
 
     Note: for a given model, calibration dataset, target level, and confidence level,
-    there may not be any thresholds controlling the risk.
+    there may not be any threshold controlling the risk.
 
     Parameters
     ----------
@@ -920,7 +920,7 @@ class BinaryClassificationController:
         Use the calibrate method to compute these.
 
     best_predict_param : Optional[float]
-        The best thresholds that control the risk (or performance).
+        The best threshold that control the risk (or performance).
         Use the calibrate method to compute it.
 
     Examples
@@ -956,7 +956,7 @@ class BinaryClassificationController:
     ... )
 
     >>> controller.calibrate(X_calib, y_calib)
-    >>> predictions = controller.predict(X_test)  # doctest: +SKIP
+    >>> predictions = controller.predict(X_test)
 
     References
     ----------
@@ -1157,4 +1157,15 @@ def _get_predictions_per_param(self, X: ArrayLike, params: NDArray) -> NDArray:
                 ) from e
             else:
                 raise
+        except IndexError as e:
+            if "array is 1-dimensional, but 2 were indexed" in str(e):
+                raise IndexError(
+                    "Error when calling the predict_function. "
+                    "Maybe the predict function you provided returns only the "
+                    "probability of the positive class. "
+                    "You should provide a predict function that returns the "
+                    "probabilities of both classes, like scikit-learn estimators."
+                ) from e
+            else:
+                raise
         return (predictions_proba[:, np.newaxis] >= params).T.astype(int)
diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py
index 1ee072b37..8a015397c 100644
--- a/mapie/tests/test_risk_control.py
+++ b/mapie/tests/test_risk_control.py
@@ -1123,20 +1123,53 @@ def test_error_passing_classifier(self):
             risk=precision,
             target_level=0.9
         )
-        X_test = []
+        X_test = [[0]]
         params = np.array([0.5])
 
         with pytest.raises(
             TypeError,
-            match=r"Error when calling the predict_function"
+            match=r"Maybe you provided a binary classifier"
         ):
             bcc._get_predictions_per_param(X_test, params)
 
-    def test_other_error(self):
+    def test_error_incorrect_predict_shape(self):
+        """
+        Test when the user provides a predict function that outputs only
+        the positive class.
+        """
+        clf = LogisticRegression().fit([[0], [1]], [0, 1])
+
+        def pred_func(X):
+            return clf.predict_proba(X)[:, 0]
+
+        bcc = BinaryClassificationController(
+            predict_function=pred_func,
+            risk=precision,
+            target_level=0.9
+        )
+        X_test = [[0]]
+        params = np.array([0.5])
+
+        with pytest.raises(
+            IndexError,
+            match=r"Maybe the predict function you provided returns only the "
+                  r"probability of the positive class."
+        ):
+            bcc._get_predictions_per_param(X_test, params)
+
+    @pytest.mark.parametrize(
+        "error,expected_error_type,expected_error_message",
+        [
+            (ValueError("Hey"), ValueError, "Hey"),
+            (IndexError("Gloups"), IndexError, "Gloups"),
+            (TypeError("I'm hungry"), TypeError, "I'm hungry"),
+        ],
+    )
+    def test_other_error(self, error, expected_error_type, expected_error_message):
         """Test that other errors are re-raised without modification"""
 
         def failing_predict_function(X):
-            raise TypeError("Some other error message")
+            raise error
 
         bcc = BinaryClassificationController(
             predict_function=failing_predict_function,
@@ -1144,10 +1177,10 @@ def failing_predict_function(X):
             target_level=0.9
         )
 
-        X_test = []
+        X_test = [[0]]
         params = np.array([0.5])
 
-        with pytest.raises(TypeError, match="Some other error message"):
+        with pytest.raises(expected_error_type, match=expected_error_message):
             bcc._get_predictions_per_param(X_test, params)
 
 

From 08c8bf40346974f29282efe0989900a89553df0c Mon Sep 17 00:00:00 2001
From: Valentin Laurent <valentin.laurent.fr@gmail.com>
Date: Mon, 22 Sep 2025 09:32:38 +0200
Subject: [PATCH 14/14] FIX - Fix wrong risk value with higher_is_better risks
 when undefined

---
 mapie/risk_control.py            | 15 ++++++++-------
 mapie/tests/test_risk_control.py | 22 ++++++++++++----------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/mapie/risk_control.py b/mapie/risk_control.py
index 055b5f566..9018da8b6 100644
--- a/mapie/risk_control.py
+++ b/mapie/risk_control.py
@@ -827,13 +827,14 @@ def get_value_and_effective_sample_size(
         if effective_sample_size_int != 0:
             risk_sum: int = np.sum(risk_occurrences[risk_conditions])
             risk_value = risk_sum / effective_sample_size_int
-            if self.higher_is_better:
-                risk_value = 1 - risk_value
-            return risk_value, effective_sample_size_int
-        # In this case, the corresponding lambda shouldn't be considered valid.
-        # In the current LTT implementation, providing n_obs=-1 will result
-        # in an infinite p_value, effectively invaliding the lambda
-        return 1, -1
+        else:
+            # In this case, the corresponding lambda shouldn't be considered valid.
+            # In the current LTT implementation, providing n_obs=-1 will result
+            # in an infinite p_value, effectively invaliding the lambda
+            risk_value, effective_sample_size_int = 1, -1
+        if self.higher_is_better:
+            risk_value = 1 - risk_value
+        return risk_value, effective_sample_size_int
 
 
 precision = BinaryClassificationRisk(
diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py
index 8a015397c..611437d08 100644
--- a/mapie/tests/test_risk_control.py
+++ b/mapie/tests/test_risk_control.py
@@ -851,17 +851,19 @@ def test_binary_classification_risk(
     y_true,
     y_pred
 ):
-    result = risk_instance.get_value_and_effective_sample_size(y_true, y_pred)
-    if effective_sample_func(y_true, y_pred) == 0:
-        assert result == (1, -1)
-    else:
-        value, n = result
+    value, n = risk_instance.get_value_and_effective_sample_size(y_true, y_pred)
+    effective_sample_size = effective_sample_func(y_true, y_pred)
+
+    if effective_sample_size != 0:
         expected_value = metric_func(y_true, y_pred)
-        if risk_instance.higher_is_better:
-            expected_value = 1 - expected_value
-        expected_n = effective_sample_func(y_true, y_pred)
-        assert np.isclose(value, expected_value)
-        assert n == expected_n
+        expected_n = effective_sample_size
+    else:
+        expected_value = 1
+        expected_n = -1
+    if risk_instance.higher_is_better:
+        expected_value = 1 - expected_value
+    assert np.isclose(value, expected_value)
+    assert n == expected_n
 
 
 class TestBinaryClassificationControllerBestPredictParamChoice: