From 9ec2d65dd72480ef3baff2e7dad9296ef5b9cecc Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Tue, 16 Sep 2025 14:56:42 +0200 Subject: [PATCH 01/14] better doc organisation: rename multilabel_classification as risk_control and move tuto to all examples --- .gitignore | 2 +- doc/Makefile | 2 +- doc/conf.py | 4 ++-- doc/index.rst | 2 +- examples/multilabel_classification/README.rst | 4 ---- .../1-quickstart/README.rst | 4 ++-- examples/risk_control/2-advanced-analysis/README.rst | 6 ++++++ .../2-advanced-analysis}/plot_tutorial_risk_control.py | 0 examples/risk_control/README.rst | 6 ++++++ 9 files changed, 19 insertions(+), 11 deletions(-) delete mode 100644 examples/multilabel_classification/README.rst rename examples/{multilabel_classification => risk_control}/1-quickstart/README.rst (56%) create mode 100644 examples/risk_control/2-advanced-analysis/README.rst rename examples/{multilabel_classification/1-quickstart => risk_control/2-advanced-analysis}/plot_tutorial_risk_control.py (100%) create mode 100644 examples/risk_control/README.rst diff --git a/.gitignore b/.gitignore index f787972b1..4d3384e64 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ doc/_build/ doc/examples_classification/ doc/examples_regression/ doc/examples_calibration/ -doc/examples_multilabel_classification/ +doc/examples_risk_control/ doc/examples_mondrian/ doc/auto_examples/ doc/modules/generated/ diff --git a/doc/Makefile b/doc/Makefile index 841011bd2..ba1723db0 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -50,7 +50,7 @@ clean: -rm -rf $(BUILDDIR)/* -rm -rf examples_regression/ -rm -rf examples_classification/ - -rm -rf examples_multilabel_classification/ + -rm -rf examples_risk_control/ -rm -rf examples_calibration/ -rm -rf examples_mondrian/ -rm -rf generated/* diff --git a/doc/conf.py b/doc/conf.py index 78cee8a31..eacd46e6e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -321,14 +321,14 @@ "examples_dirs": [ "../examples/regression", "../examples/classification", - "../examples/multilabel_classification", + "../examples/risk_control", "../examples/calibration", "../examples/mondrian", ], "gallery_dirs": [ "examples_regression", "examples_classification", - "examples_multilabel_classification", + "examples_risk_control", "examples_calibration", "examples_mondrian", ], diff --git a/doc/index.rst b/doc/index.rst index 2807c04bd..808257330 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -24,7 +24,7 @@ :caption: Control prediction errors theoretical_description_risk_control - examples_multilabel_classification/1-quickstart/plot_tutorial_risk_control + examples_risk_control/index external_risk_control_package .. toctree:: diff --git a/examples/multilabel_classification/README.rst b/examples/multilabel_classification/README.rst deleted file mode 100644 index 1f8a7b3fc..000000000 --- a/examples/multilabel_classification/README.rst +++ /dev/null @@ -1,4 +0,0 @@ -.. _general_examples: - -General examples -================ \ No newline at end of file diff --git a/examples/multilabel_classification/1-quickstart/README.rst b/examples/risk_control/1-quickstart/README.rst similarity index 56% rename from examples/multilabel_classification/1-quickstart/README.rst rename to examples/risk_control/1-quickstart/README.rst index 65aaf6366..2970a4ef1 100644 --- a/examples/multilabel_classification/1-quickstart/README.rst +++ b/examples/risk_control/1-quickstart/README.rst @@ -1,6 +1,6 @@ -.. _multilabel_classification_examples_1: +.. _risk_control_examples_1: 1. Quickstart examples ---------------------- -The following examples present the main functionalities of MAPIE through basic quickstart regression problems. \ No newline at end of file +The following examples present the main functionalities of MAPIE through basic quickstart risk control problems. \ No newline at end of file diff --git a/examples/risk_control/2-advanced-analysis/README.rst b/examples/risk_control/2-advanced-analysis/README.rst new file mode 100644 index 000000000..2179cbdbd --- /dev/null +++ b/examples/risk_control/2-advanced-analysis/README.rst @@ -0,0 +1,6 @@ +.. _risk_control_examples_2: + +2. Advanced analysis +-------------------- + +The following examples use MAPIE for discussing more complex risk control problems. \ No newline at end of file diff --git a/examples/multilabel_classification/1-quickstart/plot_tutorial_risk_control.py b/examples/risk_control/2-advanced-analysis/plot_tutorial_risk_control.py similarity index 100% rename from examples/multilabel_classification/1-quickstart/plot_tutorial_risk_control.py rename to examples/risk_control/2-advanced-analysis/plot_tutorial_risk_control.py diff --git a/examples/risk_control/README.rst b/examples/risk_control/README.rst new file mode 100644 index 000000000..f5f00e9f5 --- /dev/null +++ b/examples/risk_control/README.rst @@ -0,0 +1,6 @@ +.. _risk_control_examples: + +All risk control examples +========================= + +Following is a collection of notebooks demonstrating how to use MAPIE for risk control. \ No newline at end of file From d8057c6952f6d07c4b8f4e48925f4394d36f923a Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Tue, 16 Sep 2025 17:35:03 +0200 Subject: [PATCH 02/14] make clarifications, improve the overview table, and fix typos --- doc/theoretical_description_risk_control.rst | 43 ++++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/doc/theoretical_description_risk_control.rst b/doc/theoretical_description_risk_control.rst index 76629d311..c99204e06 100644 --- a/doc/theoretical_description_risk_control.rst +++ b/doc/theoretical_description_risk_control.rst @@ -13,26 +13,43 @@ Getting started with risk control in MAPIE Overview ======== +This section provides an overview of risk control in MAPIE. For those unfamiliar with the concept of risk control, the next section provides an introduction to the topic. + Three methods of risk control have been implemented in MAPIE so far : **Risk-Controlling Prediction Sets** (RCPS) [1], **Conformal Risk Control** (CRC) [2] and **Learn Then Test** (LTT) [3]. -The difference between these methods is the way the conformity scores are computed. -As of now, MAPIE supports risk control for two machine learning tasks: **binary classification**, as well as **multi-label classification** (including applications like image segmentation). +As of now, MAPIE supports risk control for two machine learning tasks: **binary classification**, as well as **multi-label classification** (in particular applications like image segmentation). The table below details the available methods for each task: +.. |br| raw:: html + +
+ .. list-table:: Available risk control methods in MAPIE for each ML task :header-rows: 1 - * - Risk control method - - Binary classification - - Multi-label classification (image segmentation) + * - Risk control |br| method + - Type of |br| control + - Assumption |br| on the data + - Non-monotonic |br| risks + - Binary |br| classification + - Multi-label |br| classification * - RCPS + - Probability + - i.i.d. + - ❌ - ❌ - ✅ * - CRC + - Expectation + - Exchangeable + - ❌ - ❌ - ✅ * - LTT + - Probability + - i.i.d + - ✅ - ✅ - ✅ @@ -41,7 +58,7 @@ In MAPIE for multi-label classification, CRC and RCPS are used for recall contro 1. What is risk control? ======================== -Before diving into risk control, let's take the simple example of a binary classification model, which separates the incoming data into the two classes thanks to its threshold: predictions above it are classified as 1, and those below as 0. Suppose we want to find a threshold that guarantees that our model achieves a certain level of precision. A naive, yet straightforward approach to do this is to evaluate how precision varies with different threshold values on a validation dataset. By plotting this relationship (see plot below), we can identify the range of thresholds that meet our desired precision requirement (green zone on the graph). +Before diving into risk control, let's take the simple example of a binary classification model, which separates the incoming data into two classes. Predicted probabilities above a given threshold (e.g., 0.5) correspond to predicting the "positive" class and probabilities below correspond to the "negative" class. Suppose we want to find a threshold that guarantees that our model achieves a certain level of precision. A naive, yet straightforward approach to do this is to evaluate how precision varies with different threshold values on a validation dataset. By plotting this relationship (see plot below), we can identify the range of thresholds that meet our desired precision requirement (green zone on the graph). .. image:: images/example_without_risk_control.png :width: 600 @@ -54,7 +71,7 @@ So far, so good. But here is the catch: while the chosen threshold effectively k Risk control is the science of adjusting a model's parameter, typically denoted :math:`\lambda`, so that a given risk stays below a desired level with high probability on unseen data. Note that here, the term *risk* is used to describe an undesirable outcome of the model (e.g., type I error): therefore, it is a value we want to minimize, and in our case, keep under a certain level. Also note that risk control can easily be applied to metrics we want to maximize (e.g., precision), simply by controlling the complement (e.g., 1-precision). -The strength of risk control lies in the statistical guarantees it provides on unseen data. Unlike the naive method presented earlier, it determines a value of :math:`\lambda` that ensures the risk is controlled *beyond* the training data. +The strength of risk control lies in the statistical guarantees it provides on unseen data. Unlike the naive method presented earlier, it determines a value of :math:`\lambda` that ensures the risk is controlled *beyond* the validation data. Applying risk control to the previous example would allow us to get a new — albeit narrower — range of thresholds (blue zone on the graph) that are **statistically guaranteed**. @@ -66,7 +83,7 @@ This guarantee is critical in a wide range of use cases (especially in high-stak — -To express risk control in mathematical terms, we denote by R the risk we want to control, and introduce the following two parameters: +To express risk control in mathematical terms, we denote by :math:`R` the risk we want to control, and introduce the following two parameters: - :math:`\alpha`: the target level below which we want the risk to remain, as shown in the figure below; @@ -76,13 +93,13 @@ To express risk control in mathematical terms, we denote by R the risk we want t - :math:`\delta`: the confidence level associated with the risk control. -In other words, the risk is said to be controlled if :math:`R \leq \alpha` with probability at least :math:`1 - \delta`. +In other words, the risk is said to be controlled if :math:`R \leq \alpha` with probability at least :math:`1 - \delta`, where the probability is over the randomness in the sampling of the dataset. The three risk control methods implemented in MAPIE — RCPS, CRC and LTT — rely on different assumptions, and offer slightly different guarantees: - **CRC** requires the data to be **exchangeable**, and gives a guarantee on the **expectation of the risk**: :math:`\mathbb{E}(R) \leq \alpha`; -- **RCPS** and **LTT** both impose stricter assumptions, requiring the data to be **independent and identically distributed** (i.i.d.), which implies exchangeability. The guarantee they provide is on the **probability that the risk does not exceed :math:`\alpha`**: :math:`\mathbb{P}(R \leq \alpha) \geq 1 - \delta`. +- **RCPS** and **LTT** both impose stricter assumptions, requiring the data to be **independent and identically distributed** (i.i.d.), which implies exchangeability. The guarantee they provide is on the **probability that the risk does not exceed** :math:`\boldsymbol{\alpha}`: :math:`\mathbb{P}(R \leq \alpha) \geq 1 - \delta`. .. image:: images/risk_distribution.png :width: 600 @@ -94,7 +111,7 @@ The plot above gives a visual representation of the difference between the two t - The risk is controlled in probability (RCPS/LTT) if at least :math:`1 - \delta` percent of its distribution over unseen data is below :math:`\alpha`. -Note that at the opposite of the other two methods, LTT allows to control any non-monotonic risk. +Note that contrary to the other two methods, LTT allows to control any non-monotonic risk. The following section provides a detailed overview of each method. @@ -234,7 +251,7 @@ We are going to present the Learn Then Test framework that allows the user to co This method has been introduced in article [3]. The settings here are the same as RCPS and CRC, we just need to introduce some new parameters: -- Let :math:`\Lambda` be a discretized for our :math:`\lambda`, meaning that :math:`\Lambda = \{\lambda_1, ..., \lambda_n\}`. +- Let :math:`\Lambda` be a discretized set for our :math:`\lambda`, meaning that :math:`\Lambda = \{\lambda_1, ..., \lambda_n\}`. - Let :math:`p_\lambda` be a valid p-value for the null hypothesis :math:`\mathbb{H}_j: R(\lambda_j)>\alpha`. @@ -250,7 +267,7 @@ In order to find all the parameters :math:`\lambda` that satisfy the above condi :math:`\{(x_1, y_1), \dots, (x_n, y_n)\}`. - For each :math:`\lambda_j` in a discrete set :math:`\Lambda = \{\lambda_1, \lambda_2,\dots, \lambda_n\}`, we associate the null hypothesis - :math:`\mathcal{H}_j: R(\lambda_j) > \alpha`, as rejecting the hypothesis corresponds to selecting :math:`\lambda_j` as a point where risk the risk + :math:`\mathcal{H}_j: R(\lambda_j) > \alpha`, as rejecting the hypothesis corresponds to selecting :math:`\lambda_j` as a point where the risk is controlled. - For each null hypothesis, we compute a valid p-value using a concentration inequality :math:`p_{\lambda_j}`. Here we choose to compute the Hoeffding-Bentkus p-value From e088e854868fb75ec7cee245ab00beda2b2c417b Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Thu, 18 Sep 2025 14:06:30 +0200 Subject: [PATCH 03/14] add quick start risk control --- doc/index.rst | 1 + doc/quick_start.rst | 8 +- ...plot_risk_control_binary_classification.py | 126 ++++++++++++++++++ 3 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py diff --git a/doc/index.rst b/doc/index.rst index 808257330..1d2881cb0 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -24,6 +24,7 @@ :caption: Control prediction errors theoretical_description_risk_control + examples_risk_control/1-quickstart/plot_risk_control_binary_classification examples_risk_control/index external_risk_control_package diff --git a/doc/quick_start.rst b/doc/quick_start.rst index 9794a4000..995d68157 100644 --- a/doc/quick_start.rst +++ b/doc/quick_start.rst @@ -40,4 +40,10 @@ Here, we generate one-dimensional noisy data that we fit with a MLPRegressor: `U 3. Classification ======================= -Similarly, it's possible to do the same for a basic classification problem: `Use MAPIE to plot prediction sets `_ \ No newline at end of file +Similarly, it's possible to do the same for a basic classification problem: `Use MAPIE to plot prediction sets `_ + + +4. Risk Control +======================= + +MAPIE implements risk control methods for multilabel classification (in particular, image segmentation) and binary classification: `Use MAPIE to control risk for a binary classifier `_ \ No newline at end of file diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py new file mode 100644 index 000000000..51398188c --- /dev/null +++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py @@ -0,0 +1,126 @@ +""" +================================================= +Use MAPIE to control risk for a binary classifier +================================================= + +In this example, we explain how to do risk control for binary classification with MAPIE. + +""" + +import numpy as np +import matplotlib.pyplot as plt +from sklearn.datasets import make_circles +from sklearn.svm import SVC +from sklearn.model_selection import FixedThresholdClassifier +from sklearn.metrics import precision_score +from sklearn.inspection import DecisionBoundaryDisplay + +from mapie.risk_control import BinaryClassificationController, precision +from mapie.utils import train_conformalize_test_split + +RANDOM_STATE = 1 + +############################################################################## +# Let us first load the dataset and fit an SVC on the training data. + +X, y = make_circles(n_samples=3000, noise=0.3, + factor=0.3, random_state=RANDOM_STATE) +(X_train, X_calib, X_test, + y_train, y_calib, y_test) = train_conformalize_test_split( + X, y, train_size=0.8, conformalize_size=0.1, test_size=0.1, + random_state=RANDOM_STATE) + +clf = SVC(probability=True, random_state=RANDOM_STATE) +clf.fit(X_train, y_train) + +############################################################################## +# Next, we initialize a :class:`~mapie.risk_control.BinaryClassificationController` +# using the probability estimation function from the fitted estimator: +# ``clf.predict_proba``, a risk function (here the precision), a target risk level, and +# a confidence level. Then we use the calibration data to compute statistically +# guaranteed thresholds using a risk control method. + +target_precision = 0.8 +bcc = BinaryClassificationController( + clf.predict_proba, precision, target_level=target_precision, confidence_level=0.9) +bcc.calibrate(X_calib, y_calib) + +print(f'{len(bcc.valid_predict_params)} valid thresholds found. ' + f'The best one is {bcc.best_predict_param:.3f}.') + + +############################################################################## +# In the plot below, we visualize how the threshold values impact precision, and what +# thresholds have been computed as statistically guaranteed. + +proba_positive_class = clf.predict_proba(X_calib)[:, 1] + +tested_thresholds = bcc._predict_params +precisions = np.full(len(tested_thresholds), np.inf) +for i, threshold in enumerate(tested_thresholds): + y_pred = (proba_positive_class >= threshold).astype(int) + precisions[i] = precision_score(y_calib, y_pred) + +valid_thresholds_indices = np.array( + [t in bcc.valid_predict_params for t in tested_thresholds]) +best_threshold_index = np.where( + tested_thresholds == bcc.best_predict_param)[0][0] + +plt.figure() +plt.scatter(tested_thresholds[valid_thresholds_indices], + precisions[valid_thresholds_indices], c='tab:green', + label='Valid thresholds') +plt.scatter(tested_thresholds[~valid_thresholds_indices], + precisions[~valid_thresholds_indices], c='tab:red', + label='Invalid thresholds') +plt.scatter(tested_thresholds[best_threshold_index], precisions[best_threshold_index], + c='tab:green', label='Best threshold', marker='*', edgecolors='k', s=300) +plt.axhline(target_precision, color='tab:gray', linestyle='--') +plt.text(0, target_precision+0.02, 'Target precision', + color='tab:gray', fontstyle='italic') +plt.xlabel('Threshold', labelpad=15) +plt.ylabel('Precision') +plt.legend() +plt.show() + +############################################################################## +# Contrary to the naive way of computing a threshold to satisfy a precision target on +# calibration data, risk control provides statistical guarantees on unseen data. +# Besides computing a set of valid thresholds, +# :class:`~mapie.risk_control.BinaryClassificationController` also outputs the best +# one, which in the case of precision is the threshold that, among all valid ones, +# maximizes recall. +# +# In the figure above, the highest threshold values are considered invalid due to the +# small number of observations used to compute the precision, following the Learn then +# Test procedure. In the most extreme case, no observation is available, which causes +# the precision value to be ill-defined and set to 0. +# +# After obtaining the best threshold, we can use the ``predict`` function of +# :class:`~mapie.risk_control.BinaryClassificationController` for future predictions, +# or use scikit-learn's ``FixedThresholdClassifier`` as a wrapper to benefit +# from functionalities like easily plotting the decision boundary as seen below. + +y_pred = bcc.predict(X_test) + +clf_threshold = FixedThresholdClassifier(clf, threshold=bcc.best_predict_param) +# necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator +clf_threshold.fit(X_train, y_train) + +disp = DecisionBoundaryDisplay.from_estimator( + clf_threshold, X_test, response_method="predict", cmap=plt.cm.coolwarm) + +plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], + edgecolors='k', c='tab:blue', alpha=0.5, label='"negative" class') +plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], + edgecolors='k', c='tab:red', alpha=0.5, label='"positive" class') +plt.title("Decision Boundary of FixedThresholdClassifier") +plt.xlabel("Feature 1") +plt.ylabel("Feature 2") +plt.legend() +plt.show() + +############################################################################## +# Different risk functions have been implemented, such as precision and recall, but you +# can also implement your own custom function using +# :class:`~mapie.risk_control.BinaryClassificationRisk`. From d1636b8a99610af7190322bab2c1427775dea690 Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Thu, 18 Sep 2025 15:41:06 +0200 Subject: [PATCH 04/14] Revert incorrect renaming of calibration to conformalization in risk_control.py --- HISTORY.rst | 1 + doc/v1_release_notes.rst | 2 -- mapie/risk_control.py | 28 ++++++++++++++-------------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 29408cc31..eb3409a93 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -16,6 +16,7 @@ History * MAPIE now supports Python versions up to the latest release (currently 3.13) * Change `prefit` default value to `True` in split methods' docstrings to remain consistent with the implementation * Fix issue 699 to replace `TimeSeriesRegressor.partial_fit` with `TimeSeriesRegressor.update` +* Revert incorrect renaming of calibration to conformalization in risk_control.py 1.0.1 (2025-05-22) ------------------ diff --git a/doc/v1_release_notes.rst b/doc/v1_release_notes.rst index 41ae6aa08..0a946a1e3 100644 --- a/doc/v1_release_notes.rst +++ b/doc/v1_release_notes.rst @@ -263,8 +263,6 @@ Risk control The ``MapieMultiLabelClassifier`` class has been renamed ``PrecisionRecallController``. -The parameter ``calib_size`` from the ``fit`` method has been renamed ``conformalize_size``. - Calibration ^^^^^^^^^^^^^ diff --git a/mapie/risk_control.py b/mapie/risk_control.py index f5a57c2c4..e9a2a7c8b 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -362,7 +362,7 @@ def _check_estimator( Warning If estimator is then to warn about the split of the - data between train and conformalization + data between train and calibration """ if (estimator is None) and (not _refit): raise ValueError( @@ -374,19 +374,19 @@ def _check_estimator( estimator = MultiOutputClassifier( LogisticRegression() ) - X_train, X_conf, y_train, y_conf = train_test_split( - X, - y, - test_size=self.conformalize_size, - random_state=self.random_state, + X_train, X_calib, y_train, y_calib = train_test_split( + X, + y, + test_size=self.calib_size, + random_state=self.random_state, ) estimator.fit(X_train, y_train) warnings.warn( "WARNING: To avoid overfitting, X has been split" - + "into X_train and X_conf. The conformalization will only" - + "be done on X_conf" + + "into X_train and X_calib. The calibration will only" + + "be done on X_calib" ) - return estimator, X_conf, y_conf + return estimator, X_calib, y_calib if isinstance(estimator, Pipeline): est = estimator[-1] @@ -589,7 +589,7 @@ def fit( self, X: ArrayLike, y: ArrayLike, - conformalize_size: Optional[float] = .3 + calib_size: Optional[float] = .3 ) -> PrecisionRecallController: """ Fit the base estimator or use the fitted base estimator. @@ -602,8 +602,8 @@ def fit( y: NDArray of shape (n_samples, n_classes) Training labels. - conformalize_size: Optional[float] - Size of the conformalization dataset with respect to X if the + calib_size: Optional[float] + Size of the calibration dataset with respect to X if the given model is ``None`` need to fit a LogisticRegression. By default .3 @@ -613,7 +613,7 @@ def fit( PrecisionRecallController The model itself. """ - self.conformalize_size = conformalize_size + self.calib_size = calib_size return self.partial_fit(X, y, _refit=True) def predict( @@ -696,7 +696,7 @@ def predict( ) self._check_valid_index(alpha_np) self.lambdas_star, self.r_star = find_lambda_control_star( - self.r_hat, self.valid_index, self.lambdas + self.r_hat, self.valid_index, self.lambdas ) y_pred_proba_array = ( y_pred_proba_array > From fcbff66a57f73028bbd2c67da25936c23222bcde Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Thu, 18 Sep 2025 16:09:34 +0200 Subject: [PATCH 05/14] add link to notebook theoretical validity risk control --- doc/theoretical_description_risk_control.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/theoretical_description_risk_control.rst b/doc/theoretical_description_risk_control.rst index c99204e06..5bb6e73e8 100644 --- a/doc/theoretical_description_risk_control.rst +++ b/doc/theoretical_description_risk_control.rst @@ -117,6 +117,8 @@ The following section provides a detailed overview of each method. 2. Theoretical description ========================== +Note that a notebook testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data is available here: `theoretical_validity_tests.ipynb `__. + 2.1 Risk-Controlling Prediction Sets ------------------------------------ 2.1.1 General settings From 0610bb183ce035117c96048f1a57fe7ef932066d Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 18 Sep 2025 16:56:47 +0200 Subject: [PATCH 06/14] Update examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py Change title --- .../1-quickstart/plot_risk_control_binary_classification.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py index 51398188c..1244712ef 100644 --- a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py +++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py @@ -1,7 +1,7 @@ """ -================================================= -Use MAPIE to control risk for a binary classifier -================================================= +======================================================== +Use MAPIE to control the precision of a binary classifier +======================================================== In this example, we explain how to do risk control for binary classification with MAPIE. From 8ae2c47d740c3d1e352734051b0e9f362467cc95 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Wed, 17 Sep 2025 16:44:35 +0200 Subject: [PATCH 07/14] DOC - BinaryClassificationController docstrings --- mapie/risk_control.py | 170 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 5 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index e9a2a7c8b..1f54f8715 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -134,7 +134,7 @@ class PrecisionRecallController(BaseEstimator, ClassifierMixin): References ---------- - [1] Lihua Lei Jitendra Malik Stephen Bates, Anastasios Angelopoulos + [1] Lihua Lei Jitendra Malik Stephen Bates, Anastasios Angelopoulos, and Michael I. Jordan. Distribution-free, risk-controlling prediction sets. CoRR, abs/2101.02703, 2021. URL https://arxiv.org/abs/2101.02703 @@ -197,7 +197,7 @@ def __init__( def _check_parameters(self) -> None: """ - Check n_jobs, verbose and random_states. + Check n_jobs, verbose, and random_states. Raises ------ @@ -719,6 +719,22 @@ def predict( class BinaryClassificationRisk: + """ + Parameters + ---------- + risk_occurrence : Callable[[int, int], bool] + risk_condition : Callable[[int, int], bool] + higher_is_better : bool + + Attributes + ---------- + risk_occurrence : Callable[[int, int], bool] + risk_condition : Callable[[int, int], bool] + higher_is_better : bool + + Examples + -------- + """ # Any risk that can be defined in the following way will work using the binary # Hoeffding-Bentkus p-values used in MAPIE # Take the example of precision in the docstring to explain how the class works. @@ -738,6 +754,16 @@ def get_value_and_effective_sample_size( y_true: NDArray, # shape (n_samples,), values in {0, 1} y_pred: NDArray, # shape (n_samples,), values in {0, 1} ) -> Tuple[float, int]: + """ + Parameters + ---------- + y_true : NDArray + y_pred : NDArray + + Returns + ------- + Tuple[float, int] + """ # float between 0 and 1, int between 0 and len(y_true) # Returns 1-risk_occurrence if higher_is_better is True # returns (1, -1) when the risk is not defined (condition never met) @@ -790,6 +816,108 @@ def get_value_and_effective_sample_size( class BinaryClassificationController: + """ + Controls the risk or performance of a binary classifier. + + BinaryClassificationController finds the decision thresholds of a binary classifier + that statistically guarantee a risk to be below a target level + (the risk is "controlled"). + It can be used to control a performance metric as well, such as the precision. + In that case, the thresholds guarantee that the performance is above a target level. + + Usage: + + 1. Instantiate a BinaryClassificationController, providing the predict_proba method + of your binary classifier + 2. Call the calibrate method to find the thresholds + 3. Use the predict method to predict using the best threshold + + Note: for a given model, calibration dataset, target level, and confidence level, + there may not be any thresholds controlling the risk. + + Parameters + ---------- + predict_function : Callable[[ArrayLike], NDArray] + predict_proba method of a fitted binary classifier. + Its output signature must be of shape (len(X), 2) + + risk : BinaryClassificationRisk + The risk or performance metric to control. + Valid options: + + - An existing risk defined in `mapie.risk_control` (e.g. precision, recall, + accuracy, false_positive_rate) + - A custom instance of BinaryClassificationRisk object + + target_level : float + The maximum risk level (or minimum performance level). Must be between 0 and 1. + + confidence_level : float, default=0.9 + The confidence level with which the risk (or performance) is controlled. + See the documentation for detailed explanations. + + best_predict_param_choice : Union["auto", BinaryClassificationRisk], default="auto" + How to select the best threshold from the valid thresholds that control the risk + (or performance). The BinaryClassificationController will try to minimize + (or maximize) a secondary objective. + Valid options: + + - "auto" (default) + - An existing risk defined in `mapie.risk_control` (e.g. precision, recall, + accuracy, false_positive_rate) + - A custom instance of BinaryClassificationRisk object + + Attributes + ---------- + valid_predict_params : NDArray + The valid thresholds that control the risk (or performance). + Use the calibrate method to compute these. + + best_predict_param : Optional[float] + The best thresholds that control the risk (or performance). + Use the calibrate method to compute it. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> from mapie.risk_control import BinaryClassificationController, precision + + >>> X, y = make_classification( + ... n_features=2, + ... n_redundant=0, + ... n_informative=2, + ... n_clusters_per_class=1, + ... n_classes=2, + ... random_state=42, + ... class_sep=2.0 + ... ) + >>> X_train, X_temp, y_train, y_temp = train_test_split( + ... X, y, test_size=0.4, random_state=42 + ... ) + >>> X_calib, X_test, y_calib, y_test = train_test_split( + ... X_temp, y_temp, test_size=0.1, random_state=42 + ... ) + + >>> clf = LogisticRegression().fit(X_train, y_train) + + >>> controller = BinaryClassificationController( + ... predict_function=clf.predict_proba, + ... risk=precision, + ... target_level=0.6 + ... ) + + >>> controller.calibrate(X_calib, y_calib) + >>> predictions = controller.predict(X_test) # doctest: +SKIP + + References + ---------- + Angelopoulos, Anastasios N., Stephen, Bates, Emmanuel J. Candès, et al. + "Learn Then Test: Calibrating Predictive Algorithms to Achieve Risk Control." (2022) + + """ _best_predict_param_choice_map = { precision: recall, recall: precision, @@ -799,10 +927,8 @@ class BinaryClassificationController: def __init__( self, - # X -> y_proba of shape (n_samples, 2) predict_function: Callable[[ArrayLike], NDArray], - risk: BinaryClassificationRisk, # to import from mapie.risk_control - # above or below depending if risk is higher_is_better or not + risk: BinaryClassificationRisk, target_level: float, confidence_level: float = 0.9, best_predict_param_choice: Union[ @@ -833,6 +959,23 @@ def calibrate( # pragma: no cover X_calibrate: ArrayLike, y_calibrate: ArrayLike ) -> None: + """ + Calibrate the BinaryClassificationController. + Sets attributes valid_predict_params and best_predict_param (if the risk + or performance can be controlled at the target level). + + Parameters + ---------- + X_calibrate : ArrayLike + Features of the calibration set. + + y_calibrate : ArrayLike + Binary labels of the calibration set. + + Returns + ------- + None + """ y_calibrate_ = np.asarray(y_calibrate, dtype=int) predictions_per_param = self._get_predictions_per_param( @@ -869,6 +1012,23 @@ def calibrate( # pragma: no cover ) def predict(self, X_test: ArrayLike) -> NDArray: + """ + Predict using predict_function at the best threshold. + + Parameters + ---------- + X_test : ArrayLike + Features + + Returns + ------- + NDArray of shape (n_samples,) + + Raises + ------ + ValueError if the method .calibrate was not called, + or if no valid thresholds were found during calibration. + """ if self.best_predict_param is None: raise ValueError( "Cannot predict. " From 709d4e29b85285718d98cd664c99e950d2a2b3f2 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 18 Sep 2025 16:48:25 +0200 Subject: [PATCH 08/14] DOC - BinaryClassificationRisk docstring, + make some attributes private --- mapie/risk_control.py | 88 ++++++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 1f54f8715..b4ee727c4 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -720,62 +720,104 @@ def predict( class BinaryClassificationRisk: """ + Define a risk (or a performance metric) to be used with the + BinaryClassificationController. Predefined instances are implemented, + see :func:`mapie.risk_control.precision`, :func:`mapie.risk_control.recall`, + :func:`mapie.risk_control.accuracy` and + :func:`mapie.risk_control.false_positive_rate`. + + Here, a binary classification risk (or performance) is defined by an occurrence and + a condition. Let's take the example of precision. Precision is the sum of true + positives over the total number of positives. In other words, precision is + the average of correct predictions (occurrence) given that those predictions + are positive (condition). Programmatically, + ``precision = (sum(y_pred == y_true) if y_pred == 1)/sum(y_pred == 1)``. + Because precision is a performance metric rather than a risk, `higher_is_better` + must be set to `True`. See the implementation of `precision` in mapie.risk_control. + + Note: any risk or performance metric that can be defined as + ``sum(occurrence if condition) / sum(occurrence)`` can be theoretically controlled + with the BinaryClassificationController, thanks to the LearnThenTest framework [1] + and the binary Hoeffding-Bentkus p-values implemented in MAPIE. + + Note: by definition, the value of the risk (or performance metric) here is always + between 0 and 1. + Parameters ---------- risk_occurrence : Callable[[int, int], bool] + A function defining the occurrence of the risk for a given sample. + Must take y_true and y_pred as input and return a boolean. + risk_condition : Callable[[int, int], bool] + A function defining the condition of the risk for a given sample, + Must take y_true and y_pred as input and return a boolean. + higher_is_better : bool + Whether this BinaryClassificationRisk instance is a risk + (higher_is_better=False) or a performance metric (higher_is_better=True). Attributes ---------- - risk_occurrence : Callable[[int, int], bool] - risk_condition : Callable[[int, int], bool] higher_is_better : bool + See above. - Examples - -------- + References + ---------- + [1] Angelopoulos, Anastasios N., Stephen, Bates, Emmanuel J. Candès, et al. + "Learn Then Test: Calibrating Predictive Algorithms to Achieve Risk Control." (2022) """ - # Any risk that can be defined in the following way will work using the binary - # Hoeffding-Bentkus p-values used in MAPIE - # Take the example of precision in the docstring to explain how the class works. - # Explain that it works by computing sum(risk_occurence[risk_cond]) + def __init__( self, risk_occurrence: Callable[[int, int], bool], risk_condition: Callable[[int, int], bool], higher_is_better: bool, ): - self.risk_occurrence = risk_occurrence - self.risk_condition = risk_condition + self._risk_occurrence = risk_occurrence + self._risk_condition = risk_condition self.higher_is_better = higher_is_better def get_value_and_effective_sample_size( self, - y_true: NDArray, # shape (n_samples,), values in {0, 1} - y_pred: NDArray, # shape (n_samples,), values in {0, 1} + y_true: NDArray, + y_pred: NDArray, ) -> Tuple[float, int]: """ + Computes the value of a risk given an array of ground + truth labels and the corresponding predictions. Also returns the number of + samples used to compute that value. + + That number can be different from the total number of samples. For example, in + the case of precision, only the samples with positive predictions are used. + + In the case of a performance metric, this function returns 1 - perf_value. + Parameters ---------- y_true : NDArray + NDArray of ground truth labels, of shape (n_samples,), with values in {0, 1} + y_pred : NDArray + NDArray of predictions, of shape (n_samples,), with values in {0, 1} Returns ------- - Tuple[float, int] + A tuple containing the value of the risk between 0 and 1, + and the number of effective samples used to compute that value + (between 1 and n_samples). + + In the case of a performance metric, this function returns 1 - perf_value. + + If the risk is not defined (condition never met), the value is set to 1, + and the number of effective samples is set to -1. """ - # float between 0 and 1, int between 0 and len(y_true) - # Returns 1-risk_occurrence if higher_is_better is True - # returns (1, -1) when the risk is not defined (condition never met) - # In this case, the corresponding lambda shouldn't be considered valid. - # In the current LTT implementation, providing n_obs=-1 will result - # in an infinite p_value, effectively invaliding the lambda risk_occurrences = np.array([ - self.risk_occurrence(y_true_i, y_pred_i) + self._risk_occurrence(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) ]) risk_conditions = np.array([ - self.risk_condition(y_true_i, y_pred_i) + self._risk_condition(y_true_i, y_pred_i) for y_true_i, y_pred_i in zip(y_true, y_pred) ]) effective_sample_size = len(y_true) - np.sum(~risk_conditions) @@ -787,6 +829,9 @@ def get_value_and_effective_sample_size( if self.higher_is_better: risk_value = 1 - risk_value return risk_value, effective_sample_size_int + # In this case, the corresponding lambda shouldn't be considered valid. + # In the current LTT implementation, providing n_obs=-1 will result + # in an infinite p_value, effectively invaliding the lambda return 1, -1 @@ -916,7 +961,6 @@ class BinaryClassificationController: ---------- Angelopoulos, Anastasios N., Stephen, Bates, Emmanuel J. Candès, et al. "Learn Then Test: Calibrating Predictive Algorithms to Achieve Risk Control." (2022) - """ _best_predict_param_choice_map = { precision: recall, From ca8f1780801381246070b8b9860c2b9f6d78e979 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Thu, 18 Sep 2025 17:10:38 +0200 Subject: [PATCH 09/14] DOC - Fix docstrings formatting, add classes to the API page in ReadTheDoc --- doc/api.rst | 2 ++ mapie/risk_control.py | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index dbfbaaa8c..b043beb1a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -102,6 +102,8 @@ Risk Control :template: class.rst mapie.risk_control.PrecisionRecallController + mapie.risk_control.BinaryClassificationController + mapie.risk_control.BinaryClassificationRisk Calibration =========== diff --git a/mapie/risk_control.py b/mapie/risk_control.py index b4ee727c4..8ef9fd4ed 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -760,7 +760,7 @@ class BinaryClassificationRisk: Attributes ---------- higher_is_better : bool - See above. + See params. References ---------- @@ -803,14 +803,15 @@ def get_value_and_effective_sample_size( Returns ------- - A tuple containing the value of the risk between 0 and 1, - and the number of effective samples used to compute that value - (between 1 and n_samples). + Tuple[float, int] + A tuple containing the value of the risk between 0 and 1, + and the number of effective samples used to compute that value + (between 1 and n_samples). - In the case of a performance metric, this function returns 1 - perf_value. + In the case of a performance metric, this function returns 1 - perf_value. - If the risk is not defined (condition never met), the value is set to 1, - and the number of effective samples is set to -1. + If the risk is not defined (condition never met), the value is set to 1, + and the number of effective samples is set to -1. """ risk_occurrences = np.array([ self._risk_occurrence(y_true_i, y_pred_i) @@ -1066,12 +1067,14 @@ def predict(self, X_test: ArrayLike) -> NDArray: Returns ------- - NDArray of shape (n_samples,) + NDArray + NDArray of shape (n_samples,) Raises ------ - ValueError if the method .calibrate was not called, - or if no valid thresholds were found during calibration. + ValueError + If the method .calibrate was not called, + or if no valid thresholds were found during calibration. """ if self.best_predict_param is None: raise ValueError( From 3026f13a56cd5c072f3fdef95c3446e90512d97a Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Fri, 19 Sep 2025 14:45:22 +0200 Subject: [PATCH 10/14] clarifications of explanations and formatting --- ...plot_risk_control_binary_classification.py | 91 ++++++++++++------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py index 1244712ef..ed8b6b277 100644 --- a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py +++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py @@ -1,7 +1,7 @@ """ -======================================================== +========================================================= Use MAPIE to control the precision of a binary classifier -======================================================== +========================================================= In this example, we explain how to do risk control for binary classification with MAPIE. @@ -23,12 +23,13 @@ ############################################################################## # Let us first load the dataset and fit an SVC on the training data. -X, y = make_circles(n_samples=3000, noise=0.3, - factor=0.3, random_state=RANDOM_STATE) +X, y = make_circles(n_samples=3000, noise=0.3, factor=0.3, random_state=RANDOM_STATE) (X_train, X_calib, X_test, y_train, y_calib, y_test) = train_conformalize_test_split( - X, y, train_size=0.8, conformalize_size=0.1, test_size=0.1, - random_state=RANDOM_STATE) + X, y, + train_size=0.8, conformalize_size=0.1, test_size=0.1, + random_state=RANDOM_STATE + ) clf = SVC(probability=True, random_state=RANDOM_STATE) clf.fit(X_train, y_train) @@ -41,12 +42,18 @@ # guaranteed thresholds using a risk control method. target_precision = 0.8 +confidence_level = 0.9 bcc = BinaryClassificationController( - clf.predict_proba, precision, target_level=target_precision, confidence_level=0.9) + clf.predict_proba, + precision, target_level=target_precision, + confidence_level=confidence_level + ) bcc.calibrate(X_calib, y_calib) -print(f'{len(bcc.valid_predict_params)} valid thresholds found. ' - f'The best one is {bcc.best_predict_param:.3f}.') +print(f'{len(bcc.valid_predict_params)} thresholds found that guarantee a precision of ' + f'at least {target_precision} with a confidence of {confidence_level}.\n' + 'Among those, the one that maximizes the secondary objective (recall here) is: ' + f'{bcc.best_predict_param:.3f}.') ############################################################################## @@ -67,18 +74,23 @@ tested_thresholds == bcc.best_predict_param)[0][0] plt.figure() -plt.scatter(tested_thresholds[valid_thresholds_indices], - precisions[valid_thresholds_indices], c='tab:green', - label='Valid thresholds') -plt.scatter(tested_thresholds[~valid_thresholds_indices], - precisions[~valid_thresholds_indices], c='tab:red', - label='Invalid thresholds') -plt.scatter(tested_thresholds[best_threshold_index], precisions[best_threshold_index], - c='tab:green', label='Best threshold', marker='*', edgecolors='k', s=300) +plt.scatter( + tested_thresholds[valid_thresholds_indices], precisions[valid_thresholds_indices], + c='tab:green', label='Valid thresholds' + ) +plt.scatter( + tested_thresholds[~valid_thresholds_indices], precisions[~valid_thresholds_indices], + c='tab:red', label='Invalid thresholds' + ) +plt.scatter( + tested_thresholds[best_threshold_index], precisions[best_threshold_index], + c='tab:green', label='Best threshold', marker='*', edgecolors='k', s=300 + ) plt.axhline(target_precision, color='tab:gray', linestyle='--') -plt.text(0, target_precision+0.02, 'Target precision', - color='tab:gray', fontstyle='italic') -plt.xlabel('Threshold', labelpad=15) +plt.text( + 0.7, target_precision+0.02, 'Target precision', color='tab:gray', fontstyle='italic' +) +plt.xlabel('Threshold') plt.ylabel('Precision') plt.legend() plt.show() @@ -86,15 +98,19 @@ ############################################################################## # Contrary to the naive way of computing a threshold to satisfy a precision target on # calibration data, risk control provides statistical guarantees on unseen data. -# Besides computing a set of valid thresholds, -# :class:`~mapie.risk_control.BinaryClassificationController` also outputs the best -# one, which in the case of precision is the threshold that, among all valid ones, -# maximizes recall. +# In the plot above, we can see that not all thresholds corresponding to a precision +# higher that the target are valid. This is due to the uncertainty inherent to the +# finite size of the calibration set, which risk control takes into account. # -# In the figure above, the highest threshold values are considered invalid due to the +# In particular, the highest threshold values are considered invalid due to the # small number of observations used to compute the precision, following the Learn then # Test procedure. In the most extreme case, no observation is available, which causes # the precision value to be ill-defined and set to 0. + +# Besides computing a set of valid thresholds, +# :class:`~mapie.risk_control.BinaryClassificationController` also outputs the "best" +# one, which is the valid threshold that maximizes a secondary objective +# (recall here). # # After obtaining the best threshold, we can use the ``predict`` function of # :class:`~mapie.risk_control.BinaryClassificationController` for future predictions, @@ -104,16 +120,22 @@ y_pred = bcc.predict(X_test) clf_threshold = FixedThresholdClassifier(clf, threshold=bcc.best_predict_param) -# necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator -clf_threshold.fit(X_train, y_train) +clf_threshold.fit(X_train, y_train) +# .fit necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator -disp = DecisionBoundaryDisplay.from_estimator( - clf_threshold, X_test, response_method="predict", cmap=plt.cm.coolwarm) -plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], - edgecolors='k', c='tab:blue', alpha=0.5, label='"negative" class') -plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], - edgecolors='k', c='tab:red', alpha=0.5, label='"positive" class') +disp = DecisionBoundaryDisplay.from_estimator( + clf_threshold, X_test, response_method="predict", cmap=plt.cm.coolwarm + ) + +plt.scatter( + X_test[y_test == 0, 0], X_test[y_test == 0, 1], + edgecolors='k', c='tab:blue', alpha=0.5, label='"negative" class' + ) +plt.scatter( + X_test[y_test == 1, 0], X_test[y_test == 1, 1], + edgecolors='k', c='tab:red', alpha=0.5, label='"positive" class' + ) plt.title("Decision Boundary of FixedThresholdClassifier") plt.xlabel("Feature 1") plt.ylabel("Feature 2") @@ -123,4 +145,5 @@ ############################################################################## # Different risk functions have been implemented, such as precision and recall, but you # can also implement your own custom function using -# :class:`~mapie.risk_control.BinaryClassificationRisk`. +# :class:`~mapie.risk_control.BinaryClassificationRisk` and choose your own +# secondary objective. From 28dbc3cf81815c19680fc90cfc183a05f77e6f22 Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Fri, 19 Sep 2025 14:49:53 +0200 Subject: [PATCH 11/14] fix trailing whitespace --- .../plot_risk_control_binary_classification.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py index ed8b6b277..216594119 100644 --- a/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py +++ b/examples/risk_control/1-quickstart/plot_risk_control_binary_classification.py @@ -26,7 +26,7 @@ X, y = make_circles(n_samples=3000, noise=0.3, factor=0.3, random_state=RANDOM_STATE) (X_train, X_calib, X_test, y_train, y_calib, y_test) = train_conformalize_test_split( - X, y, + X, y, train_size=0.8, conformalize_size=0.1, test_size=0.1, random_state=RANDOM_STATE ) @@ -44,8 +44,8 @@ target_precision = 0.8 confidence_level = 0.9 bcc = BinaryClassificationController( - clf.predict_proba, - precision, target_level=target_precision, + clf.predict_proba, + precision, target_level=target_precision, confidence_level=confidence_level ) bcc.calibrate(X_calib, y_calib) @@ -75,7 +75,7 @@ plt.figure() plt.scatter( - tested_thresholds[valid_thresholds_indices], precisions[valid_thresholds_indices], + tested_thresholds[valid_thresholds_indices], precisions[valid_thresholds_indices], c='tab:green', label='Valid thresholds' ) plt.scatter( @@ -98,8 +98,8 @@ ############################################################################## # Contrary to the naive way of computing a threshold to satisfy a precision target on # calibration data, risk control provides statistical guarantees on unseen data. -# In the plot above, we can see that not all thresholds corresponding to a precision -# higher that the target are valid. This is due to the uncertainty inherent to the +# In the plot above, we can see that not all thresholds corresponding to a precision +# higher that the target are valid. This is due to the uncertainty inherent to the # finite size of the calibration set, which risk control takes into account. # # In particular, the highest threshold values are considered invalid due to the @@ -109,7 +109,7 @@ # Besides computing a set of valid thresholds, # :class:`~mapie.risk_control.BinaryClassificationController` also outputs the "best" -# one, which is the valid threshold that maximizes a secondary objective +# one, which is the valid threshold that maximizes a secondary objective # (recall here). # # After obtaining the best threshold, we can use the ``predict`` function of @@ -120,7 +120,7 @@ y_pred = bcc.predict(X_test) clf_threshold = FixedThresholdClassifier(clf, threshold=bcc.best_predict_param) -clf_threshold.fit(X_train, y_train) +clf_threshold.fit(X_train, y_train) # .fit necessary for plotting, alternatively you can use sklearn.frozen.FrozenEstimator @@ -145,5 +145,5 @@ ############################################################################## # Different risk functions have been implemented, such as precision and recall, but you # can also implement your own custom function using -# :class:`~mapie.risk_control.BinaryClassificationRisk` and choose your own +# :class:`~mapie.risk_control.BinaryClassificationRisk` and choose your own # secondary objective. From 373f25fed2a6902c3668ed527b4e0ef18e6ba96d Mon Sep 17 00:00:00 2001 From: Adrien Le Coz Date: Mon, 22 Sep 2025 10:59:34 +0200 Subject: [PATCH 12/14] change position of notebook link --- doc/theoretical_description_risk_control.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/theoretical_description_risk_control.rst b/doc/theoretical_description_risk_control.rst index 5bb6e73e8..6c8cbe4c4 100644 --- a/doc/theoretical_description_risk_control.rst +++ b/doc/theoretical_description_risk_control.rst @@ -117,7 +117,6 @@ The following section provides a detailed overview of each method. 2. Theoretical description ========================== -Note that a notebook testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data is available here: `theoretical_validity_tests.ipynb `__. 2.1 Risk-Controlling Prediction Sets ------------------------------------ @@ -278,6 +277,7 @@ In order to find all the parameters :math:`\lambda` that satisfy the above condi - Return :math:`\hat{\Lambda} = \mathcal{A}(\{p_j\}_{j\in\{1,\dots,\lvert \Lambda \rvert})`, where :math:`\mathcal{A}`, is an algorithm that controls the family-wise error rate (FWER), for example, Bonferonni correction. +Note that a notebook testing theoretical guarantees of risk control in binary classification using a random classifier and synthetic data is available here: `theoretical_validity_tests.ipynb `__. References ========== From ea6d0e3772a783175f72026b9b11666404afdfae Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Fri, 19 Sep 2025 18:23:26 +0200 Subject: [PATCH 13/14] DOC & MTN - Fix docstrings, add an exception handling if users passes wrong predict function --- mapie/risk_control.py | 27 +++++++++++++------ mapie/tests/test_risk_control.py | 45 +++++++++++++++++++++++++++----- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 8ef9fd4ed..055b5f566 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -722,13 +722,13 @@ class BinaryClassificationRisk: """ Define a risk (or a performance metric) to be used with the BinaryClassificationController. Predefined instances are implemented, - see :func:`mapie.risk_control.precision`, :func:`mapie.risk_control.recall`, - :func:`mapie.risk_control.accuracy` and - :func:`mapie.risk_control.false_positive_rate`. + see :data:`mapie.risk_control.precision`, :data:`mapie.risk_control.recall`, + :data:`mapie.risk_control.accuracy` and + :data:`mapie.risk_control.false_positive_rate`. Here, a binary classification risk (or performance) is defined by an occurrence and a condition. Let's take the example of precision. Precision is the sum of true - positives over the total number of positives. In other words, precision is + positives over the total number of predicted positives. In other words, precision is the average of correct predictions (occurrence) given that those predictions are positive (condition). Programmatically, ``precision = (sum(y_pred == y_true) if y_pred == 1)/sum(y_pred == 1)``. @@ -736,7 +736,7 @@ class BinaryClassificationRisk: must be set to `True`. See the implementation of `precision` in mapie.risk_control. Note: any risk or performance metric that can be defined as - ``sum(occurrence if condition) / sum(occurrence)`` can be theoretically controlled + ``sum(occurrence if condition) / sum(condition)`` can be theoretically controlled with the BinaryClassificationController, thanks to the LearnThenTest framework [1] and the binary Hoeffding-Bentkus p-values implemented in MAPIE. @@ -879,7 +879,7 @@ class BinaryClassificationController: 3. Use the predict method to predict using the best threshold Note: for a given model, calibration dataset, target level, and confidence level, - there may not be any thresholds controlling the risk. + there may not be any threshold controlling the risk. Parameters ---------- @@ -920,7 +920,7 @@ class BinaryClassificationController: Use the calibrate method to compute these. best_predict_param : Optional[float] - The best thresholds that control the risk (or performance). + The best threshold that control the risk (or performance). Use the calibrate method to compute it. Examples @@ -956,7 +956,7 @@ class BinaryClassificationController: ... ) >>> controller.calibrate(X_calib, y_calib) - >>> predictions = controller.predict(X_test) # doctest: +SKIP + >>> predictions = controller.predict(X_test) References ---------- @@ -1157,4 +1157,15 @@ def _get_predictions_per_param(self, X: ArrayLike, params: NDArray) -> NDArray: ) from e else: raise + except IndexError as e: + if "array is 1-dimensional, but 2 were indexed" in str(e): + raise IndexError( + "Error when calling the predict_function. " + "Maybe the predict function you provided returns only the " + "probability of the positive class. " + "You should provide a predict function that returns the " + "probabilities of both classes, like scikit-learn estimators." + ) from e + else: + raise return (predictions_proba[:, np.newaxis] >= params).T.astype(int) diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py index 1ee072b37..8a015397c 100644 --- a/mapie/tests/test_risk_control.py +++ b/mapie/tests/test_risk_control.py @@ -1123,20 +1123,53 @@ def test_error_passing_classifier(self): risk=precision, target_level=0.9 ) - X_test = [] + X_test = [[0]] params = np.array([0.5]) with pytest.raises( TypeError, - match=r"Error when calling the predict_function" + match=r"Maybe you provided a binary classifier" ): bcc._get_predictions_per_param(X_test, params) - def test_other_error(self): + def test_error_incorrect_predict_shape(self): + """ + Test when the user provides a predict function that outputs only + the positive class. + """ + clf = LogisticRegression().fit([[0], [1]], [0, 1]) + + def pred_func(X): + return clf.predict_proba(X)[:, 0] + + bcc = BinaryClassificationController( + predict_function=pred_func, + risk=precision, + target_level=0.9 + ) + X_test = [[0]] + params = np.array([0.5]) + + with pytest.raises( + IndexError, + match=r"Maybe the predict function you provided returns only the " + r"probability of the positive class." + ): + bcc._get_predictions_per_param(X_test, params) + + @pytest.mark.parametrize( + "error,expected_error_type,expected_error_message", + [ + (ValueError("Hey"), ValueError, "Hey"), + (IndexError("Gloups"), IndexError, "Gloups"), + (TypeError("I'm hungry"), TypeError, "I'm hungry"), + ], + ) + def test_other_error(self, error, expected_error_type, expected_error_message): """Test that other errors are re-raised without modification""" def failing_predict_function(X): - raise TypeError("Some other error message") + raise error bcc = BinaryClassificationController( predict_function=failing_predict_function, @@ -1144,10 +1177,10 @@ def failing_predict_function(X): target_level=0.9 ) - X_test = [] + X_test = [[0]] params = np.array([0.5]) - with pytest.raises(TypeError, match="Some other error message"): + with pytest.raises(expected_error_type, match=expected_error_message): bcc._get_predictions_per_param(X_test, params) From 08c8bf40346974f29282efe0989900a89553df0c Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Mon, 22 Sep 2025 09:32:38 +0200 Subject: [PATCH 14/14] FIX - Fix wrong risk value with higher_is_better risks when undefined --- mapie/risk_control.py | 15 ++++++++------- mapie/tests/test_risk_control.py | 22 ++++++++++++---------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/mapie/risk_control.py b/mapie/risk_control.py index 055b5f566..9018da8b6 100644 --- a/mapie/risk_control.py +++ b/mapie/risk_control.py @@ -827,13 +827,14 @@ def get_value_and_effective_sample_size( if effective_sample_size_int != 0: risk_sum: int = np.sum(risk_occurrences[risk_conditions]) risk_value = risk_sum / effective_sample_size_int - if self.higher_is_better: - risk_value = 1 - risk_value - return risk_value, effective_sample_size_int - # In this case, the corresponding lambda shouldn't be considered valid. - # In the current LTT implementation, providing n_obs=-1 will result - # in an infinite p_value, effectively invaliding the lambda - return 1, -1 + else: + # In this case, the corresponding lambda shouldn't be considered valid. + # In the current LTT implementation, providing n_obs=-1 will result + # in an infinite p_value, effectively invaliding the lambda + risk_value, effective_sample_size_int = 1, -1 + if self.higher_is_better: + risk_value = 1 - risk_value + return risk_value, effective_sample_size_int precision = BinaryClassificationRisk( diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py index 8a015397c..611437d08 100644 --- a/mapie/tests/test_risk_control.py +++ b/mapie/tests/test_risk_control.py @@ -851,17 +851,19 @@ def test_binary_classification_risk( y_true, y_pred ): - result = risk_instance.get_value_and_effective_sample_size(y_true, y_pred) - if effective_sample_func(y_true, y_pred) == 0: - assert result == (1, -1) - else: - value, n = result + value, n = risk_instance.get_value_and_effective_sample_size(y_true, y_pred) + effective_sample_size = effective_sample_func(y_true, y_pred) + + if effective_sample_size != 0: expected_value = metric_func(y_true, y_pred) - if risk_instance.higher_is_better: - expected_value = 1 - expected_value - expected_n = effective_sample_func(y_true, y_pred) - assert np.isclose(value, expected_value) - assert n == expected_n + expected_n = effective_sample_size + else: + expected_value = 1 + expected_n = -1 + if risk_instance.higher_is_better: + expected_value = 1 - expected_value + assert np.isclose(value, expected_value) + assert n == expected_n class TestBinaryClassificationControllerBestPredictParamChoice: