From c39ecc511cc8a717d012a471cb844aa5b33785dd Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 29 Oct 2024 08:58:59 +0100 Subject: [PATCH 1/3] API deprecate CalibratedClassifierCV(..., cv=prefit) for FrozenEstimator --- doc/modules/calibration.rst | 9 +-- .../plot_calibration_multiclass.py | 3 +- sklearn/calibration.py | 57 ++++++++++++------- sklearn/tests/test_calibration.py | 38 +++++++++---- 4 files changed, 68 insertions(+), 39 deletions(-) diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst index ad183aa79c6c4..0527dcdb81c81 100644 --- a/doc/modules/calibration.rst +++ b/doc/modules/calibration.rst @@ -193,10 +193,11 @@ The main advantage of using `ensemble=False` is computational: it reduces the overall fit time by training only a single base classifier and calibrator pair, decreases the final model size and increases prediction speed. -Alternatively an already fitted classifier can be calibrated by setting -`cv="prefit"`. In this case, the data is not split and all of it is used to -fit the regressor. It is up to the user to -make sure that the data used for fitting the classifier is disjoint from the +Alternatively an already fitted classifier can be calibrated by using a +:class:`~sklearn.frozen.FrozenEstimator` as +``CalibratedClassifierCV(estimator=FrozenEstimator(estimator))``. +It is up to the user to make sure that the data used for fitting the classifier +is disjoint from the data used for fitting the regressor. data used for fitting the regressor. :class:`CalibratedClassifierCV` supports the use of two regression techniques diff --git a/examples/calibration/plot_calibration_multiclass.py b/examples/calibration/plot_calibration_multiclass.py index 8525c76221a8f..2208292d1ccc9 100644 --- a/examples/calibration/plot_calibration_multiclass.py +++ b/examples/calibration/plot_calibration_multiclass.py @@ -64,10 +64,11 @@ class of an instance (red: class 1, green: class 2, blue: class 3). # using the valid data subset (400 samples) in a 2-stage process. from sklearn.calibration import CalibratedClassifierCV +from sklearn.frozen import FrozenEstimator clf = RandomForestClassifier(n_estimators=25) clf.fit(X_train, y_train) -cal_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit") +cal_clf = CalibratedClassifierCV(FrozenEstimator(clf), method="sigmoid") cal_clf.fit(X_valid, y_valid) # %% diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 93035fef52b45..0f43d3556abe6 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -23,6 +23,7 @@ _fit_context, clone, ) +from .frozen import FrozenEstimator from .isotonic import IsotonicRegression from .model_selection import LeaveOneOut, check_cv, cross_val_predict from .preprocessing import LabelEncoder, label_binarize @@ -75,8 +76,8 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) `probabilities=True` for :class:`~sklearn.svm.SVC` and :class:`~sklearn.svm.NuSVC` estimators (see :ref:`User Guide ` for details). - Already fitted classifiers can be calibrated via the parameter - `cv="prefit"`. In this case, no cross-validation is used and all provided + Already fitted classifiers can be calibrated wrapping the modl in a + :class:`~sklearn.frozen.FrozenEstimator`. In this case all provided data is used for calibration. The user has to take care manually that data for model fitting and calibration are disjoint. @@ -106,8 +107,7 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) use isotonic calibration with too few calibration samples ``(<<1000)`` since it tends to overfit. - cv : int, cross-validation generator, iterable or "prefit", \ - default=None + cv : int, cross-validation generator, or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -124,12 +124,13 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) Refer to the :ref:`User Guide ` for the various cross-validation strategies that can be used here. - If "prefit" is passed, it is assumed that `estimator` has been - fitted already and all data is used for calibration. - .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. + .. versionchanged:: 1.6 + `"prefit"` is deprecated. Use :class:`~sklearn.frozen.FrozenEstimator` + instead. + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. @@ -142,9 +143,11 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) .. versionadded:: 0.24 - ensemble : bool, default=True - Determines how the calibrator is fitted when `cv` is not `'prefit'`. - Ignored if `cv='prefit'`. + ensemble : bool, or "auto", default="auto" + Determines how the calibrator is fitted. + + "auto" will use `False` if the `estimator` is a + :class:`~sklearn.frozen.FrozenEstimator`, and `True` otherwise. If `True`, the `estimator` is fitted using training data, and calibrated using testing data, for each `cv` fold. The final estimator @@ -161,6 +164,9 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) .. versionadded:: 0.24 + .. versionchanged:: 1.6 + `"auto"` option is added and is the default. + Attributes ---------- classes_ : ndarray of shape (n_classes,) @@ -178,17 +184,13 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) .. versionadded:: 1.0 - calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \ - or `ensemble=False`) + calibrated_classifiers_ : list (len() equal to cv or 1 if `ensemble=False`) The list of classifier and calibrator pairs. - - When `cv="prefit"`, the fitted `estimator` and fitted + - When `ensemble=True`, `n_cv` fitted `estimator` and calibrator pairs. + `n_cv` is the number of cross-validation folds. + - When `ensemble=False`, the `estimator`, fitted on all the data, and fitted calibrator. - - When `cv` is not "prefit" and `ensemble=True`, `n_cv` fitted - `estimator` and calibrator pairs. `n_cv` is the number of - cross-validation folds. - - When `cv` is not "prefit" and `ensemble=False`, the `estimator`, - fitted on all the data, and fitted calibrator. .. versionchanged:: 0.24 Single calibrated classifier case when `ensemble=False`. @@ -240,7 +242,8 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) >>> base_clf = GaussianNB() >>> base_clf.fit(X_train, y_train) GaussianNB() - >>> calibrated_clf = CalibratedClassifierCV(base_clf, cv="prefit") + >>> from sklearn.frozen import FrozenEstimator + >>> calibrated_clf = CalibratedClassifierCV(FrozenEstimator(base_clf)) >>> calibrated_clf.fit(X_calib, y_calib) CalibratedClassifierCV(...) >>> len(calibrated_clf.calibrated_classifiers_) @@ -258,7 +261,7 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) "method": [StrOptions({"isotonic", "sigmoid"})], "cv": ["cv_object", StrOptions({"prefit"})], "n_jobs": [Integral, None], - "ensemble": ["boolean"], + "ensemble": ["boolean", StrOptions({"auto"})], } def __init__( @@ -268,7 +271,7 @@ def __init__( method="sigmoid", cv=None, n_jobs=None, - ensemble=True, + ensemble="auto", ): self.estimator = estimator self.method = method @@ -323,8 +326,18 @@ def fit(self, X, y, sample_weight=None, **fit_params): estimator = self._get_estimator() + _ensemble = self.ensemble + if _ensemble == "auto": + _ensemble = not isinstance(estimator, FrozenEstimator) + self.calibrated_classifiers_ = [] if self.cv == "prefit": + # TODO(1.8): Remove this code branch and cv='prefit' + warnings.warn( + "The `cv='prefit'` option is deprecated in 1.6 and will be removed in" + " 1.8. You can use CalibratedClassifierCV(FrozenEstimator(estimator))" + " instead." + ) # `classes_` should be consistent with that of estimator check_is_fitted(self.estimator, attributes=["classes_"]) self.classes_ = self.estimator.classes_ @@ -404,7 +417,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): ) cv = check_cv(self.cv, y, classifier=True) - if self.ensemble: + if _ensemble: parallel = Parallel(n_jobs=self.n_jobs) self.calibrated_classifiers_ = parallel( delayed(_fit_classifier_calibrator_pair)( diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 0f23bb7463126..d80c7094525f9 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -22,6 +22,7 @@ ) from sklearn.exceptions import NotFittedError from sklearn.feature_extraction import DictVectorizer +from sklearn.frozen import FrozenEstimator from sklearn.impute import SimpleImputer from sklearn.isotonic import IsotonicRegression from sklearn.linear_model import LogisticRegression, SGDClassifier @@ -45,6 +46,7 @@ assert_almost_equal, assert_array_almost_equal, assert_array_equal, + ignore_warnings, ) from sklearn.utils.extmath import softmax from sklearn.utils.fixes import CSR_CONTAINERS @@ -299,9 +301,11 @@ def predict(self, X): assert_allclose(probas, 1.0 / clf.n_classes_) +@ignore_warnings(category=FutureWarning) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_calibration_prefit(csr_container): """Test calibration for prefitted classifiers""" + # TODO(1.8): Remove cv="prefit" options here and the @ignore_warnings of the test n_samples = 50 X, y = make_classification(n_samples=3 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) @@ -333,17 +337,25 @@ def test_calibration_prefit(csr_container): (csr_container(X_calib), csr_container(X_test)), ]: for method in ["isotonic", "sigmoid"]: - cal_clf = CalibratedClassifierCV(clf, method=method, cv="prefit") + cal_clf_prefit = CalibratedClassifierCV(clf, method=method, cv="prefit") + cal_clf_frozen = CalibratedClassifierCV(FrozenEstimator(clf), method=method) for sw in [sw_calib, None]: - cal_clf.fit(this_X_calib, y_calib, sample_weight=sw) - y_prob = cal_clf.predict_proba(this_X_test) - y_pred = cal_clf.predict(this_X_test) - prob_pos_cal_clf = y_prob[:, 1] - assert_array_equal(y_pred, np.array([0, 1])[np.argmax(y_prob, axis=1)]) - + cal_clf_prefit.fit(this_X_calib, y_calib, sample_weight=sw) + cal_clf_frozen.fit(this_X_calib, y_calib, sample_weight=sw) + + y_prob_prefit = cal_clf_prefit.predict_proba(this_X_test) + y_prob_frozen = cal_clf_frozen.predict_proba(this_X_test) + y_pred_prefit = cal_clf_prefit.predict(this_X_test) + y_pred_frozen = cal_clf_frozen.predict(this_X_test) + prob_pos_cal_clf_prefit = y_prob_prefit[:, 1] + prob_pos_cal_clf_frozen = y_prob_frozen[:, 1] + assert_array_equal(y_pred_prefit, y_pred_frozen) + assert_array_equal( + y_pred_prefit, np.array([0, 1])[np.argmax(y_prob_prefit, axis=1)] + ) assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss( - y_test, prob_pos_cal_clf + y_test, prob_pos_cal_clf_frozen ) @@ -515,8 +527,10 @@ def dict_data(): {"state": "NY", "age": "adult"}, {"state": "TX", "age": "adult"}, {"state": "VT", "age": "child"}, + {"state": "CT", "age": "adult"}, + {"state": "BR", "age": "child"}, ] - text_labels = [1, 0, 1] + text_labels = [1, 0, 1, 1, 0] return dict_data, text_labels @@ -540,7 +554,7 @@ def test_calibration_dict_pipeline(dict_data, dict_data_pipeline): """ X, y = dict_data clf = dict_data_pipeline - calib_clf = CalibratedClassifierCV(clf, cv="prefit") + calib_clf = CalibratedClassifierCV(FrozenEstimator(clf), cv=2) calib_clf.fit(X, y) # Check attributes are obtained from fitted estimator assert_array_equal(calib_clf.classes_, clf.classes_) @@ -584,7 +598,7 @@ def test_calibration_inconsistent_prefit_n_features_in(): # is consistent with training set X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7) clf = LinearSVC(C=1).fit(X, y) - calib_clf = CalibratedClassifierCV(clf, cv="prefit") + calib_clf = CalibratedClassifierCV(FrozenEstimator(clf)) msg = "X has 3 features, but LinearSVC is expecting 5 features as input." with pytest.raises(ValueError, match=msg): @@ -602,7 +616,7 @@ def test_calibration_votingclassifier(): ) vote.fit(X, y) - calib_clf = CalibratedClassifierCV(estimator=vote, cv="prefit") + calib_clf = CalibratedClassifierCV(estimator=FrozenEstimator(vote)) # smoke test: should not raise an error calib_clf.fit(X, y) From 7a1f0209b58d071459836bd6e78431312f9c1dbc Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 29 Oct 2024 09:04:15 +0100 Subject: [PATCH 2/3] changelog --- .../upcoming_changes/sklearn.calibration/30171.api.rst | 4 ++++ sklearn/calibration.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.calibration/30171.api.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.calibration/30171.api.rst b/doc/whats_new/upcoming_changes/sklearn.calibration/30171.api.rst new file mode 100644 index 0000000000000..4d550af598278 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.calibration/30171.api.rst @@ -0,0 +1,4 @@ +- `cv="prefit"` is deprecated for :class:`~sklearn.calibration.CalibratedClassifierCV`. + Use :class:`~sklearn.frozen.FrozenEstimator` instead, as + `CalibratedClassifierCV(FrozenEstimator(estimator))`. + By `Adrin Jalali`_. diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 0f43d3556abe6..d737e83840afd 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -35,6 +35,7 @@ ) from .utils._param_validation import ( HasMethods, + Hidden, Interval, StrOptions, validate_params, @@ -259,7 +260,7 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) None, ], "method": [StrOptions({"isotonic", "sigmoid"})], - "cv": ["cv_object", StrOptions({"prefit"})], + "cv": ["cv_object", Hidden(StrOptions({"prefit"}))], "n_jobs": [Integral, None], "ensemble": ["boolean", StrOptions({"auto"})], } From 4c17d22ce296bad3d3aff498098fc52182a54d2e Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 29 Oct 2024 16:49:28 +0100 Subject: [PATCH 3/3] Update sklearn/calibration.py Co-authored-by: Adam Li --- sklearn/calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index d737e83840afd..b4023172bb20c 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -77,7 +77,7 @@ class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator) `probabilities=True` for :class:`~sklearn.svm.SVC` and :class:`~sklearn.svm.NuSVC` estimators (see :ref:`User Guide ` for details). - Already fitted classifiers can be calibrated wrapping the modl in a + Already fitted classifiers can be calibrated by wrapping the model in a :class:`~sklearn.frozen.FrozenEstimator`. In this case all provided data is used for calibration. The user has to take care manually that data for model fitting and calibration are disjoint.