From cf1fdceb4f154954b229e62422675f2066fd8e8b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Sep 2021 11:48:56 +0200 Subject: [PATCH 01/10] TST check equivalence sample_weight in CalibratedClassifierCV --- sklearn/calibration.py | 18 ++++++------ sklearn/tests/test_calibration.py | 46 ++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 9ede41a775c3e..2f986837eae09 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -267,6 +267,8 @@ def fit(self, X, y, sample_weight=None): """ check_classification_targets(y) X, y = indexable(X, y) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) if self.base_estimator is None: # we want all classifiers that don't expose a random_state @@ -303,15 +305,13 @@ def fit(self, X, y, sample_weight=None): # sample_weight checks fit_parameters = signature(base_estimator.fit).parameters supports_sw = "sample_weight" in fit_parameters - if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) - if not supports_sw: - estimator_name = type(base_estimator).__name__ - warnings.warn( - f"Since {estimator_name} does not support " - "sample_weights, sample weights will only be" - " used for the calibration itself." - ) + if sample_weight is not None and not supports_sw: + estimator_name = type(base_estimator).__name__ + warnings.warn( + f"Since {estimator_name} does not support " + "sample_weights, sample weights will only be" + " used for the calibration itself." + ) # Check that each cross-validation fold can have at least one # example per class diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 040571df4681b..d59836dd68229 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -6,7 +6,7 @@ from numpy.testing import assert_allclose from scipy import sparse -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, clone from sklearn.dummy import DummyClassifier from sklearn.model_selection import LeaveOneOut, train_test_split @@ -784,3 +784,47 @@ def test_calibration_display_ref_line(pyplot, iris_data_binary): labels = viz2.ax_.get_legend_handles_labels()[1] assert labels.count("Perfectly calibrated") == 1 + + +@pytest.mark.parametrize("method", ["sigmoid", "isotonic"]) +@pytest.mark.parametrize("ensemble", [True, False]) +def test_calibrated_classifier_cv_sample_weights_equivalence(data, method, ensemble): + from sklearn.datasets import load_iris + + X, y = load_iris(return_X_y=True) + X, y = X[:100], y[:100] + sample_weight = np.ones_like(y) * 2 + + X_twice = np.zeros((X.shape[0] * 2, X.shape[1]), dtype=X.dtype) + X_twice[::2, :] = X + X_twice[1::2, :] = X + y_twice = np.zeros(y.shape[0] * 2, dtype=y.dtype) + y_twice[::2] = y + y_twice[1::2] = y + + base_estimator = LogisticRegression() + calibrated_clf_without_weights = CalibratedClassifierCV( + base_estimator, + method=method, + ensemble=ensemble, + cv=2, + ) + calibrated_clf_with_weights = clone(calibrated_clf_without_weights) + + calibrated_clf_with_weights.fit(X, y, sample_weight=sample_weight) + calibrated_clf_without_weights.fit(X_twice, y_twice) + + # Check that the underlying fitted estimators have the same coefficients + for est_with_weights, est_without_weights in zip( + calibrated_clf_with_weights.calibrated_classifiers_, + calibrated_clf_without_weights.calibrated_classifiers_, + ): + assert_allclose( + est_with_weights.base_estimator.coef_, + est_without_weights.base_estimator.coef_, + ) + + y_pred_with_weights = calibrated_clf_with_weights.predict_proba(X) + y_pred_without_weights = calibrated_clf_without_weights.predict_proba(X) + + assert_allclose(y_pred_with_weights, y_pred_without_weights) From aaa1bde048f57c112e1408bcc4d6299cc2ffd609 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Sep 2021 14:16:49 +0200 Subject: [PATCH 02/10] iter --- sklearn/calibration.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 2f986837eae09..41b8fd3e973a3 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -351,6 +351,11 @@ def fit(self, X, y, sample_weight=None): else: this_estimator = clone(base_estimator) _, method_name = _get_prediction_method(this_estimator) + fit_params = ( + {"sample_weight": sample_weight} + if sample_weight is not None and supports_sw + else None + ) pred_method = partial( cross_val_predict, estimator=this_estimator, @@ -359,6 +364,7 @@ def fit(self, X, y, sample_weight=None): cv=cv, method=method_name, n_jobs=self.n_jobs, + fit_params=fit_params, ) predictions = _compute_predictions( pred_method, method_name, X, n_classes @@ -760,10 +766,17 @@ def _sigmoid_calibration(predictions, y, sample_weight=None): F = predictions # F follows Platt's notations - # Bayesian priors (see Platt end of section 2.2) - prior0 = float(np.sum(y <= 0)) - prior1 = y.shape[0] - prior0 - T = np.zeros(y.shape) + # Bayesian priors (see Platt end of section 2.2): + # It corresponds to the number of samples, taking into account the + # `sample_weight`. + mask_negative_samples = y <= 0 + if sample_weight is not None: + prior0 = (sample_weight[mask_negative_samples]).sum() + prior1 = (sample_weight[~mask_negative_samples]).sum() + else: + prior0 = float(np.sum(mask_negative_samples)) + prior1 = y.shape[0] - prior0 + T = np.zeros_like(y) T[y > 0] = (prior1 + 1.0) / (prior1 + 2.0) T[y <= 0] = 1.0 / (prior0 + 2.0) T1 = 1.0 - T From 017bfe28ff25d310be4d406604fed28a92eb48d9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Sep 2021 14:39:11 +0200 Subject: [PATCH 03/10] iter --- sklearn/tests/test_calibration.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index d59836dd68229..24a6008857508 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -788,13 +788,18 @@ def test_calibration_display_ref_line(pyplot, iris_data_binary): @pytest.mark.parametrize("method", ["sigmoid", "isotonic"]) @pytest.mark.parametrize("ensemble", [True, False]) -def test_calibrated_classifier_cv_sample_weights_equivalence(data, method, ensemble): - from sklearn.datasets import load_iris - +def test_calibrated_classifier_cv_sample_weights_equivalence(method, ensemble): + """Check that passing repeating twice the dataset `X` is equivalent to + passing a `sample_weight` with a factor 2.""" X, y = load_iris(return_X_y=True) + # Scale the data to avoid any convergence issue + X = StandardScaler().fit_transform(X) + # Only use 2 classes X, y = X[:100], y[:100] sample_weight = np.ones_like(y) * 2 + # Interlace the data such that a 2-fold cross-validation will be equivalent + # to using the original dataset with a sample weights of 2 X_twice = np.zeros((X.shape[0] * 2, X.shape[1]), dtype=X.dtype) X_twice[::2, :] = X X_twice[1::2, :] = X @@ -824,6 +829,7 @@ def test_calibrated_classifier_cv_sample_weights_equivalence(data, method, ensem est_without_weights.base_estimator.coef_, ) + # Check that the predictions are the same y_pred_with_weights = calibrated_clf_with_weights.predict_proba(X) y_pred_without_weights = calibrated_clf_without_weights.predict_proba(X) From caed3bb302f1df240e02d2522ca5da99ee9dc90f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Sep 2021 14:44:26 +0200 Subject: [PATCH 04/10] iter --- sklearn/calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 41b8fd3e973a3..d376af61c1f3d 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -776,7 +776,7 @@ def _sigmoid_calibration(predictions, y, sample_weight=None): else: prior0 = float(np.sum(mask_negative_samples)) prior1 = y.shape[0] - prior0 - T = np.zeros_like(y) + T = np.zeros_like(y, dtype=np.float64) T[y > 0] = (prior1 + 1.0) / (prior1 + 2.0) T[y <= 0] = 1.0 / (prior0 + 2.0) T1 = 1.0 - T From 2c844b311a88fe151f48998eca78fead922c1b66 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Sep 2021 15:01:01 +0200 Subject: [PATCH 05/10] iter --- sklearn/calibration.py | 4 ++- sklearn/tests/test_calibration.py | 46 ++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index d376af61c1f3d..a68bc6cdaf1c5 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -442,7 +442,9 @@ def _more_tags(self): return { "_xfail_checks": { "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" + "Due to the cross-validation and sample ordering, removing a sample" + " is not strictly equal to putting is weight to zero. Specific unit" + " tests are added for CalibratedClassifierCV specifically." ), } } diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 24a6008857508..7b8d656bef939 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -788,7 +788,7 @@ def test_calibration_display_ref_line(pyplot, iris_data_binary): @pytest.mark.parametrize("method", ["sigmoid", "isotonic"]) @pytest.mark.parametrize("ensemble", [True, False]) -def test_calibrated_classifier_cv_sample_weights_equivalence(method, ensemble): +def test_calibrated_classifier_cv_double_sample_weights_equivalence(method, ensemble): """Check that passing repeating twice the dataset `X` is equivalent to passing a `sample_weight` with a factor 2.""" X, y = load_iris(return_X_y=True) @@ -834,3 +834,47 @@ def test_calibrated_classifier_cv_sample_weights_equivalence(method, ensemble): y_pred_without_weights = calibrated_clf_without_weights.predict_proba(X) assert_allclose(y_pred_with_weights, y_pred_without_weights) + + +@pytest.mark.parametrize("method", ["sigmoid", "isotonic"]) +@pytest.mark.parametrize("ensemble", [True, False]) +def test_calibrated_classifier_cv_zeros_sample_weights_equivalence(method, ensemble): + """Check that passing removing some sample from the dataset `X` is + equivalent to passing a `sample_weight` with a factor 0.""" + X, y = load_iris(return_X_y=True) + # Scale the data to avoid any convergence issue + X = StandardScaler().fit_transform(X) + # Only use 2 classes and select samples such that 2-fold cross-validation + # split will lead to an equivalence with a `sample_weight` of 0 + X = np.vstack((X[:40], X[50:90])) + y = np.hstack((y[:40], y[50:90])) + sample_weight = np.zeros_like(y) + sample_weight[::2] = 1 + + base_estimator = LogisticRegression() + calibrated_clf_without_weights = CalibratedClassifierCV( + base_estimator, + method=method, + ensemble=ensemble, + cv=2, + ) + calibrated_clf_with_weights = clone(calibrated_clf_without_weights) + + calibrated_clf_with_weights.fit(X, y, sample_weight=sample_weight) + calibrated_clf_without_weights.fit(X[::2], y[::2]) + + # Check that the underlying fitted estimators have the same coefficients + for est_with_weights, est_without_weights in zip( + calibrated_clf_with_weights.calibrated_classifiers_, + calibrated_clf_without_weights.calibrated_classifiers_, + ): + assert_allclose( + est_with_weights.base_estimator.coef_, + est_without_weights.base_estimator.coef_, + ) + + # Check that the predictions are the same + y_pred_with_weights = calibrated_clf_with_weights.predict_proba(X) + y_pred_without_weights = calibrated_clf_without_weights.predict_proba(X) + + assert_allclose(y_pred_with_weights, y_pred_without_weights) From bec6ed424ca21b18d5b5fbbc410f5f813882369e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 28 Sep 2021 15:07:46 +0200 Subject: [PATCH 06/10] iter --- doc/whats_new/v1.0.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index d8776653cd9e8..4db2b097d2e75 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -2,6 +2,29 @@ .. currentmodule:: sklearn +.. _changes_1_0_1: + +Version 1.0.1 +============= + +**In Development** + +Changelog +--------- + +:mod:`sklearn.calibration` +.......................... + +- |Fix| Fixed :class:`calibration.CalibratedClassifierCV` to take into account + `sample_weight` when computing the base estimator prediction when + `ensemble=False`. + :pr:`20638` by :user:`Julien Bohné `. + +- |Fix| Fixed a bug in :class:`calibration.CalibratedClassifierCV` with + `method="sigmoid"` that was ignoring the `sample_weight` when computing the + the Bayesian priors. + :pr:`21179` by :user:`Guillaume Lemaitre `. + .. _changes_1_0: Version 1.0.0 From 2a71374b5dd7a25f2d1d4add8ffb6620fd87314b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 29 Sep 2021 18:24:27 +0200 Subject: [PATCH 07/10] Grant co-authorship to Julien Co-authored-by: JulienB-78 From e5a406fab02d1e1a05c01c4306ac7edc2cef3126 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 29 Sep 2021 18:42:38 +0200 Subject: [PATCH 08/10] iter --- sklearn/calibration.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index a68bc6cdaf1c5..ce4ef83637b4b 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -308,9 +308,12 @@ def fit(self, X, y, sample_weight=None): if sample_weight is not None and not supports_sw: estimator_name = type(base_estimator).__name__ warnings.warn( - f"Since {estimator_name} does not support " - "sample_weights, sample weights will only be" - " used for the calibration itself." + f"Since {estimator_name} does not appear to accept sample_weight, " + "sample weights will only be used for the calibration itself. This " + "can be caused by a limitation of the current scikit-learn API. " + "See the following issue for more details: " + "https://github.com/scikit-learn/scikit-learn/issues/21134. Be " + "warn that the result of the calibration is likely to be incorrect." ) # Check that each cross-validation fold can have at least one From c1a18be0e0a04e4384aa03a1c487589e201b01ab Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 30 Sep 2021 09:55:29 +0200 Subject: [PATCH 09/10] Update sklearn/calibration.py Co-authored-by: Olivier Grisel --- sklearn/calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index ce4ef83637b4b..b51862fe70518 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -313,7 +313,7 @@ def fit(self, X, y, sample_weight=None): "can be caused by a limitation of the current scikit-learn API. " "See the following issue for more details: " "https://github.com/scikit-learn/scikit-learn/issues/21134. Be " - "warn that the result of the calibration is likely to be incorrect." + "warned that the result of the calibration is likely to be incorrect." ) # Check that each cross-validation fold can have at least one From f7ed498e61adde0e490a8cd01889b6b5a2d9b521 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 30 Sep 2021 10:08:56 +0200 Subject: [PATCH 10/10] pep8 --- sklearn/calibration.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index b51862fe70518..0785938135513 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -313,7 +313,8 @@ def fit(self, X, y, sample_weight=None): "can be caused by a limitation of the current scikit-learn API. " "See the following issue for more details: " "https://github.com/scikit-learn/scikit-learn/issues/21134. Be " - "warned that the result of the calibration is likely to be incorrect." + "warned that the result of the calibration is likely to be " + "incorrect." ) # Check that each cross-validation fold can have at least one