From 8486ccbf0b0d4e0e4b84c98dfd50d74d80b35124 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Mar 2019 18:08:14 -0400 Subject: [PATCH 1/7] GBDT init now supports pipelines (incompatible with sample weight) --- sklearn/ensemble/gradient_boosting.py | 8 +++++++- sklearn/ensemble/tests/test_gradient_boosting.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index ba089fc30cc3b..158e8da73184c 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1445,6 +1445,12 @@ def fit(self, X, y, sample_weight=None, monitor=None): if sample_weight_is_none: sample_weight = np.ones(n_samples, dtype=np.float32) else: + from sklearn.pipeline import Pipeline + if isinstance(self.init, Pipeline): + raise ValueError( + 'The init estimator is a pipeline, ' + 'pipelines do not support sample_weight.' + ) sample_weight = column_or_1d(sample_weight, warn=True) sample_weight_is_none = False @@ -1484,7 +1490,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): else: try: self.init_.fit(X, y, sample_weight=sample_weight) - except TypeError: + except (TypeError, ValueError): if sample_weight_is_none: self.init_.fit(X, y) else: diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index dc9ec0c2338d0..d054d2c797b17 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -39,6 +39,8 @@ from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import NotFittedError from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import LinearRegression GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, @@ -1366,6 +1368,20 @@ def test_gradient_boosting_with_init(gb, dataset_maker, init_estimator): gb(init=init_est).fit(X, y, sample_weight=sample_weight) +def test_gradient_boosting_with_init_pipeline(): + # Check that the init estimator can be a pipeline (see issue #13466) + + X, y = make_regression(random_state=0) + init = make_pipeline(LinearRegression()) + gb = GradientBoostingRegressor(init=init) + gb.fit(X, y) + + with pytest.raises( + ValueError, + match='The init estimator is a pipeline, pipelines do not support'): + gb.fit(X, y, sample_weight=np.ones(X.shape[0])) + + @pytest.mark.parametrize('estimator, missing_method', [ (GradientBoostingClassifier(init=LinearSVC()), 'predict_proba'), (GradientBoostingRegressor(init=OneHotEncoder()), 'predict') From e4e6de735ff869c6b60d39e97593f347af53202c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Mar 2019 18:12:42 -0400 Subject: [PATCH 2/7] better handling --- sklearn/ensemble/gradient_boosting.py | 6 ------ sklearn/ensemble/tests/test_gradient_boosting.py | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 158e8da73184c..edc1ad3ef4c12 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1445,12 +1445,6 @@ def fit(self, X, y, sample_weight=None, monitor=None): if sample_weight_is_none: sample_weight = np.ones(n_samples, dtype=np.float32) else: - from sklearn.pipeline import Pipeline - if isinstance(self.init, Pipeline): - raise ValueError( - 'The init estimator is a pipeline, ' - 'pipelines do not support sample_weight.' - ) sample_weight = column_or_1d(sample_weight, warn=True) sample_weight_is_none = False diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index d054d2c797b17..389e45fec6b72 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1377,8 +1377,9 @@ def test_gradient_boosting_with_init_pipeline(): gb.fit(X, y) with pytest.raises( - ValueError, - match='The init estimator is a pipeline, pipelines do not support'): + ValueError, + match='The initial estimator Pipeline does not support sample ' + 'weights'): gb.fit(X, y, sample_weight=np.ones(X.shape[0])) From 050366f952ee97313f19f836b52b407a637bab71 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 21 Mar 2019 21:04:54 -0400 Subject: [PATCH 3/7] pass through exception from failed input checking of init estimator --- sklearn/ensemble/gradient_boosting.py | 24 +++++++++++-------- .../ensemble/tests/test_gradient_boosting.py | 12 ++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index edc1ad3ef4c12..172e898332ed6 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1482,20 +1482,24 @@ def fit(self, X, y, sample_weight=None, monitor=None): raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64) else: - try: - self.init_.fit(X, y, sample_weight=sample_weight) - except (TypeError, ValueError): - if sample_weight_is_none: - self.init_.fit(X, y) - else: - raise ValueError( - "The initial estimator {} does not support sample " - "weights.".format(self.init_.__class__.__name__)) + if sample_weight_is_none: + self.init_.fit(X, y) + else: + msg = ("The initial estimator {} does not support sample " + "weights.".format(self.init_.__class__.__name__)) + try: + self.init_.fit(X, y, sample_weight=sample_weight) + except TypeError: # regular estimator without SW support + raise ValueError(msg) + except ValueError as e: + if 'not enough values to unpack': # pipeline + raise ValueError(msg) + else: # regular estimator whose input checking failed + raise e raw_predictions = \ self.loss_.get_init_raw_predictions(X, self.init_) - begin_at_stage = 0 # The rng state must be preserved if warm_start is True diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 389e45fec6b72..e6b0138564b63 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -41,6 +41,7 @@ from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.pipeline import make_pipeline from sklearn.linear_model import LinearRegression +from sklearn.svm import NuSVR GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, @@ -1382,6 +1383,17 @@ def test_gradient_boosting_with_init_pipeline(): 'weights'): gb.fit(X, y, sample_weight=np.ones(X.shape[0])) + # Passing sample_weight to a pipeline raises a ValueError. This test makes + # sure we make the distinction between ValueError raised by a pipeline that + # was passes sample_weight, or by a regular estimator whose input checking + # failed. + with pytest.raises( + ValueError, + match='nu <= 0 or nu > 1'): + # Note that NuSVR properly supports sample_weight + est = NuSVR(gamma='auto', nu=1.5) + est.fit(X, y, sample_weight=np.ones(X.shape[0])) + @pytest.mark.parametrize('estimator, missing_method', [ (GradientBoostingClassifier(init=LinearSVC()), 'predict_proba'), From 3801a1e41ad966275993b94b5d19324fe973ebaf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 21 Mar 2019 21:27:47 -0400 Subject: [PATCH 4/7] actually correct fix --- sklearn/ensemble/gradient_boosting.py | 2 +- sklearn/ensemble/tests/test_gradient_boosting.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 172e898332ed6..eef8dc53e9e41 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1492,7 +1492,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): except TypeError: # regular estimator without SW support raise ValueError(msg) except ValueError as e: - if 'not enough values to unpack': # pipeline + if 'not enough values to unpack' in str(e): # pipeline raise ValueError(msg) else: # regular estimator whose input checking failed raise e diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index e6b0138564b63..19e9a2239b1c9 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1375,7 +1375,7 @@ def test_gradient_boosting_with_init_pipeline(): X, y = make_regression(random_state=0) init = make_pipeline(LinearRegression()) gb = GradientBoostingRegressor(init=init) - gb.fit(X, y) + gb.fit(X, y) # pipeline without sample_weight works fine with pytest.raises( ValueError, @@ -1391,8 +1391,9 @@ def test_gradient_boosting_with_init_pipeline(): ValueError, match='nu <= 0 or nu > 1'): # Note that NuSVR properly supports sample_weight - est = NuSVR(gamma='auto', nu=1.5) - est.fit(X, y, sample_weight=np.ones(X.shape[0])) + init = NuSVR(gamma='auto', nu=1.5) + gb = GradientBoostingRegressor(init=init) + gb.fit(X, y, sample_weight=np.ones(X.shape[0])) @pytest.mark.parametrize('estimator, missing_method', [ From 2cb1bb028b1e4eaec582a80a8f0cd8c648029f40 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 21 Mar 2019 21:28:57 -0400 Subject: [PATCH 5/7] typos --- sklearn/ensemble/tests/test_gradient_boosting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 19e9a2239b1c9..622172be313f2 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1385,8 +1385,8 @@ def test_gradient_boosting_with_init_pipeline(): # Passing sample_weight to a pipeline raises a ValueError. This test makes # sure we make the distinction between ValueError raised by a pipeline that - # was passes sample_weight, or by a regular estimator whose input checking - # failed. + # was passed sample_weight, and a ValueError raised by a regular estimator + # whose input checking failed. with pytest.raises( ValueError, match='nu <= 0 or nu > 1'): From b7f7f9a199d5f49a4d6b62a7be7ef2b154a6f83a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 22 Mar 2019 10:54:20 -0400 Subject: [PATCH 6/7] Used raise from and remove raise e --- sklearn/ensemble/gradient_boosting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index eef8dc53e9e41..b0a3da23020dd 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1493,9 +1493,9 @@ def fit(self, X, y, sample_weight=None, monitor=None): raise ValueError(msg) except ValueError as e: if 'not enough values to unpack' in str(e): # pipeline - raise ValueError(msg) + raise ValueError(msg) from e else: # regular estimator whose input checking failed - raise e + raise raw_predictions = \ self.loss_.get_init_raw_predictions(X, self.init_) From d1ffe54db1f398874611dc8ae10c18d224b8bbfd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 13:32:59 -0400 Subject: [PATCH 7/7] Added XXX comment --- sklearn/ensemble/gradient_boosting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index b0a3da23020dd..13c2b60272703 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1482,6 +1482,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64) else: + # XXX clean this once we have a support_sample_weight tag if sample_weight_is_none: self.init_.fit(X, y) else: