From 36eb7b938e458d556743b0d9a809730e0429cb1f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 09:47:25 -0500 Subject: [PATCH 1/9] Used stratified splits for early stopping in GBDT and MLP --- doc/whats_new/v0.21.rst | 12 ++++++++++-- sklearn/ensemble/gradient_boosting.py | 5 ++++- sklearn/ensemble/tests/test_gradient_boosting.py | 16 ++++++++++++++-- sklearn/neural_network/multilayer_perceptron.py | 4 +++- sklearn/neural_network/tests/test_mlp.py | 13 +++++++++++++ 5 files changed, 44 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 7355f75b83d4e..751f489d296c6 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -22,8 +22,8 @@ random sampling procedures. `max_leaf_nodes` are set. |Fix| - :class:`linear_model.LogisticRegression` and :class:`linear_model.LogisticRegressionCV` with 'saga' solver. |Fix| -- :class:`ensemble.GradientBoostingClassifier` for multiclass - classification. |Fix| +- :class:`ensemble.GradientBoostingClassifier` |Fix| +- :class:`neural_network.MLPClassifier` |Fix| Details are listed in the changelog below. @@ -109,6 +109,10 @@ Support for Python 3.4 and below has been officially dropped. the gradients would be incorrectly computed in multiclass classification problems. :issue:`12715` by :user:`Nicolas Hug`. +- |Fix| Early stopping is now checked on a stratified split for + :class:`ensemble.GradientBoostingClassifier`. + :issue:`TODO` by :user:`Nicolas Hug`. + - |Fix| Fixed a bug in :mod:`ensemble` where the ``predict`` method would error for multiclass multioutput forests models if any targets were strings. :issue:`12834` by :user:`Elizabeth Sander `. @@ -237,6 +241,10 @@ Support for Python 3.4 and below has been officially dropped. :class:`neural_network.MLPRegressor` where the option :code:`shuffle=False` was being ignored. :issue:`12582` by :user:`Sam Waterbury `. +- |Fix| Early stopping is now checked on a stratified split for + :class:`neural_network.MLPClassifier`. + :issue:`TODO` by :user:`Nicolas Hug`. + :mod:`sklearn.pipeline` ....................... diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 413cc8a5ad3fd..be963fcc663f9 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -26,6 +26,7 @@ from .base import BaseEnsemble from ..base import ClassifierMixin from ..base import RegressorMixin +from ..base import is_classifier from ._gradient_boosting import predict_stages from ._gradient_boosting import predict_stage @@ -1406,10 +1407,12 @@ def fit(self, X, y, sample_weight=None, monitor=None): y = self._validate_y(y, sample_weight) if self.n_iter_no_change is not None: + stratify = y if is_classifier(self) else None X, X_val, y, y_val, sample_weight, sample_weight_val = ( train_test_split(X, y, sample_weight, random_state=self.random_state, - test_size=self.validation_fraction)) + test_size=self.validation_fraction, + stratify=stratify)) else: X_val = y_val = sample_weight_val = None diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 0bcb036610f5f..f95721f7d4de3 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1270,8 +1270,8 @@ def test_gradient_boosting_early_stopping(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if early_stopping works as expected - for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13), - (gbc, 1e-3, 36), + for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 28), (gbr, 1e-1, 13), + (gbc, 1e-3, 70), (gbr, 1e-3, 28)): est.set_params(tol=tol) est.fit(X_train, y_train) @@ -1324,3 +1324,15 @@ def test_gradient_boosting_validation_fraction(): gbr3.fit(X_train, y_train) assert gbr.n_estimators_ < gbr3.n_estimators_ assert gbc.n_estimators_ < gbc3.n_estimators_ + + +def test_early_stopping_stratified(): + # Make sure data splitting for early stopping is stratified + X = [[1, 2], [2, 3], [3, 4], [4, 5]] + y = [0, 0, 0, 1] + + gbc = GradientBoostingClassifier(n_iter_no_change=5) + with pytest.raises( + ValueError, + match='The least populated class in y has only 1 member'): + gbc.fit(X, y) diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index f567b3a487775..a4906a8b57757 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -484,9 +484,11 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads, # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: + stratify = y if is_classifier(self) else None X, X_val, y, y_val = train_test_split( X, y, random_state=self._random_state, - test_size=self.validation_fraction) + test_size=self.validation_fraction, + stratify=stratify) if is_classifier(self): y_val = self._label_binarizer.inverse_transform(y_val) else: diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py index 36d2bc5db3077..4a3647cf0d795 100644 --- a/sklearn/neural_network/tests/test_mlp.py +++ b/sklearn/neural_network/tests/test_mlp.py @@ -9,6 +9,7 @@ import warnings import numpy as np +import pytest from numpy.testing import assert_almost_equal, assert_array_equal @@ -661,3 +662,15 @@ def test_n_iter_no_change_inf(): # validate _update_no_improvement_count() was always triggered assert_equal(clf._no_improvement_count, clf.n_iter_ - 1) + + +def test_early_stopping_stratified(): + # Make sure data splitting for early stopping is stratified + X = [[1, 2], [2, 3], [3, 4], [4, 5]] + y = [0, 0, 0, 1] + + gbc = MLPClassifier(early_stopping=True) + with pytest.raises( + ValueError, + match='The least populated class in y has only 1 member'): + gbc.fit(X, y) From 966c6bcc90d9bc89684d594023229c7bcb27b4d5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 09:54:18 -0500 Subject: [PATCH 2/9] Added PR number in whatsnew --- doc/whats_new/v0.21.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 751f489d296c6..d1956760332c8 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -111,7 +111,7 @@ Support for Python 3.4 and below has been officially dropped. - |Fix| Early stopping is now checked on a stratified split for :class:`ensemble.GradientBoostingClassifier`. - :issue:`TODO` by :user:`Nicolas Hug`. + :issue:`13164` by :user:`Nicolas Hug`. - |Fix| Fixed a bug in :mod:`ensemble` where the ``predict`` method would error for multiclass multioutput forests models if any targets were strings. @@ -243,7 +243,7 @@ Support for Python 3.4 and below has been officially dropped. - |Fix| Early stopping is now checked on a stratified split for :class:`neural_network.MLPClassifier`. - :issue:`TODO` by :user:`Nicolas Hug`. + :issue:`13164` by :user:`Nicolas Hug`. :mod:`sklearn.pipeline` ....................... From 6caae0f10b2942a00807ae28431cf28c1033a072 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 10:51:31 -0500 Subject: [PATCH 3/9] fix ident issue git pus --- sklearn/ensemble/tests/test_gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index f95721f7d4de3..dd8d43ab5987a 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1335,4 +1335,4 @@ def test_early_stopping_stratified(): with pytest.raises( ValueError, match='The least populated class in y has only 1 member'): - gbc.fit(X, y) + gbc.fit(X, y) From 8037436e98cc30b2bf878cdd30053429aa2fef54 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 15 Feb 2019 07:19:44 -0500 Subject: [PATCH 4/9] Don't stratify multilabel MLP --- sklearn/neural_network/multilayer_perceptron.py | 4 +++- sklearn/neural_network/tests/test_mlp.py | 9 +++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index a4906a8b57757..34bfa6146f9f9 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -484,7 +484,9 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads, # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: - stratify = y if is_classifier(self) else None + # don't stratify in multilabel classification + should_stratify = is_classifier(self) and self.n_outputs_ == 1 + stratify = y if should_stratify else None X, X_val, y, y_val = train_test_split( X, y, random_state=self._random_state, test_size=self.validation_fraction, diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py index 4a3647cf0d795..2b685d2fd0e59 100644 --- a/sklearn/neural_network/tests/test_mlp.py +++ b/sklearn/neural_network/tests/test_mlp.py @@ -308,6 +308,11 @@ def test_multilabel_classification(): mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4]) assert_greater(mlp.score(X, y), 0.9) + # Make sure early stopping still work now that spliting is stratified by + # default (it is disabled for multilabel classification) + mlp = MLPClassifier(early_stopping=True) + mlp.fit(X, y).predict(X) + def test_multioutput_regression(): # Test that multi-output regression works as expected @@ -669,8 +674,8 @@ def test_early_stopping_stratified(): X = [[1, 2], [2, 3], [3, 4], [4, 5]] y = [0, 0, 0, 1] - gbc = MLPClassifier(early_stopping=True) + mlp = MLPClassifier(early_stopping=True) with pytest.raises( ValueError, match='The least populated class in y has only 1 member'): - gbc.fit(X, y) + mlp.fit(X, y) From a5bade02a906ec3c4b74a06b84bfa62c1ec4ebc4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 15 Feb 2019 07:21:19 -0500 Subject: [PATCH 5/9] updated whatsnew --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index d1956760332c8..4a3fbf80c271f 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -242,7 +242,7 @@ Support for Python 3.4 and below has been officially dropped. was being ignored. :issue:`12582` by :user:`Sam Waterbury `. - |Fix| Early stopping is now checked on a stratified split for - :class:`neural_network.MLPClassifier`. + :class:`neural_network.MLPClassifier` (except in the multilabel case). :issue:`13164` by :user:`Nicolas Hug`. :mod:`sklearn.pipeline` From 61d025553a482deffda7e591bd564bef3e6f481e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 08:44:45 -0400 Subject: [PATCH 6/9] should fix test --- sklearn/ensemble/tests/test_gradient_boosting.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 3d9fb16a2de23..5f6a8cee037dc 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1393,19 +1393,20 @@ def test_gradient_boosting_init_wrong_methods(estimator, missing_method): def test_early_stopping_n_classes(): - # when doing early stopping (_, y_train, _, _ = train_test_split(X, y)) + # when doing early stopping (_, , y_train, _ = train_test_split(X, y)) # there might be classes in y that are missing in y_train. As the init # estimator will be trained on y_train, we need to raise an error if this # happens. - X = [[1, 2], [2, 3], [3, 4], [4, 5]] - y = [0, 1, 1, 1] - gb = GradientBoostingClassifier(n_iter_no_change=5, random_state=4) + X = [[1]] * 10 + y = [0, 0] + [1] * 8 # only 2 negative class over 10 samples + gb = GradientBoostingClassifier(n_iter_no_change=5, random_state=0, + validation_fraction=8) with pytest.raises( ValueError, match='The training data after the early stopping split'): gb.fit(X, y) - # No error with another random seed - gb = GradientBoostingClassifier(n_iter_no_change=5, random_state=0) - gb.fit(X, y) + # No error if we let training data be big enough + gb = GradientBoostingClassifier(n_iter_no_change=5, random_state=0, + validation_fraction=4) From 5a97311dc8f6a07522a51cc0fffdf953a2056cb2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 08:48:39 -0400 Subject: [PATCH 7/9] Updated whatnew according to comments --- doc/whats_new/v0.21.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index a0832df625231..bb09f2caa4d65 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -183,8 +183,8 @@ Support for Python 3.4 and below has been officially dropped. the gradients would be incorrectly computed in multiclass classification problems. :issue:`12715` by :user:`Nicolas Hug`. -- |Fix| Early stopping is now checked on a stratified split for - :class:`ensemble.GradientBoostingClassifier`. +- |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where + validation sets for early stopping were not sampled with stratification. :issue:`13164` by :user:`Nicolas Hug`. - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where @@ -426,8 +426,9 @@ Support for Python 3.4 and below has been officially dropped. :class:`neural_network.MLPRegressor` where the option :code:`shuffle=False` was being ignored. :issue:`12582` by :user:`Sam Waterbury `. -- |Fix| Early stopping is now checked on a stratified split for - :class:`neural_network.MLPClassifier` (except in the multilabel case). +- |Fix| Fixed a bug in :class:`neural_network.MLPClassifier` where + validation sets for early stopping were not sampled with stratification. In + multilabel case however, splits are still not stratified. :issue:`13164` by :user:`Nicolas Hug`. :mod:`sklearn.pipeline` From a698ade7beb2280eb56e33722ec379ef6476ec9c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 08:54:04 -0400 Subject: [PATCH 8/9] Update early_stopping (or n_iter_no_change) doc to mention stratification --- sklearn/ensemble/gradient_boosting.py | 2 +- sklearn/linear_model/passive_aggressive.py | 8 ++++---- sklearn/linear_model/perceptron.py | 4 ++-- sklearn/linear_model/stochastic_gradient.py | 8 ++++---- sklearn/neural_network/multilayer_perceptron.py | 3 ++- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index d83bc018d28fb..c64c2a3273b9f 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1935,7 +1935,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): number, it will set aside ``validation_fraction`` size of the training data as validation and terminate training when validation score is not improving in all of the previous ``n_iter_no_change`` numbers of - iterations. + iterations. The split is stratified. .. versionadded:: 0.20 diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py index f475f0da98a6f..877080cab6561 100644 --- a/sklearn/linear_model/passive_aggressive.py +++ b/sklearn/linear_model/passive_aggressive.py @@ -37,8 +37,8 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): early_stopping : bool, default=False Whether to use early stopping to terminate training when validation. score is not improving. If set to True, it will automatically set aside - a fraction of training data as validation and terminate training when - validation score is not improving by at least tol for + a stratified fraction of training data as validation and terminate + training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs. .. versionadded:: 0.20 @@ -282,8 +282,8 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): early_stopping : bool, default=False Whether to use early stopping to terminate training when validation. score is not improving. If set to True, it will automatically set aside - a fraction of training data as validation and terminate training when - validation score is not improving by at least tol for + a fraction of training data as validation and terminate + training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs. .. versionadded:: 0.20 diff --git a/sklearn/linear_model/perceptron.py b/sklearn/linear_model/perceptron.py index 7c6c6bf6a268d..2bf7899069864 100644 --- a/sklearn/linear_model/perceptron.py +++ b/sklearn/linear_model/perceptron.py @@ -62,8 +62,8 @@ class Perceptron(BaseSGDClassifier): early_stopping : bool, default=False Whether to use early stopping to terminate training when validation. score is not improving. If set to True, it will automatically set aside - a fraction of training data as validation and terminate training when - validation score is not improving by at least tol for + a stratified fraction of training data as validation and terminate + training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs. .. versionadded:: 0.20 diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index 36d2226fb0f6d..3e33e59588117 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -828,8 +828,8 @@ class SGDClassifier(BaseSGDClassifier): early_stopping : bool, default=False Whether to use early stopping to terminate training when validation score is not improving. If set to True, it will automatically set aside - a fraction of training data as validation and terminate training when - validation score is not improving by at least tol for + a stratified fraction of training data as validation and terminate + training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs. .. versionadded:: 0.20 @@ -1433,8 +1433,8 @@ class SGDRegressor(BaseSGDRegressor): early_stopping : bool, default=False Whether to use early stopping to terminate training when validation score is not improving. If set to True, it will automatically set aside - a fraction of training data as validation and terminate training when - validation score is not improving by at least tol for + a fraction of training data as validation and terminate + training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs. .. versionadded:: 0.20 diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index 34bfa6146f9f9..8a5469df54897 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -807,7 +807,8 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin): score is not improving. If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for - ``n_iter_no_change`` consecutive epochs. + ``n_iter_no_change`` consecutive epochs. The split is stratified, + except in a multilabel setting. Only effective when solver='sgd' or 'adam' validation_fraction : float, optional, default 0.1 From 9e7304da9823332612c9f0e1bf952215b37c1466 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 08:55:09 -0400 Subject: [PATCH 9/9] removed double import --- sklearn/neural_network/tests/test_mlp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py index 58f718ad1ef3c..036329139e58c 100644 --- a/sklearn/neural_network/tests/test_mlp.py +++ b/sklearn/neural_network/tests/test_mlp.py @@ -10,7 +10,6 @@ import warnings import numpy as np -import pytest from numpy.testing import assert_almost_equal, assert_array_equal