From e49b3aa04f9ada174f2718e9610dd464d988c148 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Fri, 11 Feb 2022 14:35:03 -0600 Subject: [PATCH 01/18] meat finished but not checked --- sklearn/linear_model/_glm/glm.py | 42 +++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index d7af8ae60d8b6..f56381e6db980 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -13,7 +13,7 @@ from ...base import BaseEstimator, RegressorMixin from ...utils.optimize import _check_optimize_result -from ...utils import check_scalar +from ...utils import check_scalar, check_array from ...utils.validation import check_is_fitted, _check_sample_weight from ..._loss.glm_distribution import ( ExponentialDispersionModel, @@ -68,12 +68,16 @@ class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): Parameters ---------- - alpha : float, default=1 - Constant that multiplies the penalty term and thus determines the + alpha : {float, iterable}, default=1 + Constant(s) that multiplies the penalty term and thus determines the regularization strength. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). Values must be in the range `[0.0, inf)`. + If alpha is a scalar then the value is applied to all non-intercept terms + If alpha is an iterable then each value must be in the range `[0.0, inf)` + and the size of the iterable must be equal to the input design matrix. + If alpha is greater than 1 dimension it will be converted to 1 dimension. fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be @@ -214,13 +218,27 @@ def fit(self, X, y, sample_weight=None): "got (link={0})".format(self.link) ) - check_scalar( - self.alpha, - name="alpha", - target_type=numbers.Real, - min_val=0.0, - include_boundaries="left", - ) + if hasattr(self.alpha, '__iter__'): + self.alpha = np.asarray(self.alpha, dtype=np.float_).ravel() + if self.alpha.size != X.shape[1]: + raise ValueError(f'Alpha must have length equivalent to input array. Input array is shape {X.shape[1]}' + f'while alpha is of length {self.alpha.size}') + for i, val in enumerate(self.alpha): + check_scalar(val + , name=f'alpha index [{i}]' + , target_type=numbers.Real + , min_val=0. + , include_boundaries='left') + else: + check_scalar( + self.alpha, + name="alpha", + target_type=numbers.Real, + min_val=0.0, + include_boundaries="left", + ) + self.alpha = np.full(X.shape[1], self.alpha) + if not isinstance(self.fit_intercept, bool): raise ValueError( "The argument fit_intercept must be bool; got {0}".format( @@ -279,7 +297,9 @@ def fit(self, X, y, sample_weight=None): family.__class__.__name__ ) ) - # TODO: if alpha=0 check that X is not rank deficient + # check that matrix is full rank when alpha = 0. + if np.max(self.alpha) == 0. and np.linalg.matrix_rank(X) < X.shape[1]: + raise ValueError('Design matrix must be full rank when alpha = 0.') # rescaling of sample_weight # From 6a0f4d4831ccfe0d6aa8b10da13cf9413222244f Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Mon, 14 Feb 2022 11:24:38 -0600 Subject: [PATCH 02/18] unit tests fixed and added --- sklearn/linear_model/_glm/glm.py | 44 ++++++++++----------- sklearn/linear_model/_glm/tests/test_glm.py | 28 ++++++++++++- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f56381e6db980..f4d8408f14b4a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -217,28 +217,6 @@ def fit(self, X, y, sample_weight=None): "an element of ['auto', 'identity', 'log']; " "got (link={0})".format(self.link) ) - - if hasattr(self.alpha, '__iter__'): - self.alpha = np.asarray(self.alpha, dtype=np.float_).ravel() - if self.alpha.size != X.shape[1]: - raise ValueError(f'Alpha must have length equivalent to input array. Input array is shape {X.shape[1]}' - f'while alpha is of length {self.alpha.size}') - for i, val in enumerate(self.alpha): - check_scalar(val - , name=f'alpha index [{i}]' - , target_type=numbers.Real - , min_val=0. - , include_boundaries='left') - else: - check_scalar( - self.alpha, - name="alpha", - target_type=numbers.Real, - min_val=0.0, - include_boundaries="left", - ) - self.alpha = np.full(X.shape[1], self.alpha) - if not isinstance(self.fit_intercept, bool): raise ValueError( "The argument fit_intercept must be bool; got {0}".format( @@ -286,7 +264,25 @@ def fit(self, X, y, sample_weight=None): y_numeric=True, multi_output=False, ) - + if hasattr(self.alpha, '__iter__') and not isinstance(self.alpha, str): + for i, val in enumerate(self.alpha): + check_scalar(val + , name=f'alpha at index {i}' + , target_type=numbers.Real + , min_val=0. + , include_boundaries='left') + self.alpha = np.asarray(self.alpha, dtype=np.float_).ravel() + if self.alpha.size != X.shape[1]: + raise ValueError(f'X width is {X.shape[1]} while alpha is of length {self.alpha.size}') + else: + check_scalar( + self.alpha, + name="alpha", + target_type=numbers.Real, + min_val=0.0, + include_boundaries="left", + ) + self.alpha = np.full(X.shape[1], self.alpha) weights = _check_sample_weight(sample_weight, X) _, n_features = X.shape @@ -299,7 +295,7 @@ def fit(self, X, y, sample_weight=None): ) # check that matrix is full rank when alpha = 0. if np.max(self.alpha) == 0. and np.linalg.matrix_rank(X) < X.shape[1]: - raise ValueError('Design matrix must be full rank when alpha = 0.') + raise ValueError('X must be full rank when alpha = 0.') # rescaling of sample_weight # diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 87fe2b51f4d28..991d82e63b715 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -195,6 +195,32 @@ def test_glm_scalar_argument(Estimator, params, err_type, err_msg): glm.fit(X, y) +@pytest.mark.parametrize( + "Estimator", + [GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor, TweedieRegressor], +) +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ({"alpha": [1, '2']}, TypeError, "alpha at index 1 must be an instance of float, not str."), + ({"alpha": [1, 2, 3], "fit_intercept": True}, ValueError + , 'X width is 2 while alpha is of length 3'), + ({"alpha": [1, 2, 3], "fit_intercept": False}, ValueError + , 'X width is 2 while alpha is of length 3'), + ({"alpha": [-2, 2]}, ValueError, "alpha at index 0 == -2, must be >= 0.0"), + ({"alpha": [0., 0.]}, ValueError, "X must be full rank when alpha = 0."), + ] +) +def test_glm_alpha_array(Estimator, params, err_type, err_msg): + """Test GLM for invalid alpha input when alpha is an iterable""" + X = [[1,2], [2,4]] + y = [1, 2] + glm = Estimator(**params) + with pytest.raises(err_type, match=err_msg): + glm.fit(X, y) + + + @pytest.mark.parametrize("warm_start", ["not bool", 1, 0, [True]]) def test_glm_warm_start_argument(warm_start): """Test GLM for invalid warm_start argument.""" @@ -494,4 +520,4 @@ def test_tweedie_regression_family(regression_data): ], ) def test_tags(estimator, value): - assert estimator._get_tags()["requires_positive_y"] is value + assert estimator._get_tags()["requires_positive_y"] is value \ No newline at end of file From 83dde0e13bcd697e1dac3e24fe7042a89109571f Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Fri, 11 Feb 2022 14:35:03 -0600 Subject: [PATCH 03/18] meat finished but not checked --- sklearn/linear_model/_glm/glm.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f4d8408f14b4a..c34378d78320a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -217,6 +217,28 @@ def fit(self, X, y, sample_weight=None): "an element of ['auto', 'identity', 'log']; " "got (link={0})".format(self.link) ) + + if hasattr(self.alpha, '__iter__'): + self.alpha = np.asarray(self.alpha, dtype=np.float_).ravel() + if self.alpha.size != X.shape[1]: + raise ValueError(f'Alpha must have length equivalent to input array. Input array is shape {X.shape[1]}' + f'while alpha is of length {self.alpha.size}') + for i, val in enumerate(self.alpha): + check_scalar(val + , name=f'alpha index [{i}]' + , target_type=numbers.Real + , min_val=0. + , include_boundaries='left') + else: + check_scalar( + self.alpha, + name="alpha", + target_type=numbers.Real, + min_val=0.0, + include_boundaries="left", + ) + self.alpha = np.full(X.shape[1], self.alpha) + if not isinstance(self.fit_intercept, bool): raise ValueError( "The argument fit_intercept must be bool; got {0}".format( @@ -295,7 +317,11 @@ def fit(self, X, y, sample_weight=None): ) # check that matrix is full rank when alpha = 0. if np.max(self.alpha) == 0. and np.linalg.matrix_rank(X) < X.shape[1]: +<<<<<<< HEAD raise ValueError('X must be full rank when alpha = 0.') +======= + raise ValueError('Design matrix must be full rank when alpha = 0.') +>>>>>>> 39b122d50 (meat finished but not checked) # rescaling of sample_weight # From 314105bef90e5a71005d9f92e4df929da4fdaef7 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Mon, 14 Feb 2022 11:24:38 -0600 Subject: [PATCH 04/18] unit tests fixed and added --- sklearn/linear_model/_glm/glm.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index c34378d78320a..f4d8408f14b4a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -217,28 +217,6 @@ def fit(self, X, y, sample_weight=None): "an element of ['auto', 'identity', 'log']; " "got (link={0})".format(self.link) ) - - if hasattr(self.alpha, '__iter__'): - self.alpha = np.asarray(self.alpha, dtype=np.float_).ravel() - if self.alpha.size != X.shape[1]: - raise ValueError(f'Alpha must have length equivalent to input array. Input array is shape {X.shape[1]}' - f'while alpha is of length {self.alpha.size}') - for i, val in enumerate(self.alpha): - check_scalar(val - , name=f'alpha index [{i}]' - , target_type=numbers.Real - , min_val=0. - , include_boundaries='left') - else: - check_scalar( - self.alpha, - name="alpha", - target_type=numbers.Real, - min_val=0.0, - include_boundaries="left", - ) - self.alpha = np.full(X.shape[1], self.alpha) - if not isinstance(self.fit_intercept, bool): raise ValueError( "The argument fit_intercept must be bool; got {0}".format( @@ -317,11 +295,7 @@ def fit(self, X, y, sample_weight=None): ) # check that matrix is full rank when alpha = 0. if np.max(self.alpha) == 0. and np.linalg.matrix_rank(X) < X.shape[1]: -<<<<<<< HEAD raise ValueError('X must be full rank when alpha = 0.') -======= - raise ValueError('Design matrix must be full rank when alpha = 0.') ->>>>>>> 39b122d50 (meat finished but not checked) # rescaling of sample_weight # From df064c7f2a67110639dfbe61f43cc7d888372366 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Mon, 14 Feb 2022 11:40:44 -0600 Subject: [PATCH 05/18] pep8 and black --- sklearn/linear_model/_glm/glm.py | 23 +++++++++------- sklearn/linear_model/_glm/tests/test_glm.py | 29 ++++++++++++++------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f4d8408f14b4a..451d2a045d647 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -264,16 +264,21 @@ def fit(self, X, y, sample_weight=None): y_numeric=True, multi_output=False, ) - if hasattr(self.alpha, '__iter__') and not isinstance(self.alpha, str): + if hasattr(self.alpha, "__iter__") and not isinstance(self.alpha, str): for i, val in enumerate(self.alpha): - check_scalar(val - , name=f'alpha at index {i}' - , target_type=numbers.Real - , min_val=0. - , include_boundaries='left') + check_scalar( + val, + name=f"alpha at index {i}", + target_type=numbers.Real, + min_val=0.0, + include_boundaries="left", + ) self.alpha = np.asarray(self.alpha, dtype=np.float_).ravel() if self.alpha.size != X.shape[1]: - raise ValueError(f'X width is {X.shape[1]} while alpha is of length {self.alpha.size}') + raise ValueError( + f"X width is {X.shape[1]} while alpha is of length" + f" {self.alpha.size}" + ) else: check_scalar( self.alpha, @@ -294,8 +299,8 @@ def fit(self, X, y, sample_weight=None): ) ) # check that matrix is full rank when alpha = 0. - if np.max(self.alpha) == 0. and np.linalg.matrix_rank(X) < X.shape[1]: - raise ValueError('X must be full rank when alpha = 0.') + if np.max(self.alpha) == 0.0 and np.linalg.matrix_rank(X) < X.shape[1]: + raise ValueError("X must be full rank when alpha = 0.") # rescaling of sample_weight # diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 991d82e63b715..f87d545a8e4e7 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -202,25 +202,34 @@ def test_glm_scalar_argument(Estimator, params, err_type, err_msg): @pytest.mark.parametrize( "params, err_type, err_msg", [ - ({"alpha": [1, '2']}, TypeError, "alpha at index 1 must be an instance of float, not str."), - ({"alpha": [1, 2, 3], "fit_intercept": True}, ValueError - , 'X width is 2 while alpha is of length 3'), - ({"alpha": [1, 2, 3], "fit_intercept": False}, ValueError - , 'X width is 2 while alpha is of length 3'), + ( + {"alpha": [1, "2"]}, + TypeError, + "alpha at index 1 must be an instance of float, not str.", + ), + ( + {"alpha": [1, 2, 3], "fit_intercept": True}, + ValueError, + "X width is 2 while alpha is of length 3", + ), + ( + {"alpha": [1, 2, 3], "fit_intercept": False}, + ValueError, + "X width is 2 while alpha is of length 3", + ), ({"alpha": [-2, 2]}, ValueError, "alpha at index 0 == -2, must be >= 0.0"), - ({"alpha": [0., 0.]}, ValueError, "X must be full rank when alpha = 0."), - ] + ({"alpha": [0.0, 0.0]}, ValueError, "X must be full rank when alpha = 0."), + ], ) def test_glm_alpha_array(Estimator, params, err_type, err_msg): """Test GLM for invalid alpha input when alpha is an iterable""" - X = [[1,2], [2,4]] + X = [[1, 2], [2, 4]] y = [1, 2] glm = Estimator(**params) with pytest.raises(err_type, match=err_msg): glm.fit(X, y) - @pytest.mark.parametrize("warm_start", ["not bool", 1, 0, [True]]) def test_glm_warm_start_argument(warm_start): """Test GLM for invalid warm_start argument.""" @@ -520,4 +529,4 @@ def test_tweedie_regression_family(regression_data): ], ) def test_tags(estimator, value): - assert estimator._get_tags()["requires_positive_y"] is value \ No newline at end of file + assert estimator._get_tags()["requires_positive_y"] is value From 223e9bb8e0322fcedecb3af99f0ae5b65386d756 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 15 Feb 2022 08:54:06 -0600 Subject: [PATCH 06/18] circleCI test and undid transformation of scalar alpha to ndarray --- sklearn/linear_model/_glm/glm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 451d2a045d647..171ffb31555b6 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -13,7 +13,7 @@ from ...base import BaseEstimator, RegressorMixin from ...utils.optimize import _check_optimize_result -from ...utils import check_scalar, check_array +from ...utils import check_scalar from ...utils.validation import check_is_fitted, _check_sample_weight from ..._loss.glm_distribution import ( ExponentialDispersionModel, @@ -287,7 +287,6 @@ def fit(self, X, y, sample_weight=None): min_val=0.0, include_boundaries="left", ) - self.alpha = np.full(X.shape[1], self.alpha) weights = _check_sample_weight(sample_weight, X) _, n_features = X.shape From 42903bafcfe5fb198e63375aee438294cb04104b Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 15 Feb 2022 15:47:19 -0600 Subject: [PATCH 07/18] updated doc whats_new --- doc/whats_new/_contributors.rst | 4 +++- doc/whats_new/v1.1.rst | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index ca0f8ede93afa..f7b44760f5ab9 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -176,4 +176,6 @@ .. _Nicolas Hug: https://github.com/NicolasHug -.. _Guillaume Lemaitre: https://github.com/glemaitre \ No newline at end of file +.. _Guillaume Lemaitre: https://github.com/glemaitre + +.. _Xiao Wei: https://github.com/xiaowei1234 \ No newline at end of file diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index f9ffc5d329b51..985b0aae7b526 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -436,6 +436,43 @@ Changelog `alpha`, `max_iter` and `tol`. :pr:`22240` by :user:`Arturo Amor `. +- |Fix| :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` now set the `n_iter_` attribute + with a shape that respects the docstring and that is consistent with the shape + obtained when using the other solvers in the one-vs-rest setting. Previously, + it would record only the maximum of the number of iterations for each binary + sub-problem while now all of them are recorded. :pr:`21998` by + :user:`Olivier Grisel `. + +- |Enhancement| :class:`linear_model.BayesianRidge` and + :class:`linear_model.ARDRegression` now preserve float32 dtype. :pr:`9087` by + :user:`Arthur Imbert ` and :pr:`22525` by :user:`Meekail Zain `. + +:mod:`sklearn.manifold` +....................... + +- |Feature| :class:`sklearn.manifold.Isomap` now supports radius-based + neighbors via the `radius` argument. + :pr:`19794` by :user:`Zhehao Liu `. + +- |Enhancement| :func:`manifold.spectral_embedding` and + :class:`manifold.SpectralEmbedding` supports `np.float32` dtype and will + preserve this dtype. + :pr:`21534` by :user:`Andrew Knyazev `. + +- |Enhancement| Adds `get_feature_names_out` to :class:`manifold.Isomap` + and :class:`manifold.LocallyLinearEmbedding`. :pr:`22254` by `Thomas Fan`_. + +- |Fix| :func:`manifold.spectral_embedding` now uses Gaussian instead of + the previous uniform on [0, 1] random initial approximations to eigenvectors + in eigen_solvers `lobpcg` and `amg` to improve their numerical stability. + :pr:`21565` by :user:`Andrew Knyazev `. +- |Enhancement| :class:`linear_model.GeneralizedLinearRegressor` fit method + will now allow penalty term `alpha` to be an iterable of same length as number + of features in X in addition to a scalar value. Will also check that X matrix + is full rank if penalty term `alpha` is set to 0.0 or array of 0.0s. + :pr:`11566` by :user:`Xiao Wei `. + :mod:`sklearn.metrics` ...................... From 7062825096bb268349d77c17158c2a3ef76474e0 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 15 Feb 2022 15:52:07 -0600 Subject: [PATCH 08/18] wrong PR # --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 985b0aae7b526..d6f97b3f0377a 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -471,7 +471,7 @@ Changelog will now allow penalty term `alpha` to be an iterable of same length as number of features in X in addition to a scalar value. Will also check that X matrix is full rank if penalty term `alpha` is set to 0.0 or array of 0.0s. - :pr:`11566` by :user:`Xiao Wei `. + :pr:`22485` by :user:`Xiao Wei `. :mod:`sklearn.metrics` ...................... From 01141d8da1f228086319838e93639c6e16d20d3b Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 22 Feb 2022 08:34:14 -0600 Subject: [PATCH 09/18] updated docstring --- sklearn/linear_model/_glm/glm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 171ffb31555b6..d8fb67156814c 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -68,16 +68,16 @@ class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): Parameters ---------- - alpha : {float, iterable}, default=1 + alpha : {float, array-like} if array like then of size n_features, default=1 Constant(s) that multiplies the penalty term and thus determines the regularization strength. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). Values must be in the range `[0.0, inf)`. If alpha is a scalar then the value is applied to all non-intercept terms - If alpha is an iterable then each value must be in the range `[0.0, inf)` - and the size of the iterable must be equal to the input design matrix. - If alpha is greater than 1 dimension it will be converted to 1 dimension. + If alpha is an array-like then each value must be in the range `[0.0, inf)` + and the length must equal to n_features. + If alpha is greater than 1 dimension it will be converted to 1 dimension. fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be From 079cfad97c077a35b40f318dd238be4112ae9ce0 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 1 Mar 2022 08:56:27 -0600 Subject: [PATCH 10/18] Update doc/whats_new/v1.1.rst Co-authored-by: Alexandre Gramfort --- doc/whats_new/v1.1.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index d6f97b3f0377a..99936a8620d64 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -436,6 +436,13 @@ Changelog `alpha`, `max_iter` and `tol`. :pr:`22240` by :user:`Arturo Amor `. + +- |Enhancement| :class:`linear_model.GeneralizedLinearRegressor` fit method + will now allow penalty term `alpha` to be an iterable of same length as the number + of features in X in addition to a scalar value. Will also check that X matrix + is full rank if penalty term `alpha` is set to 0.0 or array of 0.0s. + :pr:`22485` by :user:`Xiao Wei `. + - |Fix| :class:`linear_model.LogisticRegression` and :class:`linear_model.LogisticRegressionCV` now set the `n_iter_` attribute with a shape that respects the docstring and that is consistent with the shape From 038fbd8c171fb85ea968f822ababef04aa3a9302 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 1 Mar 2022 08:56:39 -0600 Subject: [PATCH 11/18] Update sklearn/linear_model/_glm/glm.py Co-authored-by: Alexandre Gramfort --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index d8fb67156814c..bec08ae1c4bdd 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -68,7 +68,7 @@ class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): Parameters ---------- - alpha : {float, array-like} if array like then of size n_features, default=1 + alpha : {float, array-like}, default=1 Constant(s) that multiplies the penalty term and thus determines the regularization strength. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix `X` must have full column rank From c0e0a37cef93845ad5a6c8a28eff945a1ebb99dd Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 1 Mar 2022 08:56:46 -0600 Subject: [PATCH 12/18] Update sklearn/linear_model/_glm/glm.py Co-authored-by: Alexandre Gramfort --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index bec08ae1c4bdd..de4eeb1f8059c 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -76,7 +76,7 @@ class GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): Values must be in the range `[0.0, inf)`. If alpha is a scalar then the value is applied to all non-intercept terms If alpha is an array-like then each value must be in the range `[0.0, inf)` - and the length must equal to n_features. + and the length must equal to n_features. If alpha is greater than 1 dimension it will be converted to 1 dimension. fit_intercept : bool, default=True From a62c3bf1f6247d2c85f3e60b4eadf4fa10f48b7b Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 1 Mar 2022 09:51:01 -0600 Subject: [PATCH 13/18] self.alpha check revision --- sklearn/linear_model/_glm/glm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index de4eeb1f8059c..0fdbed9612e62 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -7,6 +7,7 @@ # License: BSD 3 clause import numbers +from collections.abc import Iterable import numpy as np import scipy.optimize @@ -264,7 +265,7 @@ def fit(self, X, y, sample_weight=None): y_numeric=True, multi_output=False, ) - if hasattr(self.alpha, "__iter__") and not isinstance(self.alpha, str): + if isinstance(self.alpha, Iterable): for i, val in enumerate(self.alpha): check_scalar( val, @@ -273,7 +274,7 @@ def fit(self, X, y, sample_weight=None): min_val=0.0, include_boundaries="left", ) - self.alpha = np.asarray(self.alpha, dtype=np.float_).ravel() + self.alpha = np.asarray(self.alpha, dtype=np.float64).ravel() if self.alpha.size != X.shape[1]: raise ValueError( f"X width is {X.shape[1]} while alpha is of length" From 801166ca8288929580fd42f54091986bb349e846 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Tue, 1 Mar 2022 10:27:54 -0600 Subject: [PATCH 14/18] rebuilt and fixed unit tests --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 0fdbed9612e62..88b74493b857c 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -265,7 +265,7 @@ def fit(self, X, y, sample_weight=None): y_numeric=True, multi_output=False, ) - if isinstance(self.alpha, Iterable): + if isinstance(self.alpha, Iterable) and not isinstance(self.alpha, str): for i, val in enumerate(self.alpha): check_scalar( val, From 03f1bc3f14dee9767fcd3bd68bf1d7e02f965645 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Sun, 6 Mar 2022 16:24:02 -0600 Subject: [PATCH 15/18] ENH unit test for glm alpha array --- sklearn/linear_model/_glm/tests/test_glm.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index f87d545a8e4e7..327d4a50e76c0 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -230,6 +230,24 @@ def test_glm_alpha_array(Estimator, params, err_type, err_msg): glm.fit(X, y) +@pytest.mark.parametrize( + "Estimator", + [GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor, TweedieRegressor], +) +def test_glm_alpha_array_reg(Estimator): + """Test GLM regression when alpha is an array and 2nd column + has different alpha than 1st column + """ + X = np.asarray([[1, 2], [1, 3], [1, 4], [1, 3]]) + y = np.asarray([2, 2, 3, 2]) + scalar_coefs = Estimator(alpha=1., fit_intercept=False).fit(X, y).coef_ + X_scaled = X.copy() + X_scaled[:, 1] = X_scaled[:, 1] * 2. + array_coefs = Estimator(alpha=[1., 4.], fit_intercept=False).fit(X_scaled, y).coef_ + array_coefs[1] *= 2 + assert_allclose(scalar_coefs, array_coefs, atol=1e-4) + + @pytest.mark.parametrize("warm_start", ["not bool", 1, 0, [True]]) def test_glm_warm_start_argument(warm_start): """Test GLM for invalid warm_start argument.""" From 95dad3ff63361ee20737c7a323677487ba39d336 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Mon, 7 Mar 2022 09:36:15 -0600 Subject: [PATCH 16/18] ENH reverted matrix rank check and test --- sklearn/linear_model/_glm/glm.py | 4 +--- sklearn/linear_model/_glm/tests/test_glm.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 88b74493b857c..1bc327b4cde42 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -298,9 +298,7 @@ def fit(self, X, y, sample_weight=None): family.__class__.__name__ ) ) - # check that matrix is full rank when alpha = 0. - if np.max(self.alpha) == 0.0 and np.linalg.matrix_rank(X) < X.shape[1]: - raise ValueError("X must be full rank when alpha = 0.") + # TODO: if alpha=0 check that X is not rank deficient # rescaling of sample_weight # diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 327d4a50e76c0..c300aead14893 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -218,7 +218,6 @@ def test_glm_scalar_argument(Estimator, params, err_type, err_msg): "X width is 2 while alpha is of length 3", ), ({"alpha": [-2, 2]}, ValueError, "alpha at index 0 == -2, must be >= 0.0"), - ({"alpha": [0.0, 0.0]}, ValueError, "X must be full rank when alpha = 0."), ], ) def test_glm_alpha_array(Estimator, params, err_type, err_msg): From d06826ab953f592447d823205bdfc3d0c774f252 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Mon, 7 Mar 2022 09:40:21 -0600 Subject: [PATCH 17/18] ENH doc cleanup for #22485 --- doc/whats_new/v1.1.rst | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index cbd5894a4ea42..65bf1d01ef1a0 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -537,10 +537,10 @@ Changelog `alpha`, `max_iter` and `tol`. :pr:`22240` by :user:`Arturo Amor `. -- |Enhancement| :class:`linear_model.GeneralizedLinearRegressor` fit method - will now allow penalty term `alpha` to be an iterable of same length as the number - of features in X in addition to a scalar value. Will also check that X matrix - is full rank if penalty term `alpha` is set to 0.0 or array of 0.0s. +- |Enhancement| :class:`linear_model.PoissonRegressor`, `linear_model.GammaRegressor`, + and `linear_model.TweedieRegressor` fit methods will now allow penalty term `alpha` + to be an iterable of same length as the number of features in X in addition to a + scalar value. :pr:`22485` by :user:`Xiao Wei `. - |Fix| :class:`linear_model.LogisticRegression` and @@ -575,12 +575,6 @@ Changelog in eigen_solvers `lobpcg` and `amg` to improve their numerical stability. :pr:`21565` by :user:`Andrew Knyazev `. -- |Enhancement| :class:`linear_model.GeneralizedLinearRegressor` fit method - will now allow penalty term `alpha` to be an iterable of same length as number - of features in X in addition to a scalar value. Will also check that X matrix - is full rank if penalty term `alpha` is set to 0.0 or array of 0.0s. - :pr:`22485` by :user:`Xiao Wei `. - - |Enhancement| added `metric_params` to :class:`manifold.TSNE` constructor for additional parameters of distance metric to use in optimization. :pr:`21805` by :user:`Jeanne Dionisi ` and :pr:`22685` by From 98f820e7574584b45084e772cca0084db0f61231 Mon Sep 17 00:00:00 2001 From: Xiao Wei Date: Mon, 7 Mar 2022 09:56:57 -0600 Subject: [PATCH 18/18] ENH #22485 black --- sklearn/linear_model/_glm/tests/test_glm.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index c300aead14893..8ae4f5e629d84 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -239,11 +239,13 @@ def test_glm_alpha_array_reg(Estimator): """ X = np.asarray([[1, 2], [1, 3], [1, 4], [1, 3]]) y = np.asarray([2, 2, 3, 2]) - scalar_coefs = Estimator(alpha=1., fit_intercept=False).fit(X, y).coef_ + scalar_coefs = Estimator(alpha=1.0, fit_intercept=False).fit(X, y).coef_ X_scaled = X.copy() - X_scaled[:, 1] = X_scaled[:, 1] * 2. - array_coefs = Estimator(alpha=[1., 4.], fit_intercept=False).fit(X_scaled, y).coef_ - array_coefs[1] *= 2 + X_scaled[:, 1] = X_scaled[:, 1] * 2.0 + array_coefs = ( + Estimator(alpha=[1.0, 4.0], fit_intercept=False).fit(X_scaled, y).coef_ + ) + array_coefs[1] *= 2.0 assert_allclose(scalar_coefs, array_coefs, atol=1e-4)