From 0baaf67e587d6c5ae7c40050c4a610cefcc35e3b Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 18 Mar 2022 13:12:21 +0100 Subject: [PATCH 1/6] scale X_offset when X sparse --- sklearn/linear_model/_base.py | 22 ++++----- sklearn/linear_model/tests/test_base.py | 62 +++++++++++++------------ 2 files changed, 44 insertions(+), 40 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index f0322c3924426..2ca19d09121d9 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -325,7 +325,7 @@ def _preprocess_data( # sample_weight makes the refactoring tricky. -def _rescale_data(X, y, sample_weight): +def _rescale_data(X, y, sample_weight, square_root=True): """Rescale data sample-wise by square root of sample_weight. For many linear models, this enables easy support for sample_weight. @@ -340,7 +340,8 @@ def _rescale_data(X, y, sample_weight): sample_weight = np.asarray(sample_weight) if sample_weight.ndim == 0: sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype) - sample_weight = np.sqrt(sample_weight) + if square_root: + sample_weight = np.sqrt(sample_weight) sw_matrix = sparse.dia_matrix((sample_weight, 0), shape=(n_samples, n_samples)) X = safe_sparse_dot(sw_matrix, X) y = safe_sparse_dot(sw_matrix, y) @@ -676,10 +677,9 @@ def fit(self, X, y, sample_weight=None): X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True ) - if sample_weight is not None: - sample_weight = _check_sample_weight( - sample_weight, X, dtype=X.dtype, only_non_negative=True - ) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, only_non_negative=True + ) X, y, X_offset, y_offset, X_scale = _preprocess_data( X, @@ -691,9 +691,9 @@ def fit(self, X, y, sample_weight=None): return_mean=True, ) - if sample_weight is not None: - # Sample weight can be implemented via a simple rescaling. - X, y = _rescale_data(X, y, sample_weight) + # Sample weight can be implemented via a simple rescaling. + sample_weight = np.sqrt(sample_weight) + X, y = _rescale_data(X, y, sample_weight, square_root=False) if self.positive: if y.ndim < 2: @@ -708,10 +708,10 @@ def fit(self, X, y, sample_weight=None): X_offset_scale = X_offset / X_scale def matvec(b): - return X.dot(b) - b.dot(X_offset_scale) + return X.dot(b) - sample_weight * b.dot(X_offset_scale) def rmatvec(b): - return X.T.dot(b) - X_offset_scale * np.sum(b) + return X.T.dot(b) - X_offset_scale * b.dot(sample_weight) X_centered = sparse.linalg.LinearOperator( shape=X.shape, matvec=matvec, rmatvec=rmatvec diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 6b89b7e25e899..fac7011893baa 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -4,6 +4,8 @@ # # License: BSD 3 clause +from os import confstr_names +from random import sample import pytest import warnings @@ -55,45 +57,47 @@ def test_linear_regression(): assert_array_almost_equal(reg.predict(X), [0]) -def test_linear_regression_sample_weights(): - # TODO: loop over sparse data as well - +@pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_linear_regression_sample_weights(array_constr, fit_intercept): rng = np.random.RandomState(0) # It would not work with under-determined systems - for n_samples, n_features in ((6, 5),): + n_samples, n_features = 6, 5 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - sample_weight = 1.0 + rng.rand(n_samples) + X = array_constr(rng.normal(size=(n_samples, n_features))) + y = rng.normal(size=n_samples) - for intercept in (True, False): + sample_weight = 1.0 + rng.uniform(size=n_samples) - # LinearRegression with explicit sample_weight - reg = LinearRegression(fit_intercept=intercept) - reg.fit(X, y, sample_weight=sample_weight) - coefs1 = reg.coef_ - inter1 = reg.intercept_ + # LinearRegression with explicit sample_weight + reg = LinearRegression(fit_intercept=fit_intercept) + reg.fit(X, y, sample_weight=sample_weight) + coefs1 = reg.coef_ + inter1 = reg.intercept_ - assert reg.coef_.shape == (X.shape[1],) # sanity checks - assert reg.score(X, y) > 0.5 + assert reg.coef_.shape == (X.shape[1],) # sanity checks + assert reg.score(X, y) > 0.5 - # Closed form of the weighted least square - # theta = (X^T W X)^(-1) * X^T W y - W = np.diag(sample_weight) - if intercept is False: - X_aug = X - else: - dummy_column = np.ones(shape=(n_samples, 1)) - X_aug = np.concatenate((dummy_column, X), axis=1) + # Closed form of the weighted least square + # theta = (X^T W X)^(-1) @ X^T W y + W = np.diag(sample_weight) + if not fit_intercept: + X_aug = X + else: + hstack = sparse.hstack if sparse.issparse(X) else np.hstack + dummy_column = np.ones(shape=(n_samples, 1)) + X_aug = hstack((dummy_column, X)) - coefs2 = linalg.solve(X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y)) + Xw = X_aug.T @ W @ X_aug + yw = X_aug.T @ W @ y + coefs2 = linalg.solve(Xw, yw) - if intercept is False: - assert_array_almost_equal(coefs1, coefs2) - else: - assert_array_almost_equal(coefs1, coefs2[1:]) - assert_almost_equal(inter1, coefs2[0]) + if not fit_intercept: + assert_allclose(coefs1, coefs2) + else: + assert_allclose(coefs1, coefs2[1:]) + assert_allclose(inter1, coefs2[0]) def test_raises_value_error_if_positive_and_sparse(): From b59aa974eb1e4bba00c9866df9ba4878a73abe52 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 18 Mar 2022 13:26:29 +0100 Subject: [PATCH 2/6] wtf --- sklearn/linear_model/tests/test_base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index fac7011893baa..2e2e3c8ba6f35 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -4,8 +4,6 @@ # # License: BSD 3 clause -from os import confstr_names -from random import sample import pytest import warnings From 92546725a86ec95ecd3cb42bf78c470b01bc9401 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 18 Mar 2022 13:30:00 +0100 Subject: [PATCH 3/6] what's new --- doc/whats_new/v1.1.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index f64a6bda6ea95..7786870dc523f 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -594,6 +594,10 @@ Changelog :class:`linear_model.ARDRegression` now preserve float32 dtype. :pr:`9087` by :user:`Arthur Imbert ` and :pr:`22525` by :user:`Meekail Zain `. +- |Fix| The `intercept_` attribute of :class:`LinearRegression` is now correctly + computed in the presence of sample weights when the input is sparse. + :pr:`22891` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.manifold` ....................... From e8720ec3e82604714598bc24fb0aa6e7cccbc77f Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 18 Mar 2022 13:34:06 +0100 Subject: [PATCH 4/6] lint --- sklearn/linear_model/tests/test_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 2e2e3c8ba6f35..9659b5d0e0679 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -13,7 +13,6 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.utils import check_random_state From ba1a5e43f1781facc25c34eb00c95a005d73ee00 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 18 Mar 2022 15:27:49 +0100 Subject: [PATCH 5/6] address comments --- sklearn/linear_model/_base.py | 3 +++ sklearn/linear_model/tests/test_base.py | 8 ++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 2ca19d09121d9..b0c708d3516c3 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -330,6 +330,9 @@ def _rescale_data(X, y, sample_weight, square_root=True): For many linear models, this enables easy support for sample_weight. + Set square_root=False if the square root of the sample weights has already been + done prior to calling this function. + Returns ------- X_rescaled : {array-like, sparse matrix} diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 9659b5d0e0679..f59a5f6f08c5c 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -25,6 +25,7 @@ from sklearn.datasets import make_regression from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import add_dummy_feature rng = np.random.RandomState(0) rtol = 1e-6 @@ -79,12 +80,7 @@ def test_linear_regression_sample_weights(array_constr, fit_intercept): # Closed form of the weighted least square # theta = (X^T W X)^(-1) @ X^T W y W = np.diag(sample_weight) - if not fit_intercept: - X_aug = X - else: - hstack = sparse.hstack if sparse.issparse(X) else np.hstack - dummy_column = np.ones(shape=(n_samples, 1)) - X_aug = hstack((dummy_column, X)) + X_aug = X if not fit_intercept else add_dummy_feature(X) Xw = X_aug.T @ W @ X_aug yw = X_aug.T @ W @ y From 49e2bcea28d3310701bebe1047b795345846cb45 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 18 Mar 2022 16:22:28 +0100 Subject: [PATCH 6/6] address comments --- sklearn/linear_model/_base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index b0c708d3516c3..65cdedafe1821 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -325,13 +325,13 @@ def _preprocess_data( # sample_weight makes the refactoring tricky. -def _rescale_data(X, y, sample_weight, square_root=True): +def _rescale_data(X, y, sample_weight, sqrt_sample_weight=True): """Rescale data sample-wise by square root of sample_weight. For many linear models, this enables easy support for sample_weight. - Set square_root=False if the square root of the sample weights has already been - done prior to calling this function. + Set sqrt_sample_weight=False if the square root of the sample weights has already + been done prior to calling this function. Returns ------- @@ -343,7 +343,7 @@ def _rescale_data(X, y, sample_weight, square_root=True): sample_weight = np.asarray(sample_weight) if sample_weight.ndim == 0: sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype) - if square_root: + if sqrt_sample_weight: sample_weight = np.sqrt(sample_weight) sw_matrix = sparse.dia_matrix((sample_weight, 0), shape=(n_samples, n_samples)) X = safe_sparse_dot(sw_matrix, X) @@ -695,8 +695,8 @@ def fit(self, X, y, sample_weight=None): ) # Sample weight can be implemented via a simple rescaling. - sample_weight = np.sqrt(sample_weight) - X, y = _rescale_data(X, y, sample_weight, square_root=False) + sample_weight_sqrt = np.sqrt(sample_weight) + X, y = _rescale_data(X, y, sample_weight_sqrt, sqrt_sample_weight=False) if self.positive: if y.ndim < 2: @@ -711,10 +711,10 @@ def fit(self, X, y, sample_weight=None): X_offset_scale = X_offset / X_scale def matvec(b): - return X.dot(b) - sample_weight * b.dot(X_offset_scale) + return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale) def rmatvec(b): - return X.T.dot(b) - X_offset_scale * b.dot(sample_weight) + return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt) X_centered = sparse.linalg.LinearOperator( shape=X.shape, matvec=matvec, rmatvec=rmatvec