diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 66272c97d7a16..25e0b369bebd3 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -135,6 +135,12 @@ Changelog :pr:`17743` by :user:`Maria Telenczuk ` and :user:`Alexandre Gramfort `. +- |Fix|: `sample_weight` are now fully taken into account in linear models + when `normalize=True` for both feature centering and feature + scaling. + :pr:`19426` by :user:`Alexandre Gramfort ` and + :user:`Maria Telenczuk `. + :mod:`sklearn.manifold` ....................... diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index f84d4234c193c..61005cb4b5d4a 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -33,6 +33,7 @@ from ..utils.validation import _deprecate_positional_args from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot +from ..utils.extmath import _incremental_mean_and_var from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale from ..utils.fixes import sparse_lsqr from ..utils._seq_dataset import ArrayDataset32, CSRDataset32 @@ -40,7 +41,6 @@ from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.fixes import delayed -from ..preprocessing import normalize as f_normalize # TODO: bayesian_ridge_regression and bayesian_regression_ard # should be squashed into its respective objects. @@ -229,33 +229,33 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if fit_intercept: if sp.issparse(X): - X_offset, X_var = mean_variance_axis(X, axis=0) + X_offset, X_var = mean_variance_axis( + X, axis=0, weights=sample_weight + ) if not return_mean: X_offset[:] = X.dtype.type(0) + else: + X_offset, X_var, _ = _incremental_mean_and_var( + X, last_mean=0., last_variance=0., last_sample_count=0., + sample_weight=sample_weight + ) - if normalize: + X_offset = X_offset.astype(X.dtype) + X -= X_offset - # TODO: f_normalize could be used here as well but the function - # inplace_csr_row_normalize_l2 must be changed such that it - # can return also the norms computed internally + X_var = X_var.astype(X.dtype, copy=False) - # transform variance to norm in-place - X_var *= X.shape[0] - X_scale = np.sqrt(X_var, X_var) - del X_var - X_scale[X_scale == 0] = 1 + if normalize: + X_var *= X.shape[0] + X_scale = np.sqrt(X_var, out=X_var) + X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1. + if sp.issparse(X): inplace_column_scale(X, 1. / X_scale) else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) - + X /= X_scale else: - X_offset = np.average(X, axis=0, weights=sample_weight) - X -= X_offset - if normalize: - X, X_scale = f_normalize(X, axis=0, copy=False, - return_norm=True) - else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) + X_scale = np.ones(X.shape[1], dtype=X.dtype) + y_offset = np.average(y, axis=0, weights=sample_weight) y = y - y_offset else: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 75cc9dd5fd8f1..56ee18f5f0d06 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -1,5 +1,6 @@ # Author: Alexandre Gramfort # Fabian Pedregosa +# Maria Telenczuk # # License: BSD 3 clause @@ -24,6 +25,7 @@ from sklearn.datasets import make_sparse_uncorrelated from sklearn.datasets import make_regression from sklearn.datasets import load_iris +from sklearn.preprocessing import StandardScaler rng = np.random.RandomState(0) rtol = 1e-6 @@ -407,31 +409,31 @@ def test_preprocess_data(): X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) expected_X_mean = np.mean(X, axis=0) - expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0]) + expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0]) expected_y_mean = np.mean(y, axis=0) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X) assert_array_almost_equal(yt, y) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale) assert_array_almost_equal(yt, y - expected_y_mean) @@ -461,36 +463,94 @@ def test_preprocess_data_multioutput(): assert_array_almost_equal(yt, y - y_mean) -def test_preprocess_data_weighted(): +@pytest.mark.parametrize("is_sparse", [False, True]) +def test_preprocess_data_weighted(is_sparse): n_samples = 200 - n_features = 2 + n_features = 4 + # Generate random data with 50% of zero values to make sure + # that the sparse variant of this test is actually sparse. This also + # shifts the mean value for each columns in X further away from + # zero. X = rng.rand(n_samples, n_features) + X[X < 0.5] = 0. + + # Scale the first feature of X to be 10 larger than the other to + # better check the impact of feature scaling. + X[:, 0] *= 10 + + # Constant non-zero feature: this edge-case is currently not handled + # correctly for sparse data, see: + # https://github.com/scikit-learn/scikit-learn/issues/19450 + # X[:, 2] = 1. + + # Constant zero feature (non-materialized in the sparse case) + X[:, 3] = 0. y = rng.rand(n_samples) + sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) - # XXX: if normalize=True, should we expect a weighted standard deviation? - # Currently not weighted, but calculated with respect to weighted mean - expected_X_norm = (np.sqrt(X.shape[0]) * - np.mean((X - expected_X_mean) ** 2, axis=0) ** .5) + X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0) + X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, + weights=sample_weight, + axis=0) + expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) + + # near constant features should not be scaled + expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1 + + if is_sparse: + X = sparse.csr_matrix(X) - Xt, yt, X_mean, y_mean, X_norm = \ + # normalize is False + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, - sample_weight=sample_weight) + sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, np.ones(n_features)) - assert_array_almost_equal(Xt, X - expected_X_mean) + assert_array_almost_equal(X_scale, np.ones(n_features)) + if is_sparse: + assert_array_almost_equal(Xt.toarray(), X.toarray()) + else: + assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) - Xt, yt, X_mean, y_mean, X_norm = \ + # normalize is True + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, - sample_weight=sample_weight) + sample_weight=sample_weight, return_mean=True) + assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + + if is_sparse: + # X is not centered + assert_array_almost_equal( + Xt.toarray(), X.toarray() / expected_X_scale + ) + else: + assert_array_almost_equal( + Xt, (X - expected_X_mean) / expected_X_scale + ) + + # _preprocess_data with normalize=True scales the data by the feature-wise + # euclidean norms while StandardScaler scales the data by the feature-wise + # standard deviations. + # The two are equivalent up to a ratio of np.sqrt(n_samples) + if is_sparse: + scaler = StandardScaler(with_mean=False).fit( + X, sample_weight=sample_weight) + + assert_array_almost_equal( + scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray() + ) + else: + scaler = StandardScaler(with_mean=True).fit( + X, sample_weight=sample_weight) + assert_array_almost_equal(scaler.mean_, X_mean) + assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt) assert_array_almost_equal(yt, y - expected_y_mean) @@ -502,33 +562,33 @@ def test_sparse_preprocess_data_with_return_mean(): X = X.tolil() y = rng.rand(n_samples) XA = X.toarray() - expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0]) + expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0]) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=False, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt.A, XA / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + assert_array_almost_equal(Xt.A, XA / expected_X_scale) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) @@ -577,19 +637,19 @@ def test_dtype_preprocess_data(): for fit_intercept in [True, False]: for normalize in [True, False]: - Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data( + Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data( X_32, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) - Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data( + Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data( X_64, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) - Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = ( + Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = ( _preprocess_data(X_32, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) - Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = ( + Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = ( _preprocess_data(X_64, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) @@ -597,25 +657,25 @@ def test_dtype_preprocess_data(): assert yt_32.dtype == np.float32 assert X_mean_32.dtype == np.float32 assert y_mean_32.dtype == np.float32 - assert X_norm_32.dtype == np.float32 + assert X_scale_32.dtype == np.float32 assert Xt_64.dtype == np.float64 assert yt_64.dtype == np.float64 assert X_mean_64.dtype == np.float64 assert y_mean_64.dtype == np.float64 - assert X_norm_64.dtype == np.float64 + assert X_scale_64.dtype == np.float64 assert Xt_3264.dtype == np.float32 assert yt_3264.dtype == np.float32 assert X_mean_3264.dtype == np.float32 assert y_mean_3264.dtype == np.float32 - assert X_norm_3264.dtype == np.float32 + assert X_scale_3264.dtype == np.float32 assert Xt_6432.dtype == np.float64 assert yt_6432.dtype == np.float64 assert X_mean_6432.dtype == np.float64 assert y_mean_6432.dtype == np.float64 - assert X_norm_6432.dtype == np.float64 + assert X_scale_6432.dtype == np.float64 assert X_32.dtype == np.float32 assert y_32.dtype == np.float32 @@ -626,7 +686,7 @@ def test_dtype_preprocess_data(): assert_array_almost_equal(yt_32, yt_64) assert_array_almost_equal(X_mean_32, X_mean_64) assert_array_almost_equal(y_mean_32, y_mean_64) - assert_array_almost_equal(X_norm_32, X_norm_64) + assert_array_almost_equal(X_scale_32, X_scale_64) @pytest.mark.parametrize('n_targets', [None, 2]) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index b6acb78838a33..3eba535d70c89 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -13,6 +13,7 @@ from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.exceptions import ConvergenceWarning from sklearn.utils._testing import assert_allclose @@ -25,6 +26,7 @@ from sklearn.utils._testing import _convert_container from sklearn.utils._testing import TempMemmap from sklearn.utils.fixes import parse_version +from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.linear_model import ( ARDRegression, @@ -298,7 +300,33 @@ def test_lasso_cv_positive_constraint(): assert min(clf_constrained.coef_) >= 0 -# FIXME: 'normalize' to be removed in 1.2 +def _scale_alpha_inplace(estimator, n_samples): + """Rescale the parameter alpha from when the estimator is evoked with + normalize set to True to when it is evoked in a Pipeline with normalize set + to False and with a StandardScaler. + """ + if 'alpha' not in estimator.get_params(): + return + + if isinstance(estimator, (Lasso, LassoLars, MultiTaskLasso)): + alpha = estimator.alpha * np.sqrt(n_samples) + if isinstance(estimator, (Ridge, RidgeClassifier)): + alpha = estimator.alpha * n_samples + if isinstance(estimator, (ElasticNet, MultiTaskElasticNet)): + if estimator.l1_ratio == 1: + alpha = estimator.alpha * np.sqrt(n_samples) + elif estimator.l1_ratio == 0: + alpha = estimator.alpha * n_samples + else: + # To avoid silent errors in case of refactoring + raise NotImplementedError + + estimator.set_params(alpha=alpha) + + +# FIXME: 'normalize' to be removed in 1.2 for all the models excluding: +# OrthogonalMatchingPursuit, Lars, LassoLars, LarsCV, LassoLarsCV +# for which it is to be removed in 1.4 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( "LinearModel, params", @@ -324,7 +352,6 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): # in the pipeline and with normalize set to False # normalize is True - model_name = LinearModel.__name__ model_normalize = LinearModel(normalize=True, fit_intercept=True, **params) pipeline = make_pipeline( @@ -351,22 +378,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - if 'alpha' in params: - model_normalize.set_params(alpha=params['alpha']) - if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if model_name in ['Ridge', 'RidgeClassifier']: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - if model_name in ['ElasticNet', 'MultiTaskElasticNet']: - if params['l1_ratio'] == 1: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if params['l1_ratio'] == 0: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - - if 'new_params' in locals(): - pipeline[1].set_params(**new_params) + _scale_alpha_inplace(pipeline[1], X_train.shape[0]) model_normalize.fit(X_train, y_train) y_pred_normalize = model_normalize.predict(X_test) @@ -386,24 +398,47 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): # FIXME: 'normalize' to be removed in 1.2 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( - "estimator, is_sparse, with_mean", - [(LinearRegression, True, False), - (LinearRegression, False, True), - (LinearRegression, False, False)] + "estimator, params", + [ + (Lasso, {"tol": 1e-16, "alpha": 0.1}), + (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), + (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), + (LinearRegression, {}), + ] +) +@pytest.mark.parametrize( + "is_sparse, with_mean", [ + (False, True), + (False, False), + (True, False) + # No need to test sparse and with_mean=True + ] ) def test_linear_model_sample_weights_normalize_in_pipeline( - estimator, is_sparse, with_mean + is_sparse, with_mean, estimator, params ): - # Test that the results for running linear regression LinearRegression with - # sample_weight set and with normalize set to True gives similar results as - # LinearRegression with no normalize in a pipeline with a StandardScaler - # and set sample_weight. + # Test that the results for running linear model with sample_weight + # and with normalize set to True gives similar results as the same linear + # model with normalize set to False in a pipeline with + # a StandardScaler and sample_weight. + model_name = estimator.__name__ + + if model_name in ['Lasso', 'ElasticNet'] and is_sparse: + pytest.skip(f'{model_name} does not support sample_weight with sparse') + rng = np.random.RandomState(0) X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, random_state=rng) + + if is_classifier(estimator): + y = np.sign(y) + # make sure the data is not centered to make the problem more - # difficult - X += 10 + # difficult + add 0s for the sparse case + X[X < 0] = 0 + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rng) if is_sparse: @@ -412,27 +447,41 @@ def test_linear_model_sample_weights_normalize_in_pipeline( sample_weight = rng.rand(X_train.shape[0]) - # linear estimator with explicit sample_weight - reg_with_normalize = estimator(normalize=True) + # linear estimator with built-in feature normalization + reg_with_normalize = estimator(normalize=True, fit_intercept=True, + **params) reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight) - # linear estimator in a pipeline - reg_with_scaler = make_pipeline( - StandardScaler(with_mean=with_mean), - estimator(normalize=False) - ) - kwargs = {reg_with_scaler.steps[-1][0] + '__sample_weight': - sample_weight} - reg_with_scaler.fit(X_train, y_train, **kwargs) - - y_pred_norm = reg_with_normalize.predict(X_test) - y_pred_pip = reg_with_scaler.predict(X_test) - - assert_allclose( - reg_with_normalize.coef_ * reg_with_scaler[0].scale_, - reg_with_scaler[1].coef_ - ) - assert_allclose(y_pred_norm, y_pred_pip) + # linear estimator in a pipeline with a StandardScaler, normalize=False + linear_regressor = estimator(normalize=False, fit_intercept=True, **params) + _scale_alpha_inplace(linear_regressor, X_train.shape[0]) # rescale alpha + reg_with_scaler = Pipeline([ + ("scaler", StandardScaler(with_mean=with_mean)), + ("linear_regressor", linear_regressor) + ]) + + fit_params = { + "scaler__sample_weight": sample_weight, + "linear_regressor__sample_weight": sample_weight, + } + + reg_with_scaler.fit(X_train, y_train, **fit_params) + + # Check that the 2 regressions models are exactly equivalent in the + # sense that they predict exactly the same outcome. + y_pred_normalize = reg_with_normalize.predict(X_test) + y_pred_scaler = reg_with_scaler.predict(X_test) + assert_allclose(y_pred_normalize, y_pred_scaler) + # Check intercept computation when normalize is True + y_train_mean = np.average(y_train, weights=sample_weight) + if is_sparse: + X_train_mean, _ = mean_variance_axis(X_train, axis=0, + weights=sample_weight) + else: + X_train_mean = np.average(X_train, weights=sample_weight, axis=0) + assert (reg_with_normalize.intercept_ == + pytest.approx(y_train_mean - + reg_with_normalize.coef_.dot(X_train_mean))) # FIXME: 'normalize' to be removed in 1.2