From d277a43cea35a719f82272d668eaf90dd8b05245 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 09:54:16 +0100 Subject: [PATCH 01/54] add ridge to the test + fix the test --- sklearn/linear_model/tests/test_coordinate_descent.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index a5acb4aa25da2..b0a88dd330fe8 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -393,7 +393,10 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): "estimator, is_sparse, with_mean", [(LinearRegression, True, False), (LinearRegression, False, True), - (LinearRegression, False, False)] + (LinearRegression, False, False), + (Ridge, True, False), + (Ridge, False, True), + (Ridge, False, False)] ) def test_linear_model_sample_weights_normalize_in_pipeline( estimator, is_sparse, with_mean @@ -425,7 +428,9 @@ def test_linear_model_sample_weights_normalize_in_pipeline( StandardScaler(with_mean=with_mean), estimator(normalize=False) ) - kwargs = {reg_with_scaler.steps[-1][0] + '__sample_weight': + kwargs = {reg_with_scaler.steps[0][0] + '__sample_weight': + sample_weight, + reg_with_scaler.steps[-1][0] + '__sample_weight': sample_weight} reg_with_scaler.fit(X_train, y_train, **kwargs) From 03695d7cd9b2a06afe6cfca31d2323d625ae9904 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 10:35:19 +0100 Subject: [PATCH 02/54] changing alpha for ridgh in a pipeline --- .../tests/test_coordinate_descent.py | 48 +++++++++++++------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index b0a88dd330fe8..c94306c161847 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -391,12 +391,13 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( "estimator, is_sparse, with_mean", - [(LinearRegression, True, False), - (LinearRegression, False, True), - (LinearRegression, False, False), - (Ridge, True, False), + [#(LinearRegression, True, False), + #(LinearRegression, False, True), + #(LinearRegression, False, False), + #(Ridge, True, False), (Ridge, False, True), - (Ridge, False, False)] + #(Ridge, False, False) + ] ) def test_linear_model_sample_weights_normalize_in_pipeline( estimator, is_sparse, with_mean @@ -406,27 +407,37 @@ def test_linear_model_sample_weights_normalize_in_pipeline( # LinearRegression with no normalize in a pipeline with a StandardScaler # and set sample_weight. rng = np.random.RandomState(0) - X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, - random_state=rng) + #X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, + # random_state=rng) + n_samples, n_features = 100, 2 + w = rng.randn(n_features) + X = rng.randn(n_samples, n_features) + X += 20 # make features non-zero mean + y = X.dot(w) # XXX : should add some intercept + + params = {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 1.} # make sure the data is not centered to make the problem more # difficult - X += 10 + #X += 10 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rng) if is_sparse: X_train = sparse.csr_matrix(X_train) X_test = _convert_container(X_train, 'sparse') - sample_weight = rng.rand(X_train.shape[0]) + sample_weight = 0.1 * rng.rand(X_train.shape[0]) + new_params = dict(alpha=params['alpha'] * X_train.shape[0]) + # linear estimator with explicit sample_weight - reg_with_normalize = estimator(normalize=True) + reg_with_normalize = estimator(normalize=True, fit_intercept=True, + **params) reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight) # linear estimator in a pipeline reg_with_scaler = make_pipeline( StandardScaler(with_mean=with_mean), - estimator(normalize=False) + estimator(normalize=False, fit_intercept=True, **new_params) ) kwargs = {reg_with_scaler.steps[0][0] + '__sample_weight': sample_weight, @@ -437,10 +448,17 @@ def test_linear_model_sample_weights_normalize_in_pipeline( y_pred_norm = reg_with_normalize.predict(X_test) y_pred_pip = reg_with_scaler.predict(X_test) - assert_allclose( - reg_with_normalize.coef_ * reg_with_scaler[0].scale_, - reg_with_scaler[1].coef_ - ) + # assert_allclose( + # reg_with_normalize.coef_ * reg_with_scaler[0].scale_, + # reg_with_scaler[1].coef_ + #) + #assert_allclose(y_pred_norm, y_pred_pip) + y_train_mean = np.average(y_train, weights=sample_weight) + X_train_mean = np.average(X_train, weights=sample_weight, axis=0) + assert reg_with_scaler[1].intercept_ == pytest.approx(y_train_mean) + assert (reg_with_normalize.intercept_ == + pytest.approx(y_train_mean - + reg_with_normalize.coef_.dot(X_train_mean))) assert_allclose(y_pred_norm, y_pred_pip) From 66cf82dcca57bcbad94c287f835f3d98cc0da82b Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 11:28:47 +0100 Subject: [PATCH 03/54] updated the test to include update in alpha --- .../tests/test_coordinate_descent.py | 69 +++++++++++++++---- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index c94306c161847..3afd4b318dc31 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -386,7 +386,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): model_normalize.coef_.dot(X_train.mean(0)))) assert_allclose(y_pred_normalize, y_pred_standardize) - +''' # FIXME: 'normalize' to be removed in 1.2 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( @@ -399,26 +399,52 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): #(Ridge, False, False) ] ) +''' +# FIXME: 'normalize' to be removed in 1.2 +@pytest.mark.filterwarnings("ignore:'normalize' was deprecated") +@pytest.mark.parametrize( + "estimator, params", + [(Lasso, {"tol": 1e-16, "alpha": 0.1}), + # (LassoLars, {"alpha": 0.1}), (unexpected sample_weight) + # (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), + (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), + (BayesianRidge, {}), + # (ARDRegression, {}), (unexpected sample_weight) + # (OrthogonalMatchingPursuit, {}), (unexpected sample_weight) + # (MultiTaskElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), (unexpected sample_weight) + # (MultiTaskElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), (unexpected sample_weight) + # (MultiTaskLasso, {"tol": 1e-16, "alpha": 0.1}), (unexpected sample_weight) + # (Lars, {}), (unexpected sample_weight) + (LinearRegression, {}), + # (LassoLarsIC, {}) (unexpected sample_weight) + ] +) +@pytest.mark.parametrize( + "is_sparse", + [False] #, True] +) +@pytest.mark.parametrize( + "with_mean", + [True, False] +) def test_linear_model_sample_weights_normalize_in_pipeline( - estimator, is_sparse, with_mean + with_mean, is_sparse, estimator, params ): # Test that the results for running linear regression LinearRegression with # sample_weight set and with normalize set to True gives similar results as # LinearRegression with no normalize in a pipeline with a StandardScaler # and set sample_weight. + model_name = estimator.__name__ + rng = np.random.RandomState(0) - #X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, - # random_state=rng) - n_samples, n_features = 100, 2 - w = rng.randn(n_features) - X = rng.randn(n_samples, n_features) - X += 20 # make features non-zero mean - y = X.dot(w) # XXX : should add some intercept + X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, + random_state=rng) - params = {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 1.} # make sure the data is not centered to make the problem more # difficult - #X += 10 + X += 10 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rng) if is_sparse: @@ -426,8 +452,6 @@ def test_linear_model_sample_weights_normalize_in_pipeline( X_test = _convert_container(X_train, 'sparse') sample_weight = 0.1 * rng.rand(X_train.shape[0]) - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - # linear estimator with explicit sample_weight reg_with_normalize = estimator(normalize=True, fit_intercept=True, @@ -437,8 +461,25 @@ def test_linear_model_sample_weights_normalize_in_pipeline( # linear estimator in a pipeline reg_with_scaler = make_pipeline( StandardScaler(with_mean=with_mean), - estimator(normalize=False, fit_intercept=True, **new_params) + estimator(normalize=False, fit_intercept=True, **params) ) + if 'alpha' in params: + # reg_with_scaler.set_params(alpha=params['alpha']) + if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']: + new_params = dict( + alpha=params['alpha'] * np.sqrt(X_train.shape[0])) + if model_name in ['Ridge', 'RidgeClassifier']: + new_params = dict(alpha=params['alpha'] * X_train.shape[0]) + if model_name in ['ElasticNet', 'MultiTaskElasticNet']: + if params['l1_ratio'] == 1: + new_params = dict( + alpha=params['alpha'] * np.sqrt(X_train.shape[0])) + if params['l1_ratio'] == 0: + new_params = dict(alpha=params['alpha'] * X_train.shape[0]) + + if 'new_params' in locals(): + reg_with_scaler[1].set_params(**new_params) + kwargs = {reg_with_scaler.steps[0][0] + '__sample_weight': sample_weight, reg_with_scaler.steps[-1][0] + '__sample_weight': From 165a5e10179ce31a8cdb3e071025ad2deb5f7a56 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 11:32:34 +0100 Subject: [PATCH 04/54] updated normalize to include sample_weight when x is sparse --- sklearn/linear_model/_base.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index f84d4234c193c..0904072cf51b6 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -33,6 +33,7 @@ from ..utils.validation import _deprecate_positional_args from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot +from ..utils.extmath import _incremental_weighted_mean_and_var from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale from ..utils.fixes import sparse_lsqr from ..utils._seq_dataset import ArrayDataset32, CSRDataset32 @@ -40,7 +41,6 @@ from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.fixes import delayed -from ..preprocessing import normalize as f_normalize # TODO: bayesian_ridge_regression and bayesian_regression_ard # should be squashed into its respective objects. @@ -229,12 +229,12 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if fit_intercept: if sp.issparse(X): - X_offset, X_var = mean_variance_axis(X, axis=0) + X_offset, X_var = mean_variance_axis(X, axis=0, + weights=sample_weight) if not return_mean: X_offset[:] = X.dtype.type(0) if normalize: - # TODO: f_normalize could be used here as well but the function # inplace_csr_row_normalize_l2 must be changed such that it # can return also the norms computed internally @@ -249,13 +249,19 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, X_scale = np.ones(X.shape[1], dtype=X.dtype) else: - X_offset = np.average(X, axis=0, weights=sample_weight) + X_offset, X_var, _ = \ + _incremental_weighted_mean_and_var(X, sample_weight, + last_mean=0., + last_variance=0., + last_weight_sum=0.) X -= X_offset + if normalize: - X, X_scale = f_normalize(X, axis=0, copy=False, - return_norm=True) + X_scale = np.sqrt(X_var) * np.sqrt(len(X)) + X = X / X_scale else: X_scale = np.ones(X.shape[1], dtype=X.dtype) + y_offset = np.average(y, axis=0, weights=sample_weight) y = y - y_offset else: From 41366669c7b078d4b3b4dc507a6212406a2aacb1 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 13:36:30 +0100 Subject: [PATCH 05/54] update the old test for the correct normalize --- sklearn/linear_model/tests/test_base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 75cc9dd5fd8f1..f0b31ce66169a 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -470,10 +470,11 @@ def test_preprocess_data_weighted(): expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) - # XXX: if normalize=True, should we expect a weighted standard deviation? - # Currently not weighted, but calculated with respect to weighted mean - expected_X_norm = (np.sqrt(X.shape[0]) * - np.mean((X - expected_X_mean) ** 2, axis=0) ** .5) + X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0) + X_sample_weight_var = np.average((X-X_sample_weight_avg)**2, + weights=sample_weight, + axis=0) + expected_X_norm = np.sqrt(X_sample_weight_var) * np.sqrt(len(X)) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, From ff266b4f2c2130ccfa8590d3c98bbcaef8442b07 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 14:05:12 +0100 Subject: [PATCH 06/54] keep working on the pipeline test --- .../tests/test_coordinate_descent.py | 35 +++---------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 3afd4b318dc31..c3a0b58b28009 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -386,39 +386,17 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): model_normalize.coef_.dot(X_train.mean(0)))) assert_allclose(y_pred_normalize, y_pred_standardize) -''' -# FIXME: 'normalize' to be removed in 1.2 -@pytest.mark.filterwarnings("ignore:'normalize' was deprecated") -@pytest.mark.parametrize( - "estimator, is_sparse, with_mean", - [#(LinearRegression, True, False), - #(LinearRegression, False, True), - #(LinearRegression, False, False), - #(Ridge, True, False), - (Ridge, False, True), - #(Ridge, False, False) - ] -) -''' + # FIXME: 'normalize' to be removed in 1.2 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( "estimator, params", [(Lasso, {"tol": 1e-16, "alpha": 0.1}), - # (LassoLars, {"alpha": 0.1}), (unexpected sample_weight) # (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), - (BayesianRidge, {}), - # (ARDRegression, {}), (unexpected sample_weight) - # (OrthogonalMatchingPursuit, {}), (unexpected sample_weight) - # (MultiTaskElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), (unexpected sample_weight) - # (MultiTaskElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), (unexpected sample_weight) - # (MultiTaskLasso, {"tol": 1e-16, "alpha": 0.1}), (unexpected sample_weight) - # (Lars, {}), (unexpected sample_weight) (LinearRegression, {}), - # (LassoLarsIC, {}) (unexpected sample_weight) ] ) @pytest.mark.parametrize( @@ -451,7 +429,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline( X_train = sparse.csr_matrix(X_train) X_test = _convert_container(X_train, 'sparse') - sample_weight = 0.1 * rng.rand(X_train.shape[0]) + sample_weight = rng.rand(X_train.shape[0]) # linear estimator with explicit sample_weight reg_with_normalize = estimator(normalize=True, fit_intercept=True, @@ -465,12 +443,12 @@ def test_linear_model_sample_weights_normalize_in_pipeline( ) if 'alpha' in params: # reg_with_scaler.set_params(alpha=params['alpha']) - if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']: + if model_name in ['Lasso']: new_params = dict( alpha=params['alpha'] * np.sqrt(X_train.shape[0])) if model_name in ['Ridge', 'RidgeClassifier']: new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - if model_name in ['ElasticNet', 'MultiTaskElasticNet']: + if model_name in ['ElasticNet']: if params['l1_ratio'] == 1: new_params = dict( alpha=params['alpha'] * np.sqrt(X_train.shape[0])) @@ -489,11 +467,6 @@ def test_linear_model_sample_weights_normalize_in_pipeline( y_pred_norm = reg_with_normalize.predict(X_test) y_pred_pip = reg_with_scaler.predict(X_test) - # assert_allclose( - # reg_with_normalize.coef_ * reg_with_scaler[0].scale_, - # reg_with_scaler[1].coef_ - #) - #assert_allclose(y_pred_norm, y_pred_pip) y_train_mean = np.average(y_train, weights=sample_weight) X_train_mean = np.average(X_train, weights=sample_weight, axis=0) assert reg_with_scaler[1].intercept_ == pytest.approx(y_train_mean) From b386af7066f69fd7085e06bffaf40a2b2d8f8a64 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 14:59:19 +0100 Subject: [PATCH 07/54] add test for sparse and sample_weight when testing for _preprocess_data --- sklearn/linear_model/tests/test_base.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index f0b31ce66169a..4fd856841567a 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -461,11 +461,16 @@ def test_preprocess_data_multioutput(): assert_array_almost_equal(yt, y - y_mean) -def test_preprocess_data_weighted(): +@pytest.mark.parametrize( + "is_sparse", + [False, True] +) +def test_preprocess_data_weighted(is_sparse): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) + sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) @@ -476,22 +481,27 @@ def test_preprocess_data_weighted(): axis=0) expected_X_norm = np.sqrt(X_sample_weight_var) * np.sqrt(len(X)) + if is_sparse: + X = sparse.csr_matrix(X) + Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, - sample_weight=sample_weight) + sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) - assert_array_almost_equal(Xt, X - expected_X_mean) + if not is_sparse: + assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, - sample_weight=sample_weight) + sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + if not is_sparse: + assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) assert_array_almost_equal(yt, y - expected_y_mean) From 2a6c2134ec647eb5623b94cfd7f8bd89d167273f Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 16:58:35 +0100 Subject: [PATCH 08/54] corrected test with dtype, 1 test remaining to be corrected --- sklearn/linear_model/tests/test_base.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 4fd856841567a..50fb485470236 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -490,7 +490,9 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) - if not is_sparse: + if is_sparse: + assert_array_almost_equal(Xt.toarray(), X.toarray() - expected_X_mean) + else: assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) @@ -500,8 +502,14 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) - if not is_sparse: - assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + if is_sparse: + assert_array_almost_equal( + Xt.toarray(), (X.toarray() - expected_X_mean) / expected_X_norm + ) + else: + assert_array_almost_equal( + Xt, (X - expected_X_mean) / expected_X_norm + ) assert_array_almost_equal(yt, y - expected_y_mean) From 82c6344602fa3453af0a6c4b4455f56f8e7258d1 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 16:59:30 +0100 Subject: [PATCH 09/54] cleanup --- sklearn/linear_model/tests/test_coordinate_descent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index c3a0b58b28009..b3de4fb5d812e 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -401,11 +401,11 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): ) @pytest.mark.parametrize( "is_sparse", - [False] #, True] + [False] # , True] ) @pytest.mark.parametrize( "with_mean", - [True, False] + [True] # , False] ) def test_linear_model_sample_weights_normalize_in_pipeline( with_mean, is_sparse, estimator, params From 187833c7ea7a66d5bbe0eb120e91ea270f51a576 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 10 Feb 2021 21:10:19 +0100 Subject: [PATCH 10/54] use _incremental_mean_and_var --- sklearn/linear_model/_base.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 0904072cf51b6..ae0c82052a6a5 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -33,7 +33,7 @@ from ..utils.validation import _deprecate_positional_args from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot -from ..utils.extmath import _incremental_weighted_mean_and_var +from ..utils.extmath import _incremental_mean_and_var from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale from ..utils.fixes import sparse_lsqr from ..utils._seq_dataset import ArrayDataset32, CSRDataset32 @@ -250,10 +250,11 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, else: X_offset, X_var, _ = \ - _incremental_weighted_mean_and_var(X, sample_weight, - last_mean=0., - last_variance=0., - last_weight_sum=0.) + _incremental_mean_and_var(X, + last_mean=0., + last_variance=0., + last_sample_count=0., + sample_weight=sample_weight) X -= X_offset if normalize: From 153d5c2fbaa200a0eedd485dc414749d47dd68a8 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 21:24:29 +0100 Subject: [PATCH 11/54] update X_offset and X_var to be of the correct type --- sklearn/linear_model/_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index ae0c82052a6a5..ce8cfc2ca3e9d 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -249,12 +249,16 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, X_scale = np.ones(X.shape[1], dtype=X.dtype) else: + xtype = X.dtype + X_offset, X_var, _ = \ _incremental_mean_and_var(X, last_mean=0., last_variance=0., last_sample_count=0., sample_weight=sample_weight) + X_offset = X_offset.astype(xtype) + X_var = X_var.astype(xtype) X -= X_offset if normalize: From ae9ea27aa3ee98990fd81c1c5259668c089b8f74 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 10 Feb 2021 21:25:43 +0100 Subject: [PATCH 12/54] fix some tests --- sklearn/linear_model/_base.py | 4 ++++ sklearn/linear_model/tests/test_base.py | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index ae0c82052a6a5..b384f4e6186df 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -255,6 +255,10 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, last_variance=0., last_sample_count=0., sample_weight=sample_weight) + + X_var = X_var.astype(X.dtype) + X_offset = X_offset.astype(X.dtype) + X -= X_offset if normalize: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 50fb485470236..fab9aa20b6db7 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -476,7 +476,7 @@ def test_preprocess_data_weighted(is_sparse): expected_y_mean = np.average(y, axis=0, weights=sample_weight) X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0) - X_sample_weight_var = np.average((X-X_sample_weight_avg)**2, + X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, weights=sample_weight, axis=0) expected_X_norm = np.sqrt(X_sample_weight_var) * np.sqrt(len(X)) @@ -491,7 +491,7 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) if is_sparse: - assert_array_almost_equal(Xt.toarray(), X.toarray() - expected_X_mean) + assert_array_almost_equal(Xt.toarray(), X.toarray()) else: assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) @@ -504,7 +504,7 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(X_norm, expected_X_norm) if is_sparse: assert_array_almost_equal( - Xt.toarray(), (X.toarray() - expected_X_mean) / expected_X_norm + Xt.toarray(), X.toarray() / expected_X_norm ) else: assert_array_almost_equal( From 1f433ac7a7c9181ef216cd033399f29448c66f8f Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 10 Feb 2021 21:40:04 +0100 Subject: [PATCH 13/54] fix more tests --- sklearn/linear_model/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 016f0a3c77be5..5cba9803b459c 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -263,6 +263,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if normalize: X_scale = np.sqrt(X_var) * np.sqrt(len(X)) + X_scale[X_scale == 0.0] = 1.0 X = X / X_scale else: X_scale = np.ones(X.shape[1], dtype=X.dtype) From 563074e2fd12805a9534fe3b5cd0b55c7b0634d3 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 10 Feb 2021 21:57:09 +0100 Subject: [PATCH 14/54] more fixes --- .../tests/test_coordinate_descent.py | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index b3de4fb5d812e..1f4f3885387c8 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -29,6 +29,7 @@ from sklearn.utils._testing import _convert_container from sklearn.utils._testing import TempMemmap from sklearn.utils.fixes import parse_version +from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.linear_model import ( ARDRegression, @@ -391,7 +392,8 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( "estimator, params", - [(Lasso, {"tol": 1e-16, "alpha": 0.1}), + [ + (Lasso, {"tol": 1e-16, "alpha": 0.1}), # (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), @@ -400,12 +402,12 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): ] ) @pytest.mark.parametrize( - "is_sparse", - [False] # , True] -) -@pytest.mark.parametrize( - "with_mean", - [True] # , False] + "is_sparse, with_mean", [ + (False, True), + (False, False), + (True, False) + # No need to test sparse and with_mean=True + ] ) def test_linear_model_sample_weights_normalize_in_pipeline( with_mean, is_sparse, estimator, params @@ -416,6 +418,9 @@ def test_linear_model_sample_weights_normalize_in_pipeline( # and set sample_weight. model_name = estimator.__name__ + if model_name in ['Lasso', 'ElasticNet'] and is_sparse: + pytest.skip(f'{model_name} does not suppert sample_weight with sparse') + rng = np.random.RandomState(0) X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, random_state=rng) @@ -442,7 +447,6 @@ def test_linear_model_sample_weights_normalize_in_pipeline( estimator(normalize=False, fit_intercept=True, **params) ) if 'alpha' in params: - # reg_with_scaler.set_params(alpha=params['alpha']) if model_name in ['Lasso']: new_params = dict( alpha=params['alpha'] * np.sqrt(X_train.shape[0])) @@ -468,8 +472,11 @@ def test_linear_model_sample_weights_normalize_in_pipeline( y_pred_pip = reg_with_scaler.predict(X_test) y_train_mean = np.average(y_train, weights=sample_weight) - X_train_mean = np.average(X_train, weights=sample_weight, axis=0) - assert reg_with_scaler[1].intercept_ == pytest.approx(y_train_mean) + if is_sparse: + X_train_mean, _ = mean_variance_axis(X_train, axis=0, + weights=sample_weight) + else: + X_train_mean = np.average(X_train, weights=sample_weight, axis=0) assert (reg_with_normalize.intercept_ == pytest.approx(y_train_mean - reg_with_normalize.coef_.dot(X_train_mean))) From 0e98592c69669dbe08f7b530ccb58dfeee0449a7 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 10 Feb 2021 21:58:30 +0100 Subject: [PATCH 15/54] more fixes --- sklearn/linear_model/tests/test_coordinate_descent.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 1f4f3885387c8..42db2fa9c06cd 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -394,7 +394,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): "estimator, params", [ (Lasso, {"tol": 1e-16, "alpha": 0.1}), - # (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), + (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), @@ -425,6 +425,9 @@ def test_linear_model_sample_weights_normalize_in_pipeline( X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, random_state=rng) + if is_classifier(estimator): + y = np.sign(y) + # make sure the data is not centered to make the problem more # difficult X += 10 From 29ce06037979f5e4aeba242dcd41e77cf9b9e94c Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 10 Feb 2021 22:06:42 +0100 Subject: [PATCH 16/54] typo --- sklearn/linear_model/tests/test_coordinate_descent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 42db2fa9c06cd..95e56b8263781 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -419,7 +419,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline( model_name = estimator.__name__ if model_name in ['Lasso', 'ElasticNet'] and is_sparse: - pytest.skip(f'{model_name} does not suppert sample_weight with sparse') + pytest.skip(f'{model_name} does not support sample_weight with sparse') rng = np.random.RandomState(0) X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, From 65528f499ad4ac6d15895f3846f0608266c83739 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 10 Feb 2021 22:19:04 +0100 Subject: [PATCH 17/54] factorize code --- .../tests/test_coordinate_descent.py | 53 ++++++++----------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 95e56b8263781..e58c630ebcb8a 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -303,6 +303,24 @@ def test_lasso_cv_positive_constraint(): assert min(clf_constrained.coef_) >= 0 +def _scale_alpha(estimator, n_samples): + if 'alpha' not in estimator.get_params(): + return + + model_name = estimator.__class__.__name__ + if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']: + alpha = estimator.alpha * np.sqrt(n_samples) + if model_name in ['Ridge', 'RidgeClassifier']: + alpha = estimator.alpha * n_samples + if model_name in ['ElasticNet', 'MultiTaskElasticNet']: + if estimator.l1_ratio == 1: + alpha = estimator.alpha * np.sqrt(n_samples) + if estimator.l1_ratio == 0: + alpha = estimator.alpha * n_samples + + estimator.set_params(alpha=alpha) + + # FIXME: 'normalize' to be removed in 1.2 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( @@ -329,7 +347,6 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): # in the pipeline and with normalize set to False # normalize is True - model_name = LinearModel.__name__ model_normalize = LinearModel(normalize=True, fit_intercept=True, **params) pipeline = make_pipeline( @@ -356,22 +373,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - if 'alpha' in params: - model_normalize.set_params(alpha=params['alpha']) - if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if model_name in ['Ridge', 'RidgeClassifier']: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - if model_name in ['ElasticNet', 'MultiTaskElasticNet']: - if params['l1_ratio'] == 1: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if params['l1_ratio'] == 0: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - - if 'new_params' in locals(): - pipeline[1].set_params(**new_params) + _scale_alpha(pipeline[1], X_train.shape[0]) model_normalize.fit(X_train, y_train) y_pred_normalize = model_normalize.predict(X_test) @@ -449,21 +451,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline( StandardScaler(with_mean=with_mean), estimator(normalize=False, fit_intercept=True, **params) ) - if 'alpha' in params: - if model_name in ['Lasso']: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if model_name in ['Ridge', 'RidgeClassifier']: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - if model_name in ['ElasticNet']: - if params['l1_ratio'] == 1: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if params['l1_ratio'] == 0: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - - if 'new_params' in locals(): - reg_with_scaler[1].set_params(**new_params) + + _scale_alpha(reg_with_scaler[1], X_train.shape[0]) kwargs = {reg_with_scaler.steps[0][0] + '__sample_weight': sample_weight, From 84fbd1a5fde104384458e34a4ce5c5d5cd206d93 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 10 Feb 2021 22:29:37 +0100 Subject: [PATCH 18/54] added whats new --- doc/whats_new/v1.0.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index d073ffd80bdf7..46b1bba325ab0 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -131,6 +131,12 @@ Changelog :pr:`17743` by :user:`Maria Telenczuk ` and :user:`Alexandre Gramfort `. +- |Fix|: Fixed a bug in linear_model._base._preprocess_data when + `normalize=True` and `sample_weight` is set. `sample_weight` now weights + standard deviation as expected. + :pr:`19426` by :user:`Alexandre Gramfort ` and + :user:`Maria Telenczuk `. + :mod:`sklearn.metrics` ...................... From 61702a9b53258440305cd4add902d2cdbbf1bd55 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Thu, 11 Feb 2021 09:01:39 +0100 Subject: [PATCH 19/54] nitpick on what's new --- doc/whats_new/v1.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 46b1bba325ab0..9af28f4abd476 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -131,9 +131,9 @@ Changelog :pr:`17743` by :user:`Maria Telenczuk ` and :user:`Alexandre Gramfort `. -- |Fix|: Fixed a bug in linear_model._base._preprocess_data when - `normalize=True` and `sample_weight` is set. `sample_weight` now weights - standard deviation as expected. +- |Fix|: `sample_weight` are now fully taken into account in linear models + when `normalize=True` for both feature centering and feature + scaling. :pr:`19426` by :user:`Alexandre Gramfort ` and :user:`Maria Telenczuk `. From 9b9ee563dcb38a9ab9b4e979b6be7bcffe5dd2b4 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 11 Feb 2021 09:38:05 +0100 Subject: [PATCH 20/54] Update sklearn/linear_model/tests/test_base.py Co-authored-by: Guillaume Lemaitre --- sklearn/linear_model/tests/test_base.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index fab9aa20b6db7..c6a88e64f0039 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -461,10 +461,7 @@ def test_preprocess_data_multioutput(): assert_array_almost_equal(yt, y - y_mean) -@pytest.mark.parametrize( - "is_sparse", - [False, True] -) +@pytest.mark.parametrize("is_sparse", [False, True]) def test_preprocess_data_weighted(is_sparse): n_samples = 200 n_features = 2 From e7b6e9246d1fc4269c99bda30cb99a827d9ced71 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 09:44:06 +0100 Subject: [PATCH 21/54] restructuring the code (if normalize separate) --- sklearn/linear_model/_base.py | 50 ++++++++++++++--------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 5cba9803b459c..4a46578a0e660 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -229,44 +229,34 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if fit_intercept: if sp.issparse(X): - X_offset, X_var = mean_variance_axis(X, axis=0, - weights=sample_weight) + X_offset, X_var = mean_variance_axis( + X, axis=0, weights=sample_weight + ) + if not return_mean: X_offset[:] = X.dtype.type(0) - - if normalize: - # TODO: f_normalize could be used here as well but the function - # inplace_csr_row_normalize_l2 must be changed such that it - # can return also the norms computed internally - - # transform variance to norm in-place - X_var *= X.shape[0] - X_scale = np.sqrt(X_var, X_var) - del X_var - X_scale[X_scale == 0] = 1 - inplace_column_scale(X, 1. / X_scale) - else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) - else: - X_offset, X_var, _ = \ - _incremental_mean_and_var(X, - last_mean=0., - last_variance=0., - last_sample_count=0., - sample_weight=sample_weight) + X_offset, X_var, _ = _incremental_mean_and_var( + X, last_mean=0., last_variance=0., last_sample_count=0., + sample_weight=sample_weight + ) X_offset = X_offset.astype(X.dtype) - X_var = X_var.astype(X.dtype) - X -= X_offset - if normalize: - X_scale = np.sqrt(X_var) * np.sqrt(len(X)) - X_scale[X_scale == 0.0] = 1.0 - X = X / X_scale + X_var = X_var.astype(X.dtype) + + if normalize: + X_var *= X.shape[0] + X_scale = np.sqrt(X_var, X_var) + del X_var + X_scale[X_scale == 0] = 1 + if sp.issparse(X): + inplace_column_scale(X, 1. / X_scale) else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) + X /= X_scale + else: + X_scale = np.ones(X.shape[1], dtype=X.dtype) y_offset = np.average(y, axis=0, weights=sample_weight) y = y - y_offset From b366488caf1165d01e438ba83ef531324219777e Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 09:45:39 +0100 Subject: [PATCH 22/54] remove del x_var --- sklearn/linear_model/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 4a46578a0e660..171f08fc6dfaf 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -249,7 +249,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if normalize: X_var *= X.shape[0] X_scale = np.sqrt(X_var, X_var) - del X_var + X_scale[X_scale == 0] = 1 if sp.issparse(X): inplace_column_scale(X, 1. / X_scale) From e055335536c55c7da5954f1062cdc438d80a3f0c Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 09:58:38 +0100 Subject: [PATCH 23/54] update the docstring of the test --- sklearn/linear_model/tests/test_coordinate_descent.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index e58c630ebcb8a..dec26e598e2a2 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -414,10 +414,10 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): def test_linear_model_sample_weights_normalize_in_pipeline( with_mean, is_sparse, estimator, params ): - # Test that the results for running linear regression LinearRegression with - # sample_weight set and with normalize set to True gives similar results as - # LinearRegression with no normalize in a pipeline with a StandardScaler - # and set sample_weight. + # Test that the results for running linear model with sample_weight + # and with normalize set to True gives similar results as the same linear + # model with normalize set to False in a pipeline with + # a StandardScaler and sample_weight. model_name = estimator.__name__ if model_name in ['Lasso', 'ElasticNet'] and is_sparse: From 0db7fe1140156fbe0235b89e5f65a95f2997af49 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 11 Feb 2021 09:59:57 +0100 Subject: [PATCH 24/54] Update sklearn/linear_model/tests/test_base.py Co-authored-by: Guillaume Lemaitre --- sklearn/linear_model/tests/test_base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index c6a88e64f0039..67410b3c0c60d 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -500,13 +500,14 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) if is_sparse: + # X is not centered assert_array_almost_equal( Xt.toarray(), X.toarray() / expected_X_norm - ) + ) else: assert_array_almost_equal( Xt, (X - expected_X_mean) / expected_X_norm - ) + ) assert_array_almost_equal(yt, y - expected_y_mean) From faaec9c01e1a69bd3c180ccbbd9155f9855886dc Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 10:32:25 +0100 Subject: [PATCH 25/54] improve the docstrings of the tests --- sklearn/linear_model/tests/test_coordinate_descent.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index dec26e598e2a2..a4ab09c43c65b 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -304,6 +304,9 @@ def test_lasso_cv_positive_constraint(): def _scale_alpha(estimator, n_samples): + """"Rescale the parameter alpha from when the estimator is evoked with + normalize set to True to when it is evoked in a Pipeline with normalize set + to False and with a StandardScaler.""" if 'alpha' not in estimator.get_params(): return @@ -321,7 +324,9 @@ def _scale_alpha(estimator, n_samples): estimator.set_params(alpha=alpha) -# FIXME: 'normalize' to be removed in 1.2 +# FIXME: 'normalize' to be removed in 1.2 for all the models excluding: +# OrthogonalMatchingPursuit, Lars, LassoLars, LarsCV, LassoLarsCV +# for which it is to be removed in 1.4 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( "LinearModel, params", From f9f2b872f992c8b0bfdafa42b4a5c5c5d560f309 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 11 Feb 2021 10:41:25 +0100 Subject: [PATCH 26/54] Update sklearn/linear_model/tests/test_coordinate_descent.py Co-authored-by: Guillaume Lemaitre --- .../linear_model/tests/test_coordinate_descent.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index a4ab09c43c65b..7fa9688b95c6e 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -400,12 +400,12 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): @pytest.mark.parametrize( "estimator, params", [ - (Lasso, {"tol": 1e-16, "alpha": 0.1}), - (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), - (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), - (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), - (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), - (LinearRegression, {}), + (Lasso, {"tol": 1e-16, "alpha": 0.1}), + (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), + (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), + (LinearRegression, {}), ] ) @pytest.mark.parametrize( From 1e41bbdfa3b81e61a1e184e0e955a66c0d16a287 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 10:48:47 +0100 Subject: [PATCH 27/54] change the order of the params --- sklearn/linear_model/tests/test_coordinate_descent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 7fa9688b95c6e..9dbd5c22317c7 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -417,7 +417,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): ] ) def test_linear_model_sample_weights_normalize_in_pipeline( - with_mean, is_sparse, estimator, params + is_sparse, with_mean, estimator, params ): # Test that the results for running linear model with sample_weight # and with normalize set to True gives similar results as the same linear From 06035b7d963322027a6cdd1b89a4427454a5b398 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Thu, 11 Feb 2021 11:30:50 +0100 Subject: [PATCH 28/54] add inplace in function name to make it explicit --- sklearn/linear_model/tests/test_coordinate_descent.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 9dbd5c22317c7..c94f7a078ce74 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -303,7 +303,7 @@ def test_lasso_cv_positive_constraint(): assert min(clf_constrained.coef_) >= 0 -def _scale_alpha(estimator, n_samples): +def _scale_alpha_inplace(estimator, n_samples): """"Rescale the parameter alpha from when the estimator is evoked with normalize set to True to when it is evoked in a Pipeline with normalize set to False and with a StandardScaler.""" @@ -378,7 +378,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - _scale_alpha(pipeline[1], X_train.shape[0]) + _scale_alpha_inplace(pipeline[1], X_train.shape[0]) model_normalize.fit(X_train, y_train) y_pred_normalize = model_normalize.predict(X_test) @@ -457,7 +457,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline( estimator(normalize=False, fit_intercept=True, **params) ) - _scale_alpha(reg_with_scaler[1], X_train.shape[0]) + _scale_alpha_inplace(reg_with_scaler[1], X_train.shape[0]) kwargs = {reg_with_scaler.steps[0][0] + '__sample_weight': sample_weight, From f5fd4d1f79089f7fe2a03f686e540f999b7825e4 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 11 Feb 2021 13:12:32 +0100 Subject: [PATCH 29/54] Update sklearn/linear_model/tests/test_base.py Co-authored-by: Olivier Grisel --- sklearn/linear_model/tests/test_base.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 67410b3c0c60d..85566f98a996b 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -465,7 +465,16 @@ def test_preprocess_data_multioutput(): def test_preprocess_data_weighted(is_sparse): n_samples = 200 n_features = 2 + # Generate random data with 50% of zero values to make sure + # that the sparse variant of this test is actually sparse. This also + # shifts the mean value for each columns in X further away from + # zero. X = rng.rand(n_samples, n_features) + X[X < 0.5] = 0. + + # Scale the first feature of X to be 10 larger than the other to + # better check the impact of feature scaling. + X[:, 0] *= 10 y = rng.rand(n_samples) sample_weight = rng.rand(n_samples) From 53a8d5390cf6030de3d535c371837afa2fa50491 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 11 Feb 2021 13:13:22 +0100 Subject: [PATCH 30/54] Update sklearn/linear_model/tests/test_base.py Co-authored-by: Olivier Grisel --- sklearn/linear_model/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 85566f98a996b..e765dc1b5a9ea 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -485,7 +485,7 @@ def test_preprocess_data_weighted(is_sparse): X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, weights=sample_weight, axis=0) - expected_X_norm = np.sqrt(X_sample_weight_var) * np.sqrt(len(X)) + expected_X_norm = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) if is_sparse: X = sparse.csr_matrix(X) From 15f5ad722d94f942a4e3b4a7ce8d353b923c3b06 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 13:18:38 +0100 Subject: [PATCH 31/54] change checking name of the model for isinstance() --- sklearn/linear_model/tests/test_coordinate_descent.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index c94f7a078ce74..d451c4aa16d21 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -310,12 +310,11 @@ def _scale_alpha_inplace(estimator, n_samples): if 'alpha' not in estimator.get_params(): return - model_name = estimator.__class__.__name__ - if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']: + if isinstance(estimator, (Lasso, LassoLars, MultiTaskLasso)): alpha = estimator.alpha * np.sqrt(n_samples) - if model_name in ['Ridge', 'RidgeClassifier']: + if isinstance(estimator, (Ridge, RidgeClassifier)): alpha = estimator.alpha * n_samples - if model_name in ['ElasticNet', 'MultiTaskElasticNet']: + if isinstance(estimator, (ElasticNet, MultiTaskElasticNet)): if estimator.l1_ratio == 1: alpha = estimator.alpha * np.sqrt(n_samples) if estimator.l1_ratio == 0: From 9ac0a66747ce5584d39b5be4729112d91011e64e Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 13:29:50 +0100 Subject: [PATCH 32/54] update the test --- sklearn/linear_model/tests/test_coordinate_descent.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index d451c4aa16d21..ff99ecad83725 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -317,8 +317,11 @@ def _scale_alpha_inplace(estimator, n_samples): if isinstance(estimator, (ElasticNet, MultiTaskElasticNet)): if estimator.l1_ratio == 1: alpha = estimator.alpha * np.sqrt(n_samples) - if estimator.l1_ratio == 0: + elif estimator.l1_ratio == 0: alpha = estimator.alpha * n_samples + else: + # To avoid silent errors in case of refactoring + raise NotImplementedError estimator.set_params(alpha=alpha) From 75025b914de87276e831fe7eab984a57ee4f19b0 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 13:31:51 +0100 Subject: [PATCH 33/54] cleanup --- sklearn/linear_model/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index e765dc1b5a9ea..735979520ce45 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -471,7 +471,7 @@ def test_preprocess_data_weighted(is_sparse): # zero. X = rng.rand(n_samples, n_features) X[X < 0.5] = 0. - + # Scale the first feature of X to be 10 larger than the other to # better check the impact of feature scaling. X[:, 0] *= 10 From b29d37f42055a1fd578f3c455b4fd8a62fe846b4 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 14:07:57 +0100 Subject: [PATCH 34/54] update X update --- sklearn/linear_model/tests/test_coordinate_descent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index ff99ecad83725..485e5762335bf 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -439,7 +439,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline( # make sure the data is not centered to make the problem more # difficult - X += 10 + X[X < 0] = 0 + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rng) if is_sparse: From cfe824d30a414dda67ce846e02c917a49195f94e Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 11 Feb 2021 14:08:49 +0100 Subject: [PATCH 35/54] Update sklearn/linear_model/tests/test_coordinate_descent.py Co-authored-by: Olivier Grisel --- sklearn/linear_model/tests/test_coordinate_descent.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 485e5762335bf..ce8e472eaa839 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -304,9 +304,12 @@ def test_lasso_cv_positive_constraint(): def _scale_alpha_inplace(estimator, n_samples): - """"Rescale the parameter alpha from when the estimator is evoked with + """Rescale the alpha param to check equivalence with StandardScaler + + Rescale the alpha parameter from when the estimator is evoked with normalize set to True to when it is evoked in a Pipeline with normalize set - to False and with a StandardScaler.""" + to False and with a StandardScaler. + """ if 'alpha' not in estimator.get_params(): return From 2efe5c38127ccaee7d1ead2d1346ccdea4d1b433 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 15:22:08 +0100 Subject: [PATCH 36/54] cleanup --- sklearn/linear_model/tests/test_coordinate_descent.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index ce8e472eaa839..afa121cd49f40 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -304,9 +304,7 @@ def test_lasso_cv_positive_constraint(): def _scale_alpha_inplace(estimator, n_samples): - """Rescale the alpha param to check equivalence with StandardScaler - - Rescale the alpha parameter from when the estimator is evoked with + """Rescale the parameter alpha from when the estimator is evoked with normalize set to True to when it is evoked in a Pipeline with normalize set to False and with a StandardScaler. """ @@ -441,7 +439,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline( y = np.sign(y) # make sure the data is not centered to make the problem more - # difficult + # difficult + add 0s for the sparse case X[X < 0] = 0 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, From e062745c27e8e92ad39c262e73e19979962fe68b Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 15:39:15 +0100 Subject: [PATCH 37/54] towards comparing the results of the StandardScaler and _preprocess_data --- sklearn/linear_model/tests/test_base.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 735979520ce45..f6bc1f982bb1e 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -1,5 +1,6 @@ # Author: Alexandre Gramfort # Fabian Pedregosa +# Maria Telenczuk # # License: BSD 3 clause @@ -24,6 +25,7 @@ from sklearn.datasets import make_sparse_uncorrelated from sklearn.datasets import make_regression from sklearn.datasets import load_iris +from sklearn.preprocessing import StandardScaler rng = np.random.RandomState(0) rtol = 1e-6 @@ -490,6 +492,7 @@ def test_preprocess_data_weighted(is_sparse): if is_sparse: X = sparse.csr_matrix(X) + # normalize is False Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight, return_mean=True) @@ -502,9 +505,11 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) + # normalize is True Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight, return_mean=True) + assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) @@ -513,10 +518,16 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal( Xt.toarray(), X.toarray() / expected_X_norm ) + scaler = StandardScaler(with_mean=False).fit( + X, sample_weight=sample_weight) else: assert_array_almost_equal( Xt, (X - expected_X_mean) / expected_X_norm ) + scaler = StandardScaler(with_mean=True).fit( + X, sample_weight=sample_weight) + assert_array_almost_equal(scaler.mean_, X_mean) + assert_array_almost_equal(scaler.transform(X), Xt) assert_array_almost_equal(yt, y - expected_y_mean) From 4733d975da355fc8fbd1e54c6dd45df847feef4f Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 16:15:29 +0100 Subject: [PATCH 38/54] fix the test --- sklearn/linear_model/tests/test_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index f6bc1f982bb1e..73d137f4d981b 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -520,6 +520,9 @@ def test_preprocess_data_weighted(is_sparse): ) scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) + assert_array_almost_equal( + scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray() + ) else: assert_array_almost_equal( Xt, (X - expected_X_mean) / expected_X_norm @@ -527,7 +530,7 @@ def test_preprocess_data_weighted(is_sparse): scaler = StandardScaler(with_mean=True).fit( X, sample_weight=sample_weight) assert_array_almost_equal(scaler.mean_, X_mean) - assert_array_almost_equal(scaler.transform(X), Xt) + assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt) assert_array_almost_equal(yt, y - expected_y_mean) From 66ac2972f2ee666b8ad2c9a650e561ba5c15fda9 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 18:28:17 +0100 Subject: [PATCH 39/54] clean up the tests according to the reviewer guidelines --- .../tests/test_coordinate_descent.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index afa121cd49f40..59b7aa5133eb3 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -13,6 +13,7 @@ from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.exceptions import ConvergenceWarning @@ -450,24 +451,25 @@ def test_linear_model_sample_weights_normalize_in_pipeline( sample_weight = rng.rand(X_train.shape[0]) - # linear estimator with explicit sample_weight + # linear estimator with explicit sample_weight, normalize = True reg_with_normalize = estimator(normalize=True, fit_intercept=True, **params) reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight) - # linear estimator in a pipeline - reg_with_scaler = make_pipeline( - StandardScaler(with_mean=with_mean), - estimator(normalize=False, fit_intercept=True, **params) - ) + # linear estimator in a pipeline with a StandardScaler, normalize=False + linear_regressor = estimator(normalize=False, fit_intercept=True, **params) + _scale_alpha_inplace(linear_regressor, X_train.shape[0]) # rescale alpha + reg_with_scaler = Pipeline([ + ("scaler", StandardScaler(with_mean=with_mean)), + ("linear_regressor", linear_regressor) + ]) - _scale_alpha_inplace(reg_with_scaler[1], X_train.shape[0]) + fit_params = { + "scaler__sample_weight": sample_weight, + "linear_regressor__sample_weight": sample_weight, + } - kwargs = {reg_with_scaler.steps[0][0] + '__sample_weight': - sample_weight, - reg_with_scaler.steps[-1][0] + '__sample_weight': - sample_weight} - reg_with_scaler.fit(X_train, y_train, **kwargs) + reg_with_scaler.fit(X_train, y_train, **fit_params) y_pred_norm = reg_with_normalize.predict(X_test) y_pred_pip = reg_with_scaler.predict(X_test) From 32ebc775df6616e88b16517fbbb7a21de0458f13 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 18:30:30 +0100 Subject: [PATCH 40/54] more readibility improvements --- sklearn/linear_model/tests/test_coordinate_descent.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 59b7aa5133eb3..58a96446e42fb 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -471,8 +471,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline( reg_with_scaler.fit(X_train, y_train, **fit_params) - y_pred_norm = reg_with_normalize.predict(X_test) - y_pred_pip = reg_with_scaler.predict(X_test) + y_pred_nomalize = reg_with_normalize.predict(X_test) + y_pred_scaler = reg_with_scaler.predict(X_test) y_train_mean = np.average(y_train, weights=sample_weight) if is_sparse: @@ -483,7 +483,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline( assert (reg_with_normalize.intercept_ == pytest.approx(y_train_mean - reg_with_normalize.coef_.dot(X_train_mean))) - assert_allclose(y_pred_norm, y_pred_pip) + assert_allclose(y_pred_nomalize, y_pred_scaler) # FIXME: 'normalize' to be removed in 1.2 From 760285c0d3f92e3a0af75fa5d9e7b8a2b6313f4e Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 18:32:25 +0100 Subject: [PATCH 41/54] add more comments to the test --- sklearn/linear_model/tests/test_coordinate_descent.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 58a96446e42fb..349193490199d 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -471,6 +471,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline( reg_with_scaler.fit(X_train, y_train, **fit_params) + # Check that the 2 regressions models are exactly equivalent in the + # sense that they predict exactly the same outcome. y_pred_nomalize = reg_with_normalize.predict(X_test) y_pred_scaler = reg_with_scaler.predict(X_test) From 41f32b8c21048f5d6dfc7903e45604aca0bce194 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 20:40:21 +0100 Subject: [PATCH 42/54] improve further the test readibility --- sklearn/linear_model/tests/test_coordinate_descent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 349193490199d..096338ab5d172 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -451,7 +451,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline( sample_weight = rng.rand(X_train.shape[0]) - # linear estimator with explicit sample_weight, normalize = True + # linear estimator with built-in feature normalization reg_with_normalize = estimator(normalize=True, fit_intercept=True, **params) reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight) @@ -475,6 +475,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline( # sense that they predict exactly the same outcome. y_pred_nomalize = reg_with_normalize.predict(X_test) y_pred_scaler = reg_with_scaler.predict(X_test) + assert_allclose(y_pred_nomalize, y_pred_scaler) y_train_mean = np.average(y_train, weights=sample_weight) if is_sparse: @@ -485,7 +486,6 @@ def test_linear_model_sample_weights_normalize_in_pipeline( assert (reg_with_normalize.intercept_ == pytest.approx(y_train_mean - reg_with_normalize.coef_.dot(X_train_mean))) - assert_allclose(y_pred_nomalize, y_pred_scaler) # FIXME: 'normalize' to be removed in 1.2 From 0ab04c2a1ca8c35d587a41aa5e521a014be75891 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 20:46:06 +0100 Subject: [PATCH 43/54] move standardscaler test to the end + add explanation --- sklearn/linear_model/tests/test_base.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 73d137f4d981b..6551736d6056c 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -518,15 +518,22 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal( Xt.toarray(), X.toarray() / expected_X_norm ) + else: + assert_array_almost_equal( + Xt, (X - expected_X_mean) / expected_X_norm + ) + + # _preprocess_data with normalize=True scales the data by the feature-wise + # euclidean norms while StandardScaler scales the data by the feature-wise + # standard deviations. + # The two are equivalent up to a ration of np.sqrt(n_samples) + if is_sparse: scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) assert_array_almost_equal( scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray() ) else: - assert_array_almost_equal( - Xt, (X - expected_X_mean) / expected_X_norm - ) scaler = StandardScaler(with_mean=True).fit( X, sample_weight=sample_weight) assert_array_almost_equal(scaler.mean_, X_mean) From d34ab61705b39fb703f4a346a717a4f9fc36ad11 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 11 Feb 2021 20:46:42 +0100 Subject: [PATCH 44/54] Update sklearn/linear_model/tests/test_base.py Co-authored-by: Olivier Grisel --- sklearn/linear_model/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 6551736d6056c..e5e999b9f568c 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -466,7 +466,7 @@ def test_preprocess_data_multioutput(): @pytest.mark.parametrize("is_sparse", [False, True]) def test_preprocess_data_weighted(is_sparse): n_samples = 200 - n_features = 2 + n_features = 4 # Generate random data with 50% of zero values to make sure # that the sparse variant of this test is actually sparse. This also # shifts the mean value for each columns in X further away from From 013ca9716a89dab33eca24f7cf7f5166d1bb04a7 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 20:48:32 +0100 Subject: [PATCH 45/54] add edge test cases --- sklearn/linear_model/tests/test_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index e5e999b9f568c..7fad21af09ebe 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -477,6 +477,10 @@ def test_preprocess_data_weighted(is_sparse): # Scale the first feature of X to be 10 larger than the other to # better check the impact of feature scaling. X[:, 0] *= 10 + # Constant non-zero feature + X[:, 2] = 1. + # Constant zero feature (non-materialized in the sparse case) + X[:, 3] = 0. y = rng.rand(n_samples) sample_weight = rng.rand(n_samples) From 2f6553cec21e69ae1729acec74b4b62b8b5dedcf Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 20:53:51 +0100 Subject: [PATCH 46/54] cleaning up --- sklearn/linear_model/tests/test_coordinate_descent.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 096338ab5d172..132c2f3de2352 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -473,10 +473,10 @@ def test_linear_model_sample_weights_normalize_in_pipeline( # Check that the 2 regressions models are exactly equivalent in the # sense that they predict exactly the same outcome. - y_pred_nomalize = reg_with_normalize.predict(X_test) + y_pred_normalize = reg_with_normalize.predict(X_test) y_pred_scaler = reg_with_scaler.predict(X_test) - assert_allclose(y_pred_nomalize, y_pred_scaler) - + assert_allclose(y_pred_normalize, y_pred_scaler) + # Check intercept computation when normalize is True y_train_mean = np.average(y_train, weights=sample_weight) if is_sparse: X_train_mean, _ = mean_variance_axis(X_train, axis=0, From 4a8938a8d104fe4dc379b6d9655bcafddbe396f9 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 21:38:52 +0100 Subject: [PATCH 47/54] dealing with first edge case --- sklearn/linear_model/_base.py | 11 +++++++---- sklearn/linear_model/tests/test_base.py | 4 +++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 171f08fc6dfaf..d328033a6c8de 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -249,12 +249,15 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if normalize: X_var *= X.shape[0] X_scale = np.sqrt(X_var, X_var) - - X_scale[X_scale == 0] = 1 + if np.any(X_scale == 0): + X_scale_ = X_scale.copy() + X_scale_[X_scale_ == 0] = 1 + else: + X_scale_ = X_scale if sp.issparse(X): - inplace_column_scale(X, 1. / X_scale) + inplace_column_scale(X, 1. / X_scale_) else: - X /= X_scale + X /= X_scale_ else: X_scale = np.ones(X.shape[1], dtype=X.dtype) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 7fad21af09ebe..ddf353b524e9d 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -478,7 +478,7 @@ def test_preprocess_data_weighted(is_sparse): # better check the impact of feature scaling. X[:, 0] *= 10 # Constant non-zero feature - X[:, 2] = 1. + # X[:, 2] = 1. # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0. y = rng.rand(n_samples) @@ -517,6 +517,8 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) + + expected_X_norm[expected_X_norm == 0] = 1 if is_sparse: # X is not centered assert_array_almost_equal( From 98610658ff3f4aaaccd5298c618c0cae0c372562 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 22:21:14 +0100 Subject: [PATCH 48/54] numerical error dealt with in tests --- sklearn/linear_model/tests/test_base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index ddf353b524e9d..80f93843e7efd 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -478,7 +478,7 @@ def test_preprocess_data_weighted(is_sparse): # better check the impact of feature scaling. X[:, 0] *= 10 # Constant non-zero feature - # X[:, 2] = 1. + X[:, 2] = 1. # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0. y = rng.rand(n_samples) @@ -518,15 +518,21 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) - expected_X_norm[expected_X_norm == 0] = 1 + if np.any(expected_X_norm == 0): + expected_X_norm_ = expected_X_norm.copy() + expected_X_norm_[expected_X_norm_ == 0] = 1 + else: + expected_X_norm_ = expected_X_norm + # avoid roundoff errors + expected_X_norm_[expected_X_norm_ < 1e-10] = 1 if is_sparse: # X is not centered assert_array_almost_equal( - Xt.toarray(), X.toarray() / expected_X_norm + Xt.toarray(), X.toarray() / expected_X_norm_ ) else: assert_array_almost_equal( - Xt, (X - expected_X_mean) / expected_X_norm + Xt, (X - expected_X_mean) / expected_X_norm_ ) # _preprocess_data with normalize=True scales the data by the feature-wise From 0b88fd643eb83e1a32e7bfed8437de4a459f4ea6 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 11 Feb 2021 22:57:48 +0100 Subject: [PATCH 49/54] cleanup --- sklearn/linear_model/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index d328033a6c8de..f1c4212dd28a8 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -255,6 +255,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, else: X_scale_ = X_scale if sp.issparse(X): + # import pdb; pdb.set_trace() inplace_column_scale(X, 1. / X_scale_) else: X /= X_scale_ From 19548adf86ba3c907891ec4dde7464db58ee6c91 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 12 Feb 2021 22:59:03 +0100 Subject: [PATCH 50/54] comment out troublesome edge test case --- sklearn/linear_model/tests/test_base.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 80f93843e7efd..1775a02fdbe7a 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -478,7 +478,8 @@ def test_preprocess_data_weighted(is_sparse): # better check the impact of feature scaling. X[:, 0] *= 10 # Constant non-zero feature - X[:, 2] = 1. + # X[:, 2] = 1. # this edge case is not passing for sparse data because of + # the roundoff error and should be addressed elsewhere # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0. y = rng.rand(n_samples) @@ -518,13 +519,13 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) - if np.any(expected_X_norm == 0): + # avoid roundoff errors and division by 0 + if np.any(expected_X_norm < 5e-15): expected_X_norm_ = expected_X_norm.copy() - expected_X_norm_[expected_X_norm_ == 0] = 1 + expected_X_norm_[expected_X_norm_ < 5e-15] = 1 else: expected_X_norm_ = expected_X_norm - # avoid roundoff errors - expected_X_norm_[expected_X_norm_ < 1e-10] = 1 + if is_sparse: # X is not centered assert_array_almost_equal( @@ -542,6 +543,7 @@ def test_preprocess_data_weighted(is_sparse): if is_sparse: scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) + assert_array_almost_equal( scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray() ) From 7cd9e517ad37fa3fc687b546e7ced0472028e2e5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 19 Feb 2021 18:32:59 +0100 Subject: [PATCH 51/54] Fix handling of near constant features --- sklearn/linear_model/_base.py | 17 +++--- sklearn/linear_model/tests/test_base.py | 74 ++++++++++++------------- 2 files changed, 42 insertions(+), 49 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index f1c4212dd28a8..d804fbda1e68a 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -244,21 +244,18 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, X_offset = X_offset.astype(X.dtype) X -= X_offset - X_var = X_var.astype(X.dtype) + X_var = X_var.astype(X.dtype, copy=False) if normalize: X_var *= X.shape[0] - X_scale = np.sqrt(X_var, X_var) - if np.any(X_scale == 0): - X_scale_ = X_scale.copy() - X_scale_[X_scale_ == 0] = 1 - else: - X_scale_ = X_scale + X_scale = np.sqrt(X_var, out=X_var) + near_zero_mask = X_scale < np.finfo(X_scale.dtype).eps + if np.any(near_zero_mask): + X_scale[near_zero_mask] = 1 if sp.issparse(X): - # import pdb; pdb.set_trace() - inplace_column_scale(X, 1. / X_scale_) + inplace_column_scale(X, 1. / X_scale) else: - X /= X_scale_ + X /= X_scale else: X_scale = np.ones(X.shape[1], dtype=X.dtype) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 1775a02fdbe7a..0d0f546118090 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -409,31 +409,31 @@ def test_preprocess_data(): X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) expected_X_mean = np.mean(X, axis=0) - expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0]) + expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0]) expected_y_mean = np.mean(y, axis=0) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X) assert_array_almost_equal(yt, y) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale) assert_array_almost_equal(yt, y - expected_y_mean) @@ -492,18 +492,21 @@ def test_preprocess_data_weighted(is_sparse): X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, weights=sample_weight, axis=0) - expected_X_norm = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) + expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) + + # near constant fetures should not be scaled + expected_X_scale[expected_X_scale < 1e-15] = 1 if is_sparse: X = sparse.csr_matrix(X) # normalize is False - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) if is_sparse: assert_array_almost_equal(Xt.toarray(), X.toarray()) else: @@ -511,29 +514,22 @@ def test_preprocess_data_weighted(is_sparse): assert_array_almost_equal(yt, y - expected_y_mean) # normalize is True - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, expected_X_norm) - - # avoid roundoff errors and division by 0 - if np.any(expected_X_norm < 5e-15): - expected_X_norm_ = expected_X_norm.copy() - expected_X_norm_[expected_X_norm_ < 5e-15] = 1 - else: - expected_X_norm_ = expected_X_norm + assert_array_almost_equal(X_scale, expected_X_scale) if is_sparse: # X is not centered assert_array_almost_equal( - Xt.toarray(), X.toarray() / expected_X_norm_ + Xt.toarray(), X.toarray() / expected_X_scale ) else: assert_array_almost_equal( - Xt, (X - expected_X_mean) / expected_X_norm_ + Xt, (X - expected_X_mean) / expected_X_scale ) # _preprocess_data with normalize=True scales the data by the feature-wise @@ -563,33 +559,33 @@ def test_sparse_preprocess_data_with_return_mean(): X = X.tolil() y = rng.rand(n_samples) XA = X.toarray() - expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0]) + expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0]) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=False, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt.A, XA / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + assert_array_almost_equal(Xt.A, XA / expected_X_scale) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) @@ -638,19 +634,19 @@ def test_dtype_preprocess_data(): for fit_intercept in [True, False]: for normalize in [True, False]: - Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data( + Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data( X_32, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) - Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data( + Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data( X_64, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) - Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = ( + Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = ( _preprocess_data(X_32, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) - Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = ( + Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = ( _preprocess_data(X_64, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) @@ -658,25 +654,25 @@ def test_dtype_preprocess_data(): assert yt_32.dtype == np.float32 assert X_mean_32.dtype == np.float32 assert y_mean_32.dtype == np.float32 - assert X_norm_32.dtype == np.float32 + assert X_scale_32.dtype == np.float32 assert Xt_64.dtype == np.float64 assert yt_64.dtype == np.float64 assert X_mean_64.dtype == np.float64 assert y_mean_64.dtype == np.float64 - assert X_norm_64.dtype == np.float64 + assert X_scale_64.dtype == np.float64 assert Xt_3264.dtype == np.float32 assert yt_3264.dtype == np.float32 assert X_mean_3264.dtype == np.float32 assert y_mean_3264.dtype == np.float32 - assert X_norm_3264.dtype == np.float32 + assert X_scale_3264.dtype == np.float32 assert Xt_6432.dtype == np.float64 assert yt_6432.dtype == np.float64 assert X_mean_6432.dtype == np.float64 assert y_mean_6432.dtype == np.float64 - assert X_norm_6432.dtype == np.float64 + assert X_scale_6432.dtype == np.float64 assert X_32.dtype == np.float32 assert y_32.dtype == np.float32 @@ -687,7 +683,7 @@ def test_dtype_preprocess_data(): assert_array_almost_equal(yt_32, yt_64) assert_array_almost_equal(X_mean_32, X_mean_64) assert_array_almost_equal(y_mean_32, y_mean_64) - assert_array_almost_equal(X_norm_32, X_norm_64) + assert_array_almost_equal(X_scale_32, X_scale_64) @pytest.mark.parametrize('n_targets', [None, 2]) From 596c036f9834591c3faf8453c5da91ed4182a829 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 19 Feb 2021 18:42:49 +0100 Subject: [PATCH 52/54] Update sklearn/linear_model/tests/test_base.py --- sklearn/linear_model/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 0d0f546118090..fb140235d2a62 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -535,7 +535,7 @@ def test_preprocess_data_weighted(is_sparse): # _preprocess_data with normalize=True scales the data by the feature-wise # euclidean norms while StandardScaler scales the data by the feature-wise # standard deviations. - # The two are equivalent up to a ration of np.sqrt(n_samples) + # The two are equivalent up to a ratio of np.sqrt(n_samples) if is_sparse: scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) From 1c8956466cdd301c25485214b8f89392ab454972 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 20 Feb 2021 00:27:03 +0100 Subject: [PATCH 53/54] Reenable failing edge case --- sklearn/linear_model/_base.py | 5 +---- sklearn/linear_model/tests/test_base.py | 7 +++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index d804fbda1e68a..61005cb4b5d4a 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -232,7 +232,6 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, X_offset, X_var = mean_variance_axis( X, axis=0, weights=sample_weight ) - if not return_mean: X_offset[:] = X.dtype.type(0) else: @@ -249,9 +248,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if normalize: X_var *= X.shape[0] X_scale = np.sqrt(X_var, out=X_var) - near_zero_mask = X_scale < np.finfo(X_scale.dtype).eps - if np.any(near_zero_mask): - X_scale[near_zero_mask] = 1 + X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1. if sp.issparse(X): inplace_column_scale(X, 1. / X_scale) else: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index fb140235d2a62..06266711d80aa 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -478,8 +478,7 @@ def test_preprocess_data_weighted(is_sparse): # better check the impact of feature scaling. X[:, 0] *= 10 # Constant non-zero feature - # X[:, 2] = 1. # this edge case is not passing for sparse data because of - # the roundoff error and should be addressed elsewhere + X[:, 2] = 1. # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0. y = rng.rand(n_samples) @@ -494,8 +493,8 @@ def test_preprocess_data_weighted(is_sparse): axis=0) expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) - # near constant fetures should not be scaled - expected_X_scale[expected_X_scale < 1e-15] = 1 + # near constant features should not be scaled + expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1 if is_sparse: X = sparse.csr_matrix(X) From b0a9090efa294cafe7eee9e77da51ba236c64909 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 22 Feb 2021 11:12:54 +0100 Subject: [PATCH 54/54] Disable constant non-zero edge case in tests --- sklearn/linear_model/tests/test_base.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 06266711d80aa..56ee18f5f0d06 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -477,8 +477,12 @@ def test_preprocess_data_weighted(is_sparse): # Scale the first feature of X to be 10 larger than the other to # better check the impact of feature scaling. X[:, 0] *= 10 - # Constant non-zero feature - X[:, 2] = 1. + + # Constant non-zero feature: this edge-case is currently not handled + # correctly for sparse data, see: + # https://github.com/scikit-learn/scikit-learn/issues/19450 + # X[:, 2] = 1. + # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0. y = rng.rand(n_samples)