8000 [MRG] FIX sample_weight invariance for linear models (#19616) · scikit-learn/scikit-learn@5f3bfcc · GitHub
[go: up one dir, main page]

Skip to content

Commit 5f3bfcc

Browse files
ogriselagramfortlorentzenchr
authored
[MRG] FIX sample_weight invariance for linear models (#19616)
Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org> Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
1 parent 038c5cd commit 5f3bfcc

File tree

5 files changed

+147
-36
lines changed

5 files changed

+147
-36
lines changed

doc/whats_new/v1.0.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ Changelog
352352
BayesianRidge, ARDRegression were deprecated in:
353353
:pr:`17746` by :user:`Maria Telenczuk <maikia>`.
354354

355-
- |Fix|: `sample_weight` are now fully taken into account in linear models
355+
- |Fix| `sample_weight` are now fully taken into account in linear models
356356
when `normalize=True` for both feature centering and feature
357357
scaling.
358358
:pr:`19426` by :user:`Alexandre Gramfort <agramfort>` and
@@ -366,6 +366,10 @@ Changelog
366366
:class:`Lars`, :class:`LassoLars`, :class:`LassoLars`, :class:`LarsCV` and
367367
:class:`LassoLarsCV`. :pr:`20155` by :user:`Takeshi Oura <takoika>`.
368368

369+
- |Fix| Sample weight invariance for class:`Ridge` was fixed in :pr:`19616` by
370+
:user:`Oliver Grisel <ogrisel>` and
371+
:user:`Christian Lorentzen <lorentzenchr>`.
372+
369373
:mod:`sklearn.manifold`
370374
.......................
371375

sklearn/linear_model/_base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,10 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
272272
# the np.sqrt. Otherwise constant features cannot be detected with
273273
# sample weights.
274274
constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0])
275-
X_var *= X.shape[0]
275+
if sample_weight is None:
276+
X_var *= X.shape[0]
277+
else:
278+
X_var *= sample_weight.sum()
276279
X_scale = np.sqrt(X_var, out=X_var)
277280
X_scale[constant_mask] = 1.
278281
if sp.issparse(X):

sklearn/linear_model/tests/test_base.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,9 @@ def test_preprocess_data_weighted(is_sparse):
467467
axis=0)
468468
constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
469469
assert_array_equal(constant_mask, [0, 0, 1, 1])
470-
expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)
470+
expected_X_scale = (
471+
np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
472+
)
471473

472474
# near constant features should not be scaled
473475
expected_X_scale[constant_mask] = 1
@@ -510,14 +512,16 @@ def test_preprocess_data_weighted(is_sparse):
510512
# _preprocess_data with normalize=True scales the data by the feature-wise
511513
# euclidean norms while StandardScaler scales the data by the feature-wise
512514
# standard deviations.
513-
# The two are equivalent up to a ratio of np.sqrt(n_samples).
515+
# The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted
516+
# or np.sqrt(sample_weight.sum()) if weighted.
514517
if is_sparse:
515518
scaler = StandardScaler(with_mean=False).fit(
516519
X, sample_weight=sample_weight)
517520

518521
# Non-constant features are scaled similarly with np.sqrt(n_samples)
519522
assert_array_almost_equal(
520-
scaler.transform(X).toarray()[:, :2] / np.sqrt(n_samples),
523+
scaler.transform(X).toarray()[:, :2]
524+
/ np.sqrt(sample_weight.sum()),
521525
Xt.toarray()[:, :2]
522526
)
523527

@@ -530,7 +534,10 @@ def test_preprocess_data_weighted(is_sparse):
530534
scaler = StandardScaler(with_mean=True).fit(
531535
X, sample_weight=sample_weight)
532536
assert_array_almost_equal(scaler.mean_, X_mean)
533-
assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt)
537+
assert_array_almost_equal(
538+
scaler.transform(X) / np.sqrt(sample_weight.sum()),
539+
Xt,
540+
)
534541
assert_array_almost_equal(yt, y - expected_y_mean)
535542

536543

sklearn/linear_model/tests/test_coordinate_descent.py

Lines changed: 76 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import joblib
1010

1111
from sklearn.base import is_classifier
12+
from sklearn.base import clone
1213
from sklearn.datasets import load_diabetes
1314
from sklearn.datasets import make_regression
1415
from sklearn.model_selection import train_test_split
@@ -453,7 +454,7 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
453454
X_train = sparse.csr_matrix(X_train)
454455
X_test = _convert_container(X_train, 'sparse')
455456

456-
sample_weight = rng.rand(X_train.shape[0])
457+
sample_weight = rng.uniform(low=0.1, high=100, size=X_train.shape[0])
457458

458459
# linear estimator with built-in feature normalization
459460
reg_with_normalize = estimator(normalize=True, fit_intercept=True,
@@ -462,7 +463,12 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
462463

463464
# linear estimator in a pipeline with a StandardScaler, normalize=False
464465
linear_regressor = estimator(normalize=False, fit_intercept=True, **params)
465-
_scale_alpha_inplace(linear_regressor, X_train.shape[0]) # rescale alpha
466+
467+
# rescale alpha
468+
if model_name in ["Lasso", "ElasticNet"]:
469+
_scale_alpha_inplace(linear_regressor, y_test.shape[0])
470+
else:
471+
_scale_alpha_inplace(linear_regressor, sample_weight.sum())
466472
reg_with_scaler = Pipeline([
467473
("scaler", StandardScaler(with_mean=with_mean)),
468474
("linear_regressor", linear_regressor)
@@ -479,7 +485,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
479485
# sense that they predict exactly the same outcome.
480486
y_pred_normalize = reg_with_normalize.predict(X_test)
481487
y_pred_scaler = reg_with_scaler.predict(X_test)
482-
assert_allclose(y_pred_normalize, y_pred_scaler)
488+
assert_allclose(y_pred_normalize, y_pred_scaler)
489+
483490
# Check intercept computation when normalize is True
484491
y_train_mean = np.average(y_train, weights=sample_weight)
485492
if is_sparse:
@@ -1446,39 +1453,78 @@ def test_enet_ridge_consistency(normalize, ridge_alpha):
14461453
# effective_rank are more problematic in particular.
14471454

14481455
rng = np.random.RandomState(42)
1456+
n_samples = 300
14491457
X, y = make_regression(
1450-
n_samples=100,
1451-
n_features=300,
1452-
effective_rank=100,
1458+
n_samples=n_samples,
1459+
n_features=100,
1460+
effective_rank=10,
14531461
n_informative=50,
14541462
random_state=rng,
14551463
)
1456-
sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
1457-
1458-
ridge = Ridge(
1459-
alpha=ridge_alpha,
1464+
sw = rng.uniform(low=0.01, high=10, size=X.shape[0])
1465+
alpha = 1.
1466+
common_params = dict(
14601467
normalize=normalize,
1461-
).fit(X, y, sample_weight=sw)
1462-
1463-
enet = ElasticNet(
1464-
alpha=ridge_alpha / sw.sum(),
1465-
normalize=normalize,
1466-
l1_ratio=0.,
1467-
max_iter=1000,
1468+
tol=1e-12,
14681469
)
1469-
# Even when the ElasticNet model has actually converged, the duality gap
1470-
# convergence criterion is never met when l1_ratio is 0 and for any value
1471-
# of the `tol` parameter. The convergence message should point the user to
1472-
# Ridge instead:
1473-
expected_msg = (
1474-
r"Objective did not converge\. .* "
1475-
r"Linear regression models with null weight for the "
1476-
r"l1 regularization term are more efficiently fitted "
1477-
r"using one of the solvers implemented in "
1478-
r"sklearn\.linear_model\.Ridge/RidgeCV instead\."
1470+
ridge = Ridge(alpha=alpha, **common_params).fit(
1471+
X, y, sample_weight=sw
1472+
)
1473+
if normalize:
1474+
alpha_enet = alpha / n 10000 _samples
1475+
else:
1476+
alpha_enet = alpha / sw.sum()
1477+
enet = ElasticNet(alpha=alpha_enet, l1_ratio=0, **common_params).fit(
1478+
X, y, sample_weight=sw
14791479
)
1480-
with pytest.warns(ConvergenceWarning, match=expected_msg):
1481-
enet.fit(X, y, sample_weight=sw)
1482-
14831480
assert_allclose(ridge.coef_, enet.coef_)
14841481
assert_allclose(ridge.intercept_, enet.intercept_)
1482+
1483+
1484+
@pytest.mark.parametrize(
1485+
"estimator", [
1486+
Lasso(alpha=1.),
1487+
ElasticNet(alpha=1., l1_ratio=0.1),
1488+
]
1489+
)
1490+
def test_sample_weight_invariance(estimator):
1491+
rng = np.random.RandomState(42)
1492+
X, y = make_regression(
1493+
n_samples=100,
1494+
n_features=300,
1495+
effective_rank=10,
1496+
n_informative=50,
1497+
random_state=rng,
1498+
)
1499+
normalize = False # These tests don't work for normalize=True.
1500+
sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
1501+
params = dict(normalize=normalize, tol=1e-12)
1502+
1503+
# Check that setting some weights to 0 is equivalent to trimming the
1504+
# samples:
1505+
cutoff = X.shape[0] // 3
1506+
sw_with_null = sw.copy()
1507+
sw_with_null[:cutoff] = 0.
1508+
X_trimmed, y_trimmed = X[cutoff:, :], y[cutoff:]
1509+
sw_trimmed = sw[cutoff:]
1510+
1511+
reg_trimmed = clone(estimator).set_params(**params).fit(
1512+
X_trimmed, y_trimmed, sample_weight=sw_trimmed)
1513+
reg_null_weighted = clone(estimator).set_params(**params).fit(
1514+
X, y, sample_weight=sw_with_null)
1515+
assert_allclose(reg_null_weighted.coef_, reg_trimmed.coef_)
1516+
assert_allclose(reg_null_weighted.intercept_, reg_trimmed.intercept_)
1517+
1518+
# Check that duplicating the training dataset is equivalent to multiplying
1519+
# the weights by 2:
1520+
X_dup = np.concatenate([X, X], axis=0)
1521+
y_dup = np.concatenate([y, y], axis=0)
1522+
sw_dup = np.concatenate([sw, sw], axis=0)
1523+
1524+
reg_2sw = clone(estimator).set_params(**params).fit(
1525+
X, y, sample_weight=2 * sw)
1526+
reg_dup = clone(estimator).set_params(**params).fit(
1527+
X_dup, y_dup, sample_weight=sw_dup)
1528+
1529+
assert_allclose(reg_2sw.coef_, reg_dup.coef_)
1530+
assert_allclose(reg_2sw.intercept_, reg_dup.intercept_)

sklearn/linear_model/tests/test_ridge.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from sklearn.utils._testing import assert_array_almost_equal
1212
from sklearn.utils._testing import assert_array_equal
1313
from sklearn.utils._testing import ignore_warnings
14+
from sklearn.utils.estimator_checks import check_sample_weights_invariance
1415

1516
from sklearn.exceptions import ConvergenceWarning
1617

@@ -1414,3 +1415,53 @@ def test_ridge_sag_with_X_fortran():
14141415
X = X[::2, :]
14151416
y = y[::2]
14161417
Ridge(solver='sag').fit(X, y)
1418+
1419+
1420+
# FIXME: 'normalize' to be removed in 1.2
1421+
@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
1422+
@pytest.mark.parametrize("normalize", [True, False])
1423+
@pytest.mark.parametrize(
1424+
"solver",
1425+
["cholesky", "lsqr", "sparse_cg", "svd", "sag", "saga"]
1426+
)
1427+
def test_ridge_sample_weight_invariance(normalize, solver):
1428+
"""Test that Ridge fulfils sample weight invariance.
1429+
1430+
Note that this test is stricter than the common test
1431+
check_sample_weights_invariance alone.
1432+
"""
1433+
params = dict(
1434+
alpha=1.,
1435+
normalize=normalize,
1436+
solver=solver,
1437+
tol=1e-12,
1438+
)
1439+
reg = Ridge(**params)
1440+
name = reg.__class__.__name__
1441+
check_sample_weights_invariance(name, reg, kind="ones")
1442+
check_sample_weights_invariance(name, reg, kind="zeros")
1443+
1444+
# Check that duplicating the training dataset is equivalent to multiplying
1445+
# the weights by 2:
1446+
if solver.startswith("sag") and normalize:
1447+
pytest.xfail("sag/saga diverge on the second part of this test")
1448+
1449+
rng = np.random.RandomState(42)
1450+
X, y = make_regression(
1451+
n_samples=100,
1452+
n_features=300,
1453+
effective_rank=10,
1454+
n_informative=50,
1455+
random_state=rng,
1456+
)
1457+
sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
1458+
X_dup = np.concatenate([X, X], axis=0)
1459+
y_dup = np.concatenate([y, y], axis=0)
1460+
sw_dup = np.concatenate([sw, sw], axis=0)
1461+
1462+
ridge_2sw = Ridge(**params).fit(X, y, sample_weight=2 * sw)
1463+
ridge_dup = Ridge(**params).fit(
1464+
X_dup, y_dup, sample_weight=sw_dup)
1465+
1466+
assert_allclose(ridge_2sw.coef_, ridge_dup.coef_)
1467+
assert_allclose(ridge_2sw.intercept_, ridge_dup.intercept_)

0 commit comments

Comments
 (0)
0