8000 FIX Don't scale near-constant features to large values (#19527) · scikit-learn/scikit-learn@c748e46 · GitHub
[go: up one dir, main page]

Skip to content

Commit c748e46

Browse files
authored
FIX Don't scale near-constant features to large values (#19527)
1 parent 638b768 commit c748e46

File tree

5 files changed

+113
-21
lines changed

5 files changed

+113
-21
lines changed

doc/whats_new/v1.0.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,13 @@ Changelog
193193
positioning strategy ``knots``.
194194
:pr:`18368` by :user:`Christian Lorentzen <lorentzenchr>`.
195195

196+
- |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler`
197+
and similar scalers detect near-constant features to avoid scaling them to
198+
very large values. This problem happens in particular when using a scaler on
199+
sparse data with a constant column with sample weights, in which case
200+
centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel
201+
<ogrisel>` and :user:`Maria Telenczuk <maikia>`.
202+
196203
:mod:`sklearn.tree`
197204
...................
198205

sklearn/linear_model/_base.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,13 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
246246
X_var = X_var.astype(X.dtype, copy=False)
247247

248248
if normalize:
249+
# Detect constant features on the computed variance, before taking
250+
# the np.sqrt. Otherwise constant features cannot be detected with
251+
# sample_weights.
252+
constant_mask = X_var < 10 * np.finfo(X.dtype).eps
249253
X_var *= X.shape[0]
250254
X_scale = np.sqrt(X_var, out=X_var)
251-
X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1.
255+
X_scale[constant_mask] = 1.
252256
if sp.issparse(X):
253257
inplace_column_scale(X, 1. / X_scale)
254258
else:

sklearn/linear_model/tests/test_base.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -478,10 +478,8 @@ def test_preprocess_data_weighted(is_sparse):
478478
# better check the impact of feature scaling.
479479
X[:, 0] *= 10
480480

481-
# Constant non-zero feature: this edge-case is currently not handled
482-
# correctly for sparse data, see:
483-
# https://github.com/scikit-learn/scikit-learn/issues/19450
484-
# X[:, 2] = 1.
481+
# Constant non-zero feature.
482+
X[:, 2] = 1.
485483

486484
# Constant zero feature (non-materialized in the sparse case)
487485
X[:, 3] = 0.
@@ -495,10 +493,12 @@ def test_preprocess_data_weighted(is_sparse):
495493
X_sample_weight_var = np.average((X - X_sample_weight_avg)**2,
496494
weights=sample_weight,
497495
axis=0)
496+
constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
497+
assert_array_equal(constant_mask, [0, 0, 1, 1])
498498
expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)
499499

500500
# near constant features should not be scaled
501-
expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1
501+
expected_X_scale[constant_mask] = 1
502502

503503
if is_sparse:
504504
X = sparse.csr_matrix(X)
@@ -538,14 +538,22 @@ def test_preprocess_data_weighted(is_sparse):
538538
# _preprocess_data with normalize=True scales the data by the feature-wise
539539
# euclidean norms while StandardScaler scales the data by the feature-wise
540540
# standard deviations.
541-
# The two are equivalent up to a ratio of np.sqrt(n_samples)
541+
# The two are equivalent up to a ratio of np.sqrt(n_samples).
542542
if is_sparse:
543543
scaler = StandardScaler(with_mean=False).fit(
544544
X, sample_weight=sample_weight)
545545

546+
# Non-constant features are scaled similarly with np.sqrt(n_samples)
546547
assert_array_almost_equal(
547-
scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray()
548-
)
548+
scaler.transform(X).toarray()[:, :2] / np.sqrt(n_samples),
549+
Xt.toarray()[:, :2]
550+
)
551+
552+
# Constant features go through un-scaled.
553+
assert_array_almost_equal(
554+
scaler.transform(X).toarray()[:, 2:],
555+
Xt.toarray()[:, 2:]
556+
)
549557
else:
550558
scaler = StandardScaler(with_mean=True).fit(
551559
X, sample_weight=sample_weight)

sklearn/preprocessing/_data.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,22 +60,36 @@
6060
]
6161

6262

63-
def _handle_zeros_in_scale(scale, copy=True):
64-
"""Makes sure that whenever scale is zero, we handle it correctly.
63+
def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
64+
"""Set scales of near constant features to 1.
6565
66-
This happens in most scalers when we have constant features.
67-
"""
66+
The goal is to avoid division by very small or zero values.
67+
68+
Near constant features are detected automatically by identifying
69+
scales close to machine precision unless they are precomputed by
70+
the caller and passed with the `constant_mask` kwarg.
6871
72+
Typically for standard scaling, the scales are the standard
73+
deviation while near constant features are better detected on the
74+
computed variances which are closer to machine precision by
75+
construction.
76+
"""
6977
# if we are fitting on 1D arrays, scale might be a scalar
7078
if np.isscalar(scale):
7179
if scale == .0:
7280
scale = 1.
7381
return scale
7482
elif isinstance(scale, np.ndarray):
83+
if constant_mask is None:
84+
# Detect near constant values to avoid dividing by a very small
85+
# value that could lead to suprising results and numerical
86+
# stability issues.
87+
constant_mask = scale < 10 * np.finfo(scale.dtype).eps
88+
7589
if copy:
7690
# New array to avoid side-effects
7791
scale = scale.copy()
78-
scale[scale == 0.0] = 1.0
92+
scale[constant_mask] = 1.0
7993
return scale
8094

8195

@@ -408,7 +422,7 @@ def partial_fit(self, X, y=None):
408422

409423
data_range = data_max - data_min
410424
self.scale_ = ((feature_range[1] - feature_range[0]) /
411-
_handle_zeros_in_scale(data_range))
425+
_handle_zeros_in_scale(data_range, copy=True))
412426
self.min_ = feature_range[0] - data_min * self.scale_
413427
self.data_min_ = data_min
414428
self.data_max_ = data_max
@@ -850,7 +864,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
850864
self.n_samples_seen_ = self.n_samples_seen_[0]
851865

852866
if self.with_std:
853-
self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
867+
# Extract the list of near constant features on the raw variances,
868+
# before taking the square root.
869+
constant_mask = self.var_ < 10 * np.finfo(X.dtype).eps
870+
self.scale_ = _handle_zeros_in_scale(
871+
np.sqrt(self.var_), copy=False, constant_mask=constant_mask)
854872
else:
855873
self.scale_ = None
856874

@@ -1078,7 +1096,7 @@ def partial_fit(self, X, y=None):
10781096
self.n_samples_seen_ += X.shape[0]
10791097

10801098
self.max_abs_ = max_abs
1081-
self.scale_ = _handle_zeros_in_scale(max_abs)
1099+
self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
10821100
return self
10831101

10841102
def transform(self, X):

sklearn/preprocessing/tests/test_data.py

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,62 @@ def test_standard_scaler_dtype(add_sample_weight, sparse_constructor):
414414
assert scaler.scale_.dtype == np.float64
415415

416416

417+
@pytest.mark.parametrize("scaler", [
418+
StandardScaler(with_mean=False),
419+
RobustScaler(with_centering=False),
420+
])
421+
@pytest.mark.parametrize("sparse_constructor",
422+
[np.asarray, sparse.csc_matrix, sparse.csr_matrix])
423+
@pytest.mark.parametrize("add_sample_weight", [False, True])
424+
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
425+
@pytest.mark.parametrize("constant", [0, 1., 100.])
426+
def test_standard_scaler_constant_features(
427+
scaler, add_sample_weight, sparse_constructor, dtype, constant):
428+
if (isinstance(scaler, StandardScaler)
429+
and constant > 1
430+
and sparse_constructor is not np.asarray
431+
and add_sample_weight):
432+
# https://github.com/scikit-learn/scikit-learn/issues/19546
433+
pytest.xfail("Computation of weighted variance is numerically unstable"
434+
" for sparse data. See: #19546.")
435+
436+
if isinstance(scaler, RobustScaler) and add_sample_weight:
437+
pytest.skip(f"{scaler.__class__.__name__} does not yet support"
438+
f" sample_weight")
439+
440+
rng = np.random.RandomState(0)
441+
n_samples = 100
442+
n_features = 1
443+
if add_sample_weight:
444+
fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
445+
else:
446+
fit_params = {}
447+
X_array = np.full(shape=(n_samples, n_features), fill_value=constant,
448+
dtype=dtype)
449+
X = sparse_constructor(X_array)
450+
X_scaled = scaler.fit(X, **fit_params).transform(X)
451+
452+
if isinstance(scaler, StandardScaler):
453+
# The variance info should be close to zero for constant features.
454+
assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)
455+
456+
# Constant features should not be scaled (scale of 1.):
457+
assert_allclose(scaler.scale_, np.ones(X.shape[1]))
458+
459+
if hasattr(X_scaled, "toarray"):
460+
assert_allclose(X_scaled.toarray(), X_array)
461+
else:
462+
assert_allclose(X_scaled, X)
463+
464+
if isinstance(scaler, StandardScaler) and not add_sample_weight:
465+
# Also check consistency with the standard scale function.
466+
X_scaled_2 = scale(X, with_mean=scaler.with_mean)
467+
if hasattr(X_scaled_2, "toarray"):
468+
assert_allclose(X_scaled_2.toarray(), X_scaled_2.toarray())
469+
else:
470+
assert_allclose(X_scaled_2, X_scaled_2)
471+
472+
417473
def test_scale_1d():
418474
# 1-d inputs
419475
X_list = [1., 3., 5., 0.]
@@ -538,12 +594,11 @@ def test_scaler_float16_overflow():
538594

539595

540596
def test_handle_zeros_in_scale():
541-
s1 = np.array([0, 1, 2, 3])
597+
s1 = np.array([0, 1e-16, 1, 2, 3])
542598
s2 = _handle_zeros_in_scale(s1, copy=True)
543599

544-
assert not s1[0] == s2[0]
545-
assert_array_equal(s1, np.array([0, 1, 2, 3]))
546-
assert_array_equal(s2, np.array([1, 1, 2, 3]))
600+
assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
601+
assert_allclose(s2, np.array([1, 1, 1, 2, 3]))
547602

548603

549604
def test_minmax_scaler_partial_fit():

0 commit comments

Comments
 (0)
0