10000 ENH Ignore and pass-through NaN values in MaxAbsScaler and maxabs_sca… · wdevazelhes/scikit-learn@f43dd0e · GitHub
[go: up one dir, main page]

Skip to content

Commit f43dd0e

Browse files
LucijaGregovjnothman
authored andcommitted
ENH Ignore and pass-through NaN values in MaxAbsScaler and maxabs_scale (scikit-learn#11011)
1 parent 93382cc commit f43dd0e

File tree

5 files changed

+56
-24
lines changed

5 files changed

+56
-24
lines changed

doc/whats_new/v0.20.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,11 @@ Preprocessing
266266
ignore and pass-through NaN values.
267267
:issue:`11206` by :user:`Guillaume Lemaitre <glemaitre>`.
268268

269+
- :class:`preprocessing.MaxAbsScaler` and :func:`preprocessing.maxabs_scale`
270+
handles and ignores NaN values.
271+
:issue:`11011` by `Lucija Gregov <LucihaGregov>` and
272+
:user:`Guillaume Lemaitre <glemaitre>`
273+
269274
- :class:`preprocessing.PowerTransformer` and
270275
:func:`preprocessing.power_transform` ignore and pass-through NaN values.
271276
:issue:`11306` by :user:`Guillaume Lemaitre <glemaitre>`.

sklearn/preprocessing/data.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -890,13 +890,14 @@ def partial_fit(self, X, y=None):
890890
Ignored
891891
"""
892892
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
893-
estimator=self, dtype=FLOAT_DTYPES)
893+
estimator=self, dtype=FLOAT_DTYPES,
894+
force_all_finite='allow-nan')
894895

895896
if sparse.issparse(X):
896-
mins, maxs = min_max_axis(X, axis=0)
897+
mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
897898
max_abs = np.maximum(np.abs(mins), np.abs(maxs))
898899
else:
899-
max_abs = np.abs(X).max(axis=0)
900+
max_abs = np.nanmax(np.abs(X), axis=0)
900901

901902
# First pass
902903
if not hasattr(self, 'n_samples_seen_'):
@@ -920,7 +921,8 @@ def transform(self, X):
920921
"""
921922
check_is_fitted(self, 'scale_')
922923
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
923-
estimator=self, dtype=FLOAT_DTYPES)
924+
estimator=self, dtype=FLOAT_DTYPES,
925+
force_all_finite='allow-nan')
924926

925927
if sparse.issparse(X):
926928
inplace_column_scale(X, 1.0 / self.scale_)
@@ -938,7 +940,8 @@ def inverse_transform(self, X):
938940
"""
939941
check_is_fitted(self, 'scale_')
940942
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
941-
estimator=self, dtype=FLOAT_DTYPES)
943+
estimator=self, dtype=FLOAT_DTYPES,
944+
force_all_finite='allow-nan')
942945

943946
if sparse.issparse(X):
944947
inplace_column_scale(X, self.scale_)
@@ -987,7 +990,8 @@ def maxabs_scale(X, axis=0, copy=True):
987990

988991
# If copy is required, it will be done inside the scaler object.
989992
X = check_array(X, accept_sparse=('csr', 'csc'), copy=False,
990-
ensure_2d=False, dtype=FLOAT_DTYPES)
993+
ensure_2d=False, dtype=FLOAT_DTYPES,
994+
force_all_finite='allow-nan')
991995
original_ndim = X.ndim
992996

993997
if original_ndim == 1:
@@ -2110,7 +2114,8 @@ def _transform_col(self, X_col, quantiles, inverse):
21102114
lower_bound_y = quantiles[0]
21112115
upper_bound_y = quantiles[-1]
21122116
# for inverse transform, match a uniform PDF
2113-
X_col = output_distribution.cdf(X_col)
2117+
with np.errstate(invalid='ignore'): # hide NaN comparison warnings
2118+
X_col = output_distribution.cdf(X_col)
21142119
# find index for lower and higher bounds
21152120
with np.errstate(invalid='ignore'): # hide NaN comparison warnings
21162121
lower_bounds_idx = (X_col - BOUNDS_THRESHOLD <
@@ -2563,9 +2568,13 @@ def _check_input(self, X, check_positive=False, check_shape=False,
25632568
X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy,
25642569
force_all_finite='allow-nan')
25652570

2566-
if check_positive and self.method == 'box-cox' and np.nanmin(X) <= 0:
2567-
raise ValueError("The Box-Cox transformation can only be applied "
2568-
"to strictly positive data")
2571+
with np.warnings.catch_warnings():
2572+
np.warnings.filterwarnings(
2573+
'ignore', r'All-NaN (slice|axis) encountered')
2574+
if (check_positive and self.method == 'box-cox' and
2575+
np.nanmin(X) <= 0):
2576+
raise ValueError("The Box-Cox transformation can only be "
2577+
"applied to strictly positive data")
25692578

25702579
if check_shape and not X.shape[1] == len(self.lambdas_):
25712580
raise ValueError("Input data has a different number of features "

sklearn/preprocessing/tests/test_common.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@
88

99
from sklearn.base import clone
1010

11+
from sklearn.preprocessing import maxabs_scale
1112
from sklearn.preprocessing import minmax_scale
1213
from sklearn.preprocessing import scale
1314
from sklearn.preprocessing import power_transform
1415
from sklearn.preprocessing import quantile_transform
1516

17+
from sklearn.preprocessing import MaxAbsScaler
1618
from sklearn.preprocessing import MinMaxScaler
1719
from sklearn.preprocessing import StandardScaler
1820
from sklearn.preprocessing import PowerTransformer
@@ -31,7 +33,8 @@ def _get_valid_samples_by_column(X, col):
3133

3234
@pytest.mark.parametrize(
3335
"est, func, support_sparse, strictly_positive",
34-
[(MinMaxScaler(), minmax_scale, False, False),
36+
[(MaxAbsScaler(), maxabs_scale, True, False),
37+
(MinMaxScaler(), minmax_scale, False, False),
3538
(StandardScaler(), scale, False, False),
3639
(StandardScaler(with_mean=False), scale, True, False),
3740
(PowerTransformer(), power_transform, False, True),
@@ -53,12 +56,17 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
5356
assert np.any(np.isnan(X_test), axis=0).all()
5457
X_test[:, 0] = np.nan # make sure this boundary case is tested
5558

56-
Xt = est.fit(X_train).transform(X_test)
59+
with pytest.warns(None) as records:
60+
Xt = est.fit(X_train).transform(X_test)
61+
# ensure no warnings are raised
62+
assert len(records) == 0
5763
# missing values should still be missing, and only them
5864
assert_array_equal(np.isnan(Xt), np.isnan(X_test))
5965

6066
# check that the function leads to the same results as the class
61-
Xt_class = est.transform(X_train)
67+
with pytest.warns(None) as records:
68+
Xt_class = est.transform(X_train)
69+
assert len(records) == 0
6270
Xt_func = func(X_train, **est.get_params())
6371
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
6472
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
@@ -74,7 +82,9 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
7482
# train only on non-NaN
7583
est.fit(_get_valid_samples_by_column(X_train, i))
7684
# check transforming with NaN works even when training without NaN
77-
Xt_col = est.transform(X_test[:, [i]])
85+
with pytest.warns(None) as records:
86+
Xt_col = est.transform(X_test[:, [i]])
87+
assert len(records) == 0
7888
assert_allclose(Xt_col, Xt[:, [i]])
7989
# check non-NaN is handled as before - the 1st column is all nan
8090
if not np.isnan(X_test[:, i]).all():
@@ -87,15 +97,23 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
8797
est_dense = clone(est)
8898
est_sparse = clone(est)
8999

90-
Xt_dense = est_dense.fit(X_train).transform(X_test)
91-
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
100+
with pytest.warns(None) as records:
101+
Xt_dense = est_dense.fit(X_train).transform(X_test)
102+
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
103+
assert len(records) == 0
92104
for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
93105
sparse.bsr_matrix, sparse.coo_matrix,
94106
sparse.dia_matrix, sparse.dok_matrix,
95107
sparse.lil_matrix):
96108
# check that the dense and sparse inputs lead to the same results
97-
Xt_sparse = (est_sparse.fit(sparse_constructor(X_train))
98-
.transform(sparse_constructor(X_test)))
99-
assert_allclose(Xt_sparse.A, Xt_dense)
100-
Xt_inv_sparse = est_sparse.inverse_transform(Xt_sparse)
101-
assert_allclose(Xt_inv_sparse.A, Xt_inv_dense)
109+
# precompute the matrix to avoid catching side warnings
110+
X_train_sp = sparse_constructor(X_train)
111+
X_test_sp = sparse_constructor(X_test)
112+
with pytest.warns(None) as records:
113+
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
114+
assert len(records) == 0
115+
assert_allclose(Xt_sp.A, Xt_dense)
116+
with pytest.warns(None) as records:
117+
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
118+
assert len(records) == 0
119+
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)

sklearn/utils/estimator_checks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@
7878
'RandomForestRegressor', 'Ridge', 'RidgeCV']
7979

8080
ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer',
81-
'MinMaxScaler', 'StandardScaler', 'PowerTransformer',
82-
'QuantileTransformer']
81+
'MaxAbsScaler', 'MinMaxScaler', 'StandardScaler',
82+
'PowerTransformer', 'QuantileTransformer']
8383

8484

8585
def _yield_non_meta_checks(name, estimator):

sklearn/utils/extmath.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,7 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
710710
new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
711711
last_unnormalized_variance = last_variance * last_sample_count
712712

713-
with np.errstate(divide='ignore'):
713+
with np.errstate(divide='ignore', invalid='ignore'):
714714
last_over_new_count = last_sample_count / new_sample_count
715715
updated_unnormalized_variance = (
716716
last_unnormalized_variance + new_unnormalized_variance +

0 commit comments

Comments
 (0)
0