8000 DEP Adding a warning in the SimpleImputer when strategy mode is const… · scikit-learn/scikit-learn@c1b1f87 · GitHub
[go: up one dir, main page]

Skip to content

Commit c1b1f87

Browse files
DEP Adding a warning in the SimpleImputer when strategy mode is constant and keep_empty_features is False (#29950)
Co-authored-by: Guillaume Lemaitre <guillaume@probabl.ai>
1 parent b9a7496 commit c1b1f87

File tree

4 files changed

+106
-8
lines changed

4 files changed

+106
-8
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- Add a warning in :class:`impute.SimpleImputer` when `keep_empty_feature=False` and
2+
`strategy="constant"`. In this case empty features are not dropped and this behaviour
3+
will change in 1.8.
4+
By :user:`Arthur Courselle <ArthurCourselle>` and :user:`Simon Riou <simon-riou>`

sklearn/impute/_base.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,11 @@ class SimpleImputer(_BaseImputer):
225225
226226
.. versionadded:: 1.2
227227
228+
.. versionchanged:: 1.6
229+
Currently, when `keep_empty_feature=False` and `strategy="constant"`,
230+
empty features are not dropped. This behaviour will change in version
231+
1.8. Set `keep_empty_feature=True` to preserve this behaviour.
232+
228233
Attributes
229234
----------
230235
statistics_ : array of shape (n_features,)
@@ -458,6 +463,19 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value):
458463
statistics = np.empty(X.shape[1])
459464

460465
if strategy == "constant":
466+
# TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
467+
# for empty features to drop them later.
468+
if not self.keep_empty_features and any(
469+
[all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])]
470+
):
471+
warnings.warn(
472+
"Currently, when `keep_empty_feature=False` and "
473+
'`strategy="constant"`, empty features are not dropped. '
474+
"This behaviour will change in version 1.8. Set "
475+
"`keep_empty_feature=True` to preserve this behaviour.",
476+
FutureWarning,
477+
)
478+
461479
# for constant strategy, self.statistics_ is used to store
462480
# fill_value in each column
463481
statistics.fill(fill_value)
@@ -548,6 +566,17 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
548566

549567
# Constant
550568
elif strategy == "constant":
569+
# TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
570+
# for empty features to drop them later.
571+
if not self.keep_empty_features and ma.getmask(masked_X).all(axis=0).any():
572+
warnings.warn(
573+
"Currently, when `keep_empty_feature=False` and "
574+
'`strategy="constant"`, empty features are not dropped. '
575+
"This behaviour will change in version 1.8. Set "
576+
"`keep_empty_feature=True` to preserve this behaviour.",
577+
FutureWarning,
578+
)
579+
551580
# for constant strategy, self.statistcs_ is used to store
552581
# fill_value in each column
553582
return np.full(X.shape[1], fill_value, dtype=X.dtype)

sklearn/impute/_iterative.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -636,16 +636,38 @@ def _initial_imputation(self, X, in_fit=False):
636636

637637
X_missing_mask = _get_mask(X, self.missing_values)
638638
mask_missing_values = X_missing_mask.copy()
639+
640+
# TODO (1.8): remove this once the deprecation is removed. In the meantime,
641+
# we need to catch the warning to avoid false positives.
642+
catch_warning = (
643+
self.initial_strategy == "constant" and not self.keep_empty_features
644+
)
645+
639646
if self.initial_imputer_ is None:
640647
self.initial_imputer_ = SimpleImputer(
641648
missing_values=self.missing_values,
642649
strategy=self.initial_strategy,
643650
fill_value=self.fill_value,
644651
keep_empty_features=self.keep_empty_features,
645652
).set_output(transform="default")
646-
X_filled = self.initial_imputer_.fit_transform(X)
653+
654+
# TODO (1.8): remove this once the deprecation is removed to keep only
655+
# the code in the else case.
656+
if catch_warning:
657+
with warnings.catch_warnings():
658+
warnings.simplefilter("ignore", FutureWarning)
659+
X_filled = self.initial_imputer_.fit_transform(X)
660+
else:
661+
X_filled = self.initial_imputer_.fit_transform(X)
647662
else:
648-
X_filled = self.initial_imputer_.transform(X)
663+
# TODO (1.8): remove this once the deprecation is removed to keep only
664+
# the code in the else case.
665+
if catch_warning:
666+
with warnings.catch_warnings():
667+
warnings.simplefilter("ignore", FutureWarning)
668+
X_filled = self.initial_imputer_.transform(X)
669+
else:
670+
X_filled = self.initial_imputer_.transform(X)
649671

650672
if in_fit:
651673
self._is_empty_feature = np.all(mask_missing_values, axis=0)
@@ -659,7 +681,8 @@ def _initial_imputation(self, X, in_fit=False):
659681
# The constant strategy has a specific behavior and preserve empty
660682
# features even with ``keep_empty_features=False``. We need to drop
661683
# the column for consistency.
662-
# TODO: remove this `if` branch once the following issue is addressed:
684+
# TODO (1.8): remove this `if` branch once the following issue is
685+
# addressed:
663686
# https://github.com/scikit-learn/scikit-learn/issues/29827
664687
X_filled = X_filled[:, ~self._is_empty_feature]
665688

sklearn/impute/tests/test_impute.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -410,18 +410,24 @@ def test_imputation_constant_error_invalid_type(X_data, missing_value):
410410
imputer.fit_transform(X)
411411

412412

413+
# TODO (1.8): check that `keep_empty_features=False` drop the
414+
# empty features due to the behaviour change.
413415
def test_imputation_constant_integer():
414416
# Test imputation using the constant strategy on integers
415417
X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
416418

417419
X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]])
418420

419-
imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0)
421+
imputer = SimpleImputer(
422+
missing_values=-1, strategy="constant", fill_value=0, keep_empty_features=True
423+
)
420424
X_trans = imputer.fit_transform(X)
421425

422426
assert_array_equal(X_trans, X_true)
423427

424428

429+
# TODO (1.8): check that `keep_empty_features=False` drop the
430+
# empty features due to the behaviour change.
425431
@pytest.mark.parametrize("array_constructor", CSR_CONTAINERS + [np.asarray])
426432
def test_imputation_constant_float(array_constructor):
427433
# Test imputation using the constant strategy on floats
@@ -442,12 +448,16 @@ def test_imputation_constant_float(array_constructor):
442448

443449
X_true = array_constructor(X_true)
444450

445-
imputer = SimpleImputer(strategy="constant", fill_value=-1)
451+
imputer = SimpleImputer(
452+
strategy="constant", fill_value=-1, keep_empty_features=True
453+
)
446454
X_trans = imputer.fit_transform(X)
447455

448456
assert_allclose_dense_sparse(X_trans, X_true)
449457

450458

459+
# TODO (1.8): check that `keep_empty_features=False` drop the
460+
# empty features due to the behaviour change.
451461
@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0])
452462
def test_imputation_constant_object(marker):
453463
# Test imputation using the constant strategy on objects
@@ -472,13 +482,18 @@ def test_imputation_constant_object(marker):
472482
)
473483

474484
imputer = SimpleImputer(
475-
missing_values=marker, strategy="constant", fill_value="missing"
485+
missing_values=marker,
486+
strategy="constant",
487+
fill_value="missing",
488+
keep_empty_features=True,
476489
)
477490
X_trans = imputer.fit_transform(X)
478491

479492
assert_array_equal(X_trans, X_true)
480493

481494

495+
# TODO (1.8): check that `keep_empty_features=False` drop the
496+
# empty features due to the behaviour change.
482497
@pytest.mark.parametrize("dtype", [object, "category"])
483498
def test_imputation_constant_pandas(dtype):
484499
# Test imputation using the constant strategy on pandas df
@@ -498,7 +513,7 @@ def test_imputation_constant_pandas(dtype):
498513
dtype=object,
499514
)
500515

501-
imputer = SimpleImputer(strategy="constant")
516+
imputer = SimpleImputer(strategy="constant", keep_empty_features=True)
502517
X_trans = imputer.fit_transform(df)
503518

504519
assert_array_equal(X_trans, X_true)
@@ -1514,6 +1529,26 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
15141529
)
15151530

15161531

1532+
@pytest.mark.parametrize(
1533+
"initial_strategy", ["mean", "median", "most_frequent", "constant"]
1534+
)
1535+
def test_iterative_imputer_keep_empty_features(initial_strategy):
1536+
"""Check the behaviour of the iterative imputer with different initial strategy
1537+
and keeping empty features (i.e. features containing only missing values).
1538+
"""
1539+
X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])
1540+
1541+
imputer = IterativeImputer(
1542+
initial_strategy=initial_strategy, keep_empty_features=True
1543+
)
1544+
X_imputed = imputer.fit_transform(X)
1545+
assert_allclose(X_imputed[:, 1], 0)
1546+
X_imputed = imputer.transform(X)
1547+
assert_allclose(X_imputed[:, 1], 0)
1548+
1549+
1550+
# TODO (1.8): check that `keep_empty_features=False` drop the
1551+
# empty features due to the behaviour change.
15171552
def test_iterative_imputer_constant_fill_value():
15181553
"""Check that we propagate properly the parameter `fill_value`."""
15191554
X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
@@ -1524,6 +1559,7 @@ def test_iterative_imputer_constant_fill_value():
15241559
initial_strategy="constant",
15251560
fill_value=fill_value,
15261561
max_iter=0,
1562+
keep_empty_features=True,
15271563
)
15281564
imputer.fit_transform(X)
15291565
assert_array_equal(imputer.initial_imputer_.statistics_, fill_value)
@@ -1722,7 +1758,13 @@ def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_feat
17221758
)
17231759

17241760
for method in ["fit_transform", "transform"]:
1725-
X_imputed = getattr(imputer, method)(X)
1761+
# TODO(1.8): Remove the condition and still call getattr(imputer, method)(X)
1762+
if method.startswith("fit") and not keep_empty_features:
1763+
warn_msg = '`strategy="constant"`, empty features are not dropped. '
1764+
with pytest.warns(FutureWarning, match=warn_msg):
1765+
X_imputed = getattr(imputer, method)(X)
1766+
else:
1767+
X_imputed = getattr(imputer, method)(X)
17261768
assert X_imputed.shape == X.shape
17271769
constant_feature = (
17281770
X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]

0 commit comments

Comments
 (0)
0