8000 ENH Adds feature_names_out to impute module (#21078) · scikit-learn/scikit-learn@8f621ad · GitHub
[go: up one dir, main page]

Skip to content

Commit 8f621ad

Browse files
authored
ENH Adds feature_names_out to impute module (#21078)
1 parent 2ad5f15 commit 8f621ad

File tree

7 files changed

+177
-3
lines changed

7 files changed

+177
-3
lines changed

doc/whats_new/v1.1.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ Changelog
6868
error when 'min_idf' or 'max_idf' are floating-point numbers greater than 1.
6969
:pr:`20752` by :user:`Alek Lefebvre <AlekLefebvre>`.
7070

71+
:mod:`sklearn.impute`
72+
.....................
73+
74+
- |API| Adds :meth:`get_feature_names_out` to :class:`impute.SimpleImputer`,
75+
:class:`impute.KNNImputer`, :class:`impute.IterativeImputer`, and
76+
:class:`impute.MissingIndicator`. :pr:`21078` by `Thomas Fan`_.
77+
7178
:mod:`sklearn.linear_model`
7279
...........................
7380

sklearn/impute/_base.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from ..utils.sparsefuncs import _get_median
1616
from ..utils.validation import check_is_fitted
1717
from ..utils.validation import FLOAT_DTYPES
18+
from ..utils.validation import _check_feature_names_in
1819
from ..utils._mask import _get_mask
1920
from ..utils import is_scalar_nan
2021

@@ -113,6 +114,13 @@ def _concatenate_indicator(self, X_imputed, X_indicator):
113114

114115
return hstack((X_imputed, X_indicator))
115116

117+
def _concatenate_indicator_feature_names_out(self, names, input_features):
118+
if not self.add_indicator:
119+
return names
120+
121+
indicator_names = self.indicator_.get_feature_names_out(input_features)
122+
return np.concatenate([names, indicator_names])
123+
116124
def _more_tags(self):
117125
return {"allow_nan": is_scalar_nan(self.missing_values)}
118126

@@ -596,6 +604,30 @@ def inverse_transform(self, X):
596604
X_original[full_mask] = self.missing_values
597605
return X_original
598606

607+
def get_feature_names_out(self, input_features=None):
608+
"""Get output feature names for transformation.
609+
610+
Parameters
611+
----------
612+
input_features : array-like of str or None, default=None
613+
Input features.
614+
615+
- If `input_features` is `None`, then `feature_names_in_` is
616+
used as feature names in. If `feature_names_in_` is not defined,
617+
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
618+
- If `input_features` is an array-like, then `input_features` must
619+
match `feature_names_in_` if `feature_names_in_` is defined.
620+
621+
Returns
622+
-------
623+
feature_names_out : ndarray of str objects
624+
Transformed feature names.
625+
"""
626+
input_features = _check_feature_names_in(self, input_features)
627+
non_missing_mask = np.logical_not(_get_mask(self.statistics_, np.nan))
628+
names = input_features[non_missing_mask]
629+
return self._concatenate_indicator_feature_names_out(names, input_features)
630+
599631

600632
class MissingIndicator(TransformerMixin, BaseEstimator):
601633
"""Binary indicators for missing values.
@@ -922,6 +954,35 @@ def fit_transform(self, X, y=None):
922954

923955
return imputer_mask
924956

957+
def get_feature_names_out(self, input_features=None):
958+
"""Get output feature names for transformation.
959+
960+
Parameters
961+
----------
962+
input_features : array-like of str or None, default=None
963+
Input features.
964+
965+
- If `input_features` is `None`, then `feature_names_in_` is
966+
used as feature names in. If `feature_names_in_` is not defined,
967+
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
968+
- If `input_features` is an array-like, then `input_features` must
969+
match `feature_names_in_` if `feature_names_in_` is defined.
970+
971+
Returns
972+
-------
973+
feature_names_out : ndarray of str objects
974+
Transformed feature names.
975+
"""
976+
input_features = _check_feature_names_in(self, input_features)
977+
prefix = self.__class__.__name__.lower()
978+
return np.asarray(
979+
[
980+
f"{prefix}_{feature_name}"
981+
for feature_name in input_features[self.features_]
982+
],
983+
dtype=object,
984+
)
985+
925986
def _more_tags(self):
926987
return {
927988
"allow_nan": True,

sklearn/impute/_iterative.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ..preprocessing import normalize
1111
from ..utils import check_array, check_random_state, _safe_indexing, is_scalar_nan
1212
from ..utils.validation import FLOAT_DTYPES, check_is_fitted
13+
from ..utils.validation import _check_feature_names_in
1314
from ..utils._mask import _get_mask
1415

1516
from ._base import _BaseImputer
@@ -774,3 +775,26 @@ def fit(self, X, y=None):
774775
"""
775776
self.fit_transform(X)
776777
return self
778+
779+
def get_feature_names_out(self, input_features=None):
780+
"""Get output feature names for transformation.
781+
782+
Parameters
783+
----------
784+
input_features : array-like of str or None, default=None
785+
Input features.
786+
787+
- If `input_features` is `None`, then `feature_names_in_` is
788+
used as feature names in. If `feature_names_in_` is not defined,
789+
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
790+
- If `input_features` is an array-like, then `input_features` must
791+
match `feature_names_in_` if `feature_names_in_` is defined.
792+
793+
Returns
794+
-------
795+
feature_names_out : ndarray of str objects
796+
Transformed feature names.
797+
"""
798+
input_features = _check_feature_names_in(self, input_features)
799+
names = self.initial_imputer_.get_feature_names_out(input_features)
800+
return self._concatenate_indicator_feature_names_out(names, input_features)

sklearn/impute/_knn.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from ..utils import is_scalar_nan
1414
from ..utils._mask import _get_mask
1515
from ..utils.validation import check_is_fitted
16+
from ..utils.validation import _check_feature_names_in
1617

1718

1819
class KNNImputer(_BaseImputer):
@@ -217,6 +218,7 @@ def fit(self, X, y=None):
217218
_check_weights(self.weights)
218219
self._fit_X = X
219220
self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
221+
self._valid_mask = ~np.all(self._mask_fit_X, axis=0)
220222

221223
super()._fit_indicator(self._mask_fit_X)
222224

@@ -253,7 +255,7 @@ def transform(self, X):
253255

254256
mask = _get_mask(X, self.missing_values)
255257
mask_fit_X = self._mask_fit_X
256-
valid_mask = ~np.all(mask_fit_X, axis=0)
258+
valid_mask = self._valid_mask
257259

258260
X_indicator = super()._transform_indicator(mask)
259261

@@ -338,3 +340,26 @@ def process_chunk(dist_chunk, start):
338340
pass
339341

340342
return super()._concatenate_indicator(X[:, valid_mask], X_indicator)
343+
344+
def get_feature_names_out(self, input_features=None):
345+
"""Get output feature names for transformation.
346+
347+
Parameters
348+
----------
349+
input_features : array-like of str or None, default=None
350+
Input features.
351+
352+
- If `input_features` is `None`, then `feature_names_in_` is
353+
used as feature names in. If `feature_names_in_` is not defined,
354+
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
355+
- If `input_features` is an array-like, then `input_features` must
356+
match `feature_names_in_` if `feature_names_in_` is defined.
357+
358+
Returns
359+
-------
360+
feature_names_out : ndarray of str objects
361+
Transformed feature names.
362+
"""
363+
input_features = _check_feature_names_in(self, input_features)
364+
names = input_features[self._valid_mask]
365+
return self._concatenate_indicator_feature_names_out(names, input_features)

sklearn/impute/tests/test_common.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from sklearn.impute import SimpleImputer
1515

1616

17-
IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()]
17+
IMPUTERS = [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
1818
SPARSE_IMPUTERS = [SimpleImputer()]
1919

2020

@@ -122,3 +122,42 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
122122
X_trans = imputer.fit_transform(X_df)
123123

124124
assert_allclose(X_trans_expected, X_trans)
125+
126+
127+
@pytest.mark.parametrize("imputer", IMPUTERS, ids=lambda x: x.__class__.__name__)
128+
@pytest.mark.parametrize("add_indicator", [True, False])
129+
def test_imputers_feature_names_out_pandas(imputer, add_indicator):
130+
"""Check feature names out for imputers."""
131+
pd = pytest.importorskip("pandas")
132+
marker = np.nan
133+
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
134+
135+
X = np.array(
136+
[
137+
[marker, 1, 5, 3, marker, 1],
138+
[2, marker, 1, 4, marker, 2],
139+
[6, 3, 7, marker, marker, 3],
140+
[1, 2, 9, 8, marker, 4],
141+
]
142+
)
143+
X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
144+
imputer.fit(X_df)
145+
146+
names = imputer.get_feature_names_out()
147+
148+
if add_indicator:
149+
expected_names = [
150+
"a",
151+
"b",
152+
"c",
153+
"d",
154+
"f",
155+
"missingindicator_a",
156+
"missingindicator_b",
157+
"missingindicator_d",
158+
"missingindicator_e",
159+
]
160+
assert_array_equal(expected_names, names)
161+
else:
162+
expected_names = ["a", "b", "c", "d", "f"]
163+
assert_array_equal(expected_names, names)

sklearn/impute/tests/test_impute.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1493,3 +1493,22 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
14931493
assert expected == _most_frequent(
14941494
np.array(array, dtype=dtype), extra_value, n_repeat
14951495
)
1496+
1497+
1498+
def test_missing_indicator_feature_names_out():
1499+
"""Check that missing indicator return the feature names with a prefix."""
1500+
pd = pytest.importorskip("pandas")
1501+
1502+
missing_values = np.nan
1503+
X = pd.DataFrame(
1504+
[
1505+
[missing_values, missing_values, 1, missing_values],
1506+
[4, missing_values, 2, 10],
1507+
],
1508+
columns=["a", "b", "c", "d"],
1509+
)
1510+
1511+
indicator = MissingIndicator(missing_values=missing_values).fit(X)
1512+
feature_names = indicator.get_feature_names_out()
1513+
expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"]
1514+
assert_array_equal(expected_names, feature_names)

sklearn/tests/test_common.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,6 @@ def test_pandas_column_name_consistency(estimator):
365365
"decomposition",
366366
"discriminant_analysis",
367367
"ensemble",
368-
"impute",
369368
"isotonic",
370369
"kernel_approximation",
371370
"preprocessing",

0 commit comments

Comments
 (0)
0