8000 Revert "FIX add support for non numeric values in MissingIndicator (#… · xhluca/scikit-learn@f9ac7e7 · GitHub
[go: up one dir, main page]

Skip to content

Commit f9ac7e7

Browse files
author
Xing
committed
Revert "FIX add support for non numeric values in MissingIndicator (scikit-learn#13046)"
This reverts commit 29c51c4.
1 parent b8dfc3f commit f9ac7e7

File tree

4 files changed

+21
-75
lines changed

4 files changed

+21
-75
lines changed

doc/whats_new/v0.20.rst

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,6 @@ Changelog
3636
threaded when `n_jobs > 1` or `n_jobs = -1`.
3737
:issue:`13005` by :user:`Prabakaran Kumaresshan <nixphix>`.
3838

39-
:mod:`sklearn.impute`
40-
.....................
41-
42-
- |Fix| add support for non-numeric data in
43-
:class:`sklearn.impute.MissingIndicator` which was not supported while
44-
:class:`sklearn.impute.SimpleImputer` was supporting this for some
45-
imputation strategies.
46-
:issue:`13046` by :user:`Guillaume Lemaitre <glemaitre>`.
47-
4839
:mod:`sklearn.linear_model`
4940
...........................
5041

sklearn/impute.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -533,23 +533,6 @@ def _get_missing_features_info(self, X):
533533

534534
return imputer_mask, features_with_missing
535535

536-
def _validate_input(self, X):
537-
if not is_scalar_nan(self.missing_values):
538-
force_all_finite = True
539-
else:
540-
force_all_finite = "allow-nan"
541-
X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
542-
force_all_finite=force_all_finite)
543-
_check_inputs_dtype(X, self.missing_values)
544-
if X.dtype.kind not in ("i", "u", "f", "O"):
545-
raise ValueError("MissingIndicator does not support data with "
546-
"dtype {0}. Please provide either a numeric array"
547-
" (with a floating point or integer dtype) or "
548-
"categorical data represented either as an array "
549-
"with integer dtype or an array of string values "
550-
"with an object dtype.".format(X.dtype))
551-
return X
552-
553536
def fit(self, X, y=None):
554537
"""Fit the transformer on X.
555538
@@ -564,7 +547,14 @@ def fit(self, X, y=None):
564547
self : object
565548
Returns self.
566549
"""
567-
X = self._validate_input(X)
550+
if not is_scalar_nan(self.missing_values):
551+
force_all_finite = True
552+
else:
553+
force_all_finite = "allow-nan"
554+
X = check_array(X, accept_sparse=('csc', 'csr'),
555+
force_all_finite=force_all_finite)
556+
_check_inputs_dtype(X, self.missing_values)
557+
568558
self._n_features = X.shape[1]
569559

570560
if self.features not in ('missing-only', 'all'):
@@ -598,7 +588,14 @@ def transform(self, X):
598588
599589
"""
600590
check_is_fitted(self, "features_")
601-
X = self._validate_input(X)
591+
592+
if not is_scalar_nan(self.missing_values):
593+
force_all_finite = True
594+
else:
595+
force_all_finite = "allow-nan"
596+
X = check_array(X, accept_sparse=('csc', 'csr'),
597+
force_all_finite=force_all_finite)
598+
_check_inputs_dtype(X, self.missing_values)
602599

603600
if X.shape[1] != self._n_features:
604601
raise ValueError("X has a different number of features "

sklearn/tests/test_impute.py

Lines changed: 1 addition & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from sklearn.impute import MissingIndicator
1414
from sklearn.impute import SimpleImputer
1515
from sklearn.pipeline import Pipeline
16-
from sklearn.pipeline import make_union
1716
from sklearn.model_selection import GridSearchCV
1817
from sklearn import tree
1918
from sklearn.random_projection import sparse_random_matrix
@@ -510,10 +509,7 @@ def test_imputation_copy():
510509
"'features' has to be either 'missing-only' or 'all'"),
511510
(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
512511
{'features': 'all', 'sparse': 'random'},
513-
"'sparse' has to be a boolean or 'auto'"),
514-
(np.array([['a', 'b'], ['c', 'a']], dtype=str),
515-
np.array([['a', 'b'], ['c', 'a']], dtype=str),
516-
{}, "MissingIndicator does not support data with dtype")]
512+
"'sparse' has to be a boolean or 'auto'")]
517513
)
518514
def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
519515
indicator = MissingIndicator(missing_values=-1)
@@ -618,37 +614,6 @@ def test_missing_indicator_sparse_param(arr_type, missing_values,
618614
assert isinstance(X_trans_mask, np.ndarray)
619615

620616

621-
def test_missing_indicator_string():
622-
X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object)
623-
indicator = MissingIndicator(missing_values='a', features='all')
624-
X_trans = indicator.fit_transform(X)
625-
assert_array_equal(X_trans, np.array([[True, False, False],
626-
[False, False, True]]))
627-
628-
629-
@pytest.mark.parametrize(
630-
"X, missing_values, X_trans_exp",
631-
[(np.array([['a', 'b'], ['b', 'a']], dtype=object), 'a',
632-
np.array([['b', 'b', True, False], ['b', 'b', False, True]],
633-
dtype=object)),
634-
(np.array([[np.nan, 1.], [1., np.nan]]), np.nan,
635-
np.array([[1., 1., True, False], [1., 1., False, True]])),
636-
(np.array([[np.nan, 'b'], ['b', np.nan]], dtype=object), np.nan,
637-
np.array([['b', 'b', True, False], ['b', 'b', False, True]],
638-
dtype=object)),
639-
(np.array([[None, 'b'], ['b', None]], dtype=object), None,
640-
np.array([['b', 'b', True, False], ['b', 'b', False, True]],
641-
dtype=object))]
642-
)
643-
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
644-
trans = make_union(
645-
SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
646-
MissingIndicator(missing_values=missing_values)
647-
)
648-
X_trans = trans.fit_transform(X)
649-
assert_array_equal(X_trans, X_trans_exp)
650-
651-
652617
@pytest.mark.parametrize("imputer_constructor",
653618
[SimpleImputer])
654619
@pytest.mark.parametrize(

sklearn/utils/estimator_checks.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,10 @@
7272
'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
7373
'RANSACRegressor', 'RadiusNeighborsRegressor',
7474
'RandomForestRegressor', 'Ridge', 'RidgeCV']
75+
7576
ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
7677
'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
7778
'PowerTransformer', 'QuantileTransformer']
78-
SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']
7979

8080

8181
def _yield_non_meta_checks(name, estimator):
@@ -623,16 +623,9 @@ def check_dtype_object(name, estimator_orig):
623623
if "Unknown label type" not in str(e):
624624
raise
625625

626-
if name not in SUPPORT_STRING:
627-
X[0, 0] = {'foo': 'bar'}
628-
msg = "argument must be a string or a number"
629-
assert_raises_regex(TypeError, msg, estimator.fit, X, y)
630-
else:
631-
# Estimators supporting string will not call np.asarray to convert the
632-
# data to numeric and therefore, the error will not be raised.
633-
# Checking for each element dtype in the input array will be costly.
634-
# Refer to #11401 for full discussion.
635-
estimator.fit(X, y)
626+
X[0, 0] = {'foo': 'bar'}
627+
msg = "argument must be a string or a number"
628+
assert_raises_regex(TypeError, msg, estimator.fit, X, y)
636629

637630

638631
def check_complex_data(name, estimator_orig):

0 commit comments

Comments
 (0)
0