8000 Revert "FIX add support for non numeric values in MissingIndicator (#… · xhluca/scikit-learn@457f128 · GitHub
  • [go: up one dir, main page]

    Skip to content

    Commit 457f128

    Browse files
    author
    Xing
    authored
    Revert "FIX add support for non numeric values in MissingIndicator (scikit-learn#13046)"
    This reverts commit 29c51c4.
    1 parent ce116d1 commit 457f128

    File tree

    4 files changed

    +21
    -75
    lines changed

    4 files changed

    +21
    -75
    lines changed

    doc/whats_new/v0.20.rst

    Lines changed: 0 additions & 9 deletions
    Original file line numberDiff line numberDiff line change
    @@ -36,15 +36,6 @@ Changelog
    3636
    threaded when `n_jobs > 1` or `n_jobs = -1`.
    3737
    :issue:`13005` by :user:`Prabakaran Kumaresshan <nixphix>`.
    3838

    39-
    :mod:`sklearn.impute`
    40-
    .....................
    41-
    42-
    - |Fix| add support for non-numeric data in
    43-
    :class:`sklearn.impute.MissingIndicator` which was not supported while
    44-
    :class:`sklearn.impute.SimpleImputer` was supporting this for some
    45-
    imputation strategies.
    46-
    :issue:`13046` by :user:`Guillaume Lemaitre <glemaitre>`.
    47-
    4839
    :mod:`sklearn.linear_model`
    4940
    ...........................
    5041

    sklearn/impute.py

    Lines changed: 16 additions & 19 deletions
    Original file line numberDiff line numberDiff line change
    @@ -533,23 +533,6 @@ def _get_missing_features_info(self, X):
    533533

    534534
    return imputer_mask, features_with_missing
    535535

    536-
    def _validate_input(self, X):
    537-
    if not is_scalar_nan(self.missing_values):
    538-
    force_all_finite = True
    539-
    else:
    540-
    force_all_finite = "allow-nan"
    541-
    X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
    542-
    force_all_finite=force_all_finite)
    543-
    _check_inputs_dtype(X, self.missing_values)
    544-
    if X.dtype.kind not in ("i", "u", "f", "O"):
    545-
    raise ValueError("MissingIndicator does not support data with "
    546-
    "dtype {0}. Please provide either a numeric array"
    547-
    " (with a floating point or integer dtype) or "
    548-
    "categorical data represented either as an array "
    549-
    "with integer dtype or an array of string values "
    550-
    "with an object dtype.".format(X.dtype))
    551-
    return X
    552-
    553536
    def fit(self, X, y=None):
    554537
    """Fit the transformer on X.
    555538
    @@ -564,7 +547,14 @@ def fit(self, X, y=None):
    564547
    self : object
    565548
    Returns self.
    566549
    """
    567-
    X = self._validate_input(X)
    550+
    if not is_scalar_nan(self.missing_values):
    551+
    force_all_finite = True
    552+
    else:
    553+
    force_all_finite = "allow-nan"
    554+
    X = check_array(X, accept_sparse=('csc', 'csr'),
    555+
    force_all_finite=force_all_finite)
    556+
    _check_inputs_dtype(X, self.missing_values)
    557+
    568558
    self._n_features = X.shape[1]
    569559

    570560
    if self.features not in ('missing-only', 'all'):
    @@ -598,7 +588,14 @@ def transform(self, X):
    598588
    599589
    """
    600590
    check_is_fitted(self, "features_")
    601-
    X = self._validate_input(X)
    591+
    592+
    if not is_scalar_nan(self.missing_values):
    593+
    force_all_finite = True
    594+
    else:
    595+
    force_all_finite = "allow-nan"
    596+
    X = check_array(X, accept_sparse=('csc', 'csr'),
    597+
    force_all_finite=force_all_finite)
    598+
    _check_inputs_dtype(X, self.missing_values)
    602599

    603600
    if X.shape[1] != self._n_features:
    604601
    raise ValueError("X has a different number of features "

    sklearn/tests/test_impute.py

    Lines changed: 1 addition & 36 deletions
    Original file line numberDiff line numberDiff line change
    @@ -13,7 +13,6 @@
    1313
    from sklearn.impute import MissingIndicator
    1414
    from sklearn.impute import SimpleImputer
    1515
    from sklearn.pipeline import Pipeline
    16-
    from sklearn.pipeline import make_union
    1716
    from sklearn.model_selection import GridSearchCV
    1817
    from sklearn import tree
    1918
    from sklearn.random_projection import sparse_random_matrix
    @@ -510,10 +509,7 @@ def test_imputation_copy():
    510509
    "'features' has to be either 'missing-only' or 'all'"),
    511510
    (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
    512511
    {'features': 'all', 'sparse': 'random'},
    513-
    "'sparse' has to be a boolean or 'auto'"),
    514-
    (np.array([['a', 'b'], ['c', 'a']], dtype=str),
    515-
    np.array([['a', 'b'], ['c', 'a']], dtype=str),
    516-
    {}, "MissingIndicator does not support data with dtype")]
    512+
    "'sparse' has to be a boolean or 'auto'")]
    517513
    )
    518514
    def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
    519515
    indicator = MissingIndicator(missing_values=-1)
    @@ -618,37 +614,6 @@ def test_missing_indicator_sparse_param(arr_type, missing_values,
    618614
    assert isinstance(X_trans_mask, np.ndarray)
    619615

    620616

    621-
    def test_missing_indicator_string():
    622-
    X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object)
    623-
    indicator = MissingIndicator(missing_values='a', features='all')
    624-
    X_trans = indicator.fit_transform(X)
    625-
    assert_array_equal(X_trans, np.array([[True, False, False],
    626-
    [False, False, True]]))
    627-
    628-
    629-
    @pytest.mark.parametrize(
    630-
    "X, missing_values, X_trans_exp",
    631-
    [(np.array([['a', 'b'], ['b', 'a']], dtype=object), 'a',
    632-
    np.array([['b', 'b', True, False], ['b', 'b', False, True]],
    633-
    dtype=object)),
    634-
    (np.array([[np.nan, 1.], [1., np.nan]]), np.nan,
    635-
    np.array([[1., 1., True, False], [1., 1., False, True]])),
    636-
    (np.array([[np.nan, 'b'], ['b', np.nan]], dtype=object), np.nan,
    637-
    np.array([['b', 'b', True, False], ['b', 'b', False, True]],
    638-
    dtype=object)),
    639-
    (np.array([[None, 'b'], ['b', None]], dtype=object), None,
    640-
    np.array([['b', 'b', True, False], ['b', 'b', False, True]],
    641-
    dtype=object))]
    642-
    )
    643-
    def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
    644-
    trans = make_union(
    645-
    SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
    646-
    MissingIndicator(missing_values=missing_values)
    647-
    )
    648-
    X_trans = trans.fit_transform(X)
    649-
    assert_array_equal(X_trans, X_trans_exp)
    650-
    651-
    652617
    @pytest.mark.parametrize("imputer_constructor",
    653618
    [SimpleImputer])
    654619
    @pytest.mark.parametrize(

    sklearn/utils/estimator_checks.py

    Lines changed: 4 additions & 11 deletions
    Original file line numberDiff line numberDiff line change
    @@ -72,10 +72,10 @@
    7272
    'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
    7373
    'RANSACRegressor', 'RadiusNeighborsRegressor',
    7474
    'RandomForestRegressor', 'Ridge', 'RidgeCV']
    75+
    7576
    ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
    7677
    'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
    7778
    'PowerTransformer', 'QuantileTransformer']
    78-
    SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']
    7979

    8080

    8181
    def _yield_non_meta_checks(name, estimator):
    @@ -623,16 +623,9 @@ def check_dtype_object(name, estimator_orig):
    623623
    if "Unknown label type" not in str(e):
    624624
    raise
    625625

    626-
    if name not in SUPPORT_STRING:
    627-
    X[0, 0] = {'foo': 'bar'}
    628-
    msg = "argument must be a string or a number"
    629-
    assert_raises_regex(TypeError, msg, estimator.fit, X, y)
    630-
    else:
    631-
    # Estimators supporting string will not call np.asarray to convert the
    632-
    # data to numeric and therefore, the error will not be raised.
    633-
    # Checking for each element dtype in the input array will be costly.
    634-
    # Refer to #11401 for full discussion.
    635-
    estimator.fit(X, y)
    626+
    X[0, 0] = {'foo': 'bar'}
    627+
    msg = "argument must be a string or a number"
    628+
    assert_raises_regex(TypeError, msg, estimator.fit, X, y)
    636629

    637630

    638631
    def check_complex_data(name, estimator_orig):

    0 commit comments

    Comments
     (0)
    0