diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 9723f29d26500..115164b752882 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -627,6 +627,15 @@ Support for Python 3.4 and below has been officially dropped. affects all ensemble methods using decision trees. :issue:`12344` by :user:`Adrin Jalali `. +:mod:`sklearn.utils` +................... + +- |API| Deprecated ``warn_on_dtype`` parameter from :func:`utils.check_array` + and :func:`utils.check_X_y`. Added explicit warning for dtype conversion + in :func:`check_pairwise_arrays` if the ``metric`` being passed is a + pairwise boolean metric. + :issue:`13382` by :user:`Prathmesh Savale `. + Multiple modules ................ diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 315e3c8460b06..5e443f6c5f795 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -30,6 +30,7 @@ from ..utils._joblib import effective_n_jobs from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan +from ..exceptions import DataConversionWarning # Utility Functions @@ -99,19 +100,18 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): """ X, Y, dtype_float = _return_float_dtype(X, Y) - warn_on_dtype = dtype is not None estimator = 'check_pairwise_arrays' if dtype is None: dtype = dtype_float if Y is X or Y is None: X = Y = check_array(X, accept_sparse='csr', dtype=dtype, - warn_on_dtype=warn_on_dtype, estimator=estimator) + estimator=estimator) else: X = check_array(X, accept_sparse='csr', dtype=dtype, - warn_on_dtype=warn_on_dtype, estimator=estimator) + estimator=estimator) Y = check_array(Y, accept_sparse='csr', dtype=dtype, - warn_on_dtype=warn_on_dtype, estimator=estimator) + estimator=estimator) if precomputed: if X.shape[1] != Y.shape[0]: @@ -1421,6 +1421,11 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds): " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None + + if dtype == bool and (X.dtype != bool or Y.dtype != bool): + msg = "Data was converted to boolean for metric %s" % metric + warnings.warn(msg, DataConversionWarning) + X, Y = check_pairwise_arrays(X, Y, dtype=dtype) # precompute data-derived metric params diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 356caffff107f..e221efba4c3c7 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -138,6 +138,21 @@ def test_pairwise_boolean_distance(metric): res[np.isnan(res)] = 0 assert np.sum(res != 0) == 0 + # non-boolean arrays are converted to boolean for boolean + # distance metrics with a data conversion warning + msg = "Data was converted to boolean for metric %s" % metric + with pytest.warns(DataConversionWarning, match=msg): + pairwise_distances(X, metric=metric) + + +def test_no_data_conversion_warning(): + # No warnings issued if metric is not a boolean distance function + rng = np.random.RandomState(0) + X = rng.randn(5, 4) + with pytest.warns(None) as records: + pairwise_distances(X, metric="minkowski") + assert len(records) == 0 + @pytest.mark.parametrize('func', [pairwise_distances, pairwise_kernels]) def test_pairwise_precomputed(func): diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bab41f3bdd492..8c8524ef6505c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -137,8 +137,8 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): """ # noqa X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False, - warn_on_dtype=False, estimator='the scale function', - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + estimator='the scale function', dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): if with_mean: raise ValueError( @@ -348,7 +348,7 @@ def partial_fit(self, X, y=None): raise TypeError("MinMaxScaler does no support sparse input. " "You may consider to use MaxAbsScaler instead.") - X = check_array(X, copy=self.copy, warn_on_dtype=False, + X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") @@ -468,7 +468,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True): """ # noqa # Unlike the scaler object, this function allows 1d input. # If copy is required, it will be done inside the scaler object. - X = check_array(X, copy=False, ensure_2d=False, warn_on_dtype=False, + X = check_array(X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') original_ndim = X.ndim @@ -659,8 +659,8 @@ def partial_fit(self, X, y=None): Ignored """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - warn_on_dtype=False, estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -753,7 +753,7 @@ def transform(self, X, copy=None): check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy - X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=False, + X = check_array(X, accept_sparse='csr', copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 67fc9b33d404b..ca4fd34816b50 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -387,12 +387,15 @@ def test_check_array_dtype_warning(): assert_equal(X_checked.dtype, np.float64) for X in float64_data: - X_checked = assert_no_warnings(check_array, X, dtype=np.float64, - accept_sparse=True, warn_on_dtype=True) - assert_equal(X_checked.dtype, np.float64) - X_checked = assert_no_warnings(check_array, X, dtype=np.float64, - accept_sparse=True, warn_on_dtype=False) - assert_equal(X_checked.dtype, np.float64) + with pytest.warns(None) as record: + warnings.simplefilter("ignore", DeprecationWarning) # 0.23 + X_checked = check_array(X, dtype=np.float64, + accept_sparse=True, warn_on_dtype=True) + assert_equal(X_checked.dtype, np.float64) + X_checked = check_array(X, dtype=np.float64, + accept_sparse=True, warn_on_dtype=False) + assert_equal(X_checked.dtype, np.float64) + assert len(record) == 0 for X in float32_data: X_checked = assert_no_warnings(check_array, X, @@ -417,6 +420,17 @@ def test_check_array_dtype_warning(): assert_equal(X_checked.format, 'csr') +def test_check_array_warn_on_dtype_deprecation(): + X = np.asarray([[0.0], [1.0]]) + Y = np.asarray([[2.0], [3.0]]) + with pytest.warns(DeprecationWarning, + match="'warn_on_dtype' is deprecated"): + check_array(X, warn_on_dtype=True) + with pytest.warns(DeprecationWarning, + match="'warn_on_dtype' is deprecated"): + check_X_y(X, Y, warn_on_dtype=True) + + def test_check_array_accept_sparse_type_exception(): X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) @@ -690,8 +704,7 @@ def test_suppress_validation(): def test_check_array_series(): # regression test that check_array works on pandas Series pd = importorskip("pandas") - res = check_array(pd.Series([1, 2, 3]), ensure_2d=False, - warn_on_dtype=True) + res = check_array(pd.Series([1, 2, 3]), ensure_2d=False) assert_array_equal(res, np.array([1, 2, 3])) # with categorical dtype (not a numpy dtype) (GH12699) @@ -712,7 +725,10 @@ def test_check_dataframe_warns_on_dtype(): check_array, df, dtype=np.float64, warn_on_dtype=True) assert_warns(DataConversionWarning, check_array, df, dtype='numeric', warn_on_dtype=True) - assert_no_warnings(check_array, df, dtype='object', warn_on_dtype=True) + with pytest.warns(None) as record: + warnings.simplefilter("ignore", DeprecationWarning) # 0.23 + check_array(df, dtype='object', warn_on_dtype=True) + assert len(record) == 0 # Also check that it raises a warning for mixed dtypes in a DataFrame. df_mixed = pd.DataFrame([['1', 2, 3], ['4', 5, 6]]) @@ -728,8 +744,11 @@ def test_check_dataframe_warns_on_dtype(): df_mixed_numeric = pd.DataFrame([[1., 2, 3], [4., 5, 6]]) assert_warns(DataConversionWarning, check_array, df_mixed_numeric, dtype='numeric', warn_on_dtype=True) - assert_no_warnings(check_array, df_mixed_numeric.astype(int), - dtype='numeric', warn_on_dtype=True) + with pytest.warns(None) as record: + warnings.simplefilter("ignore", DeprecationWarning) # 0.23 + check_array(df_mixed_numeric.astype(int), + dtype='numeric', warn_on_dtype=True) + assert len(record) == 0 class DummyMemory: diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f2b298dcba5b2..32cad0197317b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -332,7 +332,7 @@ def _ensure_no_complex_data(array): def check_array(array, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, - ensure_min_features=1, warn_on_dtype=False, estimator=None): + ensure_min_features=1, warn_on_dtype=None, estimator=None): """Input validation on an array, list, sparse matrix or similar. @@ -407,10 +407,14 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. - warn_on_dtype : boolean (default=False) + warn_on_dtype : boolean or None, optional (default=None) Raise DataConversionWarning if the dtype of the input data structure does not match the requested dtype, causing a memory copy. + .. deprecated:: 0.21 + ``warn_on_dtype`` is deprecated in version 0.21 and will be + removed in 0.23. + estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages. @@ -418,8 +422,15 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, ------- array_converted : object The converted and validated array. - """ + # warn_on_dtype deprecation + if warn_on_dtype is not None: + warnings.warn( + "'warn_on_dtype' is deprecated in version 0.21 and will be " + "removed in 0.23. Don't set `warn_on_dtype` to remove this " + "warning.", + DeprecationWarning) + # store reference to original array to check if copy is needed when # function returns array_orig = array @@ -590,7 +601,7 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, - warn_on_dtype=False, estimator=None): + warn_on_dtype=None, estimator=None): """Input validation for standard estimators. Checks X and y for consistent length, enforces X to be 2D and y 1D. By @@ -675,10 +686,14 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, it is converted to float64. Should only be used for regression algorithms. - warn_on_dtype : boolean (default=False) + warn_on_dtype : boolean or None, optional (default=None) Raise DataConversionWarning if the dtype of the input data structure does not match the requested dtype, causing a memory copy. + .. deprecated:: 0.21 + ``warn_on_dtype`` is deprecated in version 0.21 and will be + removed in 0.23. + estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages.