diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index b70114979fb16..9f88fc4e2c447 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -623,6 +623,10 @@ Changelog NaN to integer. :pr:`14872` by `Roman Yurchak`_. +- |Fix| :func:`utils.check_array` will now correctly detect numeric dtypes in + pandas dataframes, fixing a bug where ``float32`` was upcast to ``float64`` + unnecessarily. :pre:`15094` by `Andreas Müller`_. + - |API| The following utils have been deprecated and are now private: - ``choose_check_classifiers_labels`` - ``enforce_estimator_tags_y`` diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index fc6fdec72cbaa..ea02b8dffdd2d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -42,7 +42,8 @@ _num_samples, check_scalar, _check_sample_weight, - _allclose_dense_sparse) + _allclose_dense_sparse, + FLOAT_DTYPES) import sklearn from sklearn.exceptions import NotFittedError @@ -351,6 +352,45 @@ def test_check_array_pandas_dtype_object_conversion(): assert check_array(X_df, ensure_2d=False).dtype.kind == "f" +def test_check_array_pandas_dtype_casting(): + # test that data-frames with homogeneous dtype are not upcast + pd = pytest.importorskip('pandas') + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) + X_df = pd.DataFrame(X) + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + + X_df.iloc[:, 0] = X_df.iloc[:, 0].astype(np.float16) + assert_array_equal(X_df.dtypes, + (np.float16, np.float32, np.float32)) + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + + X_df.iloc[:, 1] = X_df.iloc[:, 1].astype(np.int16) + # float16, int16, float32 casts to float32 + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + + X_df.iloc[:, 2] = X_df.iloc[:, 2].astype(np.float16) + # float16, int16, float16 casts to float32 + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + + X_df = X_df.astype(np.int16) + assert check_array(X_df).dtype == np.int16 + # we're not using upcasting rules for determining + # the target type yet, so we cast to the default of float64 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64 + + # check that we handle pandas dtypes in a semi-reasonable way + # this is actually tricky because we can't really know that this + # should be integer ahead of converting it. + cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])]) + assert (check_array(cat_df).dtype == np.int64) + assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype + == np.float64) + + def test_check_array_on_mock_dataframe(): arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) mock_df = MockDataFrame(arr) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4b953cd135398..57f37ec038b5d 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -453,6 +453,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): dtypes_orig = np.array(array.dtypes) + if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): + dtype_orig = np.result_type(*array.dtypes) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O":