scikit-learn · jnothman · Oct 8, 2019 · Sep 25, 2019 · Sep 25, 2019 · Sep 26, 2019
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -623,6 +623,10 @@ Changelog
   NaN to integer.
   :pr:`14872` by `Roman Yurchak`_.
 
+- |Fix| :func:`utils.check_array` will now correctly detect numeric dtypes in
+  pandas dataframes, fixing a bug where ``float32`` was upcast to ``float64``
+  unnecessarily. :pre:`15094` by `Andreas Müller`_.
+
 - |API| The following utils have been deprecated and are now private:
   - ``choose_check_classifiers_labels``
   - ``enforce_estimator_tags_y``

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -42,7 +42,8 @@
     _num_samples,
     check_scalar,
     _check_sample_weight,
-    _allclose_dense_sparse)
+    _allclose_dense_sparse,
+    FLOAT_DTYPES)
 import sklearn
 
 from sklearn.exceptions import NotFittedError
@@ -351,6 +352,45 @@ def test_check_array_pandas_dtype_object_conversion():
     assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
 
 
+def test_check_array_pandas_dtype_casting():
+    # test that data-frames with homogeneous dtype are not upcast
+    pd = pytest.importorskip('pandas')
+    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
+    X_df = pd.DataFrame(X)
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df.iloc[:, 0] = X_df.iloc[:, 0].astype(np.float16)
+    assert_array_equal(X_df.dtypes,
+                       (np.float16, np.float32, np.float32))
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df.iloc[:, 1] = X_df.iloc[:, 1].astype(np.int16)
+    # float16, int16, float32 casts to float32
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df.iloc[:, 2] = X_df.iloc[:, 2].astype(np.float16)
+    # float16, int16, float16 casts to float32
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df = X_df.astype(np.int16)
+    assert check_array(X_df).dtype == np.int16
+    # we're not using upcasting rules for determining
+    # the target type yet, so we cast to the default of float64
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64
+
+    # check that we handle pandas dtypes in a semi-reasonable way
+    # this is actually tricky because we can't really know that this
+    # should be integer ahead of converting it.
+    cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])])
+    assert (check_array(cat_df).dtype == np.int64)
+    assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype
+            == np.float64)
+
+
 def test_check_array_on_mock_dataframe():
     arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
     mock_df = MockDataFrame(arr)

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -453,6 +453,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     dtypes_orig = None
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
         dtypes_orig = np.array(array.dtypes)
+        if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
+            dtype_orig = np.result_type(*array.dtypes)
 
     if dtype_numeric:
         if dtype_orig is not None and dtype_orig.kind == "O":