crankycoder
diff --git a/‎doc/whats_new/v0.22.rst
Lines changed: 4 additions & 0 deletions b/‎doc/whats_new/v0.22.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎sklearn/utils/tests/test_validation.py
Lines changed: 41 additions & 1 deletion b/‎sklearn/utils/tests/test_validation.py
Lines changed: 41 additions & 1 deletion
diff --git a/‎sklearn/utils/validation.py
Lines changed: 2 additions & 0 deletions b/‎sklearn/utils/validation.py
Lines changed: 2 additions & 0 deletions
@@ -645,6 +645,10 @@ Changelog
   NaN to integer.
   :pr:`14872` by `Roman Yurchak`_.
 
+- |Fix| :func:`utils.check_array` will now correctly detect numeric dtypes in
+  pandas dataframes, fixing a bug where ``float32`` was upcast to ``float64``
+  unnecessarily. :pre:`15094` by `Andreas Müller`_.
+  
 - |API| The following utils have been deprecated and are now private:
   - ``choose_check_classifiers_labels``
   - ``enforce_estimator_tags_y``
 
@@ -43,7 +43,8 @@
     check_scalar,
     _deprecate_positional_args,
     _check_sample_weight,
-    _allclose_dense_sparse)
+    _allclose_dense_sparse,
+    FLOAT_DTYPES)
 import sklearn
 
 from sklearn.exceptions import NotFittedError
@@ -352,6 +353,45 @@ def test_check_array_pandas_dtype_object_conversion():
     assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
 
 
+def test_check_array_pandas_dtype_casting():
+    # test that data-frames with homogeneous dtype are not upcast
+    pd = pytest.importorskip('pandas')
+    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
+    X_df = pd.DataFrame(X)
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df.iloc[:, 0] = X_df.iloc[:, 0].astype(np.float16)
+    assert_array_equal(X_df.dtypes,
+                       (np.float16, np.float32, np.float32))
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df.iloc[:, 1] = X_df.iloc[:, 1].astype(np.int16)
+    # float16, int16, float32 casts to float32
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df.iloc[:, 2] = X_df.iloc[:, 2].astype(np.float16)
+    # float16, int16, float16 casts to float32
+    assert check_array(X_df).dtype == np.float32
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32
+
+    X_df = X_df.astype(np.int16)
+    assert check_array(X_df).dtype == np.int16
+    # we're not using upcasting rules for determining
+    # the target type yet, so we cast to the default of float64
+    assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64
+
+    # check that we handle pandas dtypes in a semi-reasonable way
+    # this is actually tricky because we can't really know that this
+    # should be integer ahead of converting it.
+    cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])])
+    assert (check_array(cat_df).dtype == np.int64)
+    assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype
+            == np.float64)
+
+
 def test_check_array_on_mock_dataframe():
     arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
     mock_df = MockDataFrame(arr)
 
@@ -454,6 +454,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     dtypes_orig = None
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
         dtypes_orig = np.array(array.dtypes)
+        if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
+            dtype_orig = np.result_type(*array.dtypes)
 
     if dtype_numeric:
         if dtype_orig is not None and dtype_orig.kind == "O":