8000 MRG respect dtypes in pandas dataframes if homogeneous by amueller · Pull Request #15094 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

MRG respect dtypes in pandas dataframes if homogeneous #15094

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
4 changes: 4 additions & 0 deletions doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,10 @@ Changelog
NaN to integer.
:pr:`14872` by `Roman Yurchak`_.

- |Fix| :func:`utils.check_array` will now correctly detect numeric dtypes in
pandas dataframes, fixing a bug where ``float32`` was upcast to ``float64``
unnecessarily. :pre:`15094` by `Andreas Müller`_.

- |API| The following utils have been deprecated and are now private:
- ``choose_check_classifiers_labels``
- ``enforce_estimator_tags_y``
Expand Down
42 changes: 41 additions & 1 deletion sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
_num_samples,
check_scalar,
_check_sample_weight,
_allclose_dense_sparse)
_allclose_dense_sparse,
FLOAT_DTYPES)
import sklearn

from sklearn.exceptions import NotFittedError
Expand Down Expand Up @@ -351,6 +352,45 @@ def test_check_array_pandas_dtype_object_conversion():
assert check_array(X_df, ensure_2d=False).dtype.kind == "f"


def test_check_array_pandas_dtype_casting():
# test that data-frames with homogeneous dtype are not upcast
pd = pytest.importorskip('pandas')
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
X_df = pd.DataFrame(X)
assert check_array(X_df).dtype == np.float32
assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32

X_df.iloc[:, 0] = X_df.iloc[:, 0].astype(np.float16)
assert_array_equal(X_df.dtypes,
(np.float16, np.float32, np.float32))
assert check_array(X_df).dtype == np.float32
assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32

X_df.iloc[:, 1] = X_df.iloc[:, 1].astype(np.int16)
# float16, int16, float32 casts to float32
assert check_array(X_df).dtype == np.float32
assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32

X_df.iloc[:, 2] = X_df.iloc[:, 2].astype(np.float16)
# float16, int16, float16 casts to float32
assert check_array(X_df).dtype == np.float32
assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32

X_df = X_df.astype(np.int16)
assert check_array(X_df).dtype == np.int16
# we're not using upcasting rules for determining
# the target type yet, so we cast to the default of float64
assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64

# check that we handle pandas dtypes in a semi-reasonable way
# this is actually tricky because we can't really know that this
# should be integer ahead of converting it.
cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])])
assert (check_array(cat_df).dtype == np.int64)
assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype
== np.float64)


def test_check_array_on_mock_dataframe():
arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
mock_df = MockDataFrame(arr)
Expand Down
2 changes: 2 additions & 0 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
dtypes_orig = None
if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
dtypes_orig = np.array(array.dtypes)
if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
dtype_orig = np.result_type(*array.dtypes)

if dtype_numeric:
if dtype_orig is not None and dtype_orig.kind == "O":
Expand Down
0