From a87ee3ecfe1ba994a64ca0a30b140ab8687b99bd Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 25 Sep 2019 18:03:03 -0400 Subject: [PATCH 1/8] respect dtypes in pandas dataframes if homogeneous --- sklearn/utils/tests/test_validation.py | 12 +++++++++++- sklearn/utils/validation.py | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ac0e2c7cbf431..1671a12f820f9 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -42,7 +42,8 @@ _num_samples, check_scalar, _check_sample_weight, - _allclose_dense_sparse) + _allclose_dense_sparse, + FLOAT_DTYPES) import sklearn from sklearn.exceptions import NotFittedError @@ -351,6 +352,15 @@ def test_check_array_pandas_dtype_object_conversion(): assert check_array(X_df, ensure_2d=False).dtype.kind == "f" +def test_check_array_pandas_homogeneous_dtype(): + # test that data-frames with homogeneous dtype are not upcast + pd = pytest.importorskip('pandas') + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) + X_df = pd.DataFrame(X) + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + + def test_check_array_on_mock_dataframe(): arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) mock_df = MockDataFrame(arr) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4b953cd135398..279e4414640b8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -453,6 +453,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): dtypes_orig = np.array(array.dtypes) + if len(np.unique(dtypes_orig)) == 1: + dtype_orig = dtypes_orig[0] if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From dc2ab9dd2a9a6cc39fcdc5a81ac8d985cbc7c013 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 25 Sep 2019 18:10:17 -0400 Subject: [PATCH 2/8] upcast to smallest common type if possible --- sklearn/utils/tests/test_validation.py | 8 +++++++- sklearn/utils/validation.py | 3 +-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 1671a12f820f9..7d5f311fad5ef 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -352,7 +352,7 @@ def test_check_array_pandas_dtype_object_conversion(): assert check_array(X_df, ensure_2d=False).dtype.kind == "f" -def test_check_array_pandas_homogeneous_dtype(): +def test_check_array_pandas_dtype_casting(): # test that data-frames with homogeneous dtype are not upcast pd = pytest.importorskip('pandas') X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) @@ -360,6 +360,12 @@ def test_check_array_pandas_homogeneous_dtype(): assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + X_df.iloc[:, 0] = X_df.iloc[:, 0].astype(np.float16) + assert_array_equal(X_df.dtypes, + (np.float16, np.float32, np.float32)) + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + def test_check_array_on_mock_dataframe(): arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 279e4414640b8..5fe9493ba2ad0 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -453,8 +453,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): dtypes_orig = np.array(array.dtypes) - if len(np.unique(dtypes_orig)) == 1: - dtype_orig = dtypes_orig[0] + dtype_orig = np.find_common_type(array.dtypes, []) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From 0ae32037f9902d195a417b943c2d11267faa02ce Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 26 Sep 2019 11:27:28 -0400 Subject: [PATCH 3/8] add whatsnew --- doc/whats_new/v0.22.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e4b9f5507d0ab..917dec03a96fb 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -579,6 +579,10 @@ Changelog NaN to integer. :pr:`14872` by `Roman Yurchak`_. +- |Fix| :func:`utils.check_array` will now correctly detect numeric dtypes in + pandas dataframes, fixing a bug where ``float32`` was upcast to ``float64`` + unnecessarily. :pre:`15094` by `Andreas Müller`_. + :mod:`sklearn.isotonic` .................................. From cf7d0be7d2ba7ebecccbe0d861a6cc43fb730aec Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 26 Sep 2019 11:49:11 -0400 Subject: [PATCH 4/8] use result_type instead of find_common_type --- sklearn/utils/tests/test_validation.py | 16 ++++++++++++++++ sklearn/utils/validation.py | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 7d5f311fad5ef..967a4edb268b4 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -366,6 +366,22 @@ def test_check_array_pandas_dtype_casting(): assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + X_df.iloc[:, 1] = X_df.iloc[:, 1].astype(np.int16) + # float16, int16, float32 casts to float32 + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + + X_df.iloc[:, 2] = X_df.iloc[:, 2].astype(np.float16) + # float16, int16, float16 casts to float32 + assert check_array(X_df).dtype == np.float32 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 + + X_df = X_df.astype(np.int16) + assert check_array(X_df).dtype == np.int16 + # we're not using upcasting rules for determining + # the target type yet, so we cast to the default of float64 + assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64 + def test_check_array_on_mock_dataframe(): arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5fe9493ba2ad0..0af11c62506ec 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -453,7 +453,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): dtypes_orig = np.array(array.dtypes) - dtype_orig = np.find_common_type(array.dtypes, []) + dtype_orig = np.result_type(*array.dtypes) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From 0dc1ce80ea0b5948b5940edc85f2171d4e7994e5 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 27 Sep 2019 12:48:47 -0400 Subject: [PATCH 5/8] don't try to sniff dtypes if there's pandas dtypes around --- sklearn/utils/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 0af11c62506ec..57f37ec038b5d 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -453,7 +453,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): dtypes_orig = np.array(array.dtypes) - dtype_orig = np.result_type(*array.dtypes) + if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): + dtype_orig = np.result_type(*array.dtypes) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From a36dc1341602e55d24bfa482598c909aaf5b3322 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 27 Sep 2019 12:51:25 -0400 Subject: [PATCH 6/8] add test for integers within pd.Categorical --- sklearn/utils/tests/test_validation.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 967a4edb268b4..fb13cec4abe8d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -382,6 +382,12 @@ def test_check_array_pandas_dtype_casting(): # the target type yet, so we cast to the default of float64 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64 + # check that we handle pandas dtypes in a semi-reasonable way + # this is actually tricky because we can't really know that this + # should be integer ahead of converting it. + assert (check_array(pd.DataFrame(pd.Categorical([1, 2, 3]))).dtype + == np.int64) + def test_check_array_on_mock_dataframe(): arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) From 0db26893d7c9c06a9532bfba9c0a830600c7056a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 27 Sep 2019 15:04:23 -0400 Subject: [PATCH 7/8] try to appease old pandas --- sklearn/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index fb13cec4abe8d..54898d20ff8da 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -385,7 +385,7 @@ def test_check_array_pandas_dtype_casting(): # check that we handle pandas dtypes in a semi-reasonable way # this is actually tricky because we can't really know that this # should be integer ahead of converting it. - assert (check_array(pd.DataFrame(pd.Categorical([1, 2, 3]))).dtype + assert (check_array(pd.DataFrame([pd.Categorical([1, 2, 3])])).dtype == np.int64) From 8a3c932315e37dae70e1a7135f71c343a1c7fee7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 4 Oct 2019 20:11:45 +0200 Subject: [PATCH 8/8] check for casting categorical dtypes to float --- sklearn/utils/tests/test_validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 54898d20ff8da..d7da45ccd418d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -385,8 +385,10 @@ def test_check_array_pandas_dtype_casting(): # check that we handle pandas dtypes in a semi-reasonable way # this is actually tricky because we can't really know that this # should be integer ahead of converting it. - assert (check_array(pd.DataFrame([pd.Categorical([1, 2, 3])])).dtype - == np.int64) + cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])]) + assert (check_array(cat_df).dtype == np.int64) + assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype + == np.float64) def test_check_array_on_mock_dataframe():