From 0d85e3e8a5409ce9459512a8fb58e08ebbef15a2 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 11:08:56 -0500 Subject: [PATCH 1/7] fix check_array dtype check for pandas series --- sklearn/utils/tests/test_validation.py | 6 ++++++ sklearn/utils/validation.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 997f233807d67..e6180a975a626 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -694,6 +694,12 @@ def test_suppress_validation(): assert_raises(ValueError, assert_all_finite, X) +def test_check_array_series(): + # regression test that check_array works on pandas Series + pd = importorskip("pandas") + check_array(pd.Series([1, 2, 3]), ensure_2d=False, warn_on_dtype=True) + + def test_check_dataframe_warns_on_dtype(): # Check that warn_on_dtype also works for DataFrames. # https://github.com/scikit-learn/scikit-learn/issues/10948 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 3181b925ba83a..f71bd17c72db2 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -478,7 +478,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array, "__array__"): - dtypes_orig = np.array(array.dtypes) + dtypes_orig = np.array(array.dtypes, ndmin=1) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From 2e351652c975f096883dc908160b40f7739cf114 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 11:11:39 -0500 Subject: [PATCH 2/7] add comment explaining fix --- sklearn/utils/validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f71bd17c72db2..5f83deaecd34d 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -478,6 +478,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array, "__array__"): + # ndmin in case dtypes is a scalar (for Series) dtypes_orig = np.array(array.dtypes, ndmin=1) if dtype_numeric: From 75ed85bf7f1ba7c7f2a50c5c23951e4dbbb936d6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 15:24:53 -0500 Subject: [PATCH 3/7] don't define dtypes for series --- sklearn/utils/validation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5f83deaecd34d..ea2606fe6b6eb 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -477,9 +477,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # check if the object contains several dtypes (typically a pandas # DataFrame), and store them. If not, store None. dtypes_orig = None - if hasattr(array, "dtypes") and hasattr(array, "__array__"): - # ndmin in case dtypes is a scalar (for Series) - dtypes_orig = np.array(array.dtypes, ndmin=1) + if hasattr(array, "dtypes") and len(array.dtypes): + dtypes_orig = np.array(array.dtypes) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From 87cd3fa89f12d81a4428901a3faa0e773417f965 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 16:20:33 -0500 Subject: [PATCH 4/7] check output of check_array on series --- sklearn/utils/tests/test_validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index e6180a975a626..6dcbd4e0ba2f8 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -697,7 +697,8 @@ def test_suppress_validation(): def test_check_array_series(): # regression test that check_array works on pandas Series pd = importorskip("pandas") - check_array(pd.Series([1, 2, 3]), ensure_2d=False, warn_on_dtype=True) + res = check_array(pd.Series([1, 2, 3]), ensure_2d=False, warn_on_dtype=True) + assert_array_equal(res, np.array([1, 2, 3])) def test_check_dataframe_warns_on_dtype(): From 7bfaf375850dcbe298c77abcea632307333a57e4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 16:25:20 -0500 Subject: [PATCH 5/7] add whatsnew for pandas series fix, fix link to my website --- doc/whats_new/_contributors.rst | 2 +- doc/whats_new/v0.20.rst | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index 937fce63219df..270df1fb837fc 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -48,7 +48,7 @@ .. _Bertrand Thirion: https://team.inria.fr/parietal/bertrand-thirions-page -.. _Andreas Müller: https://peekaboo-vision.blogspot.com/ +.. _Andreas Müller: https://amueller.github.io/ .. _Matthieu Perrot: http://brainvisa.info/biblio/lnao/en/Author/PERROT-M.html diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 98328cf60457c..b38b9a8bff553 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -176,6 +176,10 @@ Changelog precision issues in :class:`preprocessing.StandardScaler` and :class:`decomposition.IncrementalPCA` when using float32 datasets. :issue:`12338` by :user:`bauks `. + +- |Fix| Calling :func:`utils.check_array` on pandas `Series`, which + raised an error in 0.20.0, now returns the expected output again. + :issue:`12625` by `Andreas Müller`_ Miscellaneous ............. From 6734c93ac1ede0a93ad46f1d93fcffe4b58147ce Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Nov 2018 16:36:55 -0500 Subject: [PATCH 6/7] fix flake8 --- sklearn/utils/tests/test_validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 6dcbd4e0ba2f8..11128d106a50e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -697,7 +697,8 @@ def test_suppress_validation(): def test_check_array_series(): # regression test that check_array works on pandas Series pd = importorskip("pandas") - res = check_array(pd.Series([1, 2, 3]), ensure_2d=False, warn_on_dtype=True) + res = check_array(pd.Series([1, 2, 3]), ensure_2d=False, + warn_on_dtype=True) assert_array_equal(res, np.array([1, 2, 3])) From b2e0849796b73b466455d59d2ca748e638f6ca85 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 21 Nov 2018 09:22:12 +1100 Subject: [PATCH 7/7] Update v0.20.rst --- doc/whats_new/v0.20.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index b38b9a8bff553..7762a3a021d7a 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -177,7 +177,7 @@ Changelog :class:`decomposition.IncrementalPCA` when using float32 datasets. :issue:`12338` by :user:`bauks `. -- |Fix| Calling :func:`utils.check_array` on pandas `Series`, which +- |Fix| Calling :func:`utils.check_array` on `pandas.Series`, which raised an error in 0.20.0, now returns the expected output again. :issue:`12625` by `Andreas Müller`_