From 04c4c7b586fd9e01d5b58c3794cf7b9133ffb239 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 10 Mar 2023 12:08:27 -0500 Subject: [PATCH 1/8] FIX Fixes pandas extension arrays in check_array --- doc/whats_new/v1.3.rst | 3 +++ sklearn/preprocessing/tests/test_label.py | 6 ++++-- sklearn/utils/validation.py | 13 ++++++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 1f5f14ce77800..37c09f1a30d07 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -238,6 +238,9 @@ Changelog and the `values` key will be removed in 1.5. :pr:`21809` and :pr:`25732` by `Thomas Fan`_. +- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas + extension arrays. :pr:`xxxxx` by `Thomas Fan`_. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index b90218a97016b..6fd068914350f 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -118,15 +118,17 @@ def test_label_binarizer_set_label_encoding(): @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_label_binarizer_pandas_nullable(dtype): +@pytest.mark.parametrize("unique_first", [True, False]) +def test_label_binarizer_pandas_nullable(dtype, unique_first): """Checks that LabelBinarizer works with pandas nullable dtypes. Non-regression test for gh-25637. """ pd = pytest.importorskip("pandas") - from sklearn.preprocessing import LabelBinarizer y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) + if unique_first: + y_true = y_true.unique() lb = LabelBinarizer().fit(y_true) y_out = lb.transform([1, 0]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index eb56caa5e65e4..2d5f323195e97 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -626,6 +626,15 @@ def _pandas_dtype_needs_early_conversion(pd_dtype): return False +def _is_extension_array_dtype(array): + try: + from pandas.api.types import is_extension_array_dtype + + return is_extension_array_dtype(array) + except ImportError: + return False + + def check_array( array, accept_sparse=False, @@ -777,7 +786,9 @@ def check_array( if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) - elif hasattr(array, "iloc") and hasattr(array, "dtype"): + elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr( + array, "dtype" + ): # array is a pandas series pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype) if isinstance(array.dtype, np.dtype): From a8b7c29741a615e88cf5b01acd36ee0831bc3528 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 10 Mar 2023 12:17:19 -0500 Subject: [PATCH 2/8] DOC Adds PR number --- doc/whats_new/v1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 37c09f1a30d07..b3d5942624ac9 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -239,7 +239,7 @@ Changelog :pr:`21809` and :pr:`25732` by `Thomas Fan`_. - |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas - extension arrays. :pr:`xxxxx` by `Thomas Fan`_. + extension arrays. :pr:`25813` by `Thomas Fan`_. :mod:`sklearn.linear_model` ........................... From 8f2b8359f32681d29fdc96b2872ef0922ef28552 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 10 Mar 2023 12:28:30 -0500 Subject: [PATCH 3/8] DOC Adds PR number --- doc/whats_new/v1.3.rst | 3 +++ sklearn/utils/tests/test_validation.py | 26 ++++++++++++++++++++++++++ sklearn/utils/validation.py | 3 +++ 3 files changed, 32 insertions(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index b3d5942624ac9..dd743b9453261 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -353,6 +353,9 @@ Changelog :pr:`25733` by :user:`Brigitta Sipőcz ` and :user:`Jérémie du Boisberranger `. +- |Fix| :func:`utils.validation.check_array` now suports extension arrays with object + dtypes by return an ndarray with object dtype. :pr:`xxxxx` by `Thomas Fan`_. + :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5c9379facf366..19b610c1ec78e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1776,3 +1776,29 @@ def test_check_array_array_api_has_non_finite(array_namespace): with config_context(array_api_dispatch=True): with pytest.raises(ValueError, match="infinity or a value too large"): check_array(X_inf) + + +@pytest.mark.parametrize( + "extension_dtype, regular_dtype", + [("boolean", "bool"), ("Int64", "int64"), ("Float64", "float64")], +) +@pytest.mark.parametrize("include_object", [True, False]) +def test_check_array_multiple_extensions( + extension_dtype, regular_dtype, include_object +): + """Check pandas extension arrays give the same result as non-extension arrays.""" + pd = pytest.importorskip("pandas") + X_regular = pd.DataFrame( + { + "a": pd.Series([1, 0, 1, 0], dtype=regular_dtype), + "c": pd.Series([9, 8, 7, 6], dtype="int64"), + } + ) + if include_object: + X_regular["b"] = pd.Series(["a", "b", "c", "d"], dtype="object") + + X_extension = X_regular.assign(a=X_regular["a"].astype(extension_dtype)) + + X_regular_checked = check_array(X_regular, dtype=None) + X_extension_checked = check_array(X_extension, dtype=None) + assert_array_equal(X_regular_checked, X_extension_checked) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2d5f323195e97..e8978a086d001 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -785,6 +785,9 @@ def check_array( ) if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) + elif pandas_requires_conversion and any(d == object for d in dtypes_orig): + # Force object if any of the dtypes is an object + dtype_orig = object elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr( array, "dtype" From 457362e17f5a54c6dcf91db2931bd49417507f9c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 13 Mar 2023 12:29:25 -0400 Subject: [PATCH 4/8] REV Remove unneeded changes --- doc/whats_new/v1.3.rst | 3 --- sklearn/utils/tests/test_validation.py | 26 -------------------------- sklearn/utils/validation.py | 16 +--------------- 3 files changed, 1 insertion(+), 44 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 5af9ebb175cc1..6ae2df9ea1b2c 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -359,9 +359,6 @@ Changelog :pr:`25733` by :user:`Brigitta Sipőcz ` and :user:`Jérémie du Boisberranger `. -- |Fix| :func:`utils.validation.check_array` now suports extension arrays with object - dtypes by return an ndarray with object dtype. :pr:`xxxxx` by `Thomas Fan`_. - :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 19b610c1ec78e..5c9379facf366 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1776,29 +1776,3 @@ def test_check_array_array_api_has_non_finite(array_namespace): with config_context(array_api_dispatch=True): with pytest.raises(ValueError, match="infinity or a value too large"): check_array(X_inf) - - -@pytest.mark.parametrize( - "extension_dtype, regular_dtype", - [("boolean", "bool"), ("Int64", "int64"), ("Float64", "float64")], -) -@pytest.mark.parametrize("include_object", [True, False]) -def test_check_array_multiple_extensions( - extension_dtype, regular_dtype, include_object -): - """Check pandas extension arrays give the same result as non-extension arrays.""" - pd = pytest.importorskip("pandas") - X_regular = pd.DataFrame( - { - "a": pd.Series([1, 0, 1, 0], dtype=regular_dtype), - "c": pd.Series([9, 8, 7, 6], dtype="int64"), - } - ) - if include_object: - X_regular["b"] = pd.Series(["a", "b", "c", "d"], dtype="object") - - X_extension = X_regular.assign(a=X_regular["a"].astype(extension_dtype)) - - X_regular_checked = check_array(X_regular, dtype=None) - X_extension_checked = check_array(X_extension, dtype=None) - assert_array_equal(X_regular_checked, X_extension_checked) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index e8978a086d001..eb56caa5e65e4 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -626,15 +626,6 @@ def _pandas_dtype_needs_early_conversion(pd_dtype): return False -def _is_extension_array_dtype(array): - try: - from pandas.api.types import is_extension_array_dtype - - return is_extension_array_dtype(array) - except ImportError: - return False - - def check_array( array, accept_sparse=False, @@ -785,13 +776,8 @@ def check_array( ) if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) - elif pandas_requires_conversion and any(d == object for d in dtypes_orig): - # Force object if any of the dtypes is an object - dtype_orig = object - elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr( - array, "dtype" - ): + elif hasattr(array, "iloc") and hasattr(array, "dtype"): # array is a pandas series pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype) if isinstance(array.dtype, np.dtype): From 04a6e15e8b279c431435c7d6a8b4feafedce8857 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 13 Mar 2023 12:32:13 -0400 Subject: [PATCH 5/8] ENH Adds back required changes --- sklearn/preprocessing/tests/test_label.py | 2 ++ sklearn/utils/validation.py | 13 ++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 6fd068914350f..d8566c85e7b73 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -128,6 +128,8 @@ def test_label_binarizer_pandas_nullable(dtype, unique_first): y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) if unique_first: + # Calling unique creates a pandas array which has a different interface + # compared to a pandas Series. Specifically, pandas arrays do not have "iloc". y_true = y_true.unique() lb = LabelBinarizer().fit(y_true) y_out = lb.transform([1, 0]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index eb56caa5e65e4..2d5f323195e97 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -626,6 +626,15 @@ def _pandas_dtype_needs_early_conversion(pd_dtype): return False +def _is_extension_array_dtype(array): + try: + from pandas.api.types import is_extension_array_dtype + + return is_extension_array_dtype(array) + except ImportError: + return False + + def check_array( array, accept_sparse=False, @@ -777,7 +786,9 @@ def check_array( if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) - elif hasattr(array, "iloc") and hasattr(array, "dtype"): + elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr( + array, "dtype" + ): # array is a pandas series pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype) if isinstance(array.dtype, np.dtype): From 71480ced54eb2d839f7e870e40b57143eff58e6f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 21 Mar 2023 14:07:25 -0400 Subject: [PATCH 6/8] CLN Address comments --- doc/whats_new/v1.3.rst | 6 +++--- sklearn/utils/tests/test_validation.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 2a9884badd952..808ef6afafd66 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -262,9 +262,6 @@ Changelog and the `values` key will be removed in 1.5. :pr:`21809` and :pr:`25732` by `Thomas Fan`_. -- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas - extension arrays. :pr:`25813` by `Thomas Fan`_. - :mod:`sklearn.linear_model` ........................... @@ -405,6 +402,9 @@ Changelog :pr:`25733` by :user:`Brigitta Sipőcz ` and :user:`Jérémie du Boisberranger `. +- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas + extension arrays. :pr:`25813` by `Thomas Fan`_. + :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5c9379facf366..3a5416ab2e046 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1762,6 +1762,17 @@ def test_boolean_series_remains_boolean(): assert_array_equal(res, expected) +def test_pandas_array_returns_ndarray(): + """Check pandas array works and returns a ndarray. + + Non-regression test for gh-25637. + """ + pd = importorskip("pandas") + input_values = [0, 1, 0, 1, 0, 1] + result = check_array(pd.array(input_values, dtype="Int64"), ensure_2d=False) + assert_array_equal(result, input_values) + + @pytest.mark.parametrize("array_namespace", ["numpy.array_api", "cupy.array_api"]) def test_check_array_array_api_has_non_finite(array_namespace): """Checks that Array API arrays checks non-finite correctly.""" From 2d32dbdee605562edc0d8201f518ca94f4554d50 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 21 Mar 2023 14:28:06 -0400 Subject: [PATCH 7/8] TST Adds test for check_array directly --- sklearn/utils/tests/test_validation.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 3a5416ab2e046..d3319ecfc924b 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1762,15 +1762,23 @@ def test_boolean_series_remains_boolean(): assert_array_equal(res, expected) -def test_pandas_array_returns_ndarray(): - """Check pandas array works and returns a ndarray. +@pytest.mark.parametrize("input_values", [[0, 1, 0, 1, 0, np.nan], [0, 1, 0, 1, 0, 1]]) +def test_pandas_array_returns_ndarray(input_values): + """Check pandas array with extensions dtypes returns a numeric ndarray. Non-regression test for gh-25637. """ pd = importorskip("pandas") - input_values = [0, 1, 0, 1, 0, 1] - result = check_array(pd.array(input_values, dtype="Int64"), ensure_2d=False) - assert_array_equal(result, input_values) + input_series = pd.array(input_values, dtype="Int32") + result = check_array( + input_series, + dtype=None, + ensure_2d=False, + allow_nd=False, + force_all_finite=False, + ) + assert result.dtype.kind == "f" + assert_allclose(result, input_values) @pytest.mark.parametrize("array_namespace", ["numpy.array_api", "cupy.array_api"]) From 0eff901f58fe64ba540aaef5002e8e4c088a9cea Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 21 Mar 2023 14:29:39 -0400 Subject: [PATCH 8/8] TST Use issubdtype --- sklearn/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index d3319ecfc924b..028a5ee3042bf 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1777,7 +1777,7 @@ def test_pandas_array_returns_ndarray(input_values): allow_nd=False, force_all_finite=False, ) - assert result.dtype.kind == "f" + assert np.issubdtype(result.dtype.kind, np.floating) assert_allclose(result, input_values)