From 04c4c7b586fd9e01d5b58c3794cf7b9133ffb239 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 10 Mar 2023 12:08:27 -0500
Subject: [PATCH 1/8] FIX Fixes pandas extension arrays in check_array

---
 doc/whats_new/v1.3.rst                    |  3 +++
 sklearn/preprocessing/tests/test_label.py |  6 ++++--
 sklearn/utils/validation.py               | 13 ++++++++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 1f5f14ce77800..37c09f1a30d07 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -238,6 +238,9 @@ Changelog
   and the `values` key will be removed in 1.5.
   :pr:`21809` and :pr:`25732` by `Thomas Fan`_.
 
+- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
+  extension arrays. :pr:`xxxxx` by `Thomas Fan`_.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index b90218a97016b..6fd068914350f 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -118,15 +118,17 @@ def test_label_binarizer_set_label_encoding():
 
 
 @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
-def test_label_binarizer_pandas_nullable(dtype):
+@pytest.mark.parametrize("unique_first", [True, False])
+def test_label_binarizer_pandas_nullable(dtype, unique_first):
     """Checks that LabelBinarizer works with pandas nullable dtypes.
 
     Non-regression test for gh-25637.
     """
     pd = pytest.importorskip("pandas")
-    from sklearn.preprocessing import LabelBinarizer
 
     y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
+    if unique_first:
+        y_true = y_true.unique()
     lb = LabelBinarizer().fit(y_true)
     y_out = lb.transform([1, 0])
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index eb56caa5e65e4..2d5f323195e97 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -626,6 +626,15 @@ def _pandas_dtype_needs_early_conversion(pd_dtype):
     return False
 
 
+def _is_extension_array_dtype(array):
+    try:
+        from pandas.api.types import is_extension_array_dtype
+
+        return is_extension_array_dtype(array)
+    except ImportError:
+        return False
+
+
 def check_array(
     array,
     accept_sparse=False,
@@ -777,7 +786,9 @@ def check_array(
         if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
 
-    elif hasattr(array, "iloc") and hasattr(array, "dtype"):
+    elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
+        array, "dtype"
+    ):
         # array is a pandas series
         pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
         if isinstance(array.dtype, np.dtype):

From a8b7c29741a615e88cf5b01acd36ee0831bc3528 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 10 Mar 2023 12:17:19 -0500
Subject: [PATCH 2/8] DOC Adds PR number

---
 doc/whats_new/v1.3.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 37c09f1a30d07..b3d5942624ac9 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -239,7 +239,7 @@ Changelog
   :pr:`21809` and :pr:`25732` by `Thomas Fan`_.
 
 - |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
-  extension arrays. :pr:`xxxxx` by `Thomas Fan`_.
+  extension arrays. :pr:`25813` by `Thomas Fan`_.
 
 :mod:`sklearn.linear_model`
 ...........................

From 8f2b8359f32681d29fdc96b2872ef0922ef28552 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 10 Mar 2023 12:28:30 -0500
Subject: [PATCH 3/8] DOC Adds PR number

---
 doc/whats_new/v1.3.rst                 |  3 +++
 sklearn/utils/tests/test_validation.py | 26 ++++++++++++++++++++++++++
 sklearn/utils/validation.py            |  3 +++
 3 files changed, 32 insertions(+)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index b3d5942624ac9..dd743b9453261 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -353,6 +353,9 @@ Changelog
   :pr:`25733` by :user:`Brigitta Sipőcz <bsipocz>` and
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Fix| :func:`utils.validation.check_array` now suports extension arrays with object
+  dtypes by return an ndarray with object dtype. :pr:`xxxxx` by `Thomas Fan`_.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 5c9379facf366..19b610c1ec78e 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1776,3 +1776,29 @@ def test_check_array_array_api_has_non_finite(array_namespace):
     with config_context(array_api_dispatch=True):
         with pytest.raises(ValueError, match="infinity or a value too large"):
             check_array(X_inf)
+
+
+@pytest.mark.parametrize(
+    "extension_dtype, regular_dtype",
+    [("boolean", "bool"), ("Int64", "int64"), ("Float64", "float64")],
+)
+@pytest.mark.parametrize("include_object", [True, False])
+def test_check_array_multiple_extensions(
+    extension_dtype, regular_dtype, include_object
+):
+    """Check pandas extension arrays give the same result as non-extension arrays."""
+    pd = pytest.importorskip("pandas")
+    X_regular = pd.DataFrame(
+        {
+            "a": pd.Series([1, 0, 1, 0], dtype=regular_dtype),
+            "c": pd.Series([9, 8, 7, 6], dtype="int64"),
+        }
+    )
+    if include_object:
+        X_regular["b"] = pd.Series(["a", "b", "c", "d"], dtype="object")
+
+    X_extension = X_regular.assign(a=X_regular["a"].astype(extension_dtype))
+
+    X_regular_checked = check_array(X_regular, dtype=None)
+    X_extension_checked = check_array(X_extension, dtype=None)
+    assert_array_equal(X_regular_checked, X_extension_checked)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 2d5f323195e97..e8978a086d001 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -785,6 +785,9 @@ def check_array(
         )
         if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
+        elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
+            # Force object if any of the dtypes is an object
+            dtype_orig = object
 
     elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
         array, "dtype"

From 457362e17f5a54c6dcf91db2931bd49417507f9c Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 13 Mar 2023 12:29:25 -0400
Subject: [PATCH 4/8] REV Remove unneeded changes

---
 doc/whats_new/v1.3.rst                 |  3 ---
 sklearn/utils/tests/test_validation.py | 26 --------------------------
 sklearn/utils/validation.py            | 16 +---------------
 3 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 5af9ebb175cc1..6ae2df9ea1b2c 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -359,9 +359,6 @@ Changelog
   :pr:`25733` by :user:`Brigitta Sipőcz <bsipocz>` and
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
-- |Fix| :func:`utils.validation.check_array` now suports extension arrays with object
-  dtypes by return an ndarray with object dtype. :pr:`xxxxx` by `Thomas Fan`_.
-
 :mod:`sklearn.semi_supervised`
 ..............................
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 19b610c1ec78e..5c9379facf366 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1776,29 +1776,3 @@ def test_check_array_array_api_has_non_finite(array_namespace):
     with config_context(array_api_dispatch=True):
         with pytest.raises(ValueError, match="infinity or a value too large"):
             check_array(X_inf)
-
-
-@pytest.mark.parametrize(
-    "extension_dtype, regular_dtype",
-    [("boolean", "bool"), ("Int64", "int64"), ("Float64", "float64")],
-)
-@pytest.mark.parametrize("include_object", [True, False])
-def test_check_array_multiple_extensions(
-    extension_dtype, regular_dtype, include_object
-):
-    """Check pandas extension arrays give the same result as non-extension arrays."""
-    pd = pytest.importorskip("pandas")
-    X_regular = pd.DataFrame(
-        {
-            "a": pd.Series([1, 0, 1, 0], dtype=regular_dtype),
-            "c": pd.Series([9, 8, 7, 6], dtype="int64"),
-        }
-    )
-    if include_object:
-        X_regular["b"] = pd.Series(["a", "b", "c", "d"], dtype="object")
-
-    X_extension = X_regular.assign(a=X_regular["a"].astype(extension_dtype))
-
-    X_regular_checked = check_array(X_regular, dtype=None)
-    X_extension_checked = check_array(X_extension, dtype=None)
-    assert_array_equal(X_regular_checked, X_extension_checked)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index e8978a086d001..eb56caa5e65e4 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -626,15 +626,6 @@ def _pandas_dtype_needs_early_conversion(pd_dtype):
     return False
 
 
-def _is_extension_array_dtype(array):
-    try:
-        from pandas.api.types import is_extension_array_dtype
-
-        return is_extension_array_dtype(array)
-    except ImportError:
-        return False
-
-
 def check_array(
     array,
     accept_sparse=False,
@@ -785,13 +776,8 @@ def check_array(
         )
         if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
-        elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
-            # Force object if any of the dtypes is an object
-            dtype_orig = object
 
-    elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
-        array, "dtype"
-    ):
+    elif hasattr(array, "iloc") and hasattr(array, "dtype"):
         # array is a pandas series
         pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
         if isinstance(array.dtype, np.dtype):

From 04a6e15e8b279c431435c7d6a8b4feafedce8857 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 13 Mar 2023 12:32:13 -0400
Subject: [PATCH 5/8] ENH Adds back required changes

---
 sklearn/preprocessing/tests/test_label.py |  2 ++
 sklearn/utils/validation.py               | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 6fd068914350f..d8566c85e7b73 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -128,6 +128,8 @@ def test_label_binarizer_pandas_nullable(dtype, unique_first):
 
     y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
     if unique_first:
+        # Calling unique creates a pandas array which has a different interface
+        # compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
         y_true = y_true.unique()
     lb = LabelBinarizer().fit(y_true)
     y_out = lb.transform([1, 0])
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index eb56caa5e65e4..2d5f323195e97 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -626,6 +626,15 @@ def _pandas_dtype_needs_early_conversion(pd_dtype):
     return False
 
 
+def _is_extension_array_dtype(array):
+    try:
+        from pandas.api.types import is_extension_array_dtype
+
+        return is_extension_array_dtype(array)
+    except ImportError:
+        return False
+
+
 def check_array(
     array,
     accept_sparse=False,
@@ -777,7 +786,9 @@ def check_array(
         if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
 
-    elif hasattr(array, "iloc") and hasattr(array, "dtype"):
+    elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
+        array, "dtype"
+    ):
         # array is a pandas series
         pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
         if isinstance(array.dtype, np.dtype):

From 71480ced54eb2d839f7e870e40b57143eff58e6f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 21 Mar 2023 14:07:25 -0400
Subject: [PATCH 6/8] CLN Address comments

---
 doc/whats_new/v1.3.rst                 |  6 +++---
 sklearn/utils/tests/test_validation.py | 11 +++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 2a9884badd952..808ef6afafd66 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -262,9 +262,6 @@ Changelog
   and the `values` key will be removed in 1.5.
   :pr:`21809` and :pr:`25732` by `Thomas Fan`_.
 
-- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
-  extension arrays. :pr:`25813` by `Thomas Fan`_.
-
 :mod:`sklearn.linear_model`
 ...........................
 
@@ -405,6 +402,9 @@ Changelog
   :pr:`25733` by :user:`Brigitta Sipőcz <bsipocz>` and
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
+  extension arrays. :pr:`25813` by `Thomas Fan`_.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 5c9379facf366..3a5416ab2e046 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1762,6 +1762,17 @@ def test_boolean_series_remains_boolean():
     assert_array_equal(res, expected)
 
 
+def test_pandas_array_returns_ndarray():
+    """Check pandas array works and returns a ndarray.
+
+    Non-regression test for gh-25637.
+    """
+    pd = importorskip("pandas")
+    input_values = [0, 1, 0, 1, 0, 1]
+    result = check_array(pd.array(input_values, dtype="Int64"), ensure_2d=False)
+    assert_array_equal(result, input_values)
+
+
 @pytest.mark.parametrize("array_namespace", ["numpy.array_api", "cupy.array_api"])
 def test_check_array_array_api_has_non_finite(array_namespace):
     """Checks that Array API arrays checks non-finite correctly."""

From 2d32dbdee605562edc0d8201f518ca94f4554d50 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 21 Mar 2023 14:28:06 -0400
Subject: [PATCH 7/8] TST Adds test for check_array directly

---
 sklearn/utils/tests/test_validation.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 3a5416ab2e046..d3319ecfc924b 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1762,15 +1762,23 @@ def test_boolean_series_remains_boolean():
     assert_array_equal(res, expected)
 
 
-def test_pandas_array_returns_ndarray():
-    """Check pandas array works and returns a ndarray.
+@pytest.mark.parametrize("input_values", [[0, 1, 0, 1, 0, np.nan], [0, 1, 0, 1, 0, 1]])
+def test_pandas_array_returns_ndarray(input_values):
+    """Check pandas array with extensions dtypes returns a numeric ndarray.
 
     Non-regression test for gh-25637.
     """
     pd = importorskip("pandas")
-    input_values = [0, 1, 0, 1, 0, 1]
-    result = check_array(pd.array(input_values, dtype="Int64"), ensure_2d=False)
-    assert_array_equal(result, input_values)
+    input_series = pd.array(input_values, dtype="Int32")
+    result = check_array(
+        input_series,
+        dtype=None,
+        ensure_2d=False,
+        allow_nd=False,
+        force_all_finite=False,
+    )
+    assert result.dtype.kind == "f"
+    assert_allclose(result, input_values)
 
 
 @pytest.mark.parametrize("array_namespace", ["numpy.array_api", "cupy.array_api"])

From 0eff901f58fe64ba540aaef5002e8e4c088a9cea Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 21 Mar 2023 14:29:39 -0400
Subject: [PATCH 8/8] TST Use issubdtype

---
 sklearn/utils/tests/test_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index d3319ecfc924b..028a5ee3042bf 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1777,7 +1777,7 @@ def test_pandas_array_returns_ndarray(input_values):
         allow_nd=False,
         force_all_finite=False,
     )
-    assert result.dtype.kind == "f"
+    assert np.issubdtype(result.dtype.kind, np.floating)
     assert_allclose(result, input_values)