scikit-learn
diff --git a/‎doc/whats_new/upcoming_changes/sklearn.utils/31040.enhancement.rst
Lines changed: 4 additions & 0 deletions b/‎doc/whats_new/upcoming_changes/sklearn.utils/31040.enhancement.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎sklearn/utils/_indexing.py
Lines changed: 72 additions & 6 deletions b/‎sklearn/utils/_indexing.py
Lines changed: 72 additions & 6 deletions
diff --git a/‎sklearn/utils/_testing.py
Lines changed: 4 additions & 0 deletions b/‎sklearn/utils/_testing.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎sklearn/utils/tests/test_indexing.py
Lines changed: 29 additions & 10 deletions b/‎sklearn/utils/tests/test_indexing.py
Lines changed: 29 additions & 10 deletions
diff --git a/‎sklearn/utils/tests/test_testing.py
Lines changed: 15 additions & 1 deletion b/‎sklearn/utils/tests/test_testing.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎sklearn/utils/validation.py
Lines changed: 9 additions & 0 deletions b/‎sklearn/utils/validation.py
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,4 @@
+- The private helper function :func:`utils._safe_indexing` now officially supports
+  pyarrow data. For instance, passing a pyarrow `Table` as `X` in a
+  :class:`compose.ColumnTransformer` is now possible.
+  By :user:`Christian Lorentzen <lorentzenchr>`
@@ -18,6 +18,7 @@
     _is_arraylike_not_scalar,
     _is_pandas_df,
     _is_polars_df_or_series,
+    _is_pyarrow_data,
     _use_interchange_protocol,
     check_array,
     check_consistent_length,
@@ -65,7 +66,7 @@ def _list_indexing(X, key, key_dtype):
 
 
 def _polars_indexing(X, key, key_dtype, axis):
-    """Indexing X with polars interchange protocol."""
+    """Index a polars dataframe or series."""
     # Polars behavior is more consistent with lists
     if isinstance(key, np.ndarray):
         # Convert each element of the array to a Python scalar
@@ -93,6 +94,55 @@ def _polars_indexing(X, key, key_dtype, axis):
     return X_indexed
 
 
+def _pyarrow_indexing(X, key, key_dtype, axis):
+    """Index a pyarrow data."""
+    scalar_key = np.isscalar(key)
+    if isinstance(key, slice):
+        if isinstance(key.stop, str):
+            start = X.column_names.index(key.start)
+            stop = X.column_names.index(key.stop) + 1
+        else:
+            start = 0 if not key.start else key.start
+            stop = key.stop
+        step = 1 if not key.step else key.step
+        key = list(range(start, stop, step))
+
+    if axis == 1:
+        # Here we are certain that X is a pyarrow Table or RecordBatch.
+        if key_dtype == "int" and not isinstance(key, list):
+            # pyarrow's X.select behavior is more consistent with integer lists.
+            key = np.asarray(key).tolist()
+        if key_dtype == "bool":
+            key = np.asarray(key).nonzero()[0].tolist()
+
+        if scalar_key:
+            return X.column(key)
+
+        return X.select(key)
+
+    # axis == 0 from here on
+    if scalar_key:
+        if hasattr(X, "shape"):
+            # X is a Table or RecordBatch
+            key = [key]
+        else:
+            return X[key].as_py()
+    elif not isinstance(key, list):
+        key = np.asarray(key)
+
+    if key_dtype == "bool":
+        X_indexed = X.filter(key)
+    else:
+        X_indexed = X.take(key)
+
+    if scalar_key and len(getattr(X, "shape", [0])) == 2:
+        # X_indexed is a dataframe-like with a single row; we return a Series to be
+        # consistent with pandas
+        pa = sys.modules["pyarrow"]
+        return pa.array(X_indexed.to_pylist()[0].values())
+    return X_indexed
+
+
 def _determine_key_type(key, accept_slice=True):
     """Determine the data type of key.
 
@@ -245,11 +295,11 @@ def _safe_indexing(X, indices, *, axis=0):
     if axis == 1 and isinstance(X, list):
         raise ValueError("axis=1 is not supported for lists")
 
-    if axis == 1 and hasattr(X, "shape") and len(X.shape) != 2:
+    if axis == 1 and (ndim := len(getattr(X, "shape", [0]))) != 2:
         raise ValueError(
             "'X' should be a 2D NumPy array, 2D sparse matrix or "
             "dataframe when indexing the columns (i.e. 'axis=1'). "
-            "Got {} instead with {} dimension(s).".format(type(X), len(X.shape))
+            f"Got {type(X)} instead with {ndim} dimension(s)."
         )
 
     if (
@@ -262,12 +312,28 @@ def _safe_indexing(X, indices, *, axis=0):
         )
 
     if hasattr(X, "iloc"):
-        # TODO: we should probably use _is_pandas_df_or_series(X) instead but this
-        # would require updating some tests such as test_train_test_split_mock_pandas.
+        # TODO: we should probably use _is_pandas_df_or_series(X) instead but:
+        # 1) Currently, it (probably) works for dataframes compliant to pandas' API.
+        # 2) Updating would require updating some tests such as
+        #    test_train_test_split_mock_pandas.
         return _pandas_indexing(X, indices, indices_dtype, axis=axis)
     elif _is_polars_df_or_series(X):
         return _polars_indexing(X, indices, indices_dtype, axis=axis)
-    elif hasattr(X, "shape"):
+    elif _is_pyarrow_data(X):
+        return _pyarrow_indexing(X, indices, indices_dtype, axis=axis)
+    elif _use_interchange_protocol(X):  # pragma: no cover
+        # Once the dataframe X is converted into its dataframe interchange protocol
+        # version by calling X.__dataframe__(), it becomes very hard to turn it back
+        # into its original type, e.g., a pyarrow.Table, see
+        # https://github.com/data-apis/dataframe-api/issues/85.
+        raise warnings.warn(
+            message="A data object with support for the dataframe interchange protocol"
+            "was passed, but scikit-learn does currently not know how to handle this "
+            "kind of data. Some array/list indexing will be tried.",
+            category=UserWarning,
+        )
+
+    if hasattr(X, "shape"):
         return _array_indexing(X, indices, indices_dtype, axis=axis)
     else:
         return _list_indexing(X, indices, indices_dtype)
 
@@ -1021,6 +1021,7 @@ def _convert_container(
     elif constructor_name == "pyarrow":
         pa = pytest.importorskip("pyarrow", minversion=minversion)
         array = np.asarray(container)
+        array = array[:, None] if array.ndim == 1 else array
         if columns_name is None:
             columns_name = [f"col{i}" for i in range(array.shape[1])]
         data = {name: array[:, i] for i, name in enumerate(columns_name)}
@@ -1042,6 +1043,9 @@ def _convert_container(
     elif constructor_name == "series":
         pd = pytest.importorskip("pandas", minversion=minversion)
         return pd.Series(container, dtype=dtype)
+    elif constructor_name == "pyarrow_array":
+        pa = pytest.importorskip("pyarrow", minversion=minversion)
+        return pa.array(container)
     elif constructor_name == "polars_series":
         pl = pytest.importorskip("polars", minversion=minversion)
         return pl.Series(values=container)
 
@@ -134,7 +134,7 @@ def test_determine_key_type_array_api(array_namespace, device, dtype_name):
 
 
 @pytest.mark.parametrize(
-    "array_type", ["list", "array", "sparse", "dataframe", "polars"]
+    "array_type", ["list", "array", "sparse", "dataframe", "polars", "pyarrow"]
 )
 @pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
 def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
@@ -149,7 +149,9 @@ def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
     )
 
 
-@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
+@pytest.mark.parametrize(
+    "array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
+)
 @pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
 def test_safe_indexing_1d_container(array_type, indices_type):
     indices = [1, 2]
@@ -161,7 +163,9 @@ def test_safe_indexing_1d_container(array_type, indices_type):
     assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
 
 
-@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
+@pytest.mark.parametrize(
+    "array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
+)
 @pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
 @pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
 def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
@@ -177,7 +181,7 @@ def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
     )
     indices_converted = _convert_container(indices_converted, indices_type)
 
-    if isinstance(indices[0], str) and array_type not in ("dataframe", "polars"):
+    if isinstance(indices[0], str) and array_type in ("array", "sparse"):
         err_msg = (
             "Specifying the columns using strings is only supported for dataframes"
         )
@@ -192,7 +196,9 @@ def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
 
 @pytest.mark.parametrize("array_read_only", [True, False])
 @pytest.mark.parametrize("indices_read_only", [True, False])
-@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
+@pytest.mark.parametrize(
+    "array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
+)
 @pytest.mark.parametrize("indices_type", ["array", "series"])
 @pytest.mark.parametrize(
     "axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
@@ -212,7 +218,9 @@ def test_safe_indexing_2d_read_only_axis_1(
     assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
 
 
-@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
+@pytest.mark.parametrize(
+    "array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
+)
 @pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
 def test_safe_indexing_1d_container_mask(array_type, indices_type):
     indices = [False] + [True] * 2 + [False] * 6
@@ -222,7 +230,9 @@ def test_safe_indexing_1d_container_mask(array_type, indices_type):
     assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
 
 
-@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
+@pytest.mark.parametrize(
+    "array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
+)
 @pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
 @pytest.mark.parametrize(
     "axis, expected_subset",
@@ -250,6 +260,7 @@ def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
         ("sparse", "sparse"),
         ("dataframe", "series"),
         ("polars", "polars_series"),
+        ("pyarrow", "pyarrow_array"),
     ],
 )
 def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
@@ -260,7 +271,9 @@ def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
     assert_allclose_dense_sparse(subset, expected_array)
 
 
-@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
+@pytest.mark.parametrize(
+    "array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
+)
 def test_safe_indexing_1d_scalar(array_type):
     array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
     indices = 2
@@ -275,6 +288,7 @@ def test_safe_indexing_1d_scalar(array_type):
         ("sparse", "sparse"),
         ("dataframe", "series"),
         ("polars", "polars_series"),
+        ("pyarrow", "pyarrow_array"),
     ],
 )
 @pytest.mark.parametrize("indices", [2, "col_2"])
@@ -284,7 +298,7 @@ def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indice
         [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
     )
 
-    if isinstance(indices, str) and array_type not in ("dataframe", "polars"):
+    if isinstance(indices, str) and array_type in ("array", "sparse"):
         err_msg = (
             "Specifying the columns using strings is only supported for dataframes"
         )
@@ -321,7 +335,9 @@ def test_safe_indexing_error_axis(axis):
         _safe_indexing(X_toy, [0, 1], axis=axis)
 
 
-@pytest.mark.parametrize("X_constructor", ["array", "series", "polars_series"])
+@pytest.mark.parametrize(
+    "X_constructor", ["array", "series", "polars_series", "pyarrow_array"]
+)
 def test_safe_indexing_1d_array_error(X_constructor):
     # check that we are raising an error if the array-like passed is 1D and
     # we try to index on the 2nd dimension
@@ -334,6 +350,9 @@ def test_safe_indexing_1d_array_error(X_constructor):
     elif X_constructor == "polars_series":
         pl = pytest.importorskip("polars")
         X_constructor = pl.Series(values=X)
+    elif X_constructor == "pyarrow_array":
+        pa = pytest.importorskip("pyarrow")
+        X_constructor = pa.array(X)
 
     err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or dataframe"
     with pytest.raises(ValueError, match=err_msg):
 
@@ -896,6 +896,10 @@ def test_create_memmap_backed_data(monkeypatch):
         ("dataframe", lambda: pytest.importorskip("pandas").DataFrame),
         ("series", lambda: pytest.importorskip("pandas").Series),
         ("index", lambda: pytest.importorskip("pandas").Index),
+        ("pyarrow", lambda: pytest.importorskip("pyarrow").Table),
+        ("pyarrow_array", lambda: pytest.importorskip("pyarrow").Array),
+        ("polars", lambda: pytest.importorskip("polars").DataFrame),
+        ("polars_series", lambda: pytest.importorskip("polars").Series),
         ("slice", slice),
     ],
 )
@@ -916,7 +920,15 @@ def test_convert_container(
 ):
     """Check that we convert the container to the right type of array with the
     right data type."""
-    if constructor_name in ("dataframe", "polars", "series", "polars_series", "index"):
+    if constructor_name in (
+        "dataframe",
+        "index",
+        "polars",
+        "polars_series",
+        "pyarrow",
+        "pyarrow_array",
+        "series",
+    ):
         # delay the import of pandas/polars within the function to only skip this test
         # instead of the whole file
         container_type = container_type()
@@ -933,6 +945,8 @@ def test_convert_container(
         # list and tuple will use Python class dtype: int, float
         # pandas index will always use high precision: np.int64 and np.float64
         assert np.issubdtype(type(container_converted[0]), superdtype)
+    elif constructor_name in ("polars", "polars_series", "pyarrow", "pyarrow_array"):
+        return
     elif hasattr(container_converted, "dtype"):
         assert container_converted.dtype == dtype
     elif hasattr(container_converted, "dtypes"):
 
@@ -2348,6 +2348,15 @@ def _is_pandas_df(X):
     return isinstance(X, pd.DataFrame)
 
 
+def _is_pyarrow_data(X):
+    """Return True if the X is a pyarrow Table, RecordBatch, Array or ChunkedArray."""
+    try:
+        pa = sys.modules["pyarrow"]
+    except KeyError:
+        return False
+    return isinstance(X, (pa.Table, pa.RecordBatch, pa.Array, pa.ChunkedArray))
+
+
 def _is_polars_df_or_series(X):
     """Return True if the X is a polars dataframe or series."""
     try: