scikit-learn · adrinjalali · Aug 21, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -367,7 +367,13 @@ Changelog
   calling :func:`utils.validation.check_non_negative`.
   :pr:`29540` by :user:`Tamara Atanasoska <tamaraatanasoska>`.
 
-- |FIX| :func:`utils.estimator_checks.parametrize_with_checks` and
+- |Enhancement| :func:`utils.estimator_checks.parametrize_with_checks` and
+  :func:`utils.estimator_checks.check_estimator` now have started putting tests into
+  categories which can be enabled / disabled using their `dataframe` and `legacy`
+  parameters.
+  :pr:`29699`, :pr:`29713` by `Adrin Jalali`_.
+
+- |Fix| :func:`utils.estimator_checks.parametrize_with_checks` and
   :func:`utils.estimator_checks.check_estimator` now support estimators that
   have `set_output` called on them.
   :pr:`29869` by `Adrin Jalali`_.

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -1332,6 +1332,7 @@ def __sklearn_tags__(self):
             "check_fit2d_predict1d": "FIXME",
             "check_complex_data": "FIXME",
             "check_fit2d_1feature": "FIXME",
+            "check_pandas_column_name_consistency": "FIXME",
         }
         return tags
 

diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
@@ -7,10 +7,9 @@
 import numpy as np
 
 from ..base import OutlierMixin, _fit_context
-from ..utils import check_array
 from ..utils._param_validation import Interval, StrOptions
 from ..utils.metaestimators import available_if
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, validate_data
 from ._base import KNeighborsMixin, NeighborsBase
 
 __all__ = ["LocalOutlierFactor"]
@@ -471,13 +470,14 @@ def score_samples(self, X):
             The lower, the more abnormal.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse="csr")
+        # not replacing X since we need to pass raw X to kneighbors
+        X_validated = validate_data(self, X, reset=False, accept_sparse="csr")
 
         distances_X, neighbors_indices_X = self.kneighbors(
             X, n_neighbors=self.n_neighbors_
         )
 
-        if X.dtype == np.float32:
+        if X_validated.dtype == np.float32:
             distances_X = distances_X.astype(X.dtype, copy=False)
 
         X_lrd = self._local_reachability_density(

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -8,7 +8,6 @@
 import os
 import pkgutil
 import re
-import warnings
 from functools import partial
 from inspect import isgenerator
 from itertools import chain
@@ -18,7 +17,6 @@
 
 import sklearn
 from sklearn.base import BaseEstimator
-from sklearn.compose import ColumnTransformer
 from sklearn.exceptions import ConvergenceWarning
 
 # make it possible to discover experimental estimators when calling `all_estimators`
@@ -45,7 +43,6 @@
     ignore_warnings,
 )
 from sklearn.utils.estimator_checks import (
-    check_dataframe_column_names_consistency,
     check_estimator,
     check_get_feature_names_out_error,
     check_global_output_transform_pandas,
@@ -242,56 +239,6 @@ def check_field_types(tags, defaults):
     check_field_types(tags.transformer_tags, defaults.transformer_tags)
 
 
-def _estimators_that_predict_in_fit():
-    for estimator in _tested_estimators():
-        est_params = set(estimator.get_params())
-        if "oob_score" in est_params:
-            yield estimator.set_params(oob_score=True, bootstrap=True)
-        elif "early_stopping" in est_params:
-            est = estimator.set_params(early_stopping=True, n_iter_no_change=1)
-            if est.__class__.__name__ in {"MLPClassifier", "MLPRegressor"}:
-                # TODO: FIX MLP to not check validation set during MLP
-                yield pytest.param(
-                    est, marks=pytest.mark.xfail(msg="MLP still validates in fit")
-                )
-            else:
-                yield est
-        elif "n_iter_no_change" in est_params:
-            yield estimator.set_params(n_iter_no_change=1)
-
-
-# NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
-# delegates validation to a base estimator, the check is testing that the base estimator
-# is checking for column name consistency.
-column_name_estimators = list(
-    chain(
-        _tested_estimators(),
-        [make_pipeline(LogisticRegression(C=1))],
-        _estimators_that_predict_in_fit(),
-    )
-)
-
-
-@pytest.mark.parametrize(
-    "estimator", column_name_estimators, ids=_get_check_estimator_ids
-)
-def test_pandas_column_name_consistency(estimator):
-    if isinstance(estimator, ColumnTransformer):
-        pytest.skip("ColumnTransformer is not tested here")
-    tags = get_tags(estimator)
-    if "check_dataframe_column_names_consistency" in tags._xfail_checks:
-        pytest.skip(
-            "Estimator does not support check_dataframe_column_names_consistency"
-        )
-    with ignore_warnings(category=(FutureWarning)):
-        with warnings.catch_warnings(record=True) as record:
-            check_dataframe_column_names_consistency(
-                estimator.__class__.__name__, estimator
-            )
-        for warning in record:
-            assert "was fitted without feature names" not in str(warning.message)
-
-
 # TODO: As more modules support get_feature_names_out they should be removed
 # from this list to be tested
 GET_FEATURES_OUT_MODULES_TO_IGNORE = [

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
@@ -1962,5 +1962,5 @@ def __sklearn_tags__(self):
             "friedman_mse",
             "poisson",
         }
-        tags.input_tags.allow_nan: allow_nan
+        tags.input_tags.allow_nan = allow_nan
         return tags
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
@@ -485,6 +485,8 @@
     # TODO(devtools): check that function names here exist in checks for the estimator
     # TODO(devtools): write a test for the same thing with tags._xfail_checks
     AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)},
+    BaggingClassifier: {"check_pandas_column_name_consistency": dict(oob_score=True)},
+    BaggingRegressor: {"check_pandas_column_name_consistency": dict(oob_score=True)},
     BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
     BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)},
     Birch: {"check_dict_unchanged": dict(n_clusters=1)},
@@ -495,11 +497,33 @@
             max_iter=20, n_components=1, transform_algorithm="lasso_lars"
         )
     },
+    ExtraTreesClassifier: {
+        "check_pandas_column_name_consistency": dict(bootstrap=True, oob_score=True)
+    },
+    ExtraTreesRegressor: {
+        "check_pandas_column_name_consistency": dict(bootstrap=True, oob_score=True)
+    },
     FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
     FastICA: {"check_
8810
dict_unchanged": dict(max_iter=5, n_components=1)},
     FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)},
     GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)},
     GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)},
+    GradientBoostingClassifier: {
+        "check_pandas_column_name_consistency": dict(n_iter_no_change=1)
+    },
+    GradientBoostingRegressor: {
+        "check_pandas_column_name_consistency": dict(n_iter_no_change=1)
+    },
+    HistGradientBoostingClassifier: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
+    HistGradientBoostingRegressor: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
     IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)},
     Isomap: {"check_dict_unchanged": dict(n_components=1)},
     KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)},
@@ -525,17 +549,58 @@
     MiniBatchSparsePCA: {
         "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1)
     },
+    MLPClassifier: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
+    MLPRegressor: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
     NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)},
     NeighborhoodComponentsAnalysis: {
         "check_dict_unchanged": dict(max_iter=5, n_components=1)
     },
     Nystroem: {"check_dict_unchanged": dict(n_components=1)},
+    PassiveAggressiveClassifier: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
+    PassiveAggressiveRegressor: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
     PCA: {"check_dict_unchanged": dict(n_components=1)},
+    Perceptron: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+            )
+    },
     PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
     PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
     PLSSVD: {"check_dict_unchanged": dict(n_components=1)},
     PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)},
+    RandomForestClassifier: {
+        "check_pandas_column_name_consistency": dict(oob_score=True)
+    },
+    RandomForestRegressor: {
+        "check_pandas_column_name_consistency": dict(oob_score=True)
+    },
     RBFSampler: {"check_dict_unchanged": dict(n_components=1)},
+    SGDClassifier: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
+    SGDRegressor: {
+        "check_pandas_column_name_consistency": dict(
+            early_stopping=True, n_iter_no_change=1
+        )
+    },
     SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)},
     SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)},
     SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)},