diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index f617b1a924ec0..eefe83b678166 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -367,7 +367,13 @@ Changelog calling :func:`utils.validation.check_non_negative`. :pr:`29540` by :user:`Tamara Atanasoska `. -- |FIX| :func:`utils.estimator_checks.parametrize_with_checks` and +- |Enhancement| :func:`utils.estimator_checks.parametrize_with_checks` and + :func:`utils.estimator_checks.check_estimator` now have started putting tests into + categories which can be enabled / disabled using their `dataframe` and `legacy` + parameters. + :pr:`29699`, :pr:`29713` by `Adrin Jalali`_. + +- |Fix| :func:`utils.estimator_checks.parametrize_with_checks` and :func:`utils.estimator_checks.check_estimator` now support estimators that have `set_output` called on them. :pr:`29869` by `Adrin Jalali`_. diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 73a0f5e2bd8d1..a138646c1defb 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1332,6 +1332,7 @@ def __sklearn_tags__(self): "check_fit2d_predict1d": "FIXME", "check_complex_data": "FIXME", "check_fit2d_1feature": "FIXME", + "check_pandas_column_name_consistency": "FIXME", } return tags diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index c05a4f60773b0..52de9e2fe22b9 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -7,10 +7,9 @@ import numpy as np from ..base import OutlierMixin, _fit_context -from ..utils import check_array from ..utils._param_validation import Interval, StrOptions from ..utils.metaestimators import available_if -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, validate_data from ._base import KNeighborsMixin, NeighborsBase __all__ = ["LocalOutlierFactor"] @@ -471,13 +470,14 @@ def score_samples(self, X): The lower, the more abnormal. """ check_is_fitted(self) - X = check_array(X, accept_sparse="csr") + # not replacing X since we need to pass raw X to kneighbors + X_validated = validate_data(self, X, reset=False, accept_sparse="csr") distances_X, neighbors_indices_X = self.kneighbors( X, n_neighbors=self.n_neighbors_ ) - if X.dtype == np.float32: + if X_validated.dtype == np.float32: distances_X = distances_X.astype(X.dtype, copy=False) X_lrd = self._local_reachability_density( diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index a985f6a02289a..eeabe60dc41e0 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -8,7 +8,6 @@ import os import pkgutil import re -import warnings from functools import partial from inspect import isgenerator from itertools import chain @@ -18,7 +17,6 @@ import sklearn from sklearn.base import BaseEstimator -from sklearn.compose import ColumnTransformer from sklearn.exceptions import ConvergenceWarning # make it possible to discover experimental estimators when calling `all_estimators` @@ -45,7 +43,6 @@ ignore_warnings, ) from sklearn.utils.estimator_checks import ( - check_dataframe_column_names_consistency, check_estimator, check_get_feature_names_out_error, check_global_output_transform_pandas, @@ -242,56 +239,6 @@ def check_field_types(tags, defaults): check_field_types(tags.transformer_tags, defaults.transformer_tags) -def _estimators_that_predict_in_fit(): - for estimator in _tested_estimators(): - est_params = set(estimator.get_params()) - if "oob_score" in est_params: - yield estimator.set_params(oob_score=True, bootstrap=True) - elif "early_stopping" in est_params: - est = estimator.set_params(early_stopping=True, n_iter_no_change=1) - if est.__class__.__name__ in {"MLPClassifier", "MLPRegressor"}: - # TODO: FIX MLP to not check validation set during MLP - yield pytest.param( - est, marks=pytest.mark.xfail(msg="MLP still validates in fit") - ) - else: - yield est - elif "n_iter_no_change" in est_params: - yield estimator.set_params(n_iter_no_change=1) - - -# NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that -# delegates validation to a base estimator, the check is testing that the base estimator -# is checking for column name consistency. -column_name_estimators = list( - chain( - _tested_estimators(), - [make_pipeline(LogisticRegression(C=1))], - _estimators_that_predict_in_fit(), - ) -) - - -@pytest.mark.parametrize( - "estimator", column_name_estimators, ids=_get_check_estimator_ids -) -def test_pandas_column_name_consistency(estimator): - if isinstance(estimator, ColumnTransformer): - pytest.skip("ColumnTransformer is not tested here") - tags = get_tags(estimator) - if "check_dataframe_column_names_consistency" in tags._xfail_checks: - pytest.skip( - "Estimator does not support check_dataframe_column_names_consistency" - ) - with ignore_warnings(category=(FutureWarning)): - with warnings.catch_warnings(record=True) as record: - check_dataframe_column_names_consistency( - estimator.__class__.__name__, estimator - ) - for warning in record: - assert "was fitted without feature names" not in str(warning.message) - - # TODO: As more modules support get_feature_names_out they should be removed # from this list to be tested GET_FEATURES_OUT_MODULES_TO_IGNORE = [ diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 7366d53846b74..5a3d02cc996e2 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1962,5 +1962,5 @@ def __sklearn_tags__(self): "friedman_mse", "poisson", } - tags.input_tags.allow_nan: allow_nan + tags.input_tags.allow_nan = allow_nan return tags diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 093b66207449e..60d9bccf335bc 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -485,6 +485,8 @@ # TODO(devtools): check that function names here exist in checks for the estimator # TODO(devtools): write a test for the same thing with tags._xfail_checks AgglomerativeClustering: {"check_dict_unchanged": dict(n_clusters=1)}, + BaggingClassifier: {"check_pandas_column_name_consistency": dict(oob_score=True)}, + BaggingRegressor: {"check_pandas_column_name_consistency": dict(oob_score=True)}, BayesianGaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)}, BernoulliRBM: {"check_dict_unchanged": dict(n_components=1, n_iter=5)}, Birch: {"check_dict_unchanged": dict(n_clusters=1)}, @@ -495,11 +497,33 @@ max_iter=20, n_components=1, transform_algorithm="lasso_lars" ) }, + ExtraTreesClassifier: { + "check_pandas_column_name_consistency": dict(bootstrap=True, oob_score=True) + }, + ExtraTreesRegressor: { + "check_pandas_column_name_consistency": dict(bootstrap=True, oob_score=True) + }, FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)}, GaussianMixture: {"check_dict_unchanged": dict(max_iter=5, n_init=2)}, GaussianRandomProjection: {"check_dict_unchanged": dict(n_components=1)}, + GradientBoostingClassifier: { + "check_pandas_column_name_consistency": dict(n_iter_no_change=1) + }, + GradientBoostingRegressor: { + "check_pandas_column_name_consistency": dict(n_iter_no_change=1) + }, + HistGradientBoostingClassifier: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, + HistGradientBoostingRegressor: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, IncrementalPCA: {"check_dict_unchanged": dict(batch_size=10, n_components=1)}, Isomap: {"check_dict_unchanged": dict(n_components=1)}, KMeans: {"check_dict_unchanged": dict(max_iter=5, n_clusters=1, n_init=2)}, @@ -525,17 +549,58 @@ MiniBatchSparsePCA: { "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) }, + MLPClassifier: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, + MLPRegressor: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, NMF: {"check_dict_unchanged": dict(max_iter=500, n_components=1)}, NeighborhoodComponentsAnalysis: { "check_dict_unchanged": dict(max_iter=5, n_components=1) }, Nystroem: {"check_dict_unchanged": dict(n_components=1)}, + PassiveAggressiveClassifier: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, + PassiveAggressiveRegressor: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, PCA: {"check_dict_unchanged": dict(n_components=1)}, + Perceptron: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, PLSCanonical: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, PLSRegression: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, PLSSVD: {"check_dict_unchanged": dict(n_components=1)}, PolynomialCountSketch: {"check_dict_unchanged": dict(n_components=1)}, + RandomForestClassifier: { + "check_pandas_column_name_consistency": dict(oob_score=True) + }, + RandomForestRegressor: { + "check_pandas_column_name_consistency": dict(oob_score=True) + }, RBFSampler: {"check_dict_unchanged": dict(n_components=1)}, + SGDClassifier: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, + SGDRegressor: { + "check_pandas_column_name_consistency": dict( + early_stopping=True, n_iter_no_change=1 + ) + }, SkewedChi2Sampler: {"check_dict_unchanged": dict(n_components=1)}, SparsePCA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, SparseRandomProjection: {"check_dict_unchanged": dict(n_components=1)}, diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f7ce03b56b3ee..4151d812621ef 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -99,6 +99,11 @@ def _yield_api_checks(estimator): yield check_n_features_in_after_fitting +def _yield_dataframe_checks(estimator): + yield check_n_features_in_after_fitting + yield check_pandas_column_name_consistency + + def _yield_checks(estimator): name = estimator.__class__.__name__ tags = get_tags(estimator) @@ -331,7 +336,7 @@ def _yield_array_api_checks(estimator): ) -def _yield_all_checks(estimator, legacy: bool): +def _yield_all_checks(estimator, dataframe: bool, legacy: bool): name = estimator.__class__.__name__ tags = get_tags(estimator) if not tags.input_tags.two_d_array: @@ -352,6 +357,10 @@ def _yield_all_checks(estimator, legacy: bool): for check in _yield_api_checks(estimator): yield check + if dataframe: + for check in _yield_dataframe_checks(estimator): + yield check + if not legacy: return # pragma: no cover @@ -443,7 +452,7 @@ def _should_be_skipped_or_marked(estimator, check): return False, "placeholder reason that will never be used" -def parametrize_with_checks(estimators, *, legacy: bool = True): +def parametrize_with_checks(estimators, *, dataframe: bool = True, legacy: bool = True): """Pytest specific decorator for parametrizing estimator checks. Checks are categorised into the following groups: @@ -470,6 +479,12 @@ def parametrize_with_checks(estimators, *, legacy: bool = True): .. versionadded:: 0.24 + dataframe : bool, default=True + Whether to included checks related to inspecting feature counts and feature + names. Theese checks might include `polars` or `pandas` to be installed, and are + automatically skipped if otherwise. + + .. versionadded:: 1.6 legacy : bool, default=True Whether to include legacy checks. Over time we remove checks from this category @@ -513,7 +528,9 @@ def checks_generator(): # of the checks to run name = type(estimator).__name__ yield estimator, partial(check_estimator_cloneable, name) - for check in _yield_all_checks(estimator, legacy=legacy): + for check in _yield_all_checks( + estimator, dataframe=dataframe, legacy=legacy + ): check_with_name = partial(check, name) for check_instance in _yield_instances_for_check(check, estimator): yield _maybe_mark_xfail(check_instance, check_with_name, pytest) @@ -523,7 +540,9 @@ def checks_generator(): ) -def check_estimator(estimator=None, generate_only=False, *, legacy: bool = True): +def check_estimator( + estimator=None, generate_only=False, *, dataframe: bool = True, legacy: bool = True +): """Check if estimator adheres to scikit-learn conventions. This function will run an extensive test-suite for input validation, @@ -566,6 +585,13 @@ def check_estimator(estimator=None, generate_only=False, *, legacy: bool = True) .. versionadded:: 0.22 + dataframe : bool, default=True + Whether to included checks related to inspecting feature counts and feature + names. Theese checks might include `polars` or `pandas` to be installed, and are + automatically skipped if otherwise. + + .. versionadded:: 1.6 + legacy : bool, default=True Whether to include legacy checks. Over time we remove checks from this category and move them into their specific category. @@ -604,7 +630,7 @@ def checks_generator(): # we first need to check if the estimator is cloneable for the rest of the tests # to run yield estimator, partial(check_estimator_cloneable, name) - for check in _yield_all_checks(estimator, legacy=legacy): + for check in _yield_all_checks(estimator, dataframe=dataframe, legacy=legacy): check = _maybe_skip(estimator, check) for check_instance in _yield_instances_for_check(check, estimator): yield check_instance, partial(check, name) @@ -4038,147 +4064,6 @@ def check_estimator_tags_renamed(name, estimator_orig): ) -def check_dataframe_column_names_consistency(name, estimator_orig): - try: - import pandas as pd - except ImportError: - raise SkipTest( - "pandas is not installed: not checking column name consistency for pandas" - ) - - tags = get_tags(estimator_orig) - is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical - - if not is_supported_X_types or tags.no_validation: - return - - rng = np.random.RandomState(0) - - estimator = clone(estimator_orig) - set_random_state(estimator) - - X_orig = rng.normal(size=(150, 8)) - - X_orig = _enforce_estimator_tags_X(estimator, X_orig) - n_samples, n_features = X_orig.shape - - names = np.array([f"col_{i}" for i in range(n_features)]) - X = pd.DataFrame(X_orig, columns=names, copy=False) - - if is_regressor(estimator): - y = rng.normal(size=n_samples) - else: - y = rng.randint(low=0, high=2, size=n_samples) - y = _enforce_estimator_tags_y(estimator, y) - - # Check that calling `fit` does not raise any warnings about feature names. - with warnings.catch_warnings(): - warnings.filterwarnings( - "error", - message="X does not have valid feature names", - category=UserWarning, - module="sklearn", - ) - estimator.fit(X, y) - - if not hasattr(estimator, "feature_names_in_"): - raise ValueError( - "Estimator does not have a feature_names_in_ " - "attribute after fitting with a dataframe" - ) - assert isinstance(estimator.feature_names_in_, np.ndarray) - assert estimator.feature_names_in_.dtype == object - assert_array_equal(estimator.feature_names_in_, names) - - # Only check sklearn estimators for feature_names_in_ in docstring - module_name = estimator_orig.__module__ - if ( - module_name.startswith("sklearn.") - and not ("test_" in module_name or module_name.endswith("_testing")) - and ("feature_names_in_" not in (estimator_orig.__doc__)) - ): - raise ValueError( - f"Estimator {name} does not document its feature_names_in_ attribute" - ) - - check_methods = [] - for method in ( - "predict", - "transform", - "decision_function", - "predict_proba", - "score", - "score_samples", - "predict_log_proba", - ): - if not hasattr(estimator, method): - continue - - callable_method = getattr(estimator, method) - if method == "score": - callable_method = partial(callable_method, y=y) - check_methods.append((method, callable_method)) - - for _, method in check_methods: - with warnings.catch_warnings(): - warnings.filterwarnings( - "error", - message="X does not have valid feature names", - category=UserWarning, - module="sklearn", - ) - method(X) # works without UserWarning for valid features - - invalid_names = [ - (names[::-1], "Feature names must be in the same order as they were in fit."), - ( - [f"another_prefix_{i}" for i in range(n_features)], - ( - "Feature names unseen at fit time:\n- another_prefix_0\n-" - " another_prefix_1\n" - ), - ), - ( - names[:3], - f"Feature names seen at fit time, yet now missing:\n- {min(names[3:])}\n", - ), - ] - params = { - key: value - for key, value in estimator.get_params().items() - if "early_stopping" in key - } - early_stopping_enabled = any(value is True for value in params.values()) - - for invalid_name, additional_message in invalid_names: - X_bad = pd.DataFrame(X, columns=invalid_name, copy=False) - - expected_msg = re.escape( - "The feature names should match those that were passed during fit.\n" - f"{additional_message}" - ) - for name, method in check_methods: - with raises( - ValueError, match=expected_msg, err_msg=f"{name} did not raise" - ): - method(X_bad) - - # partial_fit checks on second call - # Do not call partial fit if early_stopping is on - if not hasattr(estimator, "partial_fit") or early_stopping_enabled: - continue - - estimator = clone(estimator_orig) - if is_classifier(estimator): - classes = np.unique(y) - estimator.partial_fit(X, y, classes=classes) - else: - estimator.partial_fit(X, y) - - with raises(ValueError, match=expected_msg): - estimator.partial_fit(X_bad, y) - - def check_transformer_get_feature_names_out(name, transformer_orig): tags = get_tags(transformer_orig) if not tags.input_tags.two_d_array or tags.no_validation: @@ -4747,6 +4632,162 @@ def check_inplace_ensure_writeable(name, estimator_orig): assert_allclose(X, X_copy) +# Dataframe / Feature Names inspection tests +# ========================================== +def _check_dataframe_column_names_consistency(name, estimator_orig): + try: + import pandas as pd + except ImportError: + raise SkipTest( + "pandas is not installed: not checking column name consistency for pandas" + ) + + tags = get_tags(estimator_orig) + is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical + + if not is_supported_X_types or tags.no_validation: + return + + rng = np.random.RandomState(0) + + estimator = clone(estimator_orig) + set_random_state(estimator) + + X_orig = rng.normal(size=(150, 8)) + + X_orig = _enforce_estimator_tags_X(estimator, X_orig) + n_samples, n_features = X_orig.shape + + names = np.array([f"col_{i}" for i in range(n_features)]) + X = pd.DataFrame(X_orig, columns=names, copy=False) + + if is_regressor(estimator): + y = rng.normal(size=n_samples) + else: + y = rng.randint(low=0, high=2, size=n_samples) + y = _enforce_estimator_tags_y(estimator, y) + + # Check that calling `fit` does not raise any warnings about feature names. + with warnings.catch_warnings(): + warnings.filterwarnings( + "error", + message="X does not have valid feature names", + category=UserWarning, + module="sklearn", + ) + estimator.fit(X, y) + + if not hasattr(estimator, "feature_names_in_"): + raise ValueError( + "Estimator does not have a feature_names_in_ " + "attribute after fitting with a dataframe" + ) + assert isinstance(estimator.feature_names_in_, np.ndarray) + assert estimator.feature_names_in_.dtype == object + assert_array_equal(estimator.feature_names_in_, names) + + # Only check sklearn estimators for feature_names_in_ in docstring + module_name = estimator.__module__ + if ( + module_name.startswith("sklearn.") + and not ("test_" in module_name or module_name.endswith("_testing")) + and ("feature_names_in_" not in (estimator.__doc__)) + ): + raise ValueError( + f"Estimator {name} does not document its feature_names_in_ attribute" + ) + + check_methods = [] + for method in ( + "predict", + "transform", + "decision_function", + "predict_proba", + "score", + "score_samples", + "predict_log_proba", + ): + if not hasattr(estimator, method): + continue + + callable_method = getattr(estimator, method) + if method == "score": + callable_method = partial(callable_method, y=y) + check_methods.append((method, callable_method)) + + for _, method in check_methods: + with warnings.catch_warnings(): + warnings.filterwarnings( + "error", + message="X does not have valid feature names", + category=UserWarning, + module="sklearn", + ) + method(X) # works without UserWarning for valid features + + invalid_names = [ + (names[::-1], "Feature names must be in the same order as they were in fit."), + ( + [f"another_prefix_{i}" for i in range(n_features)], + ( + "Feature names unseen at fit time:\n- another_prefix_0\n-" + " another_prefix_1\n" + ), + ), + ( + names[:3], + f"Feature names seen at fit time, yet now missing:\n- {min(names[3:])}\n", + ), + ] + params = { + key: value + for key, value in estimator.get_params().items() + if "early_stopping" in key + } + early_stopping_enabled = any(value is True for value in params.values()) + + for invalid_name, additional_message in invalid_names: + X_bad = pd.DataFrame(X, columns=invalid_name, copy=False) + + expected_msg = re.escape( + "The feature names should match those that were passed during fit.\n" + f"{additional_message}" + ) + for name, method in check_methods: + with raises( + ValueError, match=expected_msg, err_msg=f"{name} did not raise" + ): + method(X_bad) + + # partial_fit checks on second call + # Do not call partial fit if early_stopping is on + if not hasattr(estimator, "partial_fit") or early_stopping_enabled: + continue + + estimator = clone(estimator_orig) + if is_classifier(estimator): + classes = np.unique(y) + estimator.partial_fit(X, y, classes=classes) + else: + estimator.partial_fit(X, y) + + with raises(ValueError, match=expected_msg): + estimator.partial_fit(X_bad, y) + + +@ignore_warnings(category=(FutureWarning)) +def check_pandas_column_name_consistency(name, estimator_orig): + estimator = clone(estimator_orig) + + # NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator + # that delegates validation to a base estimator, the check is testing that the base + # estimator is checking for column name consistency. + with warnings.catch_warnings(record=True) as record: + _check_dataframe_column_names_consistency(name, estimator) + for warning in record: + assert "was fitted without feature names" not in str(warning.message) + + def check_do_not_raise_errors_in_init_or_set_params(name, estimator_orig): """Check that init or set_param does not raise errors.""" Estimator = type(estimator_orig) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 53be77d96e901..8a6fa3925bd59 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -41,6 +41,7 @@ raises, ) from sklearn.utils.estimator_checks import ( + _check_dataframe_column_names_consistency, _NotAnArray, _yield_all_checks, check_array_api_input, @@ -50,7 +51,6 @@ check_classifiers_multilabel_output_format_predict, check_classifiers_multilabel_output_format_predict_proba, check_classifiers_one_label_sample_weights, - check_dataframe_column_names_consistency, check_decision_proba_consistency, check_dict_unchanged, check_dont_overwrite_parameters, @@ -918,17 +918,17 @@ def test_check_regressor_data_not_an_array(): def test_check_dataframe_column_names_consistency(): err_msg = "Estimator does not have a feature_names_in_" with raises(ValueError, match=err_msg): - check_dataframe_column_names_consistency("estimator_name", BaseBadClassifier()) - check_dataframe_column_names_consistency("estimator_name", PartialFitChecksName()) + _check_dataframe_column_names_consistency("estimator_name", BaseBadClassifier()) + _check_dataframe_column_names_consistency("estimator_name", PartialFitChecksName()) lr = LogisticRegression() - check_dataframe_column_names_consistency(lr.__class__.__name__, lr) + _check_dataframe_column_names_consistency(lr.__class__.__name__, lr) lr.__doc__ = "Docstring that does not document the estimator's attributes" err_msg = ( "Estimator LogisticRegression does not document its feature_names_in_ attribute" ) with raises(ValueError, match=err_msg): - check_dataframe_column_names_consistency(lr.__class__.__name__, lr) + _check_dataframe_column_names_consistency(lr.__class__.__name__, lr) class _BaseMultiLabelClassifierMock(ClassifierMixin, BaseEstimator): @@ -1266,7 +1266,7 @@ def test_non_deterministic_estimator_skip_tests(): # check estimators with non_deterministic tag set to True # will skip certain tests, refer to issue #22313 for details for est in [MinimalTransformer, MinimalRegressor, MinimalClassifier]: - all_tests = list(_yield_all_checks(est(), legacy=True)) + all_tests = list(_yield_all_checks(est(), dataframe=True, legacy=True)) assert check_methods_sample_order_invariance in all_tests assert check_methods_subset_invariance in all_tests @@ -1276,7 +1276,7 @@ def __sklearn_tags__(self): tags.non_deterministic = True return tags - all_tests = list(_yield_all_checks(Estimator(), legacy=True)) + all_tests = list(_yield_all_checks(Estimator(), dataframe=True, legacy=True)) assert check_methods_sample_order_invariance not in all_tests assert check_methods_subset_invariance not in all_tests @@ -1345,14 +1345,14 @@ def test_decision_proba_tie_ranking(): check_decision_proba_consistency("SGDClassifier", estimator) -def test_yield_all_checks_legacy(): - # Test that _yield_all_checks with legacy=True returns more checks. +def test_yield_all_checks_api(): + # Test that _yield_all_checks with API only returns less checks. estimator = MinimalClassifier() - legacy_checks = list(_yield_all_checks(estimator, legacy=True)) - non_legacy_checks = list(_yield_all_checks(estimator, legacy=False)) + all_checks = list(_yield_all_checks(estimator, dataframe=True, legacy=True)) + api_only_checks = list(_yield_all_checks(estimator, dataframe=False, legacy=False)) - assert len(legacy_checks) > len(non_legacy_checks) + assert len(all_checks) > len(api_only_checks) def get_check_name(check): try: @@ -1361,9 +1361,9 @@ def get_check_name(check): return check.func.__name__ # Check that all non-legacy checks are included in legacy checks - non_legacy_check_names = {get_check_name(check) for check in non_legacy_checks} - legacy_check_names = {get_check_name(check) for check in legacy_checks} - assert non_legacy_check_names.issubset(legacy_check_names) + api_only_check_names = {get_check_name(check) for check in api_only_checks} + all_check_names = {get_check_name(check) for check in all_checks} + assert api_only_check_names.issubset(all_check_names) def test_check_estimator_cloneable_error():