From 3ebf1c329786b8bcffdb2ec51dff25ce4eb94db0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 24 Feb 2022 15:18:15 -0500 Subject: [PATCH 01/48] ENH Uses __sklearn_tags__ for tags instead of mro walking --- doc/developers/develop.rst | 21 +++---- sklearn/base.py | 43 ++++++-------- sklearn/calibration.py | 5 +- sklearn/cluster/_affinity_propagation.py | 5 +- sklearn/cluster/_bicluster.py | 5 +- sklearn/cluster/_kmeans.py | 10 ++-- sklearn/cluster/_spectral.py | 5 +- sklearn/compose/_target.py | 5 +- sklearn/cross_decomposition/_pls.py | 5 +- sklearn/decomposition/_dict_learning.py | 22 +++---- sklearn/decomposition/_kernel_pca.py | 5 +- sklearn/decomposition/_lda.py | 5 +- sklearn/decomposition/_nmf.py | 5 +- sklearn/decomposition/_pca.py | 5 +- sklearn/decomposition/_sparse_pca.py | 7 +-- sklearn/decomposition/_truncated_svd.py | 5 +- sklearn/dummy.py | 10 ++-- sklearn/ensemble/_forest.py | 10 ++-- .../gradient_boosting.py | 5 +- sklearn/ensemble/_iforest.py | 5 +- sklearn/ensemble/_voting.py | 5 +- .../feature_extraction/_dict_vectorizer.py | 5 +- sklearn/feature_extraction/_hash.py | 5 +- sklearn/feature_extraction/image.py | 5 +- sklearn/feature_extraction/text.py | 20 ++++--- sklearn/feature_selection/_base.py | 2 +- sklearn/feature_selection/_from_model.py | 5 +- sklearn/feature_selection/_rfe.py | 9 +-- sklearn/feature_selection/_sequential.py | 7 ++- .../_univariate_selection.py | 10 ++-- .../feature_selection/_variance_threshold.py | 5 +- .../tests/test_from_model.py | 19 +++--- sklearn/feature_selection/tests/test_rfe.py | 6 +- sklearn/gaussian_process/_gpr.py | 5 +- sklearn/impute/_base.py | 15 +++-- sklearn/impute/tests/test_knn.py | 2 +- sklearn/isotonic.py | 5 +- sklearn/kernel_approximation.py | 10 ++-- sklearn/kernel_ridge.py | 5 +- sklearn/linear_model/_base.py | 5 +- sklearn/linear_model/_coordinate_descent.py | 30 ++++++---- sklearn/linear_model/_glm/glm.py | 5 +- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- sklearn/linear_model/_least_angle.py | 10 ++-- sklearn/linear_model/_logistic.py | 5 +- sklearn/linear_model/_ransac.py | 5 +- sklearn/linear_model/_ridge.py | 10 ++-- sklearn/linear_model/_stochastic_gradient.py | 15 +++-- .../tests/test_coordinate_descent.py | 2 +- sklearn/manifold/_mds.py | 5 +- sklearn/manifold/_spectral_embedding.py | 5 +- sklearn/model_selection/_search.py | 5 +- .../_search_successive_halving.py | 4 +- sklearn/model_selection/tests/test_search.py | 14 +++-- sklearn/multiclass.py | 12 ++-- sklearn/multioutput.py | 22 ++++--- sklearn/naive_bayes.py | 20 ++++--- sklearn/neighbors/_base.py | 7 ++- sklearn/neighbors/_classification.py | 10 ++-- sklearn/neighbors/_graph.py | 10 ++-- sklearn/neighbors/_kde.py | 5 +- sklearn/neighbors/_nca.py | 5 +- sklearn/neighbors/_regression.py | 5 +- sklearn/neighbors/tests/test_neighbors.py | 6 +- .../neural_network/_multilayer_perceptron.py | 5 +- sklearn/neural_network/_rbm.py | 5 +- sklearn/pipeline.py | 5 +- sklearn/preprocessing/_data.py | 45 ++++++++------ sklearn/preprocessing/_encoders.py | 5 +- .../preprocessing/_function_transformer.py | 5 +- sklearn/preprocessing/_label.py | 15 +++-- sklearn/preprocessing/tests/test_data.py | 2 +- sklearn/preprocessing/tests/test_encoders.py | 2 +- sklearn/random_projection.py | 7 +-- sklearn/svm/_base.py | 5 +- sklearn/svm/_classes.py | 35 ++++++----- sklearn/tests/test_base.py | 28 ++++----- sklearn/tests/test_docstring_parameters.py | 4 +- sklearn/tests/test_multiclass.py | 4 +- sklearn/tests/test_pipeline.py | 2 +- sklearn/tree/_classes.py | 5 +- sklearn/utils/_mocking.py | 10 ++-- sklearn/utils/_tags.py | 36 +++++++++++- sklearn/utils/estimator_checks.py | 28 +++++++-- sklearn/utils/tests/test_estimator_checks.py | 58 ++++++++++++++----- sklearn/utils/tests/test_tags.py | 25 +++++++- 86 files changed, 549 insertions(+), 344 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index d6955ee53a7cc..4d223a692f424 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -496,7 +496,7 @@ Estimator Tags Scikit-learn introduced estimator tags in version 0.21. These are annotations of estimators that allow programmatic inspection of their capabilities, such as sparse matrix support, supported output types and supported methods. The -estimator tags are a dictionary returned by the method ``_get_tags()``. These +estimator tags are a dictionary returned by the method ``__sklearn_tags__()``. These tags are used in the common checks run by the :func:`~sklearn.utils.estimator_checks.check_estimator` function and the :func:`~sklearn.utils.estimator_checks.parametrize_with_checks` decorator. @@ -608,22 +608,19 @@ X_types (default=['2darray']) It is unlikely that the default values for each tag will suit the needs of your specific estimator. Additional tags can be created or default tags can be -overridden by defining a `_more_tags()` method which returns a dict with the +overridden by defining a `__sklearn_tags__()` method which returns a dict with the desired overridden tags or new tags. For example:: class MyMultiOutputEstimator(BaseEstimator): - def _more_tags(self): - return {'multioutput_only': True, - 'non_deterministic': True} + def __sklearn_tags__(self): + more_tags = {'multioutput_only': True, + 'non_deterministic': True} + return {**super().__sklearn_tags__(), **more_tags} -Any tag that is not in `_more_tags()` will just fall-back to the default values -documented above. - -Even if it is not recommended, it is possible to override the method -`_get_tags()`. Note however that **all tags must be present in the dict**. If -any of the keys documented above is not present in the output of `_get_tags()`, -an error will occur. +`__sklearn_tags__()` should return all tags in the dictionary. Note however that +**all tags must be present in the dict**. If any of the keys documented above is +not present in the output of `__sklearn_tags__()`, an error will occur. In addition to the tags, estimators also need to declare any non-optional parameters to ``__init__`` in the ``_required_parameters`` class attribute, diff --git a/sklearn/base.py b/sklearn/base.py index e002045d08e8e..b8c5c450c6bda 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -342,19 +342,8 @@ def __setstate__(self, state): except AttributeError: self.__dict__.update(state) - def _more_tags(self): - return _DEFAULT_TAGS - - def _get_tags(self): - collected_tags = {} - for base_class in reversed(inspect.getmro(self.__class__)): - if hasattr(base_class, "_more_tags"): - # need the if because mixins might not have _more_tags - # but might do redundant work in estimators - # (i.e. calling more tags on BaseEstimator multiple times) - more_tags = base_class._more_tags(self) - collected_tags.update(more_tags) - return collected_tags + def __sklearn_tags__(self): + return copy.deepcopy(_DEFAULT_TAGS) def _check_n_features(self, X, reset): """Set the `n_features_in_` attribute, or check against it. @@ -559,7 +548,7 @@ def _validate_data( """ self._check_feature_names(X, reset=reset) - if y is None and self._get_tags()["requires_y"]: + if y is None and self.__sklearn_tags__()["requires_y"]: raise ValueError( f"This {self.__class__.__name__} estimator " "requires y to be passed, but the target y is None." @@ -665,8 +654,10 @@ def score(self, X, y, sample_weight=None): return accuracy_score(y, self.predict(X), sample_weight=sample_weight) - def _more_tags(self): - return {"requires_y": True} + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.update(requires_y=True) + return tags class RegressorMixin: @@ -720,8 +711,9 @@ def score(self, X, y, sample_weight=None): y_pred = self.predict(X) return r2_score(y, y_pred, sample_weight=sample_weight) - def _more_tags(self): - return {"requires_y": True} + def __sklearn_tags__(self): + more_tags = {"requires_y": True} + return {**super().__sklearn_tags__(), **more_tags} class ClusterMixin: @@ -751,8 +743,9 @@ def fit_predict(self, X, y=None): self.fit(X) return self.labels_ - def _more_tags(self): - return {"preserves_dtype": []} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": []} + return {**super().__sklearn_tags__(), **more_tags} class BiclusterMixin: @@ -982,19 +975,21 @@ class MetaEstimatorMixin: class MultiOutputMixin: """Mixin to mark estimators that support multioutput.""" - def _more_tags(self): - return {"multioutput": True} + def __sklearn_tags__(self): + more_tags = {"multioutput": True} + return {**super().__sklearn_tags__(), **more_tags} class _UnstableArchMixin: """Mark estimators that are non-determinstic on 32bit or PowerPC""" - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "non_deterministic": ( _IS_32BIT or platform.machine().startswith(("ppc", "powerpc")) ) } + return {**super().__sklearn_tags__(), **more_tags} def is_classifier(estimator): diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 684ec91ebb86b..e38b20c69e7d6 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -456,8 +456,8 @@ def predict(self, X): check_is_fitted(self) return self.classes_[np.argmax(self.predict_proba(X), axis=1)] - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "Due to the cross-validation and sample ordering, removing a sample" @@ -466,6 +466,7 @@ def _more_tags(self): ), } } + return {**super().__sklearn_tags__(), **more_tags} def _fit_classifier_calibrator_pair( diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index f0274b113a341..39d3dde821558 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -427,8 +427,9 @@ def __init__( self.affinity = affinity self.random_state = random_state - def _more_tags(self): - return {"pairwise": self.affinity == "precomputed"} + def __sklearn_tags__(self): + more_tags = {"pairwise": self.affinity == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} def fit(self, X, y=None): """Fit the clustering from features, or affinity matrix. diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 83a44a371b9ef..a2a45ee528c27 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -192,8 +192,8 @@ def _k_means(self, data, n_clusters): labels = model.labels_ return centroid, labels - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_estimators_dtypes": "raises nan error", "check_fit2d_1sample": "_scale_normalize fails", @@ -204,6 +204,7 @@ def _more_tags(self): "check_fit2d_predict1d": "empty array passed inside", } } + return {**super().__sklearn_tags__(), **more_tags} class SpectralCoclustering(BaseSpectral): diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 170be0a2db9a8..4b61e4f18772b 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1390,14 +1390,15 @@ def score(self, X, y=None, sample_weight=None): X, sample_weight, x_squared_norms, self.cluster_centers_, self._n_threads )[1] - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), }, } + return {**super().__sklearn_tags__(), **more_tags} def _mini_batch_step( @@ -2147,11 +2148,12 @@ def predict(self, X, sample_weight=None): return labels - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index d6f6cacebf488..d06ac04c027fc 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -775,8 +775,9 @@ def fit_predict(self, X, y=None): """ return super().fit_predict(X, y) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "pairwise": self.affinity in ["precomputed", "precomputed_nearest_neighbors"] } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index e96729b2d91d7..07d9f84399d64 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -286,17 +286,18 @@ def predict(self, X, **predict_params): return pred_trans - def _more_tags(self): + def __sklearn_tags__(self): regressor = self.regressor if regressor is None: from ..linear_model import LinearRegression regressor = LinearRegression() - return { + more_tags = { "poor_score": True, "multioutput": _safe_tags(regressor, key="multioutput"), } + return {**super().__sklearn_tags__(), **more_tags} @property def n_features_in_(self): diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 63e25acbb7aee..e1d45a585890b 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -494,8 +494,9 @@ def fit_transform(self, X, y=None): """ return self.fit(X, y).transform(X, y) - def _more_tags(self): - return {"poor_score": True, "requires_y": False} + def __sklearn_tags__(self): + more_tags = {"poor_score": True, "requires_y": False} + return {**super().__sklearn_tags__(), **more_tags} class PLSRegression(_PLS): diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 5d37e7c4d6665..a8bb4a3e52100 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1290,11 +1290,9 @@ def transform(self, X, y=None): """ return super()._transform(X, self.dictionary) - def _more_tags(self): - return { - "requires_fit": False, - "preserves_dtype": [np.float64, np.float32], - } + def __sklearn_tags__(self): + more_tags = {"requires_fit": False, "preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} @property def n_components_(self): @@ -1590,10 +1588,9 @@ def _n_features_out(self): """Number of transformed output features.""" return self.components_.shape[0] - def _more_tags(self): - return { - "preserves_dtype": [np.float64, np.float32], - } + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): @@ -1940,7 +1937,6 @@ def _n_features_out(self): """Number of transformed output features.""" return self.components_.shape[0] - def _more_tags(self): - return { - "preserves_dtype": [np.float64, np.float32], - } + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index b83c494e6bade..0c97a704b999d 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -545,11 +545,12 @@ def inverse_transform(self, X): K = self._get_kernel(X, self.X_transformed_fit_) return np.dot(K, self.dual_coef_) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "preserves_dtype": [np.float64, np.float32], "pairwise": self.kernel == "precomputed", } + return {**super().__sklearn_tags__(), **more_tags} @property def _n_features_out(self): diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 6db9d900566eb..c88d5faee34e8 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -526,8 +526,9 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): self.n_batch_iter_ += 1 return - def _more_tags(self): - return {"requires_positive_X": True} + def __sklearn_tags__(self): + more_tags = {"requires_positive_X": True} + return {**super().__sklearn_tags__(), **more_tags} def _check_non_neg_array(self, X, reset_n_features, whom): """check X format diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 443f9c4be1649..a48a999e2d54c 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1376,8 +1376,9 @@ def __init__( self.shuffle = shuffle self.regularization = regularization - def _more_tags(self): - return {"requires_positive_X": True} + def __sklearn_tags__(self): + more_tags = {"requires_positive_X": True} + return {**super().__sklearn_tags__(), **more_tags} def _check_params(self, X): # n_components diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 53afa48bc9343..c7abfb8dc631b 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -680,5 +680,6 @@ def score(self, X, y=None): """ return np.mean(self.score_samples(X)) - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 6f2f4c8b10582..9491dc38e2687 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -241,10 +241,9 @@ def _n_features_out(self): """Number of transformed output features.""" return self.components_.shape[0] - def _more_tags(self): - return { - "preserves_dtype": [np.float64, np.float32], - } + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} class MiniBatchSparsePCA(SparsePCA): diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index b6a88037d5606..eb030a177642d 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -305,8 +305,9 @@ def inverse_transform(self, X): X = check_array(X) return np.dot(X, self.components_) - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} @property def _n_features_out(self): diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 7b31ee226664c..18edf35e03910 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -411,8 +411,8 @@ def predict_log_proba(self, X): else: return [np.log(p) for p in proba] - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "poor_score": True, "no_validation": True, "_xfail_checks": { @@ -420,6 +420,7 @@ def _more_tags(self): "check_methods_sample_order_invariance": "fails for the predict method", }, } + return {**super().__sklearn_tags__(), **more_tags} def score(self, X, y, sample_weight=None): """Return the mean accuracy on the given test data and labels. @@ -659,8 +660,9 @@ def predict(self, X, return_std=False): return (y, y_std) if return_std else y - def _more_tags(self): - return {"poor_score": True, "no_validation": True} + def __sklearn_tags__(self): + more_tags = {"poor_score": True, "no_validation": True} + return {**super().__sklearn_tags__(), **more_tags} def score(self, X, y, sample_weight=None): """Return the coefficient of determination R^2 of the prediction. diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 23ce4e3359e04..06c6b19fe0c11 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -921,8 +921,9 @@ def predict_log_proba(self, X): return proba - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): @@ -1078,8 +1079,9 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} class RandomForestClassifier(ForestClassifier): diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 0f3d8f90142e7..96021c9a60d58 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1075,8 +1075,9 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} @abstractmethod def _get_loss(self, sample_weight): diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 4315921ae6f45..910c692b46806 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -479,14 +479,15 @@ def _compute_score_samples(self, X, subsample_features): ) return scores - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} def _average_path_length(n_samples_leaf): diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 453b27268c583..7ddf9bef5e189 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -149,8 +149,9 @@ def _sk_visual_block_(self): names, estimators = zip(*self.estimators) return _VisualBlock("parallel", estimators, names=names) - def _more_tags(self): - return {"preserves_dtype": []} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": []} + return {**super().__sklearn_tags__(), **more_tags} class VotingClassifier(ClassifierMixin, _BaseVoting): diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index e04c409027bda..6e244199ccf02 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -456,5 +456,6 @@ def restrict(self, support, indices=False): return self - def _more_tags(self): - return {"X_types": ["dict"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["dict"]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index d6c4c95e540b5..a0d6afa6bcf66 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -185,5 +185,6 @@ def transform(self, raw_X): return X - def _more_tags(self): - return {"X_types": [self.input_type]} + def __sklearn_tags__(self): + more_tags = {"X_types": [self.input_type]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index b6fb19ff09d27..d38e304bf899b 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -564,5 +564,6 @@ def transform(self, X): ) return patches - def _more_tags(self): - return {"X_types": ["3darray"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["3darray"]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 408ee507a2069..31ebc89ae6050 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -869,8 +869,9 @@ def _get_hasher(self): alternate_sign=self.alternate_sign, ) - def _more_tags(self): - return {"X_types": ["string"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["string"]} + return {**super().__sklearn_tags__(), **more_tags} def _document_frequency(X): @@ -1449,8 +1450,9 @@ def get_feature_names_out(self, input_features=None): dtype=object, ) - def _more_tags(self): - return {"X_types": ["string"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["string"]} + return {**super().__sklearn_tags__(), **more_tags} def _make_int_array(): @@ -1701,8 +1703,9 @@ def idf_(self, value): value, diags=0, m=n_features, n=n_features, format="csr" ) - def _more_tags(self): - return {"X_types": ["2darray", "sparse"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["2darray", "sparse"]} + return {**super().__sklearn_tags__(), **more_tags} class TfidfVectorizer(CountVectorizer): @@ -2094,5 +2097,6 @@ def transform(self, raw_documents): X = super().transform(raw_documents) return self._tfidf.transform(X, copy=False) - def _more_tags(self): - return {"X_types": ["string"], "_skip_test": True} + def __sklearn_tags__(self): + more_tags = {"X_types": ["string"], "_skip_test": True} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 6229513a4d671..71d7f33ca0997 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -78,7 +78,7 @@ def transform(self, X): X_r : array of shape [n_samples, n_selected_features] The input samples with only the selected features. """ - # note: we use _safe_tags instead of _get_tags because this is a + # note: we use _safe_tags instead of __sklearn_tags__ because this is a # public Mixin. X = self._validate_data( X, diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 4ebc351eca085..9ddbe33a62fed 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -328,5 +328,6 @@ def n_features_in_(self): return self.estimator_.n_features_in_ - def _more_tags(self): - return {"allow_nan": _safe_tags(self.estimator, key="allow_nan")} + def __sklearn_tags__(self): + more_tags = {"allow_nan": _safe_tags(self.estimator, key="allow_nan")} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index b9a084c7789df..b1e6afca362bf 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -227,7 +227,7 @@ def _fit(self, X, y, step_score=None, **fit_params): # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit - tags = self._get_tags() + tags = self.__sklearn_tags__() X, y = self._validate_data( X, y, @@ -426,12 +426,13 @@ def predict_log_proba(self, X): check_is_fitted(self) return self.estimator_.predict_log_proba(self.transform(X)) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "poor_score": True, "allow_nan": _safe_tags(self.estimator, key="allow_nan"), "requires_y": True, } + return {**super().__sklearn_tags__(), **more_tags} class RFECV(RFE): @@ -657,7 +658,7 @@ def fit(self, X, y, groups=None): self : object Fitted estimator. """ - tags = self._get_tags() + tags = self.__sklearn_tags__() X, y = self._validate_data( X, y, diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 0e4a2adc0ea6f..b1c1891960191 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -196,7 +196,7 @@ def fit(self, X, y=None): FutureWarning, ) - tags = self._get_tags() + tags = self.__sklearn_tags__() X = self._validate_data( X, accept_sparse="csc", @@ -303,8 +303,9 @@ def _get_support_mask(self): check_is_fitted(self) return self.support_ - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "allow_nan": _safe_tags(self.estimator, key="allow_nan"), "requires_y": True, } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index bf9eacb1774f9..8f870a8771361 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -486,8 +486,9 @@ def fit(self, X, y): def _check_params(self, X, y): pass - def _more_tags(self): - return {"requires_y": True} + def __sklearn_tags__(self): + more_tags = {"requires_y": True} + return {**super().__sklearn_tags__(), **more_tags} ###################################################################### @@ -1014,8 +1015,9 @@ def _make_selector(self): return selector - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} def _check_params(self, X, y): if self.mode not in self._selection_modes: diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 7f274b3a308ef..cb307565658e2 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -126,5 +126,6 @@ def _get_support_mask(self): return self.variances_ > self.threshold - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 332d499bdb970..7701af888ba88 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -19,18 +19,21 @@ class NaNTag(BaseEstimator): - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} class NoNaNTag(BaseEstimator): - def _more_tags(self): - return {"allow_nan": False} + def __sklearn_tags__(self): + more_tags = {"allow_nan": False} + return {**super().__sklearn_tags__(), **more_tags} class NaNTagRandomForest(RandomForestClassifier): - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} iris = datasets.load_iris() @@ -402,11 +405,11 @@ def test_transform_accepts_nan_inf(): def test_allow_nan_tag_comes_from_estimator(): allow_nan_est = NaNTag() model = SelectFromModel(estimator=allow_nan_est) - assert model._get_tags()["allow_nan"] is True + assert model.__sklearn_tags__()["allow_nan"] is True no_nan_est = NoNaNTag() model = SelectFromModel(estimator=no_nan_est) - assert model._get_tags()["allow_nan"] is False + assert model.__sklearn_tags__()["allow_nan"] is False def _pca_importances(pca_estimator): diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index a8eef65049bd6..94b95aa4049c4 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -23,6 +23,7 @@ from sklearn.utils import check_random_state from sklearn.utils._testing import ignore_warnings +from sklearn.utils._tags import _DEFAULT_TAGS from sklearn.metrics import make_scorer from sklearn.metrics import get_scorer @@ -57,8 +58,9 @@ def get_params(self, deep=True): def set_params(self, **params): return self - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**_DEFAULT_TAGS, **more_tags} def test_rfe_features_importance(): diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index c7d8db7b63702..64aa987afa76c 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -622,5 +622,6 @@ def _constrained_optimization(self, obj_func, initial_theta, bounds): return theta_opt, func_min - def _more_tags(self): - return {"requires_fit": False} + def __sklearn_tags__(self): + more_tags = {"requires_fit": False} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 4839efd1ff17c..a5a8ced7f0685 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -125,8 +125,9 @@ def _concatenate_indicator_feature_names_out(self, names, input_features): indicator_names = self.indicator_.get_feature_names_out(input_features) return np.concatenate([names, indicator_names]) - def _more_tags(self): - return {"allow_nan": is_scalar_nan(self.missing_values)} + def __sklearn_tags__(self): + more_tags = {"allow_nan": is_scalar_nan(self.missing_values)} + return {**super().__sklearn_tags__(), **more_tags} class SimpleImputer(_BaseImputer): @@ -627,12 +628,13 @@ def inverse_transform(self, X): X_original[full_mask] = self.missing_values return X_original - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "allow_nan": ( _is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values) ) } + return {**super().__sklearn_tags__(), **more_tags} def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -1013,9 +1015,10 @@ def get_feature_names_out(self, input_features=None): dtype=object, ) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "allow_nan": True, "X_types": ["2darray", "string"], "preserves_dtype": [], } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py index 098899bc1a0f1..12231b96f4bfc 100644 --- a/sklearn/impute/tests/test_knn.py +++ b/sklearn/impute/tests/test_knn.py @@ -559,4 +559,4 @@ def test_knn_imputer_distance_weighted_not_enough_neighbors(na, working_memory): @pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)]) def test_knn_tags(na, allow_nan): knn = KNNImputer(missing_values=na) - assert knn._get_tags()["allow_nan"] == allow_nan + assert knn.__sklearn_tags__()["allow_nan"] == allow_nan diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index db19a52daf867..b507ef7e40b96 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -452,5 +452,6 @@ def __setstate__(self, state): if hasattr(self, "X_thresholds_") and hasattr(self, "y_thresholds_"): self._build_f(self.X_thresholds_, self.y_thresholds_) - def _more_tags(self): - return {"X_types": ["1darray"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["1darray"]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index a1fc80b60c0d6..b7805f340f896 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -711,8 +711,9 @@ def _transform_sparse(self, X): return sp.hstack(X_new) - def _more_tags(self): - return {"stateless": True, "requires_positive_X": True} + def __sklearn_tags__(self): + more_tags = {"stateless": True, "requires_positive_X": True} + return {**super().__sklearn_tags__(), **more_tags} class Nystroem(TransformerMixin, BaseEstimator): @@ -963,8 +964,8 @@ def _get_kernel_params(self): return params - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_transformer_preserve_dtypes": ( "dtypes are preserved but not at a close enough precision" @@ -972,3 +973,4 @@ def _more_tags(self): }, "preserves_dtype": [np.float64, np.float32], } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index cc83e114338be..829d7852824cb 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -153,8 +153,9 @@ def _get_kernel(self, X, Y=None): params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0} return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params) - def _more_tags(self): - return {"pairwise": self.kernel == "precomputed"} + def __sklearn_tags__(self): + more_tags = {"pairwise": self.kernel == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} def fit(self, X, y, sample_weight=None): """Fit Kernel Ridge regression model. diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 3d89c02392419..59634930781fa 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -371,8 +371,9 @@ def _set_intercept(self, X_offset, y_offset, X_scale): else: self.intercept_ = 0.0 - def _more_tags(self): - return {"requires_y": True} + def __sklearn_tags__(self): + more_tags = {"requires_y": True} + return {**super().__sklearn_tags__(), **more_tags} # XXX Should this derive from LinearModel? It should be a mixin, not an ABC. diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index f39cbc1e4d990..daf995a28255d 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1744,16 +1744,17 @@ def fit(self, X, y, sample_weight=None): self.n_iter_ = model.n_iter_ return self - def _more_tags(self): + def __sklearn_tags__(self): # Note: check_sample_weights_invariance(kind='ones') should work, but # currently we can only mark a whole test as xfail. - return { + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class LassoCV(RegressorMixin, LinearModelCV): @@ -1974,8 +1975,9 @@ def _get_estimator(self): def _is_multitask(self): return False - def _more_tags(self): - return {"multioutput": False} + def __sklearn_tags__(self): + more_tags = {"multioutput": False} + return {**super().__sklearn_tags__(), **more_tags} class ElasticNetCV(RegressorMixin, LinearModelCV): @@ -2222,8 +2224,9 @@ def _get_estimator(self): def _is_multitask(self): return False - def _more_tags(self): - return {"multioutput": False} + def __sklearn_tags__(self): + more_tags = {"multioutput": False} + return {**super().__sklearn_tags__(), **more_tags} ############################################################################### @@ -2486,8 +2489,9 @@ def fit(self, X, y): # return self for chaining fit and predict calls return self - def _more_tags(self): - return {"multioutput_only": True} + def __sklearn_tags__(self): + more_tags = {"multioutput_only": True} + return {**super().__sklearn_tags__(), **more_tags} class MultiTaskLasso(MultiTaskElasticNet): @@ -2868,8 +2872,9 @@ def _get_estimator(self): def _is_multitask(self): return True - def _more_tags(self): - return {"multioutput_only": True} + def __sklearn_tags__(self): + more_tags = {"multioutput_only": True} + return {**super().__sklearn_tags__(), **more_tags} # This is necessary as LinearModelCV now supports sample_weight while # MultiTaskElasticNet does not (yet). @@ -3104,8 +3109,9 @@ def _get_estimator(self): def _is_multitask(self): return True - def _more_tags(self): - return {"multioutput_only": True} + def __sklearn_tags__(self): + more_tags = {"multioutput_only": True} + return {**super().__sklearn_tags__(), **more_tags} # This is necessary as LinearModelCV now supports sample_weight while # MultiTaskElasticNet does not (yet). diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index d7af8ae60d8b6..7f10b80f9a81d 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -430,7 +430,7 @@ def score(self, X, y, sample_weight=None): dev_null = self._family_instance.deviance(y, y_mean, weights=weights) return 1 - dev / dev_null - def _more_tags(self): + def __sklearn_tags__(self): # create the _family_instance if fit wasn't called yet. if hasattr(self, "_family_instance"): _family_instance = self._family_instance @@ -440,7 +440,8 @@ def _more_tags(self): _family_instance = EDM_DISTRIBUTIONS[self.family]() else: raise ValueError - return {"requires_positive_y": not _family_instance.in_y_range(-1.0)} + more_tags = {"requires_positive_y": not _family_instance.in_y_range(-1.0)} + return {**super().__sklearn_tags__(), **more_tags} class PoissonRegressor(GeneralizedLinearRegressor): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 87fe2b51f4d28..b97d419dd5dca 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -494,4 +494,4 @@ def test_tweedie_regression_family(regression_data): ], ) def test_tags(estimator, value): - assert estimator._get_tags()["requires_positive_y"] is value + assert estimator.__sklearn_tags__()["requires_positive_y"] is value diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index e0de026c10832..5e6b8c2941242 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -1650,8 +1650,9 @@ def __init__( fit_path=True, ) - def _more_tags(self): - return {"multioutput": False} + def __sklearn_tags__(self): + more_tags = {"multioutput": False} + return {**super().__sklearn_tags__(), **more_tags} def fit(self, X, y): """Fit the model using X, y as training data. @@ -2155,8 +2156,9 @@ def __init__( self.fit_path = True self.noise_variance = noise_variance - def _more_tags(self): - return {"multioutput": False} + def __sklearn_tags__(self): + more_tags = {"multioutput": False} + return {**super().__sklearn_tags__(), **more_tags} def fit(self, X, y, copy_X=None): """Fit the model using X, y as training data. diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 658aa12fcd535..23a1c46c5a391 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -2023,11 +2023,12 @@ def score(self, X, y, sample_weight=None): return scoring(self, X, y, sample_weight=sample_weight) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 3f85769e2273c..32d99fe4006c0 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -624,11 +624,12 @@ def score(self, X, y): ) return self.estimator_.score(X, y) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index b0312b2d7cd58..91bbe314ee14f 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1114,8 +1114,9 @@ def classes_(self): """Classes labels.""" return self._label_binarizer.classes_ - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge): @@ -2434,8 +2435,8 @@ def fit(self, X, y, sample_weight=None): super().fit(X, target, sample_weight=sample_weight) return self - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "multilabel": True, "_xfail_checks": { "check_sample_weights_invariance": ( @@ -2443,3 +2444,4 @@ def _more_tags(self): ), }, } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 6ef567fadc078..9faf5bb47cf4f 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -1317,14 +1317,15 @@ def predict_log_proba(self, X): """ return np.log(self.predict_proba(X)) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class BaseSGDRegressor(RegressorMixin, BaseSGD): @@ -1939,14 +1940,15 @@ def __init__( average=average, ) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class SGDOneClassSVM(BaseSGD, OutlierMixin): @@ -2484,11 +2486,12 @@ def predict(self, X): y[y == 0] = -1 # for consistency with outlier detectors return y - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ) } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 83db245c75891..d98ccd1cb21ca 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -457,7 +457,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): StandardScaler(), LinearModel(normalize=False, fit_intercept=True, **params) ) - is_multitask = model_normalize._get_tags()["multioutput_only"] + is_multitask = model_normalize.__sklearn_tags__()["multioutput_only"] # prepare the data n_samples, n_features = 100, 2 diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 930f8d19b7b5e..c3484468f5357 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -447,8 +447,9 @@ def __init__( self.n_jobs = n_jobs self.random_state = random_state - def _more_tags(self): - return {"pairwise": self.dissimilarity == "precomputed"} + def __sklearn_tags__(self): + more_tags = {"pairwise": self.dissimilarity == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} def fit(self, X, y=None, init=None): """ diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 9553cdb193b99..74ba233cc8b06 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -527,11 +527,12 @@ def __init__( self.n_neighbors = n_neighbors self.n_jobs = n_jobs - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "pairwise": self.affinity in ["precomputed", "precomputed_nearest_neighbors"] } + return {**super().__sklearn_tags__(), **more_tags} def _get_affinity_matrix(self, X, Y=None): """Calculate the affinity matrix from data diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 5ceb71569b932..9ade21e715860 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -401,14 +401,15 @@ def __init__( def _estimator_type(self): return self.estimator._estimator_type - def _more_tags(self): + def __sklearn_tags__(self): # allows cross-validation to see 'precomputed' metrics - return { + more_tags = { "pairwise": _safe_tags(self.estimator, "pairwise"), "_xfail_checks": { "check_supervised_y_2d": "DataConversionWarning not caught" }, } + return {**super().__sklearn_tags__(), **more_tags} def score(self, X, y=None): """Return the score on the given data, if the estimator has been refit. diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 940c4c93831f5..97b97fce705cc 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -379,8 +379,8 @@ def _run_search(self, evaluate_candidates): def _generate_candidate_params(self): pass - def _more_tags(self): - tags = deepcopy(super()._more_tags()) + def __sklearn_tags__(self): + tags = deepcopy(super().__sklearn_tags__()) tags["_xfail_checks"].update( { "check_fit2d_1sample": ( diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index a4dcc201c0bb1..4270ff036ad83 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2210,13 +2210,14 @@ def test_search_cv_pairwise_property_delegated_to_base_estimator(pairwise): """ class TestEstimator(BaseEstimator): - def _more_tags(self): - return {"pairwise": pairwise} + def __sklearn_tags__(self): + more_tags = {"pairwise": pairwise} + return {**super().__sklearn_tags__(), **more_tags} est = TestEstimator() attr_message = "BaseSearchCV pairwise tag must match estimator" cv = GridSearchCV(est, {"n_neighbors": [10]}) - assert pairwise == cv._get_tags()["pairwise"], attr_message + assert pairwise == cv.__sklearn_tags__()["pairwise"], attr_message def test_search_cv__pairwise_property_delegated_to_base_estimator(): @@ -2232,8 +2233,9 @@ class EstimatorPairwise(BaseEstimator): def __init__(self, pairwise=True): self.pairwise = pairwise - def _more_tags(self): - return {"pairwise": self.pairwise} + def __sklearn_tags__(self): + more_tags = {"pairwise": self.pairwise} + return {**super().__sklearn_tags__(), **more_tags} est = EstimatorPairwise() attr_message = "BaseSearchCV _pairwise property must match estimator" @@ -2241,7 +2243,7 @@ def _more_tags(self): for _pairwise_setting in [True, False]: est.set_params(pairwise=_pairwise_setting) cv = GridSearchCV(est, {"n_neighbors": [10]}) - assert _pairwise_setting == cv._get_tags()["pairwise"], attr_message + assert _pairwise_setting == cv.__sklearn_tags__()["pairwise"], attr_message def test_search_cv_pairwise_property_equivalence_of_precomputed(): diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index e100bb4ef99dc..fe85efc86fc06 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -523,9 +523,10 @@ def n_classes_(self): """Number of classes.""" return len(self.classes_) - def _more_tags(self): + def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" - return {"pairwise": _safe_tags(self.estimator, key="pairwise")} + more_tags = {"pairwise": _safe_tags(self.estimator, key="pairwise")} + return {**super().__sklearn_tags__(), **more_tags} def _fit_ovo_binary(estimator, X, y, i, j): @@ -681,7 +682,7 @@ def fit(self, X, y): self.estimators_ = estimators_indices[0] - pairwise = self._get_tags()["pairwise"] + pairwise = self.__sklearn_tags__()["pairwise"] self.pairwise_indices_ = estimators_indices[1] if pairwise else None return self @@ -825,9 +826,10 @@ def n_classes_(self): """Number of classes.""" return len(self.classes_) - def _more_tags(self): + def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" - return {"pairwise": _safe_tags(self.estimator, key="pairwise")} + more_tags = {"pairwise": _safe_tags(self.estimator, key="pairwise")} + return {**super().__sklearn_tags__(), **more_tags} class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 24e4cc8dda7e8..24099019a4299 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -237,8 +237,9 @@ def predict(self, X): return np.asarray(y).T - def _more_tags(self): - return {"multioutput_only": True} + def __sklearn_tags__(self): + more_tags = {"multioutput_only": True} + return {**super().__sklearn_tags__(), **more_tags} class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator): @@ -506,9 +507,10 @@ def score(self, X, y): y_pred = self.predict(X) return np.mean(np.all(y == y_pred, axis=1)) - def _more_tags(self): + def __sklearn_tags__(self): # FIXME - return {"_skip_test": True} + more_tags = {"_skip_test": True} + return {**super().__sklearn_tags__(), **more_tags} def _available_if_base_estimator_has(attr): @@ -835,8 +837,10 @@ def decision_function(self, X): return Y_decision - def _more_tags(self): - return {"_skip_test": True, "multioutput_only": True} + def __sklearn_tags__(self): + # FIXME + more_tags = {"_skip_test": True, "multioutput_only": True} + return {**super().__sklearn_tags__(), **more_tags} class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain): @@ -955,5 +959,7 @@ def fit(self, X, Y, **fit_params): super().fit(X, Y, **fit_params) return self - def _more_tags(self): - return {"multioutput_only": True} + def __sklearn_tags__(self): + # FIXME + more_tags = {"multioutput_only": True} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 712e8dec11fd9..d0f008b885e58 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -703,8 +703,9 @@ def _init_counters(self, n_classes, n_features): self.class_count_ = np.zeros(n_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_classes, n_features), dtype=np.float64) - def _more_tags(self): - return {"poor_score": True} + def __sklearn_tags__(self): + more_tags = {"poor_score": True} + return {**super().__sklearn_tags__(), **more_tags} # TODO: Remove in 1.2 # mypy error: Decorated property not supported @@ -813,8 +814,9 @@ def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None): self.fit_prior = fit_prior self.class_prior = class_prior - def _more_tags(self): - return {"requires_positive_X": True} + def __sklearn_tags__(self): + more_tags = {"requires_positive_X": True} + return {**super().__sklearn_tags__(), **more_tags} def _count(self, X, Y): """Count and smooth feature occurrences.""" @@ -940,8 +942,9 @@ def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, norm=False): self.class_prior = class_prior self.norm = norm - def _more_tags(self): - return {"requires_positive_X": True} + def __sklearn_tags__(self): + more_tags = {"requires_positive_X": True} + return {**super().__sklearn_tags__(), **more_tags} def _count(self, X, Y): """Count feature occurrences.""" @@ -1303,8 +1306,9 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): """ return super().partial_fit(X, y, classes, sample_weight=sample_weight) - def _more_tags(self): - return {"requires_positive_X": True} + def __sklearn_tags__(self): + more_tags = {"requires_positive_X": True} + return {**super().__sklearn_tags__(), **more_tags} def _check_X(self, X): """Validate X, used only in predict* methods.""" diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index f0918f3d4db46..3e95b16ca6a2d 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -401,7 +401,7 @@ def _check_algorithm_metric(self): raise ValueError("p must be greater or equal to one for minkowski metric") def _fit(self, X, y=None): - if self._get_tags()["requires_y"]: + if self.__sklearn_tags__()["requires_y"]: if not isinstance(X, (KDTree, BallTree, NeighborsBase)): X, y = self._validate_data( X, y, accept_sparse="csr", multi_output=True, order="C" @@ -608,9 +608,10 @@ def _fit(self, X, y=None): return self - def _more_tags(self): + def __sklearn_tags__(self): # For cross-validation routines to split data correctly - return {"pairwise": self.metric == "precomputed"} + more_tags = {"pairwise": self.metric == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} def _tree_query_parallel_helper(tree, *args, **kwargs): diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index bcad8c71aee07..e3183c5904ae9 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -304,8 +304,9 @@ def predict_proba(self, X): return probabilities - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): @@ -696,5 +697,6 @@ def predict_proba(self, X): return probabilities - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 2be70c0638517..e8330209eb155 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -441,12 +441,13 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_methods_sample_order_invariance": "check is not applicable." } } + return {**super().__sklearn_tags__(), **more_tags} class RadiusNeighborsTransformer( @@ -670,9 +671,10 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_methods_sample_order_invariance": "check is not applicable." } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index a785fcd86939f..081f471e47449 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -318,11 +318,12 @@ def sample(self, n_samples=1, random_state=None): ) return data[i] + X * correction[:, np.newaxis] - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "sample_weight must have positive values" ), } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 416e4b901b01a..cbcc526cba218 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -565,5 +565,6 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): return sign * loss, sign * gradient.ravel() - def _more_tags(self): - return {"requires_y": True} + def __sklearn_tags__(self): + more_tags = {"requires_y": True} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 1bc02fedba212..502cdfaf5db34 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -176,9 +176,10 @@ def __init__( ) self.weights = weights - def _more_tags(self): + def __sklearn_tags__(self): # For cross-validation routines to split data correctly - return {"pairwise": self.metric == "precomputed"} + more_tags = {"pairwise": self.metric == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} def fit(self, X, y): """Fit the k-nearest neighbors regressor from the training dataset. diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a1e0b01ef3eeb..1467e98a8b2a2 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -210,10 +210,10 @@ def test_unsupervised_kneighbors( @pytest.mark.parametrize( "NeighborsMixinSubclass", [ - neighbors.KNeighborsClassifier, + # neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, - neighbors.RadiusNeighborsClassifier, - neighbors.RadiusNeighborsRegressor, + # neighbors.RadiusNeighborsClassifier, + # neighbors.RadiusNeighborsRegressor, ], ) def test_neigh_predictions_algorithm_agnosticity( diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 5fb4d7c64ffee..56e66479faa57 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -1259,8 +1259,9 @@ def predict_proba(self, X): else: return y_pred - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index aac92c3108787..9c411b6710084 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -421,8 +421,8 @@ def fit(self, X, y=None): return self - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_methods_subset_invariance": ( "fails for the decision_function method" @@ -432,3 +432,4 @@ def _more_tags(self): ), } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 0cc5e90ba321c..60566cf0b94f5 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -703,9 +703,10 @@ def classes_(self): """The classes labels. Only exist if the last step is a classifier.""" return self.steps[-1][1].classes_ - def _more_tags(self): + def __sklearn_tags__(self): # check if first estimator expects pairwise input - return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")} + more_tags = {"pairwise": _safe_tags(self.steps[0][1], "pairwise")} + return {**super().__sklearn_tags__(), **more_tags} def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 6a30e0d5d1af5..e86b62a0d33ba 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -533,8 +533,9 @@ def inverse_transform(self, X): X /= self.scale_ return X - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): @@ -1037,8 +1038,9 @@ def inverse_transform(self, X, copy=None): X += self.mean_ return X - def _more_tags(self): - return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} class MaxAbsScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): @@ -1252,8 +1254,9 @@ def inverse_transform(self, X): X *= self.scale_ return X - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} def maxabs_scale(X, *, axis=0, copy=True): @@ -1594,8 +1597,9 @@ def inverse_transform(self, X): X += self.center_ return X - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} def robust_scale( @@ -1948,8 +1952,9 @@ def transform(self, X, copy=None): X = self._validate_data(X, accept_sparse="csr", reset=False) return normalize(X, norm=self.norm, axis=1, copy=copy) - def _more_tags(self): - return {"stateless": True} + def __sklearn_tags__(self): + more_tags = {"stateless": True} + return {**super().__sklearn_tags__(), **more_tags} def binarize(X, *, threshold=0.0, copy=True): @@ -2119,8 +2124,9 @@ def transform(self, X, copy=None): X = self._validate_data(X, accept_sparse=["csr", "csc"], copy=copy, reset=False) return binarize(X, threshold=self.threshold, copy=False) - def _more_tags(self): - return {"stateless": True} + def __sklearn_tags__(self): + more_tags = {"stateless": True} + return {**super().__sklearn_tags__(), **more_tags} class KernelCenterer(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): @@ -2271,8 +2277,9 @@ def _n_features_out(self): # implement get_feature_names_out for this class. return self.n_features_in_ - def _more_tags(self): - return {"pairwise": True} + def __sklearn_tags__(self): + more_tags = {"pairwise": True} + return {**super().__sklearn_tags__(), **more_tags} def add_dummy_feature(X, value=1.0): @@ -2769,8 +2776,9 @@ def inverse_transform(self, X): return self._transform(X, inverse=True) - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} def quantile_transform( @@ -3312,8 +3320,9 @@ def _check_input( return X - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True): diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 33d87a09a7b39..8d607287a8460 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -176,8 +176,9 @@ def _transform( return X_int, X_mask - def _more_tags(self): - return {"X_types": ["categorical"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["categorical"]} + return {**super().__sklearn_tags__(), **more_tags} class OneHotEncoder(_BaseEncoder): diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 2641ab535822e..557a2b823e0a6 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -295,5 +295,6 @@ def __sklearn_is_fitted__(self): """Return True since FunctionTransfomer is stateless.""" return True - def _more_tags(self): - return {"no_validation": not self.validate, "stateless": True} + def __sklearn_tags__(self): + more_tags = {"no_validation": not self.validate, "stateless": True} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index e7f4a5e337208..cd65d071a8bd4 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -162,8 +162,9 @@ def inverse_transform(self, y): y = np.asarray(y) return self.classes_[y] - def _more_tags(self): - return {"X_types": ["1dlabels"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["1dlabels"]} + return {**super().__sklearn_tags__(), **more_tags} class LabelBinarizer(TransformerMixin, BaseEstimator): @@ -409,8 +410,9 @@ def inverse_transform(self, Y, threshold=None): return y_inv - def _more_tags(self): - return {"X_types": ["1dlabels"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["1dlabels"]} + return {**super().__sklearn_tags__(), **more_tags} def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False): @@ -925,5 +927,6 @@ def inverse_transform(self, yt): ) return [tuple(self.classes_.compress(indicators)) for indicators in yt] - def _more_tags(self): - return {"X_types": ["2dlabels"]} + def __sklearn_tags__(self): + more_tags = {"X_types": ["2dlabels"]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 95972033635f5..4473970146771 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2228,7 +2228,7 @@ def test_cv_pipeline_precomputed(): pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())]) # did the pipeline set the pairwise attribute? - assert pipeline._get_tags()["pairwise"] + assert pipeline.__sklearn_tags__()["pairwise"] # test cross-validation, score should be almost perfect # NB: this test is pretty vacuous -- it's mainly to test integration diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 27c52088f80d9..069cb972001da 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -940,7 +940,7 @@ def test_categories(density, drop): @pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) def test_encoders_has_categorical_tags(Encoder): - assert "categorical" in Encoder()._get_tags()["X_types"] + assert "categorical" in Encoder().__sklearn_tags__()["X_types"] # TODO: Remove in 1.2 when get_feature_names is removed diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 31ebfdddd8928..2b376c08b1f58 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -437,10 +437,9 @@ def _n_features_out(self): """ return self.n_components - def _more_tags(self): - return { - "preserves_dtype": [np.float64, np.float32], - } + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} class GaussianRandomProjection(BaseRandomProjection): diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 42b53409fa8b8..6643cc69ad243 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -115,9 +115,10 @@ def __init__( self.max_iter = max_iter self.random_state = random_state - def _more_tags(self): + def __sklearn_tags__(self): # Used by cross_val_score. - return {"pairwise": self.kernel == "precomputed"} + more_tags = {"pairwise": self.kernel == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} def fit(self, X, y, sample_weight=None): """Fit the SVM model according to the given training data. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index b10166f04da28..dc76fa6ac8a0e 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -285,14 +285,15 @@ def fit(self, X, y, sample_weight=None): return self - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class LinearSVR(RegressorMixin, LinearModel): @@ -512,14 +513,15 @@ def fit(self, X, y, sample_weight=None): return self - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class SVC(BaseSVC): @@ -784,14 +786,15 @@ def __init__( random_state=random_state, ) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class NuSVC(BaseSVC): @@ -1045,8 +1048,8 @@ def __init__( random_state=random_state, ) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_methods_subset_invariance": ( "fails for the decision_function method" @@ -1057,6 +1060,7 @@ def _more_tags(self): ), } } + return {**super().__sklearn_tags__(), **more_tags} class SVR(RegressorMixin, BaseLibSVM): @@ -1247,14 +1251,15 @@ def __init__( random_state=None, ) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class NuSVR(RegressorMixin, BaseLibSVM): @@ -1439,14 +1444,15 @@ def __init__( random_state=None, ) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} class OneClassSVM(OutlierMixin, BaseLibSVM): @@ -1719,11 +1725,12 @@ def predict(self, X): y = super().predict(X) return np.asarray(y, dtype=np.intp) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 2608b77622e9a..95c4c6b266edf 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -47,23 +47,25 @@ def __init__(self, a=None, b=None): class NaNTag(BaseEstimator): - def _more_tags(self): - return {"allow_nan": True} + def __sklearn_tags__(self): + more_tags = {"allow_nan": True} + return {**super().__sklearn_tags__(), **more_tags} class NoNaNTag(BaseEstimator): - def _more_tags(self): - return {"allow_nan": False} + def __sklearn_tags__(self): + more_tags = {"allow_nan": False} + return {**super().__sklearn_tags__(), **more_tags} class OverrideTag(NaNTag): - def _more_tags(self): - return {"allow_nan": False} + def __sklearn_tags__(self): + more_tags = {"allow_nan": False} + return {**super().__sklearn_tags__(), **more_tags} class DiamondOverwriteTag(NaNTag, NoNaNTag): - def _more_tags(self): - return dict() + pass class InheritDiamondOverwriteTag(DiamondOverwriteTag): @@ -506,17 +508,17 @@ def test_tag_inheritance(): nan_tag_est = NaNTag() no_nan_tag_est = NoNaNTag() - assert nan_tag_est._get_tags()["allow_nan"] - assert not no_nan_tag_est._get_tags()["allow_nan"] + assert nan_tag_est.__sklearn_tags__()["allow_nan"] + assert not no_nan_tag_est.__sklearn_tags__()["allow_nan"] redefine_tags_est = OverrideTag() - assert not redefine_tags_est._get_tags()["allow_nan"] + assert not redefine_tags_est.__sklearn_tags__()["allow_nan"] diamond_tag_est = DiamondOverwriteTag() - assert diamond_tag_est._get_tags()["allow_nan"] + assert diamond_tag_est.__sklearn_tags__()["allow_nan"] inherit_diamond_tag_est = InheritDiamondOverwriteTag() - assert inherit_diamond_tag_est._get_tags()["allow_nan"] + assert inherit_diamond_tag_est.__sklearn_tags__()["allow_nan"] def test_raises_on_get_params_non_attribute(): diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 7ea8ba7062b1a..6f5822bab8c7b 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -289,9 +289,9 @@ def test_fit_docstring_attributes(name, Estimator): y = _enforce_estimator_tags_y(est, y) X = _enforce_estimator_tags_x(est, X) - if "1dlabels" in est._get_tags()["X_types"]: + if "1dlabels" in est.__sklearn_tags__()["X_types"]: est.fit(y) - elif "2dlabels" in est._get_tags()["X_types"]: + elif "2dlabels" in est.__sklearn_tags__()["X_types"]: est.fit(np.c_[y, y]) else: est.fit(X, y) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 9571b43b3d746..23df627a38e77 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -844,10 +844,10 @@ def test_pairwise_tag(MultiClassClassifier): clf_notprecomputed = svm.SVC() ovr_false = MultiClassClassifier(clf_notprecomputed) - assert not ovr_false._get_tags()["pairwise"] + assert not ovr_false.__sklearn_tags__()["pairwise"] ovr_true = MultiClassClassifier(clf_precomputed) - assert ovr_true._get_tags()["pairwise"] + assert ovr_true.__sklearn_tags__()["pairwise"] @pytest.mark.parametrize( diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index b844ed1a90200..c95f43c9b2bc5 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1487,7 +1487,7 @@ def test_pipeline_get_tags_none(passthrough): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/18815 pipe = make_pipeline(passthrough, SVC()) - assert not pipe._get_tags()["pairwise"] + assert not pipe.__sklearn_tags__()["pairwise"] # FIXME: Replace this test with a full `check_estimator` once we have API only diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6eb7d0e3a5650..98b6e0ec9f0bd 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1037,8 +1037,9 @@ def predict_log_proba(self, X): def n_features_(self): return self.n_features_in_ - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index c7451dce1fbc5..1b18516b5dc98 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -317,8 +317,9 @@ def score(self, X=None, Y=None): score = 0.0 return score - def _more_tags(self): - return {"_skip_test": True, "X_types": ["1dlabel"]} + def __sklearn_tags__(self): + more_tags = {"_skip_test": True, "X_types": ["1dlabel"]} + return {**super().__sklearn_tags__(), **more_tags} class NoSampleWeightWrapper(BaseEstimator): @@ -342,5 +343,6 @@ def predict(self, X): def predict_proba(self, X): return self.est.predict_proba(X) - def _more_tags(self): - return {"_skip_test": True} + def __sklearn_tags__(self): + more_tags = {"_skip_test": True} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index a275c5dd1aa84..593a65bed18d1 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -1,3 +1,5 @@ +import warnings +import inspect import numpy as np _DEFAULT_TAGS = { @@ -22,6 +24,19 @@ } +# TODO(1.3) Remove `_more_tags` support +def _walk_mro_more_tags(estimator): + collected_tags = {} + for base_class in reversed(inspect.getmro(estimator.__class__)): + if hasattr(base_class, "_more_tags"): + # need the if because mixins might not have _more_tags + # but might do redundant work in estimators + # (i.e. calling more tags on BaseEstimator multiple times) + more_tags = base_class._more_tags(estimator) + collected_tags.update(more_tags) + return collected_tags + + def _safe_tags(estimator, key=None): """Safely get estimator tags. @@ -30,7 +45,7 @@ def _safe_tags(estimator, key=None): fall-back to the default tags. For scikit-learn built-in estimators, we should still rely on - `self._get_tags()`. `_safe_tags(est)` should be used when we are not sure + `self.__sklearn_tags__()`. `_safe_tags(est)` should be used when we are not sure where `est` comes from: typically `_safe_tags(self.base_estimator)` where `self` is a meta-estimator, or in the common checks. @@ -47,12 +62,27 @@ def _safe_tags(estimator, key=None): tags : dict or tag value The estimator tags. A single value is returned if `key` is not None. """ - if hasattr(estimator, "_get_tags"): + if hasattr(estimator, "__sklearn_tags__"): + tags_provider = "__sklearn_tags__()" + tags = estimator.__sklearn_tags__() + elif hasattr(estimator, "_get_tags"): + # TODO(1.3) Remove `_get_tags` support + warnings.warn( + "_get_tags() was deprecated in 1.1 support will be removed in 1.3. " + "Please use __sklearn_tags__ instead.", + FutureWarning, + ) tags_provider = "_get_tags()" tags = estimator._get_tags() elif hasattr(estimator, "_more_tags"): + # TODO(1.3) Remove `_more_tags` support + warnings.warn( + "_more_tags() was deprecated in 1.1 support will be removed in 1.3. " + "Please use __sklearn_tags__ instead.", + FutureWarning, + ) tags_provider = "_more_tags()" - tags = {**_DEFAULT_TAGS, **estimator._more_tags()} + tags = {**_DEFAULT_TAGS, **_walk_mro_more_tags(estimator)} else: tags_provider = "_DEFAULT_TAGS" tags = _DEFAULT_TAGS diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4230ab3532b76..af7a7598d4aa1 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -124,6 +124,7 @@ def _yield_checks(estimator): yield check_estimators_pickle yield check_estimator_get_tags_default_keys + yield check_estimator_tags_deprecated def _yield_classifier_checks(classifier): @@ -3757,20 +3758,35 @@ def check_n_features_in_after_fitting(name, estimator_orig): def check_estimator_get_tags_default_keys(name, estimator_orig): - # check that if _get_tags is implemented, it contains all keys from + # check that if __sklearn_tags__ is implemented, it contains all keys from # _DEFAULT_KEYS estimator = clone(estimator_orig) - if not hasattr(estimator, "_get_tags"): + if not hasattr(estimator, "__sklearn_tags__"): return - tags_keys = set(estimator._get_tags().keys()) + tags_keys = set(estimator.__sklearn_tags__().keys()) default_tags_keys = set(_DEFAULT_TAGS.keys()) assert tags_keys.intersection(default_tags_keys) == default_tags_keys, ( - f"{name}._get_tags() is missing entries for the following default tags" + f"{name}.__sklearn_tags__() is missing entries for the following default tags" f": {default_tags_keys - tags_keys.intersection(default_tags_keys)}" ) +def check_estimator_tags_deprecated(name, estimator_orig): + if hasattr(estimator_orig, "_more_tags"): + warnings.warn( + "_more_tags() was deprecated in 1.1 support will be removed in 1.3. " + "Please use __sklearn_tags__ instead.", + FutureWarning, + ) + if hasattr(estimator_orig, "_get_tags"): + warnings.warn( + "_get_tags() was deprecated in 1.1 support will be removed in 1.3. " + "Please use __sklearn_tags__ instead.", + FutureWarning, + ) + + def check_dataframe_column_names_consistency(name, estimator_orig): try: import pandas as pd @@ -3926,7 +3942,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): def check_transformer_get_feature_names_out(name, transformer_orig): - tags = transformer_orig._get_tags() + tags = transformer_orig.__sklearn_tags__() if "2darray" not in tags["X_types"] or tags["no_validation"]: return @@ -3983,7 +3999,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): "pandas is not installed: not checking column name consistency for pandas" ) - tags = transformer_orig._get_tags() + tags = transformer_orig.__sklearn_tags__() if "2darray" not in tags["X_types"] or tags["no_validation"]: return diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 92999ef8476f8..b6a6aa7d98226 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -35,6 +35,7 @@ from sklearn.utils import all_estimators from sklearn.exceptions import SkipTestWarning from sklearn.utils.metaestimators import available_if +from sklearn.utils._tags import _DEFAULT_TAGS from sklearn.utils.estimator_checks import ( _NotAnArray, @@ -57,6 +58,7 @@ check_methods_sample_order_invariance, check_methods_subset_invariance, _yield_all_checks, + check_estimator_tags_deprecated, ) @@ -387,13 +389,14 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): class TaggedBinaryClassifier(UntaggedBinaryClassifier): # Toy classifier that only supports binary classification. - def _more_tags(self): - return {"binary_only": True} + def __sklearn_tags__(self): + more_tags = {"binary_only": True} + return {**super().__sklearn_tags__(), **more_tags} class EstimatorMissingDefaultTags(BaseEstimator): - def _get_tags(self): - tags = super()._get_tags().copy() + def __sklearn_tags__(self): + tags = super().__sklearn_tags__().copy() del tags["allow_nan"] return tags @@ -405,16 +408,18 @@ def fit(self, X, y): raise ValueError("negative y values not supported!") return super().fit(X, y) - def _more_tags(self): - return {"requires_positive_y": True} + def __sklearn_tags__(self): + more_tags = {"requires_positive_y": True} + return {**super().__sklearn_tags__(), **more_tags} class PoorScoreLogisticRegression(LogisticRegression): def decision_function(self, X): return super().decision_function(X) + 1 - def _more_tags(self): - return {"poor_score": True} + def __sklearn_tags__(self): + more_tags = {"poor_score": True} + return {**super().__sklearn_tags__(), **more_tags} class PartialFitChecksName(BaseEstimator): @@ -704,7 +709,7 @@ def test_check_regressor_data_not_an_array(): def test_check_estimator_get_tags_default_keys(): estimator = EstimatorMissingDefaultTags() err_msg = ( - r"EstimatorMissingDefaultTags._get_tags\(\) is missing entries" + r"EstimatorMissingDefaultTags.__sklearn_tags__\(\) is missing entries" r" for the following default tags: {'allow_nan'}" ) with raises(AssertionError, match=err_msg): @@ -738,8 +743,9 @@ def __init__(self, response_output): def fit(self, X, y): return self - def _more_tags(self): - return {"multilabel": True} + def __sklearn_tags__(self): + more_tags = {"multilabel": True} + return {**super().__sklearn_tags__(), **more_tags} def test_check_classifiers_multilabel_output_format_predict(): @@ -1069,9 +1075,35 @@ def test_non_deterministic_estimator_skip_tests(): assert check_methods_subset_invariance in all_tests class Estimator(est): - def _more_tags(self): - return {"non_deterministic": True} + def __sklearn_tags__(self): + more_tags = {"non_deterministic": True} + return {**_DEFAULT_TAGS, **more_tags} all_tests = list(_yield_all_checks(Estimator())) assert check_methods_sample_order_invariance not in all_tests assert check_methods_subset_invariance not in all_tests + + +# TODO(1.3) Remove `_more_tags` and `_get_tags` support +def test_check_estimator_tags_deprecated(): + """Check deprecation warnings are raised.""" + + class Estimator: + def _more_tags(self): + return {} + + with warnings.catch_warnings(): + warnings.simplefilter("error", FutureWarning) + err_msg = r"_more_tags\(\) was deprecated" + with raises(FutureWarning, match=err_msg): + check_estimator_tags_deprecated("estimator", Estimator()) + + class Estimator: + def _get_tags(self): + return {} + + with warnings.catch_warnings(): + warnings.simplefilter("error", FutureWarning) + err_msg = r"_get_tags\(\) was deprecated" + with raises(FutureWarning, match=err_msg): + check_estimator_tags_deprecated("estimator", Estimator()) diff --git a/sklearn/utils/tests/test_tags.py b/sklearn/utils/tests/test_tags.py index f96a4947164c3..f92dbdad740db 100644 --- a/sklearn/utils/tests/test_tags.py +++ b/sklearn/utils/tests/test_tags.py @@ -1,3 +1,5 @@ +import re + import pytest from sklearn.base import BaseEstimator @@ -19,7 +21,7 @@ def _more_tags(self): @pytest.mark.parametrize( "estimator, err_msg", [ - (BaseEstimator(), "The key xxx is not defined in _get_tags"), + (BaseEstimator(), "The key xxx is not defined in __sklearn_tags__"), (NoTagsEstimator(), "The key xxx is not defined in _DEFAULT_TAGS"), ], ) @@ -29,6 +31,8 @@ def test_safe_tags_error(estimator, err_msg): _safe_tags(estimator, key="xxx") +# TODO(1.3) Remove FutureWarning when `_more_tags is not supported +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize( "estimator, key, expected_results", [ @@ -45,3 +49,22 @@ def test_safe_tags_no_get_tags(estimator, key, expected_results): # check the behaviour of _safe_tags when an estimator does not implement # _get_tags assert _safe_tags(estimator, key=key) == expected_results + + +# TODO(1.3) Remove `_more_tags` and `_get_tags` support +def test_safe_tags_raises_warning(): + """Check safe_tags raises warnings for _more_tags and _get_tags.""" + + class Estimator: + def _more_tags(self): + return {} + + with pytest.warns(FutureWarning, match=re.escape("_more_tags() was deprecated")): + _safe_tags(Estimator()) + + class Estimator: + def _get_tags(self): + return {} + + with pytest.warns(FutureWarning, match=re.escape("_get_tags() was deprecated")): + _safe_tags(Estimator()) From b06fd10e1bf702fe03cf9049be7ca5db0c342e3f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 24 Feb 2022 16:19:19 -0500 Subject: [PATCH 02/48] DOC Adds whats new --- doc/whats_new/v1.1.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index b0d36364ec333..502cc7752e8af 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -112,6 +112,9 @@ Changelog error message when setting invalid hyper-parameters with `set_params`. :pr:`21542` by :user:`Olivier Grisel `. +- |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators. + :pr:`22606` by `Thomas Fan`_. + :mod:`sklearn.calibration` .......................... From e68f7f06372d486635b0f6b81afbde46759f115d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 2 Jan 2024 07:58:22 -0500 Subject: [PATCH 03/48] CI Fix assign/unassign CI --- .github/workflows/assign.yml | 2 ++ .github/workflows/unassign.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml index 5d725d76b0b1b..7f4d5fcafb54c 100644 --- a/.github/workflows/assign.yml +++ b/.github/workflows/assign.yml @@ -22,3 +22,5 @@ jobs: echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" gh issue edit ${{ github.event.issue.number }} --add-assignee ${{ github.event.comment.user.login }} gh issue edit ${{ github.event.issue.number }} --remove-label "help wanted" + env: + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/unassign.yml b/.github/workflows/unassign.yml index 9cb1616cc0c1e..8fab57afb415f 100644 --- a/.github/workflows/unassign.yml +++ b/.github/workflows/unassign.yml @@ -19,3 +19,5 @@ jobs: run: | echo "Marking issue ${{ github.event.issue.number }} as help wanted" gh issue edit ${{ github.event.issue.number }} --add-label "help wanted" + env: + GH_TOKEN: ${{ github.token }} From a5e9560665026d4170d8e643449bcad471bed9ee Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 2 Jan 2024 08:00:27 -0500 Subject: [PATCH 04/48] CI Fix assign/unassign CI --- .github/workflows/assign.yml | 5 +++-- .github/workflows/unassign.yml | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml index 7f4d5fcafb54c..fa3b6f95a5e95 100644 --- a/.github/workflows/assign.yml +++ b/.github/workflows/assign.yml @@ -20,7 +20,8 @@ jobs: steps: - run: | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" - gh issue edit ${{ github.event.issue.number }} --add-assignee ${{ github.event.comment.user.login }} - gh issue edit ${{ github.event.issue.number }} --remove-label "help wanted" + gh issue edit $ISSUE --add-assignee ${{ github.event.comment.user.login }} + gh issue edit $ISSUE --remove-label "help wanted" env: GH_TOKEN: ${{ github.token }} + ISSUE: ${{ github.event.issue.html_url }} diff --git a/.github/workflows/unassign.yml b/.github/workflows/unassign.yml index 8fab57afb415f..94a50d49839d6 100644 --- a/.github/workflows/unassign.yml +++ b/.github/workflows/unassign.yml @@ -18,6 +18,7 @@ jobs: if: github.event.issue.state == 'open' run: | echo "Marking issue ${{ github.event.issue.number }} as help wanted" - gh issue edit ${{ github.event.issue.number }} --add-label "help wanted" + gh issue edit $ISSUE --add-label "help wanted" env: GH_TOKEN: ${{ github.token }} + ISSUE: ${{ github.event.issue.html_url }} From 3ae850648dea4ad5fa504032d8430c28fc000a2f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 29 Apr 2024 18:46:14 -0400 Subject: [PATCH 05/48] Update new estimators with __sklearn_tags__ --- doc/sphinxext/allow_nan_estimators.py | 2 +- sklearn/cluster/_birch.py | 5 +++-- sklearn/cluster/_bisect_k_means.py | 5 +++-- sklearn/cluster/_dbscan.py | 5 +++-- sklearn/cluster/_hdbscan/hdbscan.py | 5 +++-- sklearn/decomposition/_fastica.py | 5 +++-- sklearn/discriminant_analysis.py | 5 +++-- sklearn/ensemble/_bagging.py | 10 ++++++---- sklearn/ensemble/_base.py | 5 +++-- sklearn/ensemble/_forest.py | 8 ++++---- sklearn/ensemble/tests/test_bagging.py | 2 +- sklearn/feature_selection/_rfe.py | 4 ++-- sklearn/feature_selection/_univariate_selection.py | 10 ++++++---- sklearn/kernel_approximation.py | 10 ++++++---- sklearn/manifold/_isomap.py | 5 +++-- sklearn/manifold/_t_sne.py | 5 +++-- sklearn/multioutput.py | 2 +- sklearn/neighbors/_lof.py | 7 +++---- sklearn/preprocessing/_data.py | 2 +- sklearn/preprocessing/_polynomial.py | 5 +++-- sklearn/preprocessing/_target_encoder.py | 5 +++-- sklearn/tests/test_docstring_parameters.py | 2 +- sklearn/tree/_classes.py | 7 ++++--- sklearn/utils/estimator_checks.py | 4 ++-- sklearn/utils/tests/test_estimator_checks.py | 5 +++-- 25 files changed, 74 insertions(+), 56 deletions(-) diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py index 89d7077bce2b5..8c70908950605 100755 --- a/doc/sphinxext/allow_nan_estimators.py +++ b/doc/sphinxext/allow_nan_estimators.py @@ -21,7 +21,7 @@ def make_paragraph_for_estimator_type(estimator_type): with suppress(SkipTest): est = _construct_instance(est_class) - if est._get_tags().get("allow_nan"): + if est.__sklearn_tags__().get("allow_nan"): module_name = ".".join(est_class.__module__.split(".")[:2]) class_title = f"{est_class.__name__}" class_url = f"./generated/{module_name}.{class_title}.html" diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index d62fb880ba8b2..a022eb526171f 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -737,5 +737,6 @@ def _global_clustering(self, X=None): if compute_labels: self.labels_ = self._predict(X) - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index a1f7716ced822..538908b88d5e7 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -525,5 +525,6 @@ def _predict_recursive(self, X, sample_weight, cluster_node): return labels - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 98f524752a39a..e8ef3af572bf5 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -462,5 +462,6 @@ def fit_predict(self, X, y=None, sample_weight=None): self.fit(X, sample_weight=sample_weight) return self.labels_ - def _more_tags(self): - return {"pairwise": self.metric == "precomputed"} + def __sklearn_tags__(self): + more_tags = {"pairwise": self.metric == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index fc51f10cffba0..9160187425407 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -1014,5 +1014,6 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] return labels - def _more_tags(self): - return {"allow_nan": self.metric != "precomputed"} + def __sklearn_tags__(self): + more_tags = {"allow_nan": self.metric != "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 4b5b6c3f86a63..2a17550165eed 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -778,5 +778,6 @@ def _n_features_out(self): """Number of transformed output features.""" return self.components_.shape[0] - def _more_tags(self): - return {"preserves_dtype": [np.float32, np.float64]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 29146ca857694..c0a6f0aad1fcf 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -751,8 +751,9 @@ def decision_function(self, X): # Only override for the doc return super().decision_function(X) - def _more_tags(self): - return {"array_api_support": True} + def __sklearn_tags__(self): + more_tags = {"array_api_support": True} + return {**super().__sklearn_tags__(), **more_tags} class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index da340ceec6fe4..e14c78cfc9c10 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -961,13 +961,14 @@ def decision_function(self, X): return decisions - def _more_tags(self): + def __sklearn_tags__(self): if self.estimator is None: estimator = DecisionTreeClassifier() else: estimator = self.estimator - return {"allow_nan": _safe_tags(estimator, "allow_nan")} + more_tags = {"allow_nan": _safe_tags(estimator, "allow_nan")} + return {**super().__sklearn_tags__(), **more_tags} class BaggingRegressor(_RoutingNotSupportedMixin, RegressorMixin, BaseBagging): @@ -1234,9 +1235,10 @@ def _set_oob_score(self, X, y): self.oob_prediction_ = predictions self.oob_score_ = r2_score(y, predictions) - def _more_tags(self): + def __sklearn_tags__(self): if self.estimator is None: estimator = DecisionTreeRegressor() else: estimator = self.estimator - return {"allow_nan": _safe_tags(estimator, "allow_nan")} + more_tags = {"allow_nan": _safe_tags(estimator, "allow_nan")} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 1fa05d90975cd..3a78248ed3e7a 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -287,7 +287,7 @@ def get_params(self, deep=True): """ return super()._get_params("estimators", deep=deep) - def _more_tags(self): + def __sklearn_tags__(self): try: allow_nan = all( _safe_tags(est[1])["allow_nan"] if est[1] != "drop" else True @@ -298,4 +298,5 @@ def _more_tags(self): # fail. In this case, we assume that `allow_nan` is False but the parameter # validation will raise an error during `fit`. allow_nan = False - return {"preserves_dtype": [], "allow_nan": allow_nan} + more_tags = {"preserves_dtype": [], "allow_nan": allow_nan} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index c5337762b8936..b12b793ccdff2 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -714,11 +714,12 @@ def estimators_samples_(self): """ return [sample_indices for sample_indices in self._get_estimators_indices()] - def _more_tags(self): + def __sklearn_tags__(self): # Only the criterion is required to determine if the tree supports # missing values estimator = type(self.estimator)(criterion=self.criterion) - return {"allow_nan": _safe_tags(estimator, key="allow_nan")} + more_tags = {"allow_nan": _safe_tags(estimator, key="allow_nan")} + return {**super().__sklearn_tags__(), **more_tags} def _accumulate_prediction(predict, X, out, lock): @@ -855,8 +856,7 @@ def _validate_y_class_weight(self, y): raise ValueError( "Valid presets for class_weight include " '"balanced" and "balanced_subsample".' - 'Given "%s".' - % self.class_weight + 'Given "%s".' % self.class_weight ) if self.warm_start: warn( diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 2c1e308cee33b..5ced8e3000e9a 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -935,4 +935,4 @@ def fit(self, X, y): ) def test_bagging_allow_nan_tag(bagging, expected_allow_nan): """Check that bagging inherits allow_nan tag.""" - assert bagging._get_tags()["allow_nan"] == expected_allow_nan + assert bagging.__sklearn_tags__()["allow_nan"] == expected_allow_nan diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 7e15254e49745..658e5e60497b6 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -456,8 +456,8 @@ def __sklearn_tags__(self): "allow_nan": True, } # Adjust allow_nan if estimator explicitly defines `allow_nan`. - if ( - hasattr(self.estimator, "__sklearn_tags__") or hasattr(self.estimator, "_get_tags"): + if hasattr(self.estimator, "__sklearn_tags__") or hasattr( + self.estimator, "_get_tags" ): more_tags["allow_nan"] = _safe_tags(self.estimator, "allow_nan") diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 6c23f99bbe888..2f35d83756bd0 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -628,8 +628,9 @@ def _get_support_mask(self): mask[kept_ties] = True return mask - def _more_tags(self): - return {"requires_y": False} + def __sklearn_tags__(self): + more_tags = {"requires_y": False} + return {**super().__sklearn_tags__(), **more_tags} class SelectKBest(_BaseFilter): @@ -737,8 +738,9 @@ def _get_support_mask(self): mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1 return mask - def _more_tags(self): - return {"requires_y": False} + def __sklearn_tags__(self): + more_tags = {"requires_y": False} + return {**super().__sklearn_tags__(), **more_tags} class SelectFpr(_BaseFilter): diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 23a84bc7f5cf0..f2825c33279fa 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -411,8 +411,9 @@ def transform(self, X): projection *= (2.0 / self.n_components) ** 0.5 return projection - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} class SkewedChi2Sampler( @@ -569,8 +570,9 @@ def transform(self, X): projection *= np.sqrt(2.0) / np.sqrt(self.n_components) return projection - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index c6e8bfdc42685..b5bd45195a660 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -434,5 +434,6 @@ def transform(self, X): return self.kernel_pca_.transform(G_X) - def _more_tags(self): - return {"preserves_dtype": [np.float64, np.float32]} + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 6a90b1c43bbba..d2367a5fd5544 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -1154,5 +1154,6 @@ def _n_features_out(self): """Number of transformed output features.""" return self.embedding_.shape[1] - def _more_tags(self): - return {"pairwise": self.metric == "precomputed"} + def __sklearn_tags__(self): + more_tags = {"pairwise": self.metric == "precomputed"} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 78787da5ad8f9..877ad03578d1e 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -1026,7 +1026,7 @@ def get_metadata_routing(self): ) return router - def _more_tags(self): + def __sklearn_tags__(self): more_tags = {"_skip_test": True, "multioutput_only": True} return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index 05dfdb13a1cbe..deb4b474c80d8 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -510,7 +510,6 @@ def _local_reachability_density(self, distances_X, neighbors_indices): # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_: return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10) - def _more_tags(self): - return { - "preserves_dtype": [np.float64, np.float32], - } + def __sklearn_tags__(self): + more_tags = {"preserves_dtype": [np.float64, np.float32]} + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 36ea274821a19..5b6ab31a2d250 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2046,7 +2046,7 @@ def transform(self, X, copy=None): def __sklearn_tags__(self): more_tags = {"stateless": True, "array_api_support": True} return {**super().__sklearn_tags__(), **more_tags} -======= + @validate_params( { diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 2512f411a5a9c..4ef287f984e63 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -1161,8 +1161,8 @@ def transform(self, X): indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0] return XBS[:, indices] - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "_xfail_checks": { "check_estimators_pickle": ( "Current Scipy implementation of _bsplines does not" @@ -1170,3 +1170,4 @@ def _more_tags(self): ), } } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index b3b7c3d5e7bd9..e769d615e3f12 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -525,7 +525,8 @@ def get_feature_names_out(self, input_features=None): else: return feature_names - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "requires_y": True, } + return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 16027d266cd74..bc2f8f4858c0c 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -279,7 +279,7 @@ def test_fit_docstring_attributes(name, Estimator): est.fit(y) elif "2dlabels" in est.__sklearn_tags__()["X_types"]: est.fit(np.c_[y, y]) - elif "3darray" in est._get_tags()["X_types"]: + elif "3darray" in est.__sklearn_tags__()["X_types"]: est.fit(X[np.newaxis, ...], y) else: est.fit(X, y) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 7d50c872fa8fe..811ad0c0df79a 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -184,7 +184,7 @@ def get_n_leaves(self): def _support_missing_values(self, X): return ( not issparse(X) - and self._get_tags()["allow_nan"] + and self.__sklearn_tags__()["allow_nan"] and self.monotonic_cst is None ) @@ -1410,7 +1410,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): ) return averaged_predictions - def _more_tags(self): + def __sklearn_tags__(self): # XXX: nan is only support for dense arrays, but we set this for common test to # pass, specifically: check_estimators_nan_inf allow_nan = self.splitter == "best" and self.criterion in { @@ -1418,7 +1418,8 @@ def _more_tags(self): "friedman_mse", "poisson", } - return {"allow_nan": allow_nan} + more_tags = {"allow_nan": allow_nan} + return {**super().__sklearn_tags__(), **more_tags} class ExtraTreeClassifier(DecisionTreeClassifier): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index a7d9aaeb3d938..2f738371d792e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4431,7 +4431,7 @@ def check_param_validation(name, estimator_orig): def check_set_output_transform(name, transformer_orig): # Check transformer.set_output with the default configuration does not # change the transform output. - tags = transformer_orig._get_tags() + tags = _safe_tags(transformer_orig) if "2darray" not in tags["X_types"] or tags["no_validation"]: return @@ -4619,7 +4619,7 @@ def _check_set_output_transform_dataframe( or a global context by using the `with config_context(...)` """ # Check transformer.set_output configures the output of transform="pandas". - tags = transformer_orig._get_tags() + tags = _safe_tags(transformer_orig) if "2darray" not in tags["X_types"] or tags["no_validation"]: return diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 60c4b4716646b..6c8321e7dd838 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -457,8 +457,9 @@ def fit(self, X, y): raise ValueError("negative X values not supported!") return super().fit(X, y) - def _more_tags(self): - return {"requires_positive_X": True} + def __sklearn_tags__(self): + more_tags = {"requires_positive_X": True} + return {**super().__sklearn_tags__(), **more_tags} class RequiresPositiveYRegressor(LinearRegression): From dc59eef936b821a91778bd29d320aadcc713ef24 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 29 Apr 2024 18:56:13 -0400 Subject: [PATCH 06/48] Update to fix failing tests --- sklearn/ensemble/_bagging.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 0bbc4d126646f..8f1707200c540 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -638,8 +638,9 @@ def get_metadata_routing(self): def _get_estimator(self): """Resolve which estimator to return.""" - def _more_tags(self): - return {"allow_nan": _safe_tags(self._get_estimator(), "allow_nan")} + def __sklearn_tags__(self): + more_tags = {"allow_nan": _safe_tags(self._get_estimator(), "allow_nan")} + return {**super().__sklearn_tags__(), **more_tags} class BaggingClassifier(ClassifierMixin, BaseBagging): From b4deda2021f7221941a524f18f061e069fa30fd8 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 29 Apr 2024 18:58:43 -0400 Subject: [PATCH 07/48] STY Lint --- sklearn/utils/_tags.py | 3 ++- sklearn/utils/estimator_checks.py | 11 +++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 5d626f29bf53b..e64139ba467f3 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -1,5 +1,6 @@ -import warnings import inspect +import warnings + import numpy as np _DEFAULT_TAGS = { diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9f430b9de25de..995253de520de 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3475,12 +3475,11 @@ def param_filter(p): init_params = init_params[len(getattr(estimator, "_required_parameters", [])) :] for init_param in init_params: - assert init_param.default != init_param.empty, ( - "parameter %s for %s has no default value" - % ( - init_param.name, - type(estimator).__name__, - ) + assert ( + init_param.default != init_param.empty + ), "parameter %s for %s has no default value" % ( + init_param.name, + type(estimator).__name__, ) allowed_types = { str, From 87c113bc627ede3f5ce0f9c18da01cd66b1c087a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 29 Apr 2024 21:35:38 -0400 Subject: [PATCH 08/48] Fixes failing tests --- doc/whats_new/v1.1.rst | 2 -- doc/whats_new/v1.5.rst | 5 ++++- sklearn/linear_model/_ridge.py | 5 +++-- sklearn/utils/_tags.py | 6 +++--- sklearn/utils/tests/test_estimator_checks.py | 2 +- sklearn/utils/tests/test_tags.py | 4 ++-- 6 files changed, 13 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 15608babdf390..255bc8d7274a5 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -346,8 +346,6 @@ Changelog error message when setting invalid hyper-parameters with `set_params`. :pr:`21542` by :user:`Olivier Grisel `. -- |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators. - :pr:`22606` by `Thomas Fan`_. - |Enhancement| Removes random unique identifiers in the HTML representation. With this change, jupyter notebooks are reproducible as long as the cells are run in the same order. :pr:`23098` by `Thomas Fan`_. diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 9f53afd433ffc..8a6c5314343e0 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -67,6 +67,9 @@ Changes impacting many modules :class:`pipeline.Pipeline` and :class:`preprocessing.KBinsDiscretizer`. :pr:`28756` by :user:`Will Dean `. +- |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators. + :pr:`22606` by `Thomas Fan`_. + Support for Array API --------------------- @@ -169,7 +172,7 @@ Changelog .......................... - |Fix| Fixed a regression in :class:`calibration.CalibratedClassifierCV` where - an error was wrongly raised with string targets. + an error was wrongly raised with string targets. :pr:`28843` by :user:`Jérémie du Boisberranger `. :mod:`sklearn.cluster` diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 071ba0bc91777..2bb998abc774c 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1248,8 +1248,9 @@ def fit(self, X, y, sample_weight=None): ) return super().fit(X, y, sample_weight=sample_weight) - def _more_tags(self): - return {"array_api_support": True} + def __sklearn_tags__(self): + more_tags = {"array_api_support": True} + return {**super().__sklearn_tags__(), **more_tags} class _RidgeClassifierMixin(LinearClassifierMixin): diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index e64139ba467f3..1610db2cd4fd5 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -26,7 +26,7 @@ } -# TODO(1.3) Remove `_more_tags` support +# TODO(1.7) Remove `_more_tags` support def _walk_mro_more_tags(estimator): collected_tags = {} for base_class in reversed(inspect.getmro(estimator.__class__)): @@ -68,7 +68,7 @@ def _safe_tags(estimator, key=None): tags_provider = "__sklearn_tags__()" tags = estimator.__sklearn_tags__() elif hasattr(estimator, "_get_tags"): - # TODO(1.3) Remove `_get_tags` support + # TODO(1.7) Remove `_get_tags` support warnings.warn( "_get_tags() was deprecated in 1.1 support will be removed in 1.3. " "Please use __sklearn_tags__ instead.", @@ -77,7 +77,7 @@ def _safe_tags(estimator, key=None): tags_provider = "_get_tags()" tags = estimator._get_tags() elif hasattr(estimator, "_more_tags"): - # TODO(1.3) Remove `_more_tags` support + # TODO(1.7) Remove `_more_tags` support warnings.warn( "_more_tags() was deprecated in 1.1 support will be removed in 1.3. " "Please use __sklearn_tags__ instead.", diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 7f1ec84870f98..cdd7ac2de0819 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -1231,7 +1231,7 @@ def __sklearn_tags__(self): assert check_methods_subset_invariance not in all_tests -# TODO(1.3) Remove `_more_tags` and `_get_tags` support +# TODO(1.7) Remove `_more_tags` and `_get_tags` support def test_check_estimator_tags_deprecated(): """Check deprecation warnings are raised.""" diff --git a/sklearn/utils/tests/test_tags.py b/sklearn/utils/tests/test_tags.py index f92dbdad740db..cbf69f293580f 100644 --- a/sklearn/utils/tests/test_tags.py +++ b/sklearn/utils/tests/test_tags.py @@ -31,7 +31,7 @@ def test_safe_tags_error(estimator, err_msg): _safe_tags(estimator, key="xxx") -# TODO(1.3) Remove FutureWarning when `_more_tags is not supported +# TODO(1.7) Remove FutureWarning when `_more_tags is not supported @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize( "estimator, key, expected_results", @@ -51,7 +51,7 @@ def test_safe_tags_no_get_tags(estimator, key, expected_results): assert _safe_tags(estimator, key=key) == expected_results -# TODO(1.3) Remove `_more_tags` and `_get_tags` support +# TODO(1.7) Remove `_more_tags` and `_get_tags` support def test_safe_tags_raises_warning(): """Check safe_tags raises warnings for _more_tags and _get_tags.""" From 797dddf6a9f3dcc7563251eff27263d14bcb900c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 13 Aug 2024 10:01:44 +0200 Subject: [PATCH 09/48] Add dataclasses --- sklearn/utils/_tags.py | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index db8473721d2b6..fae50b1c9007e 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -1,6 +1,8 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause +from dataclasses import dataclass, field + import numpy as np _DEFAULT_TAGS = { @@ -26,6 +28,59 @@ } +@dataclass +class InputTags: + two_d_array: bool = True + sparse: bool = False + categorical: bool = False + string: bool = False + positive_only: bool = False + nan_allowed: bool = False + pairwise: bool = False + + +@dataclass +class TargetTags: + required: bool + positive_only: bool = False + multi_output: bool = False + single_output: bool = True + + +@dataclass +class TransformerTags: + preserve_dtype: list[str] = field(default_factory=lambda: ["float64"]) + + +@dataclass +class ClassifierTags: + poor_score: bool = False + binary: bool = True + multiclass: bool = True + multilabel: bool = False + + +@dataclass +class RegressorTags: + poor_score: bool = False + + +@dataclass +class Tags: + target_flags: TargetTags + transformer_flags: TransformerTags + classifier_flags: ClassifierTags + regressor_flags: RegressorTags + array_api_support: bool = False + no_validation: bool = False + stateless: bool = False + non_deterministic: bool = False + requires_fit: bool = True + _skip_test: bool = False + _xfail_checks: dict[str, str] = field(default_factory=dict) + input_flags: InputTags = field(default_factory=InputTags) + + def _safe_tags(estimator, key=None): """Safely get estimator tags. From c596d081f86b1dcb8c59a17a7ce362f40bd9d84f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 13 Aug 2024 10:17:42 +0200 Subject: [PATCH 10/48] move to 1.6 --- doc/whats_new/v1.5.rst | 3 --- doc/whats_new/v1.6.rst | 7 +++++++ sklearn/utils/_tags.py | 10 +++++----- sklearn/utils/estimator_checks.py | 4 ++-- sklearn/utils/tests/test_estimator_checks.py | 6 +++--- sklearn/utils/tests/test_tags.py | 4 ++-- 6 files changed, 19 insertions(+), 15 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 18fc0a726003c..b5542a0d1cf5f 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -165,9 +165,6 @@ Changes impacting many modules :class:`pipeline.Pipeline` and :class:`preprocessing.KBinsDiscretizer`. :pr:`28756` by :user:`Will Dean `. -- |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators. - :pr:`22606` by `Thomas Fan`_. - Support for Array API --------------------- diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 74357c9171f10..dc0d6c65dad93 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -22,6 +22,13 @@ Version 1.6.0 **In Development** +Changes impacting many modules +------------------------------ + +- |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators. + :pr:`22606` by `Thomas Fan`_. + + Support for Array API --------------------- diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index f3a5a00cb6b6c..a24b9fcdd92ef 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -29,7 +29,7 @@ } -# TODO(1.7) Remove `_more_tags` support +# TODO(1.8) Remove `_more_tags` support def _walk_mro_more_tags(estimator): collected_tags = {} for base_class in reversed(inspect.getmro(estimator.__class__)): @@ -71,18 +71,18 @@ def _safe_tags(estimator, key=None): tags_provider = "__sklearn_tags__()" tags = estimator.__sklearn_tags__() elif hasattr(estimator, "_get_tags"): - # TODO(1.7) Remove `_get_tags` support + # TODO(1.8) Remove `_get_tags` support warnings.warn( - "_get_tags() was deprecated in 1.1 support will be removed in 1.3. " + "_get_tags() was deprecated in 1.6 support will be removed in 1.8. " "Please use __sklearn_tags__ instead.", FutureWarning, ) tags_provider = "_get_tags()" tags = estimator._get_tags() elif hasattr(estimator, "_more_tags"): - # TODO(1.7) Remove `_more_tags` support + # TODO(1.8) Remove `_more_tags` support warnings.warn( - "_more_tags() was deprecated in 1.1 support will be removed in 1.3. " + "_more_tags() was deprecated in 1.6 support will be removed in 1.8. " "Please use __sklearn_tags__ instead.", FutureWarning, ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6427280743f37..0c59fff3875fd 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4053,11 +4053,11 @@ def check_estimator_get_tags_default_keys(name, estimator_orig): def check_estimator_tags_deprecated(name, estimator_orig): assert not hasattr(estimator_orig, "_more_tags"), ( - "_more_tags() was deprecated in 1.5 support will be removed in 1.7. " + "_more_tags() was deprecated in 1.6 support will be removed in 1.8. " "Please use __sklearn_tags__ instead.", ) assert not hasattr(estimator_orig, "_get_tags"), ( - "_get_tags() was deprecated in 1.5 support will be removed in 1.7. " + "_get_tags() was deprecated in 1.6 support will be removed in 1.8. " "Please use __sklearn_tags__ instead." ) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index cdd7ac2de0819..bd600efd90730 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -1231,13 +1231,13 @@ def __sklearn_tags__(self): assert check_methods_subset_invariance not in all_tests -# TODO(1.7) Remove `_more_tags` and `_get_tags` support +# TODO(1.8) Remove `_more_tags` and `_get_tags` support def test_check_estimator_tags_deprecated(): """Check deprecation warnings are raised.""" class Estimator: def _more_tags(self): - return {} + return {} # pragma: no cover err_msg = r"_more_tags\(\) was deprecated" with raises(AssertionError, match=err_msg): @@ -1245,7 +1245,7 @@ def _more_tags(self): class Estimator: def _get_tags(self): - return {} + return {} # pragma: no cover err_msg = r"_get_tags\(\) was deprecated" with raises(AssertionError, match=err_msg): diff --git a/sklearn/utils/tests/test_tags.py b/sklearn/utils/tests/test_tags.py index cbf69f293580f..8ca7447930246 100644 --- a/sklearn/utils/tests/test_tags.py +++ b/sklearn/utils/tests/test_tags.py @@ -31,7 +31,7 @@ def test_safe_tags_error(estimator, err_msg): _safe_tags(estimator, key="xxx") -# TODO(1.7) Remove FutureWarning when `_more_tags is not supported +# TODO(1.8) Remove FutureWarning when `_more_tags is not supported @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize( "estimator, key, expected_results", @@ -51,7 +51,7 @@ def test_safe_tags_no_get_tags(estimator, key, expected_results): assert _safe_tags(estimator, key=key) == expected_results -# TODO(1.7) Remove `_more_tags` and `_get_tags` support +# TODO(1.8) Remove `_more_tags` and `_get_tags` support def test_safe_tags_raises_warning(): """Check safe_tags raises warnings for _more_tags and _get_tags.""" From 004d5a68505d3f6fa1a085050adc9edde8676dd4 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 14 Aug 2024 17:30:26 +0200 Subject: [PATCH 11/48] simplify bagging __sklearn_tags__ --- sklearn/ensemble/_bagging.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index c7627b067b380..2d8268a75f80d 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -1068,15 +1068,6 @@ def decision_function(self, X): return decisions - def __sklearn_tags__(self): - if self.estimator is None: - estimator = DecisionTreeClassifier() - else: - estimator = self.estimator - - more_tags = {"allow_nan": _safe_tags(estimator, "allow_nan")} - return {**super().__sklearn_tags__(), **more_tags} - class BaggingRegressor(RegressorMixin, BaseBagging): """A Bagging regressor. @@ -1338,11 +1329,6 @@ def _set_oob_score(self, X, y): self.oob_prediction_ = predictions self.oob_score_ = r2_score(y, predictions) - def __sklearn_tags__(self): - estimator = self._get_estimator() - more_tags = {"allow_nan": _safe_tags(estimator, "allow_nan")} - return {**super().__sklearn_tags__(), **more_tags} - def _get_estimator(self): """Resolve which estimator to return (default is DecisionTreeClassifier)""" if self.estimator is None: From 4edd1da667341fb1139bceb9a1ef118463c3ccec Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 14 Aug 2024 17:37:45 +0200 Subject: [PATCH 12/48] RFE allow nan handling --- sklearn/feature_selection/_rfe.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 9abca0b884191..4ddf02e0a987e 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -536,14 +536,8 @@ def __sklearn_tags__(self): more_tags = { "poor_score": True, "requires_y": True, - "allow_nan": True, + "allow_nan": _safe_tags(self.estimator, "allow_nan"), } - # Adjust allow_nan if estimator explicitly defines `allow_nan`. - if hasattr(self.estimator, "__sklearn_tags__") or hasattr( - self.estimator, "_get_tags" - ): - more_tags["allow_nan"] = _safe_tags(self.estimator, "allow_nan") - return {**super().__sklearn_tags__(), **more_tags} def get_metadata_routing(self): From 813a8c429e29d5ffc787645253fde45c16f6617f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 14 Aug 2024 19:56:10 +0200 Subject: [PATCH 13/48] test fixes --- sklearn/model_selection/_classification_threshold.py | 5 +++-- sklearn/neighbors/tests/test_neighbors.py | 6 +++--- sklearn/tree/_classes.py | 10 ++++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py index bd30a98ac7cc9..88712e509d24d 100644 --- a/sklearn/model_selection/_classification_threshold.py +++ b/sklearn/model_selection/_classification_threshold.py @@ -210,8 +210,8 @@ def decision_function(self, X): check_is_fitted(self, "estimator_") return self.estimator_.decision_function(X) - def _more_tags(self): - return { + def __sklearn_tags__(self): + more_tags = { "binary_only": True, "_xfail_checks": { "check_classifiers_train": "Threshold at probability 0.5 does not hold", @@ -222,6 +222,7 @@ def _more_tags(self): ), }, } + return {**super().__sklearn_tags__(), **more_tags} class FixedThresholdClassifier(BaseThresholdClassifier): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 8e76be10b6be7..1c434ae8d59d4 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -250,10 +250,10 @@ def test_unsupervised_kneighbors( @pytest.mark.parametrize( "NeighborsMixinSubclass", [ - # neighbors.KNeighborsClassifier, + neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, - # neighbors.RadiusNeighborsClassifier, - # neighbors.RadiusNeighborsRegressor, + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, ], ) def test_neigh_predictions_algorithm_agnosticity( diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index c37d4b7305606..7101a3781723f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1688,7 +1688,7 @@ def __init__( monotonic_cst=monotonic_cst, ) - def _more_tags(self): + def __sklearn_tags__(self): # XXX: nan is only supported for dense arrays, but we set this for the # common test to pass, specifically: check_estimators_nan_inf allow_nan = self.splitter == "random" and self.criterion in { @@ -1696,7 +1696,8 @@ def _more_tags(self): "log_loss", "entropy", } - return {"multilabel": True, "allow_nan": allow_nan} + more_tags = {"multilabel": True, "allow_nan": allow_nan} + return {**super().__sklearn_tags__(), **more_tags} class ExtraTreeRegressor(DecisionTreeRegressor): @@ -1942,7 +1943,7 @@ def __init__( monotonic_cst=monotonic_cst, ) - def _more_tags(self): + def __sklearn_tags__(self): # XXX: nan is only supported for dense arrays, but we set this for the # common test to pass, specifically: check_estimators_nan_inf allow_nan = self.splitter == "random" and self.criterion in { @@ -1950,4 +1951,5 @@ def _more_tags(self): "friedman_mse", "poisson", } - return {"allow_nan": allow_nan} + more_tags = {"allow_nan": allow_nan} + return {**super().__sklearn_tags__(), **more_tags} From d073f1ad1eaee0a5b141b83ac71adac407a2b7d4 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 15 Aug 2024 09:52:17 +0200 Subject: [PATCH 14/48] progress --- doc/sphinxext/allow_nan_estimators.py | 2 +- sklearn/base.py | 37 +++++----------- sklearn/kernel_approximation.py | 31 ++++++++------ sklearn/linear_model/_base.py | 4 -- sklearn/utils/_tags.py | 62 ++++++++++++++++----------- sklearn/utils/estimator_checks.py | 23 +++++----- 6 files changed, 80 insertions(+), 79 deletions(-) diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py index 8c70908950605..d36e296d4c5d8 100755 --- a/doc/sphinxext/allow_nan_estimators.py +++ b/doc/sphinxext/allow_nan_estimators.py @@ -21,7 +21,7 @@ def make_paragraph_for_estimator_type(estimator_type): with suppress(SkipTest): est = _construct_instance(est_class) - if est.__sklearn_tags__().get("allow_nan"): + if est.__sklearn_tags__().input_tags.allow_nan: module_name = ".".join(est_class.__module__.split(".")[:2]) class_title = f"{est_class.__name__}" class_url = f"./generated/{module_name}.{class_title}.html" diff --git a/sklearn/base.py b/sklearn/base.py index 6e629d3fa1b22..165176ffc6ac6 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -20,9 +20,7 @@ from .utils._metadata_requests import _MetadataRequester, _routing_enabled from .utils._param_validation import validate_parameter_constraints from .utils._set_output import _SetOutputMixin -from .utils._tags import ( - _DEFAULT_TAGS, -) +from .utils._tags import default_tags from .utils.fixes import _IS_32BIT from .utils.validation import ( _check_feature_names_in, @@ -386,7 +384,7 @@ def __setstate__(self, state): self.__dict__.update(state) def __sklearn_tags__(self): - return copy.deepcopy(_DEFAULT_TAGS) + return default_tags(self) def _check_n_features(self, X, reset): """Set the `n_features_in_` attribute, or check against it. @@ -596,7 +594,7 @@ def _validate_data( """ self._check_feature_names(X, reset=reset) - if y is None and self.__sklearn_tags__()["requires_y"]: + if y is None and self.__sklearn_tags__().target_tags.required: raise ValueError( f"This {self.__class__.__name__} estimator " "requires y to be passed, but the target y is None." @@ -752,11 +750,6 @@ def score(self, X, y, sample_weight=None): return accuracy_score(y, self.predict(X), sample_weight=sample_weight) - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.update(requires_y=True) - return tags - class RegressorMixin: """Mixin class for all regression estimators in scikit-learn. @@ -839,10 +832,6 @@ def score(self, X, y, sample_weight=None): y_pred = self.predict(X) return r2_score(y, y_pred, sample_weight=sample_weight) - def __sklearn_tags__(self): - more_tags = {"requires_y": True} - return {**super().__sklearn_tags__(), **more_tags} - class ClusterMixin: """Mixin class for all cluster estimators in scikit-learn. @@ -892,10 +881,6 @@ def fit_predict(self, X, y=None, **kwargs): self.fit(X, **kwargs) return self.labels_ - def __sklearn_tags__(self): - more_tags = {"preserves_dtype": []} - return {**super().__sklearn_tags__(), **more_tags} - class BiclusterMixin: """Mixin class for all bicluster estimators in scikit-learn. @@ -1338,20 +1323,20 @@ class MultiOutputMixin: """Mixin to mark estimators that support multioutput.""" def __sklearn_tags__(self): - more_tags = {"multioutput": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = True + return tags class _UnstableArchMixin: """Mark estimators that are non-determinstic on 32bit or PowerPC""" def __sklearn_tags__(self): - more_tags = { - "non_deterministic": ( - _IS_32BIT or platform.machine().startswith(("ppc", "powerpc")) - ) - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.non_deterministic = _IS_32BIT or platform.machine().startswith( + ("ppc", "powerpc") + ) + return tags def is_classifier(estimator): diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 10200a8a36471..34833aa244838 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -402,8 +402,9 @@ def transform(self, X): return projection def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class SkewedChi2Sampler( @@ -561,8 +562,9 @@ def transform(self, X): return projection def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): @@ -817,8 +819,10 @@ def _transform_sparse(X, sample_steps, sample_interval): return sp.hstack(X_new) def __sklearn_tags__(self): - more_tags = {"stateless": True, "requires_positive_X": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.stateless = True + tags.input_tags.positive_only = True + return tags class Nystroem(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): @@ -1085,12 +1089,11 @@ def _get_kernel_params(self): return params def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_transformer_preserve_dtypes": ( - "dtypes are preserved but not at a close enough precision" - ) - }, - "preserves_dtype": [np.float64, np.float32], + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_transformer_preserve_dtypes": ( + "dtypes are preserved but not at a close enough precision" + ) } - return {**super().__sklearn_tags__(), **more_tags} + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index c67284f3b75a2..2cb866bbcc32f 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -317,10 +317,6 @@ def _set_intercept(self, X_offset, y_offset, X_scale): else: self.intercept_ = 0.0 - def __sklearn_tags__(self): - more_tags = {"requires_y": True} - return {**super().__sklearn_tags__(), **more_tags} - # XXX Should this derive from LinearModel? It should be a mixin, not an ABC. # Maybe the n_features checking can be moved to LinearModel. diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index bdbd9843b54c7..ca18521e89962 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -35,7 +35,7 @@ class InputTags: categorical: bool = False string: bool = False positive_only: bool = False - nan_allowed: bool = False + allow_nan: bool = False pairwise: bool = False @@ -49,7 +49,7 @@ class TargetTags: @dataclass class TransformerTags: - preserve_dtype: list[str] = field(default_factory=lambda: ["float64"]) + preserve_dtype: list[object] = field(default_factory=lambda: [np.float64]) @dataclass @@ -67,10 +67,10 @@ class RegressorTags: @dataclass class Tags: - target_flags: TargetTags - transformer_flags: TransformerTags - classifier_flags: ClassifierTags - regressor_flags: RegressorTags + target_tags: TargetTags + transformer_tags: TransformerTags + classifier_tags: ClassifierTags + regressor_tags: RegressorTags array_api_support: bool = False no_validation: bool = False stateless: bool = False @@ -78,7 +78,33 @@ class Tags: requires_fit: bool = True _skip_test: bool = False _xfail_checks: dict[str, str] = field(default_factory=dict) - input_flags: InputTags = field(default_factory=InputTags) + input_tags: InputTags = field(default_factory=InputTags) + + +def default_tags(estimator): + """Get the default tags for an estimator. + + Parameters + ---------- + estimator : estimator object + The estimator for which to get the default tags. + + Returns + ------- + tags : Tags + The default tags for the estimator. + """ + from ..base import is_classifier, is_regressor + + target_required = ( + True if is_classifier(estimator) or is_regressor(estimator) else False + ) + return Tags( + target_tags=TargetTags(required=target_required), + transformer_tags=TransformerTags() if hasattr(estimator, "transform") else None, + classifier_tags=ClassifierTags() if is_classifier(estimator) else None, + regressor_tags=RegressorTags() if is_regressor(estimator) else None, + ) def _safe_tags(estimator, key=None): @@ -90,7 +116,7 @@ def _safe_tags(estimator, key=None): For scikit-learn built-in estimators, we should still rely on `self.__sklearn_tags__()`. `_safe_tags(est)` should be used when we are not sure - where `est` comes from: typically `_safe_tags(self.base_estimator)` where + where `est` comes from: typically `_safe_tags(self.estimator)` where `self` is a meta-estimator, or in the common checks. Parameters @@ -98,26 +124,14 @@ def _safe_tags(estimator, key=None): estimator : estimator object The estimator from which to get the tag. - key : str, default=None - Tag name to get. By default (`None`), all tags are returned. - Returns ------- - tags : dict or tag value - The estimator tags. A single value is returned if `key` is not None. + tags : Tags + The estimator tags. """ if hasattr(estimator, "__sklearn_tags__"): - tags_provider = "__sklearn_tags__()" tags = estimator.__sklearn_tags__() else: - tags_provider = "_DEFAULT_TAGS" - tags = _DEFAULT_TAGS - - if key is not None: - if key not in tags: - raise ValueError( - f"The key {key} is not defined in {tags_provider} for the " - f"class {estimator.__class__.__name__}." - ) - return tags[key] + tags = default_tags(estimator) + return tags diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 0c59fff3875fd..75bebfde4bf0c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -327,15 +327,15 @@ def _yield_array_api_checks(estimator): def _yield_all_checks(estimator): name = estimator.__class__.__name__ tags = _safe_tags(estimator) - if "2darray" not in tags["X_types"]: + if tags.input_tags.two_d_array: warnings.warn( "Can't test estimator {} which requires input of type {}".format( - name, tags["X_types"] + name, tags.input_tags ), SkipTestWarning, ) return - if tags["_skip_test"]: + if tags._skip_test: warnings.warn( "Explicit SKIP via _skip_test tag for estimator {}.".format(name), SkipTestWarning, @@ -360,7 +360,7 @@ def _yield_all_checks(estimator): for check in _yield_outliers_checks(estimator): yield check yield check_parameters_default_constructible - if not tags["non_deterministic"]: + if not tags.non_deterministic: yield check_methods_sample_order_invariance yield check_methods_subset_invariance yield check_fit2d_1sample @@ -371,13 +371,13 @@ def _yield_all_checks(estimator): yield check_dont_overwrite_parameters yield check_fit_idempotent yield check_fit_check_is_fitted - if not tags["no_validation"]: + if not tags.no_validation: yield check_n_features_in yield check_fit1d yield check_fit2d_predict1d - if tags["requires_y"]: + if tags.target_tags.required: yield check_requires_y_none - if tags["requires_positive_X"]: + if tags.input_tags.positive_only: yield check_fit_non_negative @@ -1064,13 +1064,13 @@ def _check_estimator_sparse_container(name, estimator_orig, sparse_type): estimator.fit(X, y) if hasattr(estimator, "predict"): pred = estimator.predict(X) - if tags["multioutput_only"]: + if tags.target_tags.multi_output and not tags.target_tags.single_output: assert pred.shape == (X.shape[0], 1) else: assert pred.shape == (X.shape[0],) if hasattr(estimator, "predict_proba"): probs = estimator.predict_proba(X) - if tags["binary_only"]: + if tags.classifier_tags.binary and not tags.classifier_tags.multiclass: expected_probs_shape = (X.shape[0], 2) else: expected_probs_shape = (X.shape[0], 4) @@ -1113,7 +1113,10 @@ def check_sample_weights_pandas_series(name, estimator_orig): X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X), copy=False) y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = pd.Series([1] * 12) - if _safe_tags(estimator, key="multioutput_only"): + if ( + not _safe_tags(estimator).target_tags.single_output + and _safe_tags(estimator).target_tags.multi_output + ): y = pd.DataFrame(y, copy=False) try: estimator.fit(X, y, sample_weight=weights) From bdac769c623207c3b508100a0ed39e44b7450544 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 15 Aug 2024 10:02:28 +0200 Subject: [PATCH 15/48] remove old tags --- sklearn/utils/_tags.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index ca18521e89962..6e90577a02ca8 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -5,28 +5,6 @@ import numpy as np -_DEFAULT_TAGS = { - "array_api_support": False, - "non_deterministic": False, - "requires_positive_X": False, - "requires_positive_y": False, - "X_types": ["2darray"], - "poor_score": False, - "no_validation": False, - "multioutput": False, - "allow_nan": False, - "stateless": False, - "multilabel": False, - "_skip_test": False, - "_xfail_checks": False, - "multioutput_only": False, - "binary_only": False, - "requires_fit": True, - "preserves_dtype": [np.float64], - "requires_y": False, - "pairwise": False, -} - @dataclass class InputTags: From dcf6051a02a99209d4bdcfe538a5d41dca717371 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 15 Aug 2024 11:35:12 +0200 Subject: [PATCH 16/48] more fixes --- sklearn/calibration.py | 17 ++++--- sklearn/cluster/_affinity_propagation.py | 5 +- sklearn/cluster/_birch.py | 5 +- sklearn/cluster/_bisect_k_means.py | 5 +- sklearn/cluster/_dbscan.py | 5 +- .../_hdbscan/tests/test_reachibility.py | 2 +- sklearn/cluster/_kmeans.py | 13 +++-- sklearn/cross_decomposition/_pls.py | 11 ++++- sklearn/ensemble/_bagging.py | 7 ++- sklearn/feature_extraction/text.py | 33 ++++++++----- sklearn/kernel_approximation.py | 2 +- .../manifold/tests/test_spectral_embedding.py | 2 +- sklearn/multioutput.py | 25 ++++++---- sklearn/naive_bayes.py | 31 ++++++++---- sklearn/neural_network/_rbm.py | 21 ++++---- sklearn/preprocessing/_data.py | 48 ++++++++++++------- sklearn/tests/test_common.py | 31 ++++++++---- sklearn/tree/_classes.py | 24 ++++++---- sklearn/utils/_tags.py | 8 ++-- sklearn/utils/estimator_checks.py | 45 ++++++++--------- 20 files changed, 203 insertions(+), 137 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 69d4c3359ffea..4a47eeb8924ef 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -536,16 +536,15 @@ def get_metadata_routing(self): return router def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "Due to the cross-validation and sample ordering, removing a sample" - " is not strictly equal to putting is weight to zero. Specific unit" - " tests are added for CalibratedClassifierCV specifically." - ), - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "Due to the cross-validation and sample ordering, removing a sample" + " is not strictly equal to putting is weight to zero. Specific unit" + " tests are added for CalibratedClassifierCV specifically." + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags def _fit_classifier_calibrator_pair( diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 997982622fb8f..654f80061507c 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -479,8 +479,9 @@ def __init__( self.random_state = random_state def __sklearn_tags__(self): - more_tags = {"pairwise": self.affinity == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.affinity == "precomputed" + return tags @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index c0cfc195d6254..54e1bb64b64f6 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -736,5 +736,6 @@ def _global_clustering(self, X=None): self.labels_ = self._predict(X) def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index bcbdf29b23657..23945bc77552b 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -528,5 +528,6 @@ def _predict_recursive(self, X, sample_weight, cluster_node): return labels def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index fc56cf70be4ce..d76cd47b2e7a0 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -472,5 +472,6 @@ def fit_predict(self, X, y=None, sample_weight=None): return self.labels_ def __sklearn_tags__(self): - more_tags = {"pairwise": self.metric == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.metric == "precomputed" + return tags diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py index 53096dd7cbec7..a336e6be6116d 100644 --- a/sklearn/cluster/_hdbscan/tests/test_reachibility.py +++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py @@ -50,7 +50,7 @@ def test_mutual_reachability_graph_equivalence_dense_sparse(): @pytest.mark.parametrize("array_type", ["array", "sparse_csr"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_mutual_reachability_graph_preserve_dtype(array_type, dtype): +def test_mutual_reachability_graph_preserves_dtype(array_type, dtype): """Check that the computation preserve dtype thanks to fused types.""" rng = np.random.RandomState(0) X = rng.randn(10, 10) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index ae69367fca4fa..56b4b18fe4a9c 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1176,14 +1176,13 @@ def score(self, X, y=None, sample_weight=None): return -scores def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class KMeans(_BaseKMeans): diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 1b0b1f80d08c4..0d95a9e2f8343 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -19,6 +19,8 @@ RegressorMixin, TransformerMixin, _fit_context, + is_classifier, + is_regressor, ) from ..exceptions import ConvergenceWarning from ..utils import check_array, check_consistent_length @@ -552,8 +554,13 @@ def fit_transform(self, X, y=None): return self.fit(X, y).transform(X, y) def __sklearn_tags__(self): - more_tags = {"poor_score": True, "requires_y": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + if is_classifier(self): + tags.classifier_tags.poor_score = True + if is_regressor(self): + tags.regressor_tags.poor_score = True + tags.target_tags.required = False + return tags class PLSRegression(_PLS): diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 2d8268a75f80d..af76cc38a7574 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -639,8 +639,11 @@ def _get_estimator(self): """Resolve which estimator to return.""" def __sklearn_tags__(self): - more_tags = {"allow_nan": _safe_tags(self._get_estimator(), "allow_nan")} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = _safe_tags( + self._get_estimator() + ).input_tags.allow_nan + return tags class BaggingClassifier(ClassifierMixin, BaseBagging): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 3465daf920763..feeb49496ed38 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -909,8 +909,10 @@ def _get_hasher(self): ) def __sklearn_tags__(self): - more_tags = {"X_types": ["string"]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.string = True + tags.input_tags.two_d_array = False + return tags def _document_frequency(X): @@ -1467,8 +1469,10 @@ def get_feature_names_out(self, input_features=None): ) def __sklearn_tags__(self): - more_tags = {"X_types": ["string"]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.string = True + tags.input_tags.two_d_array = False + return tags def _make_int_array(): @@ -1706,13 +1710,13 @@ def transform(self, X, copy=True): return X def __sklearn_tags__(self): - more_tags = { - "X_types": ["2darray", "sparse"], - # FIXME: np.float16 could be preserved if _inplace_csr_row_normalize_l2 - # accepted it. - "preserves_dtype": [np.float64, np.float32], - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = True + tags.input_tags.sparse = True + # FIXME: np.float16 could be preserved if _inplace_csr_row_normalize_l2 + # accepted it. + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class TfidfVectorizer(CountVectorizer): @@ -2113,5 +2117,8 @@ def transform(self, raw_documents): return self._tfidf.transform(X, copy=False) def __sklearn_tags__(self): - more_tags = {"X_types": ["string"], "_skip_test": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.string = True + tags.input_tags.two_d_array = False + tags._skip_test = True + return tags diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 34833aa244838..f74fddef30991 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -1091,7 +1091,7 @@ def _get_kernel_params(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags._xfail_checks = { - "check_transformer_preserve_dtypes": ( + "check_transformer_preserves_dtypes": ( "dtypes are preserved but not at a close enough precision" ) } diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 6dec35123f9cc..922d3ae981119 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -444,7 +444,7 @@ def test_spectral_embedding_preserves_dtype(eigen_solver, dtype): attribute and transformed data. Ideally, this test should be covered by the common test - `check_transformer_preserve_dtypes`. However, this test only run + `check_transformer_preserves_dtypes`. However, this test only run with transformers implementing `transform` while `SpectralEmbedding` implements only `fit_transform`. """ diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 06c13d734b666..39f7eb14997a2 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -309,8 +309,10 @@ def predict(self, X): return np.asarray(y).T def __sklearn_tags__(self): - more_tags = {"multioutput_only": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.single_output = False + tags.target_tags.multi_output = True + return tags def get_metadata_routing(self): """Get metadata routing of this object. @@ -611,9 +613,10 @@ def score(self, X, y): return np.mean(np.all(y == y_pred, axis=1)) def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # FIXME - more_tags = {"_skip_test": True} - return {**super().__sklearn_tags__(), **more_tags} + tags._skip_test = True + return tags def _available_if_base_estimator_has(attr): @@ -1097,8 +1100,12 @@ def get_metadata_routing(self): return router def __sklearn_tags__(self): - more_tags = {"_skip_test": True, "multioutput_only": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + # FIXME + tags._skip_test = True + tags.classifier_tags.single_output = False + tags.classifier_tags.multi_output = True + return tags class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain): @@ -1247,6 +1254,8 @@ def get_metadata_routing(self): return router def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # FIXME - more_tags = {"multioutput_only": True} - return {**super().__sklearn_tags__(), **more_tags} + tags.target_tags.single_output = False + tags.target_tags.multi_output = True + return tags diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index db33260ef36ea..142669f140232 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -14,7 +14,13 @@ import numpy as np from scipy.special import logsumexp -from .base import BaseEstimator, ClassifierMixin, _fit_context +from .base import ( + BaseEstimator, + ClassifierMixin, + _fit_context, + is_classifier, + is_regressor, +) from .preprocessing import LabelBinarizer, binarize, label_binarize from .utils._param_validation import Interval from .utils.extmath import safe_sparse_dot @@ -761,8 +767,12 @@ def _init_counters(self, n_classes, n_features): self.feature_count_ = np.zeros((n_classes, n_features), dtype=np.float64) def __sklearn_tags__(self): - more_tags = {"poor_score": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + if is_classifier(self): + tags.classifier_tags.poor_score = True + if is_regressor(self): + tags.regressor_tags.poor_score = True + return tags class MultinomialNB(_BaseDiscreteNB): @@ -869,8 +879,9 @@ def __init__( ) def __sklearn_tags__(self): - more_tags = {"requires_positive_X": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + return tags def _count(self, X, Y): """Count and smooth feature occurrences.""" @@ -1016,8 +1027,9 @@ def __init__( self.norm = norm def __sklearn_tags__(self): - more_tags = {"requires_positive_X": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + return tags def _count(self, X, Y): """Count feature occurrences.""" @@ -1419,8 +1431,9 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): return super().partial_fit(X, y, classes, sample_weight=sample_weight) def __sklearn_tags__(self): - more_tags = {"requires_positive_X": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + return tags def _check_X(self, X): """Validate X, used only in predict* methods.""" diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index eca11bef5b2bc..5755416cc4c70 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -439,15 +439,14 @@ def fit(self, X, y=None): return self def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_methods_subset_invariance": ( - "fails for the decision_function method" - ), - "check_methods_sample_order_invariance": ( - "fails for the score_samples method" - ), - }, - "preserves_dtype": [np.float64, np.float32], + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_methods_subset_invariance": ( + "fails for the decision_function method" + ), + "check_methods_sample_order_invariance": ( + "fails for the score_samples method" + ), } - return {**super().__sklearn_tags__(), **more_tags} + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 66dd15a375f4c..4d06890b0a297 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -570,8 +570,9 @@ def inverse_transform(self, X): return X def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags @validate_params( @@ -1109,8 +1110,10 @@ def inverse_transform(self, X, copy=None): return X def __sklearn_tags__(self): - more_tags = {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): @@ -1338,8 +1341,9 @@ def inverse_transform(self, X): return X def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags @validate_params( @@ -1710,8 +1714,9 @@ def inverse_transform(self, X): return X def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags @validate_params( @@ -2108,8 +2113,10 @@ def transform(self, X, copy=None): return normalize(X, norm=self.norm, axis=1, copy=False) def __sklearn_tags__(self): - more_tags = {"stateless": True, "array_api_support": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.stateless = True + tags.array_api_support = True + return tags @validate_params( @@ -2311,8 +2318,9 @@ def transform(self, X, copy=None): return binarize(X, threshold=self.threshold, copy=False) def __sklearn_tags__(self): - more_tags = {"stateless": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.stateless = True + return tags class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): @@ -2470,8 +2478,10 @@ def _n_features_out(self): return self.n_features_in_ def __sklearn_tags__(self): - more_tags = {"pairwise": True, "array_api_support": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = True + tags.array_api_support = True + return tags @validate_params( @@ -2969,8 +2979,9 @@ def inverse_transform(self, X): return self._transform(X, inverse=True) def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags @validate_params( @@ -3541,8 +3552,9 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False): return X def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags @validate_params( diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 3a61503530f23..7300bdf7b239d 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -61,7 +61,7 @@ ) from sklearn.semi_supervised import LabelPropagation, LabelSpreading from sklearn.utils import all_estimators -from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags +from sklearn.utils._tags import _safe_tags from sklearn.utils._testing import ( SkipTest, ignore_warnings, @@ -357,14 +357,29 @@ def test_search_cv(estimator, check, request): ) def test_valid_tag_types(estimator): """Check that estimator tags are valid.""" - tags = _safe_tags(estimator) + from dataclasses import fields + + from ..utils._tags import default_tags - for name, tag in tags.items(): - correct_tags = type(_DEFAULT_TAGS[name]) - if name == "_xfail_checks": - # _xfail_checks can be a dictionary - correct_tags = (correct_tags, dict) - assert isinstance(tag, correct_tags) + def check_field_types(tags, defaults): + if tags is None: + return + tags_fields = fields(tags) + for field in tags_fields: + correct_tags = type(getattr(defaults, field.name)) + if field.name == "_xfail_checks": + # _xfail_checks can be a dictionary + correct_tags = (correct_tags, dict) + assert isinstance(getattr(tags, field.name), correct_tags) + + tags = _safe_tags(estimator) + defaults = default_tags(estimator) + check_field_types(tags, defaults) + check_field_types(tags.input_tags, defaults.input_tags) + check_field_types(tags.target_tags, defaults.target_tags) + check_field_types(tags.classifier_tags, defaults.classifier_tags) + check_field_types(tags.regressor_tags, defaults.regressor_tags) + check_field_types(tags.transformer_tags, defaults.transformer_tags) @pytest.mark.parametrize( diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 7101a3781723f..abfb836a6ec27 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -176,7 +176,7 @@ def get_n_leaves(self): def _support_missing_values(self, X): return ( not issparse(X) - and self.__sklearn_tags__()["allow_nan"] + and self.__sklearn_tags__().input_tags.allow_nan and self.monotonic_cst is None ) @@ -1072,6 +1072,7 @@ def predict_log_proba(self, X): return proba def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # XXX: nan is only support for dense arrays, but we set this for common test to # pass, specifically: check_estimators_nan_inf allow_nan = self.splitter in ("best", "random") and self.criterion in { @@ -1079,8 +1080,9 @@ def __sklearn_tags__(self): "log_loss", "entropy", } - more_tags = {"multilabel": True, "allow_nan": allow_nan} - return {**super().__sklearn_tags__(), **more_tags} + tags.classifier_tags.multi_label = True + tags.input_tags.allow_nan = allow_nan + return tags class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): @@ -1404,6 +1406,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # XXX: nan is only support for dense arrays, but we set this for common test to # pass, specifically: check_estimators_nan_inf allow_nan = self.splitter in ("best", "random") and self.criterion in { @@ -1411,8 +1414,8 @@ def __sklearn_tags__(self): "friedman_mse", "poisson", } - more_tags = {"allow_nan": allow_nan} - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.allow_nan = allow_nan + return tags class ExtraTreeClassifier(DecisionTreeClassifier): @@ -1689,6 +1692,7 @@ def __init__( ) def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # XXX: nan is only supported for dense arrays, but we set this for the # common test to pass, specifically: check_estimators_nan_inf allow_nan = self.splitter == "random" and self.criterion in { @@ -1696,8 +1700,9 @@ def __sklearn_tags__(self): "log_loss", "entropy", } - more_tags = {"multilabel": True, "allow_nan": allow_nan} - return {**super().__sklearn_tags__(), **more_tags} + tags.classifier_tags.multi_label = True + tags.input_tags.allow_nan = allow_nan + return tags class ExtraTreeRegressor(DecisionTreeRegressor): @@ -1944,6 +1949,7 @@ def __init__( ) def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # XXX: nan is only supported for dense arrays, but we set this for the # common test to pass, specifically: check_estimators_nan_inf allow_nan = self.splitter == "random" and self.criterion in { @@ -1951,5 +1957,5 @@ def __sklearn_tags__(self): "friedman_mse", "poisson", } - more_tags = {"allow_nan": allow_nan} - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.allow_nan: allow_nan + return tags diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 6e90577a02ca8..0a35f428ff554 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -27,15 +27,15 @@ class TargetTags: @dataclass class TransformerTags: - preserve_dtype: list[object] = field(default_factory=lambda: [np.float64]) + preserves_dtype: list[object] = field(default_factory=lambda: [np.float64]) @dataclass class ClassifierTags: poor_score: bool = False binary: bool = True - multiclass: bool = True - multilabel: bool = False + multi_class: bool = True + multi_label: bool = False @dataclass @@ -85,7 +85,7 @@ def default_tags(estimator): ) -def _safe_tags(estimator, key=None): +def _safe_tags(estimator): """Safely get estimator tags. :class:`~sklearn.BaseEstimator` provides the estimator tags machinery. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 75bebfde4bf0c..4d39ad5492195 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -65,10 +65,7 @@ from . import shuffle from ._missing import is_scalar_nan from ._param_validation import Interval -from ._tags import ( - _DEFAULT_TAGS, - _safe_tags, -) +from ._tags import Tags, _safe_tags from ._testing import ( SkipTest, _array_api_for_tests, @@ -112,7 +109,7 @@ def _yield_checks(estimator): # Check that all estimator yield informative messages when # trained on empty datasets - if not tags["no_validation"]: + if not tags.no_validation: yield check_complex_data yield check_dtype_object yield check_estimators_empty_data_messages @@ -121,11 +118,11 @@ def _yield_checks(estimator): # cross-decomposition's "transform" returns X and Y yield check_pipeline_consistency - if not tags["allow_nan"] and not tags["no_validation"]: + if not tags.input_tags.allow_nan and not tags.no_validation: # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf - if tags["pairwise"]: + if tags.input_tags.pairwise: # Check that pairwise estimator throws error on non-square input yield check_nonsquare_error @@ -144,7 +141,7 @@ def _yield_checks(estimator): yield check_estimator_get_tags_default_keys yield check_estimator_tags_deprecated - if tags["array_api_support"]: + if tags.array_api_support: for check in _yield_array_api_checks(estimator): yield check @@ -159,21 +156,21 @@ def _yield_classifier_checks(classifier): yield check_classifiers_one_label_sample_weights yield check_classifiers_classes yield check_estimators_partial_fit_n_features - if tags["multioutput"]: + if tags.target_tags.multi_output: yield check_classifier_multioutput # basic consistency testing yield check_classifiers_train yield partial(check_classifiers_train, readonly_memmap=True) yield partial(check_classifiers_train, readonly_memmap=True, X_dtype="float32") yield check_classifiers_regression_target - if tags["multilabel"]: + if tags.classifier_tags.multi_label: yield check_classifiers_multilabel_representation_invariance yield check_classifiers_multilabel_output_format_predict yield check_classifiers_multilabel_output_format_predict_proba yield check_classifiers_multilabel_output_format_decision_function - if not tags["no_validation"]: + if not tags.no_validation: yield check_supervised_y_no_nan - if not tags["multioutput_only"]: + if not tags.target_tags.multi_output and not tags.target_tags.single_output: yield check_supervised_y_2d if tags["requires_fit"]: yield check_estimators_unfitted @@ -249,14 +246,14 @@ def _yield_transformer_checks(transformer): tags = _safe_tags(transformer) # All transformers should either deal with sparse data or raise an # exception with type TypeError and an intelligible error message - if not tags["no_validation"]: + if not tags.no_validation: yield check_transformer_data_not_an_array # these don't actually fit the data, so don't raise errors yield check_transformer_general - if tags["preserves_dtype"]: - yield check_transformer_preserve_dtypes + if tags.transformer_tags.preserves_dtype: + yield check_transformer_preserves_dtypes yield partial(check_transformer_general, readonly_memmap=True) - if not _safe_tags(transformer, key="stateless"): + if not _safe_tags(transformer).stateless: yield check_transformers_unfitted else: yield check_transformers_unfitted_stateless @@ -507,7 +504,7 @@ def _should_be_skipped_or_marked(estimator, check): check_name = check.func.__name__ if isinstance(check, partial) else check.__name__ - xfail_checks = _safe_tags(estimator, key="_xfail_checks") or {} + xfail_checks = _safe_tags(estimator)._xfail_checks or {} if check_name in xfail_checks: return True, xfail_checks[check_name] @@ -1934,7 +1931,7 @@ def check_estimators_dtypes(name, estimator_orig): getattr(estimator, method)(X_train) -def check_transformer_preserve_dtypes(name, transformer_orig): +def check_transformer_preserves_dtypes(name, transformer_orig): # check that dtype are preserved meaning if input X is of some dtype # X_transformed should be from the same dtype. X, y = make_blobs( @@ -4040,18 +4037,14 @@ def check_n_features_in_after_fitting(name, estimator_orig): def check_estimator_get_tags_default_keys(name, estimator_orig): - # check that if __sklearn_tags__ is implemented, it contains all keys from - # _DEFAULT_KEYS + # check that if __sklearn_tags__ is implemented, it's an instance of Tags estimator = clone(estimator_orig) if not hasattr(estimator, "__sklearn_tags__"): return - tags_keys = set(estimator.__sklearn_tags__().keys()) - default_tags_keys = set(_DEFAULT_TAGS.keys()) - assert tags_keys.intersection(default_tags_keys) == default_tags_keys, ( - f"{name}.__sklearn_tags__() is missing entries for the following default tags" - f": {default_tags_keys - tags_keys.intersection(default_tags_keys)}" - ) + assert isinstance( + estimator.__sklearn_tags__(), Tags + ), f"{name}.__sklearn_tags__() must be an instance of Tags" def check_estimator_tags_deprecated(name, estimator_orig): From d93155bc96705645049370a5cc9821b38599a0a9 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 15 Aug 2024 13:57:24 +0200 Subject: [PATCH 17/48] tune more tests --- sklearn/feature_selection/tests/test_rfe.py | 7 +- sklearn/utils/tests/test_estimator_checks.py | 29 ++------- sklearn/utils/tests/test_tags.py | 68 ++++---------------- 3 files changed, 20 insertions(+), 84 deletions(-) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index 3af7312d785fc..98b55366c5853 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -22,7 +22,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC, SVR, LinearSVR from sklearn.utils import check_random_state -from sklearn.utils._tags import _DEFAULT_TAGS +from sklearn.utils._tags import default_tags from sklearn.utils._testing import ignore_warnings from sklearn.utils.fixes import CSR_CONTAINERS @@ -58,8 +58,9 @@ def set_params(self, **params): return self def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**_DEFAULT_TAGS, **more_tags} + tags = default_tags(self) + tags.input_tags.allow_nan = True + return tags def test_rfe_features_importance(): diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index bd600efd90730..05cbd9275b242 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -30,7 +30,7 @@ from sklearn.svm import SVC, NuSVC from sklearn.utils import _array_api, all_estimators, deprecated from sklearn.utils._param_validation import Interval, StrOptions -from sklearn.utils._tags import _DEFAULT_TAGS +from sklearn.utils._tags import default_tags from sklearn.utils._testing import ( MinimalClassifier, MinimalRegressor, @@ -53,7 +53,6 @@ check_decision_proba_consistency, check_estimator, check_estimator_get_tags_default_keys, - check_estimator_tags_deprecated, check_estimators_unfitted, check_fit_check_is_fitted, check_fit_score_takes_y, @@ -1223,35 +1222,15 @@ def test_non_deterministic_estimator_skip_tests(): class Estimator(est): def __sklearn_tags__(self): - more_tags = {"non_deterministic": True} - return {**_DEFAULT_TAGS, **more_tags} + tags = default_tags(self) + tags.non_deterministic = True + return tags all_tests = list(_yield_all_checks(Estimator())) assert check_methods_sample_order_invariance not in all_tests assert check_methods_subset_invariance not in all_tests -# TODO(1.8) Remove `_more_tags` and `_get_tags` support -def test_check_estimator_tags_deprecated(): - """Check deprecation warnings are raised.""" - - class Estimator: - def _more_tags(self): - return {} # pragma: no cover - - err_msg = r"_more_tags\(\) was deprecated" - with raises(AssertionError, match=err_msg): - check_estimator_tags_deprecated("estimator", Estimator()) - - class Estimator: - def _get_tags(self): - return {} # pragma: no cover - - err_msg = r"_get_tags\(\) was deprecated" - with raises(AssertionError, match=err_msg): - check_estimator_tags_deprecated("estimator", Estimator()) - - def test_check_outlier_contamination(): """Check the test for the contamination parameter in the outlier detectors.""" diff --git a/sklearn/utils/tests/test_tags.py b/sklearn/utils/tests/test_tags.py index 8ca7447930246..587430c45fb75 100644 --- a/sklearn/utils/tests/test_tags.py +++ b/sklearn/utils/tests/test_tags.py @@ -1,70 +1,26 @@ -import re - import pytest -from sklearn.base import BaseEstimator -from sklearn.utils._tags import ( - _DEFAULT_TAGS, - _safe_tags, -) +from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin +from sklearn.utils._tags import _safe_tags class NoTagsEstimator: pass -class MoreTagsEstimator: - def _more_tags(self): - return {"allow_nan": True} - - -@pytest.mark.parametrize( - "estimator, err_msg", - [ - (BaseEstimator(), "The key xxx is not defined in __sklearn_tags__"), - (NoTagsEstimator(), "The key xxx is not defined in _DEFAULT_TAGS"), - ], -) -def test_safe_tags_error(estimator, err_msg): - # Check that safe_tags raises error in ambiguous case. - with pytest.raises(ValueError, match=err_msg): - _safe_tags(estimator, key="xxx") +class ClassifierEstimator: + _estimator_type = "classifier" -# TODO(1.8) Remove FutureWarning when `_more_tags is not supported -@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize( - "estimator, key, expected_results", + "estimator, value", [ - (NoTagsEstimator(), None, _DEFAULT_TAGS), - (NoTagsEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]), - (MoreTagsEstimator(), None, {**_DEFAULT_TAGS, **{"allow_nan": True}}), - (MoreTagsEstimator(), "allow_nan", True), - (BaseEstimator(), None, _DEFAULT_TAGS), - (BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]), - (BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]), + [NoTagsEstimator(), False], + [ClassifierEstimator(), True], + [TransformerMixin(), False], + [RegressorMixin(), True], + [BaseEstimator(), False], ], ) -def test_safe_tags_no_get_tags(estimator, key, expected_results): - # check the behaviour of _safe_tags when an estimator does not implement - # _get_tags - assert _safe_tags(estimator, key=key) == expected_results - - -# TODO(1.8) Remove `_more_tags` and `_get_tags` support -def test_safe_tags_raises_warning(): - """Check safe_tags raises warnings for _more_tags and _get_tags.""" - - class Estimator: - def _more_tags(self): - return {} - - with pytest.warns(FutureWarning, match=re.escape("_more_tags() was deprecated")): - _safe_tags(Estimator()) - - class Estimator: - def _get_tags(self): - return {} - - with pytest.warns(FutureWarning, match=re.escape("_get_tags() was deprecated")): - _safe_tags(Estimator()) +def test_requires_y(estimator, value): + assert _safe_tags(estimator).target_tags.required == value From 41ed2047c75a4bbe91b918d5acdffb49c5e71d71 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 15 Aug 2024 14:11:44 +0200 Subject: [PATCH 18/48] ... --- sklearn/decomposition/_dict_learning.py | 16 +++++--- sklearn/dummy.py | 21 +++++----- sklearn/ensemble/_forest.py | 15 ++++--- .../feature_extraction/_dict_vectorizer.py | 6 ++- sklearn/linear_model/_coordinate_descent.py | 41 +++++++++++-------- sklearn/utils/_tags.py | 7 ++-- 6 files changed, 62 insertions(+), 44 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 1ea00b08806fc..699dc1ba42e8a 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1348,8 +1348,10 @@ def transform(self, X, y=None): return super()._transform(X, self.dictionary) def __sklearn_tags__(self): - more_tags = {"requires_fit": False, "preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.requires_fit = False + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags @property def n_components_(self): @@ -1705,8 +1707,9 @@ def _n_features_out(self): return self.components_.shape[0] def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): @@ -2302,5 +2305,6 @@ def _n_features_out(self): return self.components_.shape[0] def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 1978514740cb2..6d5da543bf8d2 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -421,15 +421,14 @@ def predict_log_proba(self, X): return [np.log(p) for p in proba] def __sklearn_tags__(self): - more_tags = { - "poor_score": True, - "no_validation": True, - "_xfail_checks": { - "check_methods_subset_invariance": "fails for the predict method", - "check_methods_sample_order_invariance": "fails for the predict method", - }, + tags = super().__sklearn_tags__() + tags.classifier_tags.poor_score = True + tags.no_validation = True + tags._xfail_checks = { + "check_methods_subset_invariance": "fails for the predict method", + "check_methods_sample_order_invariance": "fails for the predict method", } - return {**super().__sklearn_tags__(), **more_tags} + return tags def score(self, X, y, sample_weight=None): """Return the mean accuracy on the given test data and labels. @@ -665,8 +664,10 @@ def predict(self, X, return_std=False): return (y, y_std) if return_std else y def __sklearn_tags__(self): - more_tags = {"poor_score": True, "no_validation": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.regressor_tags.poor_score = True + tags.no_validation = True + return tags def score(self, X, y, sample_weight=None): """Return the coefficient of determination R^2 of the prediction. diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 8bd25cf2ad001..2fea3a9fde18c 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -711,11 +711,12 @@ def estimators_samples_(self): return [sample_indices for sample_indices in self._get_estimators_indices()] def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # Only the criterion is required to determine if the tree supports # missing values estimator = type(self.estimator)(criterion=self.criterion) - more_tags = {"allow_nan": _safe_tags(estimator, key="allow_nan")} - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.allow_nan = _safe_tags(estimator).input_tags.allow_nan + return tags def _accumulate_prediction(predict, X, out, lock): @@ -997,8 +998,9 @@ def predict_log_proba(self, X): return proba def __sklearn_tags__(self): - more_tags = {"multilabel": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): @@ -1162,8 +1164,9 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions def __sklearn_tags__(self): - more_tags = {"multilabel": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.regressor_tags.multi_label = True + return tags class RandomForestClassifier(ForestClassifier): diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 8d13bf371274f..64c9a5704652d 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -448,5 +448,7 @@ def restrict(self, support, indices=False): return self def __sklearn_tags__(self): - more_tags = {"X_types": ["dict"]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.dict = True + tags.input_tags.two_d_array = False + return tags diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index d1a94cb99edd1..80f891bf11e09 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1833,16 +1833,15 @@ def fit(self, X, y, sample_weight=None, **params): return self def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # Note: check_sample_weights_invariance(kind='ones') should work, but # currently we can only mark a whole test as xfail. - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags def get_metadata_routing(self): """Get metadata routing of this object. @@ -2082,8 +2081,9 @@ def _is_multitask(self): return False def __sklearn_tags__(self): - more_tags = {"multioutput": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = False + return tags class ElasticNetCV(RegressorMixin, LinearModelCV): @@ -2322,8 +2322,9 @@ def _is_multitask(self): return False def __sklearn_tags__(self): - more_tags = {"multioutput": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = False + return tags ############################################################################### @@ -2576,8 +2577,10 @@ def fit(self, X, y): return self def __sklearn_tags__(self): - more_tags = {"multioutput_only": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = True + tags.target_tags.single_output = False + return tags class MultiTaskLasso(MultiTaskElasticNet): @@ -2944,8 +2947,10 @@ def _is_multitask(self): return True def __sklearn_tags__(self): - more_tags = {"multioutput_only": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = True + tags.target_tags.single_output = False + return tags # This is necessary as LinearModelCV now supports sample_weight while # MultiTaskElasticNet does not (yet). @@ -3183,8 +3188,10 @@ def _is_multitask(self): return True def __sklearn_tags__(self): - more_tags = {"multioutput_only": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = True + tags.target_tags.single_output = False + return tags # This is necessary as LinearModelCV now supports sample_weight while # MultiTaskElasticNet does not (yet). diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 0a35f428ff554..39a2640dc1cf9 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -12,6 +12,7 @@ class InputTags: sparse: bool = False categorical: bool = False string: bool = False + dict: bool = False positive_only: bool = False allow_nan: bool = False pairwise: bool = False @@ -41,6 +42,7 @@ class ClassifierTags: @dataclass class RegressorTags: poor_score: bool = False + multi_label: bool = False @dataclass @@ -74,9 +76,8 @@ def default_tags(estimator): """ from ..base import is_classifier, is_regressor - target_required = ( - True if is_classifier(estimator) or is_regressor(estimator) else False - ) + target_required = is_classifier(estimator) or is_regressor(estimator) + return Tags( target_tags=TargetTags(required=target_required), transformer_tags=TransformerTags() if hasattr(estimator, "transform") else None, From 177d763c8d37a4cf3ae93e6d6695623bccb4bb17 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 11:42:38 +0200 Subject: [PATCH 19/48] a lot more estimators --- sklearn/cluster/_bicluster.py | 23 ++-- sklearn/cluster/_hdbscan/hdbscan.py | 5 +- sklearn/cluster/_spectral.py | 11 +- sklearn/compose/_target.py | 10 +- sklearn/decomposition/_fastica.py | 5 +- sklearn/decomposition/_kernel_pca.py | 9 +- sklearn/decomposition/_lda.py | 7 +- sklearn/decomposition/_nmf.py | 9 +- sklearn/decomposition/_pca.py | 9 +- sklearn/decomposition/_sparse_pca.py | 5 +- sklearn/decomposition/_truncated_svd.py | 5 +- sklearn/discriminant_analysis.py | 5 +- sklearn/ensemble/_base.py | 6 +- .../gradient_boosting.py | 5 +- sklearn/ensemble/_iforest.py | 15 ++- sklearn/feature_extraction/_hash.py | 9 +- sklearn/feature_extraction/image.py | 7 +- sklearn/feature_selection/_from_model.py | 5 +- sklearn/feature_selection/_rfe.py | 11 +- sklearn/feature_selection/_sequential.py | 9 +- .../_univariate_selection.py | 20 ++-- .../feature_selection/_variance_threshold.py | 5 +- sklearn/gaussian_process/_gpr.py | 5 +- sklearn/impute/_base.py | 26 ++--- sklearn/isotonic.py | 6 +- sklearn/kernel_approximation.py | 2 +- sklearn/kernel_ridge.py | 5 +- sklearn/linear_model/_glm/glm.py | 7 +- sklearn/linear_model/_least_angle.py | 10 +- sklearn/linear_model/_logistic.py | 13 +-- sklearn/linear_model/_ransac.py | 13 +-- sklearn/linear_model/_ridge.py | 25 +++-- sklearn/linear_model/_stochastic_gradient.py | 42 +++---- sklearn/manifold/_isomap.py | 5 +- sklearn/manifold/_mds.py | 5 +- sklearn/manifold/_spectral_embedding.py | 14 +-- sklearn/manifold/_t_sne.py | 5 +- .../_classification_threshold.py | 22 ++-- sklearn/model_selection/_search.py | 13 +-- .../_search_successive_halving.py | 5 +- sklearn/multiclass.py | 12 +- sklearn/neighbors/_base.py | 7 +- sklearn/neighbors/_classification.py | 10 +- sklearn/neighbors/_graph.py | 18 ++- sklearn/neighbors/_kde.py | 11 +- sklearn/neighbors/_lof.py | 4 - sklearn/neighbors/_nca.py | 5 +- sklearn/neighbors/_regression.py | 5 +- .../neural_network/_multilayer_perceptron.py | 5 +- sklearn/pipeline.py | 29 ++--- sklearn/preprocessing/_data.py | 4 +- sklearn/preprocessing/_encoders.py | 6 +- .../preprocessing/_function_transformer.py | 6 +- sklearn/preprocessing/_label.py | 19 +++- sklearn/preprocessing/_polynomial.py | 15 ++- sklearn/preprocessing/_target_encoder.py | 7 +- sklearn/random_projection.py | 5 +- sklearn/svm/_base.py | 5 +- sklearn/svm/_classes.py | 105 ++++++++---------- sklearn/utils/_tags.py | 11 +- sklearn/utils/estimator_checks.py | 14 +-- 61 files changed, 372 insertions(+), 349 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index e0c3679e5f1f0..abb4f22147208 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -194,19 +194,18 @@ def _k_means(self, data, n_clusters): return centroid, labels def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_estimators_dtypes": "raises nan error", - "check_fit2d_1sample": "_scale_normalize fails", - "check_fit2d_1feature": "raises apply_along_axis error", - "check_estimator_sparse_matrix": "does not fail gracefully", - "check_estimator_sparse_array": "does not fail gracefully", - "check_methods_subset_invariance": "empty array passed inside", - "check_dont_overwrite_parameters": "empty array passed inside", - "check_fit2d_predict1d": "empty array passed inside", - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_estimators_dtypes": "raises nan error", + "check_fit2d_1sample": "_scale_normalize fails", + "check_fit2d_1feature": "raises apply_along_axis error", + "check_estimator_sparse_matrix": "does not fail gracefully", + "check_estimator_sparse_array": "does not fail gracefully", + "check_methods_subset_invariance": "empty array passed inside", + "check_dont_overwrite_parameters": "empty array passed inside", + "check_fit2d_predict1d": "empty array passed inside", } - return {**super().__sklearn_tags__(), **more_tags} + return tags class SpectralCoclustering(BaseSpectral): diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index fe5cb9e1ff5cf..4f0c85f1103df 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -1034,5 +1034,6 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): return labels def __sklearn_tags__(self): - more_tags = {"allow_nan": self.metric != "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = self.metric != "precomputed" + return tags diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 4035924ec6c31..5d341611dd4a7 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -789,8 +789,9 @@ def fit_predict(self, X, y=None): return super().fit_predict(X, y) def __sklearn_tags__(self): - more_tags = { - "pairwise": self.affinity - in ["precomputed", "precomputed_nearest_neighbors"] - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.affinity in [ + "precomputed", + "precomputed_nearest_neighbors", + ] + return tags diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 2f5ee7f08ebe6..2f69e32a24eae 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -354,12 +354,10 @@ def predict(self, X, **predict_params): def __sklearn_tags__(self): regressor = self._get_regressor() - - more_tags = { - "poor_score": True, - "multioutput": _safe_tags(regressor, key="multioutput"), - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.regressor_tags.poor_score = True + tags.target_tags.multi_output = _safe_tags(regressor).target_tags.multi_output + return tags @property def n_features_in_(self): diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index a0c0cd8716c2a..751cfbd24472f 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -791,5 +791,6 @@ def _n_features_out(self): return self.components_.shape[0] def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index abb9423e7a44a..7888e976f9b03 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -564,11 +564,10 @@ def inverse_transform(self, X): return np.dot(K, self.dual_coef_) def __sklearn_tags__(self): - more_tags = { - "preserves_dtype": [np.float64, np.float32], - "pairwise": self.kernel == "precomputed", - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.input_tags.pairwise = self.kernel == "precomputed" + return tags @property def _n_features_out(self): diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 8b5cf0425f6f1..d718282d3294e 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -547,10 +547,9 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): return def __sklearn_tags__(self): - more_tags = { - "requires_positive_X": True, - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + return tags def _check_non_neg_array(self, X, reset_n_features, whom): """check X format diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 1d7fb753a7d9d..c39171af4b270 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1341,11 +1341,10 @@ def _n_features_out(self): return self.components_.shape[0] def __sklearn_tags__(self): - more_tags = { - "requires_positive_X": True, - "preserves_dtype": [np.float64, np.float32], - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class NMF(_BaseNMF): diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 4a8958a3f2e6d..c10202a51940e 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -847,8 +847,7 @@ def score(self, X, y=None): return float(xp.mean(self.score_samples(X))) def __sklearn_tags__(self): - more_tags = { - "preserves_dtype": [np.float64, np.float32], - "array_api_support": True, - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.array_api_support = True + return tags diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 04a3c0073e782..273d1cbcd344e 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -151,8 +151,9 @@ def _n_features_out(self): return self.components_.shape[0] def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class SparsePCA(_BaseSparsePCA): diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index aa135ef8c312e..408ef9a9f138a 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -311,8 +311,9 @@ def inverse_transform(self, X): return np.dot(X, self.components_) def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags @property def _n_features_out(self): diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index ea85f890c1c87..39aa1a42940e2 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -756,8 +756,9 @@ def decision_function(self, X): return super().decision_function(X) def __sklearn_tags__(self): - more_tags = {"array_api_support": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.array_api_support = True + return tags class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 232205a5885fb..77cb27dbccb2d 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -292,6 +292,7 @@ def get_params(self, deep=True): return super()._get_params("estimators", deep=deep) def __sklearn_tags__(self): + tags = super().__sklearn_tags__() try: allow_nan = all( _safe_tags(est[1])["allow_nan"] if est[1] != "drop" else True @@ -302,5 +303,6 @@ def __sklearn_tags__(self): # fail. In this case, we assume that `allow_nan` is False but the parameter # validation will raise an error during `fit`. allow_nan = False - more_tags = {"preserves_dtype": [], "allow_nan": allow_nan} - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.allow_nan = allow_nan + tags.transformer_tags.preserves_dtype = [] + return tags diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index fcea463636fec..0693c44803730 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1411,8 +1411,9 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags @abstractmethod def _get_loss(self, sample_weight): diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 94ae8d63d01d8..41eb1209724f5 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -630,15 +630,14 @@ def _compute_score_samples(self, X, subsample_features): return scores def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - }, - "allow_nan": True, + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.allow_nan = True + return tags def _average_path_length(n_samples_leaf): diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index c290124932d19..45570a523dbbf 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -194,5 +194,10 @@ def transform(self, raw_X): return X def __sklearn_tags__(self): - more_tags = {"X_types": [self.input_type]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + if self.input_type == "string": + tags.input_tags.string = True + elif self.input_type == "dict": + tags.input_tags.dict = True + return tags diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 63725de094328..a13d83b6c1150 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -675,5 +675,8 @@ def transform(self, X): return patches def __sklearn_tags__(self): - more_tags = {"X_types": ["3darray"], "stateless": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.input_tags.three_d_array = True + tags.requires_fit = False + return tags diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 122e3f313aff4..01b6bbef41886 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -519,5 +519,6 @@ def get_metadata_routing(self): return router def __sklearn_tags__(self): - more_tags = {"allow_nan": _safe_tags(self.estimator, key="allow_nan")} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = _safe_tags(self.estimator).input_tags.allow_nan + return tags diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 4ddf02e0a987e..e5100b3154565 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -533,12 +533,11 @@ def predict_log_proba(self, X): return self.estimator_.predict_log_proba(self.transform(X)) def __sklearn_tags__(self): - more_tags = { - "poor_score": True, - "requires_y": True, - "allow_nan": _safe_tags(self.estimator, "allow_nan"), - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.poor_score = True + tags.target_tags.required = True + tags.input_tags.allow_nan = _safe_tags(self.estimator).input_tags.allow_nan + return tags def get_metadata_routing(self): """Get metadata routing of this object. diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 0c415c35a6ffc..0046c47c6849e 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -232,7 +232,7 @@ def fit(self, X, y=None, **params): X, accept_sparse="csc", ensure_min_features=2, - ensure_all_finite=not tags.get("allow_nan", True), + ensure_all_finite=not tags.input_tags.allow_nan, ) n_features = X.shape[1] @@ -326,10 +326,9 @@ def _get_support_mask(self): return self.support_ def __sklearn_tags__(self): - more_tags = { - "allow_nan": _safe_tags(self.estimator, key="allow_nan"), - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = _safe_tags(self.estimator).input_tags.allow_nan + return tags def get_metadata_routing(self): """Get metadata routing of this object. diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 68ca27410516a..5d86420a0c64d 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -579,8 +579,9 @@ def _check_params(self, X, y): pass def __sklearn_tags__(self): - more_tags = {"requires_y": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags ###################################################################### @@ -686,8 +687,9 @@ def _get_support_mask(self): return mask def __sklearn_tags__(self): - more_tags = {"requires_y": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.required = False + return tags class SelectKBest(_BaseFilter): @@ -796,8 +798,9 @@ def _get_support_mask(self): return mask def __sklearn_tags__(self): - more_tags = {"requires_y": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.required = False + return tags class SelectFpr(_BaseFilter): @@ -1149,8 +1152,9 @@ def _make_selector(self): return selector def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags def _check_params(self, X, y): self._make_selector()._check_params(X, y) diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index fde47ebec612a..fae60bd164ca6 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -134,5 +134,6 @@ def _get_support_mask(self): return self.variances_ > self.threshold def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 7552d90ddf9ef..bb96bed7cea6b 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -665,5 +665,6 @@ def _constrained_optimization(self, obj_func, initial_theta, bounds): return theta_opt, func_min def __sklearn_tags__(self): - more_tags = {"requires_fit": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.requires_fit = False + return tags diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 8ceff1a4f00eb..1ca9bf6f344f8 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -140,8 +140,9 @@ def _concatenate_indicator_feature_names_out(self, names, input_features): return np.concatenate([names, indicator_names]) def __sklearn_tags__(self): - more_tags = {"allow_nan": is_scalar_nan(self.missing_values)} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = is_scalar_nan(self.missing_values) + return tags class SimpleImputer(_BaseImputer): @@ -701,11 +702,11 @@ def inverse_transform(self, X): return X_original def __sklearn_tags__(self): - more_tags = { - "allow_nan": is_pandas_na(self.missing_values) - or is_scalar_nan(self.missing_values) - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = is_pandas_na(self.missing_values) or is_scalar_nan( + self.missing_values + ) + return tags def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. @@ -1089,9 +1090,8 @@ def get_feature_names_out(self, input_features=None): ) def __sklearn_tags__(self): - more_tags = { - "allow_nan": True, - "X_types": ["2darray", "string"], - "preserves_dtype": [], - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.string = True + tags.transformer_tags.preserves_dtype = [] + return tags diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 21d9ba3d220b1..7312fdba7f63d 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -505,5 +505,7 @@ def __setstate__(self, state): self._build_f(self.X_thresholds_, self.y_thresholds_) def __sklearn_tags__(self): - more_tags = {"X_types": ["1darray"]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.one_d_array = True + tags.input_tags.two_d_array = False + return tags diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index f74fddef30991..6d5805124c2be 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -820,7 +820,7 @@ def _transform_sparse(X, sample_steps, sample_interval): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.stateless = True + tags.requires_fit = False tags.input_tags.positive_only = True return tags diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 4c49ef72e7c2c..6f69836d63912 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -168,8 +168,9 @@ def _get_kernel(self, X, Y=None): return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params) def __sklearn_tags__(self): - more_tags = {"pairwise": self.kernel == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.kernel == "precomputed" + return tags @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 0585ea9bc35f1..4459deb42b3cc 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -439,17 +439,18 @@ def score(self, X, y, sample_weight=None): return 1 - (deviance + constant) / (deviance_null + constant) def __sklearn_tags__(self): + tags = super().__sklearn_tags__() try: # Create instance of BaseLoss if fit wasn't called yet. This is necessary as # TweedieRegressor might set the used loss during fit different from # self._base_loss. base_loss = self._get_loss() - more_tags = {"requires_positive_y": not base_loss.in_y_true_range(-1.0)} + tags.target_tags.positive_only = not base_loss.in_y_true_range(-1.0) except (ValueError, AttributeError, TypeError): # This happens when the link or power parameter of TweedieRegressor is # invalid. We fallback on the default tags in that case. - more_tags = {} - return {**super().__sklearn_tags__(), **more_tags} + pass + return tags def _get_loss(self): """This is only necessary because of the link and power arguments of the diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index ee03dc9b89f00..0f6233982f54e 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -1689,8 +1689,9 @@ def __init__( ) def __sklearn_tags__(self): - more_tags = {"multioutput": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = False + return tags @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, **params): @@ -2212,8 +2213,9 @@ def __init__( self.noise_variance = noise_variance def __sklearn_tags__(self): - more_tags = {"multioutput": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = False + return tags @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, copy_X=None): diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 3b3336b10a1b4..96e65788731cb 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -2266,14 +2266,13 @@ def get_metadata_routing(self): return router def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags def _get_scorer(self): """Get the scorer based on the scoring method specified. diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 53740f9de3bea..b648e77e6c26b 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -716,11 +716,10 @@ def get_metadata_routing(self): return router def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 517ca93c72365..da5b9744eb96c 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1248,8 +1248,9 @@ def fit(self, X, y, sample_weight=None): return super().fit(X, y, sample_weight=sample_weight) def __sklearn_tags__(self): - more_tags = {"array_api_support": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.array_api_support = True + return tags class _RidgeClassifierMixin(LinearClassifierMixin): @@ -1336,8 +1337,9 @@ def classes_(self): return self._label_binarizer.classes_ def __sklearn_tags__(self): - more_tags = {"multilabel": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge): @@ -2882,12 +2884,11 @@ def fit(self, X, y, sample_weight=None, **params): return self def __sklearn_tags__(self): - more_tags = { - "multilabel": True, - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - }, + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index e09202c3995e3..646f2127ef96f 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -1374,15 +1374,13 @@ def predict_log_proba(self, X): return np.log(self.predict_proba(X)) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - }, - "preserves_dtype": [np.float64, np.float32], + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class BaseSGDRegressor(RegressorMixin, BaseSGD): @@ -2059,15 +2057,13 @@ def __init__( ) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - }, - "preserves_dtype": [np.float64, np.float32], + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class SGDOneClassSVM(BaseSGD, OutlierMixin): @@ -2640,12 +2636,10 @@ def predict(self, X): return y def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ) - }, - "preserves_dtype": [np.float64, np.float32], + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 208c832673afe..b0b7f4402512e 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -436,5 +436,6 @@ def transform(self, X): return self.kernel_pca_.transform(G_X) def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index b25fb9cb721f8..962086a402bdf 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -567,8 +567,9 @@ def __init__( self.normalized_stress = normalized_stress def __sklearn_tags__(self): - more_tags = {"pairwise": self.dissimilarity == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.dissimilarity == "precomputed" + return tags def fit(self, X, y=None, init=None): """ diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 4b3c2d3658388..045c881272e9f 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -648,14 +648,12 @@ def __init__( self.n_jobs = n_jobs def __sklearn_tags__(self): - more_tags = { - "pairwise": self.affinity - in [ - "precomputed", - "precomputed_nearest_neighbors", - ] - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.affinity in [ + "precomputed", + "precomputed_nearest_neighbors", + ] + return tags def _get_affinity_matrix(self, X, Y=None): """Calculate the affinity matrix from data diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index a33fee2fa246b..4906c1a463eec 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -1209,5 +1209,6 @@ def _n_features_out(self): return self.embedding_.shape[1] def __sklearn_tags__(self): - more_tags = {"pairwise": self.metric == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.metric == "precomputed" + return tags diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py index 88712e509d24d..0aa6e04408c79 100644 --- a/sklearn/model_selection/_classification_threshold.py +++ b/sklearn/model_selection/_classification_threshold.py @@ -211,18 +211,18 @@ def decision_function(self, X): return self.estimator_.decision_function(X) def __sklearn_tags__(self): - more_tags = { - "binary_only": True, - "_xfail_checks": { - "check_classifiers_train": "Threshold at probability 0.5 does not hold", - "check_sample_weights_invariance": ( - "Due to the cross-validation and sample ordering, removing a sample" - " is not strictly equal to putting is weight to zero. Specific unit" - " tests are added for TunedThresholdClassifierCV specifically." - ), - }, + tags = super().__sklearn_tags__() + tags.classifier_tags.binary = True + tags.classifier_tags.multi_class = False + tags._xfail_checks = { + "check_classifiers_train": "Threshold at probability 0.5 does not hold", + "check_sample_weights_invariance": ( + "Due to the cross-validation and sample ordering, removing a sample" + " is not strictly equal to putting is weight to zero. Specific unit" + " tests are added for TunedThresholdClassifierCV specifically." + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class FixedThresholdClassifier(BaseThresholdClassifier): diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 3869d75c7f5a2..57b8b53e5f80a 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -480,15 +480,14 @@ def _estimator_type(self): return self.estimator._estimator_type def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # allows cross-validation to see 'precomputed' metrics - more_tags = { - "pairwise": _safe_tags(self.estimator, "pairwise"), - "_xfail_checks": { - "check_supervised_y_2d": "DataConversionWarning not caught" - }, - "array_api_support": _safe_tags(self.estimator, "array_api_support"), + tags.input_tags.pairwise = _safe_tags(self.estimator).input_tags.pairwise + tags._xfail_checks = { + "check_supervised_y_2d": "DataConversionWarning not caught" } - return {**super().__sklearn_tags__(), **more_tags} + tags.array_api_support = _safe_tags(self.estimator).array_api_support + return tags def score(self, X, y=None, **params): """Return the score on the given data, if the estimator has been refit. diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index d3ffef4f43031..53e289c333082 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: BSD-3-Clause from abc import abstractmethod -from copy import deepcopy from math import ceil, floor, log from numbers import Integral, Real @@ -372,8 +371,8 @@ def _generate_candidate_params(self): pass def __sklearn_tags__(self): - tags = deepcopy(super().__sklearn_tags__()) - tags["_xfail_checks"].update( + tags = super().__sklearn_tags__() + tags._xfail_checks.update( { "check_fit2d_1sample": ( "Fail during parameter check since min/max resources requires" diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index a402fc7e07a8e..07b8e14fa6ce5 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -591,8 +591,9 @@ def n_classes_(self): def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" - more_tags = {"pairwise": _safe_tags(self.estimator, key="pairwise")} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = _safe_tags(self.estimator).input_tags.pairwise + return tags def get_metadata_routing(self): """Get metadata routing of this object. @@ -818,7 +819,7 @@ def fit(self, X, y, **fit_params): self.estimators_ = estimators_indices[0] - pairwise = self.__sklearn_tags__()["pairwise"] + pairwise = self.__sklearn_tags__().input_tags.pairwise self.pairwise_indices_ = estimators_indices[1] if pairwise else None return self @@ -991,8 +992,9 @@ def n_classes_(self): def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" - more_tags = {"pairwise": _safe_tags(self.estimator, key="pairwise")} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = _safe_tags(self.estimator).input_tags.pairwise + return tags def get_metadata_routing(self): """Get metadata routing of this object. diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index a4e89be208aa8..1d3489bc1b24e 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -469,7 +469,7 @@ def _check_algorithm_metric(self): ) def _fit(self, X, y=None): - if self.__sklearn_tags__()["requires_y"]: + if self.__sklearn_tags__().target_tags.required: if not isinstance(X, (KDTree, BallTree, NeighborsBase)): X, y = self._validate_data( X, y, accept_sparse="csr", multi_output=True, order="C" @@ -690,9 +690,10 @@ def _fit(self, X, y=None): return self def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # For cross-validation routines to split data correctly - more_tags = {"pairwise": self.metric == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.pairwise = self.metric == "precomputed" + return tags class KNeighborsMixin: diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 13195f1a036dc..1843fb21c3634 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -404,8 +404,9 @@ def predict_proba(self, X): return probabilities def __sklearn_tags__(self): - more_tags = {"multilabel": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): @@ -833,5 +834,6 @@ def predict_proba(self, X): return probabilities def __sklearn_tags__(self): - more_tags = {"multilabel": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 9e2d66a105e29..9a774c1dee514 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -481,12 +481,11 @@ def fit_transform(self, X, y=None): return self.fit(X).transform(X) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_methods_sample_order_invariance": "check is not applicable." - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_methods_sample_order_invariance": "check is not applicable." } - return {**super().__sklearn_tags__(), **more_tags} + return tags class RadiusNeighborsTransformer( @@ -712,9 +711,8 @@ def fit_transform(self, X, y=None): return self.fit(X).transform(X) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_methods_sample_order_invariance": "check is not applicable." - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_methods_sample_order_invariance": "check is not applicable." } - return {**super().__sklearn_tags__(), **more_tags} + return tags diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index f7ed408d7aaf1..465aec6ed5d8c 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -359,11 +359,8 @@ def sample(self, n_samples=1, random_state=None): return data[i] + X * correction[:, np.newaxis] def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "sample_weight must have positive values" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_sample_weights_invariance": "sample_weight must have positive values" } - return {**super().__sklearn_tags__(), **more_tags} + return tags diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index ece83fec3778b..c05a4f60773b0 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -516,7 +516,3 @@ def _local_reachability_density(self, distances_X, neighbors_indices): # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_: return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10) - - def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 1b6c58d13d1cf..a254989fde78d 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -520,8 +520,9 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): return sign * loss, sign * gradient.ravel() def __sklearn_tags__(self): - more_tags = {"requires_y": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags @property def _n_features_out(self): diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index cf56349175b55..545e6b875988a 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -192,9 +192,10 @@ def __init__( self.weights = weights def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # For cross-validation routines to split data correctly - more_tags = {"pairwise": self.metric == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.pairwise = self.metric == "precomputed" + return tags @_fit_context( # KNeighborsRegressor.metric is not validated yet diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 448cd8ad1e5c7..f46ed6fd4372f 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -1253,8 +1253,9 @@ def predict_proba(self, X): return y_pred def __sklearn_tags__(self): - more_tags = {"multilabel": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index df31ad6e9aaa2..cc70767192cbe 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1015,34 +1015,35 @@ def classes_(self): return self.steps[-1][1].classes_ def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_dont_overwrite_parameters": ( - "Pipeline changes the `steps` parameter, which it shouldn't." - "Therefore this test is x-fail until we fix this." - ), - "check_estimators_overwrite_params": ( - "Pipeline changes the `steps` parameter, which it shouldn't." - "Therefore this test is x-fail until we fix this." - ), - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_dont_overwrite_parameters": ( + "Pipeline changes the `steps` parameter, which it shouldn't." + "Therefore this test is x-fail until we fix this." + ), + "check_estimators_overwrite_params": ( + "Pipeline changes the `steps` parameter, which it shouldn't." + "Therefore this test is x-fail until we fix this." + ), } try: - more_tags["pairwise"] = _safe_tags(self.steps[0][1], "pairwise") + tags.input_tags.pairwise = _safe_tags(self.steps[0][1]).input_tags.pairwise except (ValueError, AttributeError, TypeError): # This happens when the `steps` is not a list of (name, estimator) # tuples and `fit` is not called yet to validate the steps. pass try: - more_tags["multioutput"] = _safe_tags(self.steps[-1][1], "multioutput") + tags.target_tags.multi_output = _safe_tags( + self.steps[-1][1] + ).target_tags.multi_output except (ValueError, AttributeError, TypeError): # This happens when the `steps` is not a list of (name, estimator) # tuples and `fit` is not called yet to validate the steps. pass - return {**super().__sklearn_tags__(), **more_tags} + return tags def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 4d06890b0a297..84b6e8953a237 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2114,7 +2114,7 @@ def transform(self, X, copy=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.stateless = True + tags.requires_fit = False tags.array_api_support = True return tags @@ -2319,7 +2319,7 @@ def transform(self, X, copy=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.stateless = True + tags.requires_fit = False return tags diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 97201dfceecf1..faa9b6a9b3c5a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -451,8 +451,10 @@ def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices): X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i]) def __sklearn_tags__(self): - more_tags = {"X_types": ["2darray", "categorical"], "allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.categorical = True + tags.input_tags.allow_nan = True + return tags class OneHotEncoder(_BaseEncoder): diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 8f7da169fbcab..25491253fdc24 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -388,8 +388,10 @@ def __sklearn_is_fitted__(self): return True def __sklearn_tags__(self): - more_tags = {"no_validation": not self.validate, "stateless": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.no_validation = not self.validate + tags.requires_fit = False + return tags def set_output(self, *, transform=None): """Set output container. diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 07b4de9698c4d..b6b2f9e90c813 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -164,8 +164,11 @@ def inverse_transform(self, y): return xp.take(self.classes_, y, axis=0) def __sklearn_tags__(self): - more_tags = {"X_types": ["1dlabels"], "array_api_support": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.array_api_support = True + tags.input_tags.two_d_array = False + tags.input_tags.one_d_labels = True + return tags class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): @@ -417,8 +420,10 @@ def inverse_transform(self, Y, threshold=None): return y_inv def __sklearn_tags__(self): - more_tags = {"X_types": ["1dlabels"]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.input_tags.one_d_labels = True + return tags @validate_params( @@ -952,5 +957,7 @@ def inverse_transform(self, yt): return [tuple(self.classes_.compress(indicators)) for indicators in yt] def __sklearn_tags__(self): - more_tags = {"X_types": ["2dlabels"]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.input_tags.two_d_labels = True + return tags diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index d4260f77a298c..1797a5d4f1821 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -1166,12 +1166,11 @@ def transform(self, X): return XBS[:, indices] def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_estimators_pickle": ( - "Current Scipy implementation of _bsplines does not" - "support const memory views." - ), - } + tags = super().__sklearn_tags__() + tags._xfail_checks = { + "check_estimators_pickle": ( + "Current Scipy implementation of _bsplines does not" + "support const memory views." + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index fb9e914582c9c..dc328dc5cf5db 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -529,7 +529,6 @@ def get_feature_names_out(self, input_features=None): return feature_names def __sklearn_tags__(self): - more_tags = { - "requires_y": True, - } - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 07f09591e2f51..06b591bcd8b90 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -457,8 +457,9 @@ def inverse_transform(self, X): return X @ inverse_components.T def __sklearn_tags__(self): - more_tags = {"preserves_dtype": [np.float64, np.float32]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + return tags class GaussianRandomProjection(BaseRandomProjection): diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index acc80d95b3bef..a859b8d4198fa 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -143,9 +143,10 @@ def __init__( self.random_state = random_state def __sklearn_tags__(self): + tags = super().__sklearn_tags__() # Used by cross_val_score. - more_tags = {"pairwise": self.kernel == "precomputed"} - return {**super().__sklearn_tags__(), **more_tags} + tags.input_tags.pairwise = self.kernel == "precomputed" + return tags @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index ead06ae61f04d..efcdb74a986ef 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -349,14 +349,13 @@ def fit(self, X, y, sample_weight=None): return self def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_check = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class LinearSVR(RegressorMixin, LinearModel): @@ -609,14 +608,13 @@ def fit(self, X, y, sample_weight=None): return self def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_check = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class SVC(BaseSVC): @@ -891,14 +889,13 @@ def __init__( ) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_check = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class NuSVC(BaseSVC): @@ -1162,21 +1159,20 @@ def __init__( ) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_methods_subset_invariance": ( - "fails for the decision_function method" - ), - "check_class_weight_classifiers": "class_weight is ignored.", - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - "check_classifiers_one_label_sample_weights": ( - "specified nu is infeasible for the fit." - ), - } + tags = super().__sklearn_tags__() + tags._xfail_check = { + "check_methods_subset_invariance": ( + "fails for the decision_function method" + ), + "check_class_weight_classifiers": "class_weight is ignored.", + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), + "check_classifiers_one_label_sample_weights": ( + "specified nu is infeasible for the fit." + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class SVR(RegressorMixin, BaseLibSVM): @@ -1371,14 +1367,13 @@ def __init__( ) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_check = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class NuSVR(RegressorMixin, BaseLibSVM): @@ -1566,14 +1561,13 @@ def __init__( ) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_check = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags class OneClassSVM(OutlierMixin, BaseLibSVM): @@ -1830,11 +1824,10 @@ def predict(self, X): return np.asarray(y, dtype=np.intp) def __sklearn_tags__(self): - more_tags = { - "_xfail_checks": { - "check_sample_weights_invariance": ( - "zero sample_weight is not equivalent to removing samples" - ), - } + tags = super().__sklearn_tags__() + tags._xfail_check = { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } - return {**super().__sklearn_tags__(), **more_tags} + return tags diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 39a2640dc1cf9..7a31488052646 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -8,7 +8,11 @@ @dataclass class InputTags: + one_d_array = False two_d_array: bool = True + three_d_array: bool = False + one_d_labels = False + two_d_labels: bool = False sparse: bool = False categorical: bool = False string: bool = False @@ -53,7 +57,6 @@ class Tags: regressor_tags: RegressorTags array_api_support: bool = False no_validation: bool = False - stateless: bool = False non_deterministic: bool = False requires_fit: bool = True _skip_test: bool = False @@ -80,7 +83,11 @@ def default_tags(estimator): return Tags( target_tags=TargetTags(required=target_required), - transformer_tags=TransformerTags() if hasattr(estimator, "transform") else None, + transformer_tags=( + TransformerTags() + if hasattr(estimator, "transform") or hasattr(estimator, "fit_transform") + else None + ), classifier_tags=ClassifierTags() if is_classifier(estimator) else None, regressor_tags=RegressorTags() if is_regressor(estimator) else None, ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b5637df38d0f4..613436cd65a8e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -98,7 +98,7 @@ def _yield_checks(estimator): yield check_sample_weights_pandas_series yield check_sample_weights_not_an_array yield check_sample_weights_list - if not tags["pairwise"]: + if not tags.input_tags.pairwise: # We skip pairwise because the data is not pairwise yield check_sample_weights_shape yield check_sample_weights_not_overwritten @@ -227,17 +227,17 @@ def _yield_regressor_checks(regressor): yield partial(check_regressors_train, readonly_memmap=True, X_dtype="float32") yield check_regressor_data_not_an_array yield check_estimators_partial_fit_n_features - if tags["multioutput"]: + if tags.target_tags.multi_output: yield check_regressor_multioutput yield check_regressors_no_decision_function - if not tags["no_validation"] and not tags["multioutput_only"]: + if not tags.no_validation and tags.target_tags.single_output: yield check_supervised_y_2d yield check_supervised_y_no_nan name = regressor.__class__.__name__ if name != "CCA": # check that the regressor handles int input yield check_regressors_int - if tags["requires_fit"]: + if tags.requires_fit: yield check_estimators_unfitted yield check_non_transformer_estimators_n_iter @@ -253,7 +253,7 @@ def _yield_transformer_checks(transformer): if tags.transformer_tags.preserves_dtype: yield check_transformer_preserves_dtypes yield partial(check_transformer_general, readonly_memmap=True) - if not _safe_tags(transformer).stateless: + if _safe_tags(transformer).requires_fit: yield check_transformers_unfitted else: yield check_transformers_unfitted_stateless @@ -1829,7 +1829,7 @@ def _check_transformer(name, transformer_orig, X, y): # raises error on malformed input for transform if ( hasattr(X, "shape") - and not _safe_tags(transformer, key="stateless") + and _safe_tags(transformer).requires_fit and X.ndim == 2 and X.shape[1] > 1 ): @@ -3896,7 +3896,7 @@ def check_fit_check_is_fitted(name, estimator_orig): y = rng.randint(low=0, high=2, size=n_samples) y = _enforce_estimator_tags_y(estimator, y) - if not _safe_tags(estimator).get("stateless", False): + if _safe_tags(estimator).requires_fit: # stateless estimators (such as FunctionTransformer) are always "fit"! try: check_is_fitted(estimator) From 8914bf0efba05bef1c2892eebbc27a4c62ab5f0f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 17:37:44 +0200 Subject: [PATCH 20/48] ... --- sklearn/utils/_tags.py | 2 +- sklearn/utils/estimator_checks.py | 70 +++++++++++++++++-------------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 7a31488052646..ec4ff94bb4e12 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -93,7 +93,7 @@ def default_tags(estimator): ) -def _safe_tags(estimator): +def _safe_tags(estimator) -> Tags: """Safely get estimator tags. :class:`~sklearn.BaseEstimator` provides the estimator tags machinery. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 613436cd65a8e..ce79e2fb4a66a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -302,7 +302,7 @@ def _yield_outliers_checks(estimator): # test outlier detectors can handle non-array data yield check_classifier_data_not_an_array # test if NotFittedError is raised - if _safe_tags(estimator, key="requires_fit"): + if _safe_tags(estimator).requires_fit: yield check_estimators_unfitted yield check_non_transformer_estimators_n_iter @@ -1154,7 +1154,8 @@ def check_sample_weights_not_an_array(name, estimator_orig): X = _NotAnArray(_enforce_estimator_tags_X(estimator_orig, X)) y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = _NotAnArray([1] * 12) - if _safe_tags(estimator, key="multioutput_only"): + tags = _safe_tags(estimator) + if not tags.target_tags.single_output and tags.target_tags.multi_output: y = _NotAnArray(y.data.reshape(-1, 1)) estimator.fit(X, y, sample_weight=weights) @@ -1559,7 +1560,8 @@ def check_methods_sample_order_invariance(name, estimator_orig): X = 3 * rnd.uniform(size=(20, 3)) X = _enforce_estimator_tags_X(estimator_orig, X) y = X[:, 0].astype(np.int64) - if _safe_tags(estimator_orig, key="binary_only"): + tags = _safe_tags(estimator_orig) + if tags.classifier_tags.binary and not tags.classifier_tags.multi_class: y[y == 2] = 1 estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) @@ -1789,7 +1791,7 @@ def _check_transformer(name, transformer_orig, X, y): X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) - if _safe_tags(transformer_orig, key="non_deterministic"): + if _safe_tags(transformer_orig).non_deterministic: msg = name + " is non deterministic" raise SkipTest(msg) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): @@ -1847,7 +1849,7 @@ def _check_transformer(name, transformer_orig, X, y): @ignore_warnings def check_pipeline_consistency(name, estimator_orig): - if _safe_tags(estimator_orig, key="non_deterministic"): + if _safe_tags(estimator_orig).non_deterministic: msg = name + " is non deterministic" raise SkipTest(msg) @@ -1910,6 +1912,10 @@ def check_fit_score_takes_y(name, estimator_orig): @ignore_warnings def check_estimators_dtypes(name, estimator_orig): + from sklearn.feature_extraction.text import CountVectorizer + + if isinstance(estimator_orig, CountVectorizer): + print("aha") rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_32 = _enforce_estimator_tags_X(estimator_orig, X_train_32) @@ -1943,7 +1949,7 @@ def check_transformer_preserves_dtypes(name, transformer_orig): X = StandardScaler().fit_transform(X) X = _enforce_estimator_tags_X(transformer_orig, X) - for dtype in _safe_tags(transformer_orig, key="preserves_dtype"): + for dtype in _safe_tags(transformer_orig).transformer_tags.preserves_dtype: X_cast = X.astype(dtype) transformer = clone(transformer_orig) set_random_state(transformer) @@ -2251,7 +2257,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False): pred = clusterer.labels_ assert pred.shape == (n_samples,) assert adjusted_rand_score(pred, y) > 0.4 - if _safe_tags(clusterer, key="non_deterministic"): + if _safe_tags(clusterer).non_deterministic: return set_random_state(clusterer) with warnings.catch_warnings(record=True): @@ -3016,7 +3022,7 @@ def check_classifiers_classes(name, classifier_orig): y_names_binary = np.take(labels_binary, y_binary) problems = [(X_binary, y_binary, y_names_binary)] - if not _safe_tags(classifier_orig, key="binary_only"): + if _safe_tags(classifier_orig).classifier_tags.multi_class: problems.append((X_multiclass, y_multiclass, y_names_multiclass)) for X, y, y_names in problems: @@ -3105,7 +3111,7 @@ def check_regressors_train( # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped - if not _safe_tags(regressor, key="poor_score"): + if not _safe_tags(regressor).regressor_tags.poor_score: assert regressor.score(X, y_) > 0.5 @@ -3128,10 +3134,11 @@ def check_regressors_no_decision_function(name, regressor_orig): @ignore_warnings(category=FutureWarning) def check_class_weight_classifiers(name, classifier_orig): - if _safe_tags(classifier_orig, key="binary_only"): - problems = [2] - else: + if _safe_tags(classifier_orig).classifier_tags.multi_class: problems = [2, 3] + else: + # binary only + problems = [2] for n_centers in problems: # create a very noisy dataset @@ -3141,7 +3148,7 @@ def check_class_weight_classifiers(name, classifier_orig): ) # can't use gram_if_pairwise() here, setting up gram matrix manually - if _safe_tags(classifier_orig, key="pairwise"): + if _safe_tags(classifier_orig).input_tags.pairwise: X_test = rbf_kernel(X_test, X_train) X_train = rbf_kernel(X_train, X_train) @@ -3167,7 +3174,7 @@ def check_class_weight_classifiers(name, classifier_orig): y_pred = classifier.predict(X_test) # XXX: Generally can use 0.89 here. On Windows, LinearSVC gets # 0.88 (Issue #9111) - if not _safe_tags(classifier_orig, key="poor_score"): + if not _safe_tags(classifier_orig).classifier_tags.poor_score: assert np.mean(y_pred == 0) > 0.87 @@ -3525,15 +3532,21 @@ def param_filter(p): def _enforce_estimator_tags_y(estimator, y): # Estimators with a `requires_positive_y` tag only accept strictly positive # data - if _safe_tags(estimator, key="requires_positive_y"): + tags = _safe_tags(estimator) + if tags.target_tags.positive_only: # Create strictly positive y. The minimal increment above 0 is 1, as # y could be of integer dtype. y += 1 + abs(y.min()) - if _safe_tags(estimator, key="binary_only") and y.size > 0: + if ( + tags.classifier_tags is not None + and tags.classifier_tags.binary + and not tags.classifier_tags.multi_class + and y.size > 0 + ): y = np.where(y == y.flat[0], y, y.flat[0] + 1) # Estimators in mono_output_task_error raise ValueError if y is of 1-D # Convert into a 2-D y for those estimators. - if _safe_tags(estimator, key="multioutput_only"): + if tags.target_tags.multi_output and not tags.target_tags.single_output: return np.reshape(y, (-1, 1)) return y @@ -3541,14 +3554,14 @@ def _enforce_estimator_tags_y(estimator, y): def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel): # Estimators with `1darray` in `X_types` tag only accept # X of shape (`n_samples`,) - if "1darray" in _safe_tags(estimator, key="X_types"): + if _safe_tags(estimator).input_tags.one_d_array: X = X[:, 0] # Estimators with a `requires_positive_X` tag only accept # strictly positive data - if _safe_tags(estimator, key="requires_positive_X"): + if _safe_tags(estimator).input_tags.positive_only: X = X - X.min() - if "categorical" in _safe_tags(estimator, key="X_types"): - dtype = np.float64 if _safe_tags(estimator, key="allow_nan") else np.int32 + if _safe_tags(estimator).input_tags.categorical: + dtype = np.float64 if _safe_tags(estimator).input_tags.allow_nan else np.int32 X = np.round((X - X.min())).astype(dtype) if estimator.__class__.__name__ == "SkewedChi2Sampler": @@ -3559,7 +3572,7 @@ def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel): # X of shape (`n_samples`, `n_samples`) if _is_pairwise_metric(estimator): X = pairwise_distances(X, metric="euclidean") - elif _safe_tags(estimator, key="pairwise"): + elif _safe_tags(estimator).input_tags.pairwise: X = kernel(X, X) return X @@ -3715,7 +3728,7 @@ def check_classifiers_regression_target(name, estimator_orig): X = _enforce_estimator_tags_X(estimator_orig, X) e = clone(estimator_orig) msg = "Unknown label type: " - if not _safe_tags(e, key="no_validation"): + if not _safe_tags(e).no_validation: with raises(ValueError, match=msg): e.fit(X, y) @@ -4313,6 +4326,7 @@ def check_param_validation(name, estimator_orig): X = rng.uniform(size=(20, 5)) y = rng.randint(0, 2, size=20) y = _enforce_estimator_tags_y(estimator_orig, y) + tags = _safe_tags(estimator_orig) estimator_params = estimator_orig.get_params(deep=False).keys() @@ -4377,10 +4391,7 @@ def check_param_validation(name, estimator_orig): ) with raises(InvalidParameterError, match=match, err_msg=err_msg): - if any( - isinstance(X_type, str) and X_type.endswith("labels") - for X_type in _safe_tags(estimator, key="X_types") - ): + if tags.input_tags.one_d_labels or tags.input_tags.two_d_labels: # The estimator is a label transformer and take only `y` getattr(estimator, method)(y) else: @@ -4415,10 +4426,7 @@ def check_param_validation(name, estimator_orig): ) with raises(InvalidParameterError, match=match, err_msg=err_msg): - if any( - X_type.endswith("labels") - for X_type in _safe_tags(estimator, key="X_types") - ): + if tags.input_tags.one_d_labels or tags.input_tags.two_d_labels: # The estimator is a label transformer and take only `y` getattr(estimator, method)(y) else: From d823d13759dd3a8fe40655279412dbe25e32f704 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 18:27:19 +0200 Subject: [PATCH 21/48] ... --- sklearn/feature_selection/_base.py | 2 +- sklearn/utils/estimator_checks.py | 65 ++++++++++++++++-------------- sklearn/utils/metaestimators.py | 2 +- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 9dfa73bf2a783..a6b24672cda38 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -103,7 +103,7 @@ def transform(self, X): X, dtype=None, accept_sparse="csr", - ensure_all_finite=not _safe_tags(self, key="allow_nan"), + ensure_all_finite=not _safe_tags(self).input_tags.allow_nan, cast_to_ndarray=not preserve_X, reset=False, ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index ce79e2fb4a66a..d3c8d6ab11af8 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -172,7 +172,7 @@ def _yield_classifier_checks(classifier): yield check_supervised_y_no_nan if not tags.target_tags.multi_output and not tags.target_tags.single_output: yield check_supervised_y_2d - if tags["requires_fit"]: + if tags.requires_fit: yield check_estimators_unfitted if "class_weight" in classifier.get_params().keys(): yield check_class_weight_classifiers @@ -324,7 +324,7 @@ def _yield_array_api_checks(estimator): def _yield_all_checks(estimator): name = estimator.__class__.__name__ tags = _safe_tags(estimator) - if tags.input_tags.two_d_array: + if not tags.input_tags.two_d_array: warnings.warn( "Can't test estimator {} which requires input of type {}".format( name, tags.input_tags @@ -1067,7 +1067,7 @@ def _check_estimator_sparse_container(name, estimator_orig, sparse_type): assert pred.shape == (X.shape[0],) if hasattr(estimator, "predict_proba"): probs = estimator.predict_proba(X) - if tags.classifier_tags.binary and not tags.classifier_tags.multiclass: + if tags.classifier_tags.binary and not tags.classifier_tags.multi_class: expected_probs_shape = (X.shape[0], 2) else: expected_probs_shape = (X.shape[0], 4) @@ -1342,7 +1342,7 @@ def check_dtype_object(name, estimator_orig): with raises(Exception, match="Unknown label type", may_pass=True): estimator.fit(X, y.astype(object)) - if "string" not in tags["X_types"]: + if not tags.input_tags.string: X[0, 0] = {"foo": "bar"} # This error is raised by: # - `np.asarray` in `check_array` @@ -1561,7 +1561,11 @@ def check_methods_sample_order_invariance(name, estimator_orig): X = _enforce_estimator_tags_X(estimator_orig, X) y = X[:, 0].astype(np.int64) tags = _safe_tags(estimator_orig) - if tags.classifier_tags.binary and not tags.classifier_tags.multi_class: + if ( + tags.classifier_tags is not None + and tags.classifier_tags.binary + and not tags.classifier_tags.multi_class + ): y[y == 2] = 1 estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) @@ -2075,7 +2079,7 @@ def check_estimators_pickle(name, estimator_orig, readonly_memmap=False): tags = _safe_tags(estimator_orig) # include NaN values when the estimator should deal with them - if tags["allow_nan"]: + if tags.input_tags.allow_nan: # set randomly 10 elements to np.nan rng = np.random.RandomState(42) mask = rng.choice(X.size, 10, replace=False) @@ -2175,7 +2179,7 @@ def check_classifier_multioutput(name, estimator): if hasattr(estimator, "predict_proba"): y_prob = estimator.predict_proba(X) - if isinstance(y_prob, list) and not tags["poor_score"]: + if isinstance(y_prob, list) and not tags.classifier_tags.poor_score: for i in range(n_classes): assert y_prob[i].shape == (n_samples, 2), ( "The shape of the probability for multioutput data is" @@ -2186,7 +2190,7 @@ def check_classifier_multioutput(name, estimator): assert_array_equal( np.argmax(y_prob[i], axis=1).astype(int), y_pred[:, i] ) - elif not tags["poor_score"]: + elif not tags.classifier_tags.poor_score: assert y_prob.shape == (n_samples, n_classes), ( "The shape of the probability for multioutput data is" " incorrect. Expected {}, got {}.".format( @@ -2384,7 +2388,7 @@ def check_classifiers_train( problems = [(X_b, y_b)] tags = _safe_tags(classifier_orig) - if not tags["binary_only"]: + if tags.classifier_tags.multi_class: problems.append((X_m, y_m)) for X, y in problems: @@ -2397,7 +2401,7 @@ def check_classifiers_train( set_random_state(classifier) # raises error on malformed input for fit - if not tags["no_validation"]: + if not tags.no_validation: with raises( ValueError, err_msg=( @@ -2418,7 +2422,7 @@ def check_classifiers_train( assert y_pred.shape == (n_samples,) # training set performance - if not tags["poor_score"]: + if not tags.classifier_tags.poor_score: assert accuracy_score(y, y_pred) > 0.83 # raises error on malformed input for predict @@ -2432,8 +2436,8 @@ def check_classifiers_train( "fit." ) - if not tags["no_validation"]: - if tags["pairwise"]: + if not tags.no_validation: + if tags.input_tags.pairwise: with raises( ValueError, err_msg=msg_pairwise.format(name, "predict"), @@ -2447,7 +2451,10 @@ def check_classifiers_train( # decision_function agrees with predict decision = classifier.decision_function(X) if n_classes == 2: - if not tags["multioutput_only"]: + if ( + not tags.target_tags.multi_output + or tags.target_tags.single_output + ): assert decision.shape == (n_samples,) else: assert decision.shape == (n_samples, 1) @@ -2458,8 +2465,8 @@ def check_classifiers_train( assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function - if not tags["no_validation"]: - if tags["pairwise"]: + if not tags.no_validation: + if tags.input_tags.pairwise: with raises( ValueError, err_msg=msg_pairwise.format(name, "decision_function"), @@ -2481,9 +2488,9 @@ def check_classifiers_train( assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) - if not tags["no_validation"]: + if not tags.no_validation: # raises error on malformed input for predict_proba - if tags["pairwise"]: + if tags.input_tags.pairwise: with raises( ValueError, err_msg=msg_pairwise.format(name, "predict_proba"), @@ -2917,7 +2924,7 @@ def check_supervised_y_2d(name, estimator_orig): msg = "expected 1 DataConversionWarning, got: %s" % ", ".join( [str(w_x) for w_x in w] ) - if not tags["multioutput"]: + if not tags.target_tags.multi_output: # check that we warned if we don't support multi-output assert len(w) > 0, msg assert ( @@ -3985,11 +3992,9 @@ def check_n_features_in_after_fitting(name, estimator_orig): # Make sure that n_features_in are checked after fitting tags = _safe_tags(estimator_orig) - is_supported_X_types = ( - "2darray" in tags["X_types"] or "categorical" in tags["X_types"] - ) + is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical - if not is_supported_X_types or tags["no_validation"]: + if not is_supported_X_types or tags.no_validation: return rng = np.random.RandomState(0) @@ -4080,11 +4085,9 @@ def check_dataframe_column_names_consistency(name, estimator_orig): ) tags = _safe_tags(estimator_orig) - is_supported_X_types = ( - "2darray" in tags["X_types"] or "categorical" in tags["X_types"] - ) + is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical - if not is_supported_X_types or tags["no_validation"]: + if not is_supported_X_types or tags.no_validation: return rng = np.random.RandomState(0) @@ -4216,7 +4219,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): def check_transformer_get_feature_names_out(name, transformer_orig): tags = transformer_orig.__sklearn_tags__() - if "2darray" not in tags["X_types"] or tags["no_validation"]: + if "2darray" not in tags["X_types"] or tags.no_validation: return X, y = make_blobs( @@ -4271,7 +4274,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): ) tags = transformer_orig.__sklearn_tags__() - if "2darray" not in tags["X_types"] or tags["no_validation"]: + if "2darray" not in tags["X_types"] or tags.no_validation: return X, y = make_blobs( @@ -4437,7 +4440,7 @@ def check_set_output_transform(name, transformer_orig): # Check transformer.set_output with the default configuration does not # change the transform output. tags = _safe_tags(transformer_orig) - if "2darray" not in tags["X_types"] or tags["no_validation"]: + if "2darray" not in tags["X_types"] or tags.no_validation: return rng = np.random.RandomState(0) @@ -4625,7 +4628,7 @@ def _check_set_output_transform_dataframe( """ # Check transformer.set_output configures the output of transform="pandas". tags = _safe_tags(transformer_orig) - if "2darray" not in tags["X_types"] or tags["no_validation"]: + if "2darray" not in tags["X_types"] or tags.no_validation: return rng = np.random.RandomState(0) diff --git a/sklearn/utils/metaestimators.py b/sklearn/utils/metaestimators.py index f962acb48f74e..6da13f35c70eb 100644 --- a/sklearn/utils/metaestimators.py +++ b/sklearn/utils/metaestimators.py @@ -139,7 +139,7 @@ def _safe_split(estimator, X, y, indices, train_indices=None): Indexed targets. """ - if _safe_tags(estimator, key="pairwise"): + if _safe_tags(estimator).input_tags.pairwise: if not hasattr(X, "shape"): raise ValueError( "Precomputed kernels or affinity matrices have " From 220f215b14d87b64c9c047fd97e7fd0cdd89b5e6 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 18:50:06 +0200 Subject: [PATCH 22/48] ... --- sklearn/utils/estimator_checks.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d3c8d6ab11af8..c4436c7b6b86a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -170,7 +170,7 @@ def _yield_classifier_checks(classifier): yield check_classifiers_multilabel_output_format_decision_function if not tags.no_validation: yield check_supervised_y_no_nan - if not tags.target_tags.multi_output and not tags.target_tags.single_output: + if not tags.target_tags.multi_output: yield check_supervised_y_2d if tags.requires_fit: yield check_estimators_unfitted @@ -1916,10 +1916,6 @@ def check_fit_score_takes_y(name, estimator_orig): @ignore_warnings def check_estimators_dtypes(name, estimator_orig): - from sklearn.feature_extraction.text import CountVectorizer - - if isinstance(estimator_orig, CountVectorizer): - print("aha") rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_32 = _enforce_estimator_tags_X(estimator_orig, X_train_32) @@ -4067,12 +4063,10 @@ def check_estimator_get_tags_default_keys(name, estimator_orig): def check_estimator_tags_deprecated(name, estimator_orig): assert not hasattr(estimator_orig, "_more_tags"), ( - "_more_tags() was deprecated in 1.6 support will be removed in 1.8. " - "Please use __sklearn_tags__ instead.", + "_more_tags() was removed in 1.6. " "Please use __sklearn_tags__ instead.", ) assert not hasattr(estimator_orig, "_get_tags"), ( - "_get_tags() was deprecated in 1.6 support will be removed in 1.8. " - "Please use __sklearn_tags__ instead." + "_get_tags() was removed in 1.6. " "Please use __sklearn_tags__ instead." ) @@ -4219,7 +4213,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): def check_transformer_get_feature_names_out(name, transformer_orig): tags = transformer_orig.__sklearn_tags__() - if "2darray" not in tags["X_types"] or tags.no_validation: + if tags.input_tags.two_d_array or tags.no_validation: return X, y = make_blobs( @@ -4274,7 +4268,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): ) tags = transformer_orig.__sklearn_tags__() - if "2darray" not in tags["X_types"] or tags.no_validation: + if not tags.input_tags.two_d_array or tags.no_validation: return X, y = make_blobs( @@ -4440,7 +4434,7 @@ def check_set_output_transform(name, transformer_orig): # Check transformer.set_output with the default configuration does not # change the transform output. tags = _safe_tags(transformer_orig) - if "2darray" not in tags["X_types"] or tags.no_validation: + if not tags.input_tags.two_d_array or tags.no_validation: return rng = np.random.RandomState(0) @@ -4628,7 +4622,7 @@ def _check_set_output_transform_dataframe( """ # Check transformer.set_output configures the output of transform="pandas". tags = _safe_tags(transformer_orig) - if "2darray" not in tags["X_types"] or tags.no_validation: + if not tags.input_tags.two_d_array or tags.no_validation: return rng = np.random.RandomState(0) From 64a55a4c9a418c999e7d00c77b618f5dd522506c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 19:11:32 +0200 Subject: [PATCH 23/48] rename back test name --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index c4436c7b6b86a..58a9bf2702d66 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -251,7 +251,7 @@ def _yield_transformer_checks(transformer): # these don't actually fit the data, so don't raise errors yield check_transformer_general if tags.transformer_tags.preserves_dtype: - yield check_transformer_preserves_dtypes + yield check_transformer_preserve_dtypes yield partial(check_transformer_general, readonly_memmap=True) if _safe_tags(transformer).requires_fit: yield check_transformers_unfitted @@ -1937,7 +1937,7 @@ def check_estimators_dtypes(name, estimator_orig): getattr(estimator, method)(X_train) -def check_transformer_preserves_dtypes(name, transformer_orig): +def check_transformer_preserve_dtypes(name, transformer_orig): # check that dtype are preserved meaning if input X is of some dtype # X_transformed should be from the same dtype. X, y = make_blobs( From 3be6c6e937f6c1051843dcc3454bbabbb97b1ab7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 19:29:43 +0200 Subject: [PATCH 24/48] ... --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 58a9bf2702d66..011ba63c0de90 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -139,7 +139,7 @@ def _yield_checks(estimator): yield partial(check_estimators_pickle, readonly_memmap=True) yield check_estimator_get_tags_default_keys - yield check_estimator_tags_deprecated + # yield check_estimator_tags_deprecated if tags.array_api_support: for check in _yield_array_api_checks(estimator): @@ -170,7 +170,7 @@ def _yield_classifier_checks(classifier): yield check_classifiers_multilabel_output_format_decision_function if not tags.no_validation: yield check_supervised_y_no_nan - if not tags.target_tags.multi_output: + if tags.target_tags.multi_output and tags.target_tags.single_output: yield check_supervised_y_2d if tags.requires_fit: yield check_estimators_unfitted From 042b16b80afd3a60c90fe4072898f5d3c0a1b78e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 19:48:01 +0200 Subject: [PATCH 25/48] ... --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 011ba63c0de90..5b1a47dc5ef85 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -170,7 +170,7 @@ def _yield_classifier_checks(classifier): yield check_classifiers_multilabel_output_format_decision_function if not tags.no_validation: yield check_supervised_y_no_nan - if tags.target_tags.multi_output and tags.target_tags.single_output: + if tags.target_tags.single_output: yield check_supervised_y_2d if tags.requires_fit: yield check_estimators_unfitted From bff06b158bc2eee5a962ada5e1eae1a1b7149088 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 19 Aug 2024 20:28:05 +0200 Subject: [PATCH 26/48] ... --- sklearn/svm/_classes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index efcdb74a986ef..74c95884583ac 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -350,7 +350,7 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_check = { + tags._xfail_checks = { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), @@ -609,7 +609,7 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_check = { + tags._xfail_checks = { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), @@ -890,7 +890,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_check = { + tags._xfail_checks = { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), @@ -1160,7 +1160,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_check = { + tags._xfail_checks = { "check_methods_subset_invariance": ( "fails for the decision_function method" ), @@ -1368,7 +1368,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_check = { + tags._xfail_checks = { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), @@ -1562,7 +1562,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_check = { + tags._xfail_checks = { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), @@ -1825,7 +1825,7 @@ def predict(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags._xfail_check = { + tags._xfail_checks = { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), From 3285fe2c4c07e86b951842428a4d5ff2c4ccb377 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 11:23:02 +0200 Subject: [PATCH 27/48] ... --- sklearn/linear_model/_ridge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index da5b9744eb96c..c70f9e788e1de 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -2254,6 +2254,12 @@ def _score(self, *, predictions, y, n_y, scorer, score_params): return _score + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # Required since this is neither a RegressorMixin nor a ClassifierMixin + tags.target_tags.required = True + return tags + class _BaseRidgeCV(LinearModel): _parameter_constraints: dict = { From 9dbd5006fb19b3eb5c934397b2264f7cc6a2e099 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 11:39:31 +0200 Subject: [PATCH 28/48] ... --- sklearn/ensemble/_base.py | 2 +- sklearn/model_selection/_search.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 77cb27dbccb2d..0ee3a5a82e1fa 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -295,7 +295,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() try: allow_nan = all( - _safe_tags(est[1])["allow_nan"] if est[1] != "drop" else True + _safe_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True for est in self.estimators ) except Exception: diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 57b8b53e5f80a..d99fff811543c 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -484,7 +484,8 @@ def __sklearn_tags__(self): # allows cross-validation to see 'precomputed' metrics tags.input_tags.pairwise = _safe_tags(self.estimator).input_tags.pairwise tags._xfail_checks = { - "check_supervised_y_2d": "DataConversionWarning not caught" + "check_supervised_y_2d": "DataConversionWarning not caught", + "check_requires_y_none": "Doesn't fail gracefully", } tags.array_api_support = _safe_tags(self.estimator).array_api_support return tags From 244c1dcef6274381f9634110c176889a8e57cc6c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 12:42:58 +0200 Subject: [PATCH 29/48] ... --- sklearn/multioutput.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 39f7eb14997a2..9e7b391ac0ff6 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -1103,8 +1103,8 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() # FIXME tags._skip_test = True - tags.classifier_tags.single_output = False - tags.classifier_tags.multi_output = True + tags.target_tags.single_output = False + tags.target_tags.multi_output = True return tags From d42b4baeffaea0d547e3f59607ba9626fa69eb83 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 12:52:18 +0200 Subject: [PATCH 30/48] ... --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5b1a47dc5ef85..4f4ee23abacce 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4213,7 +4213,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): def check_transformer_get_feature_names_out(name, transformer_orig): tags = transformer_orig.__sklearn_tags__() - if tags.input_tags.two_d_array or tags.no_validation: + if not tags.input_tags.two_d_array or tags.no_validation: return X, y = make_blobs( From 6c47f588e2eb7cb9d8fe526b4bb4082b09268c43 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 13:44:32 +0200 Subject: [PATCH 31/48] ... --- sklearn/ensemble/tests/test_bagging.py | 2 +- sklearn/feature_selection/_rfe.py | 5 +- .../tests/test_from_model.py | 19 +-- sklearn/impute/tests/test_knn.py | 2 +- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- sklearn/model_selection/tests/test_search.py | 16 ++- sklearn/preprocessing/tests/test_data.py | 2 +- sklearn/preprocessing/tests/test_encoders.py | 2 +- sklearn/tests/test_base.py | 25 ++-- sklearn/tests/test_docstring_parameters.py | 6 +- sklearn/tests/test_multiclass.py | 4 +- sklearn/tests/test_pipeline.py | 2 +- sklearn/utils/_mocking.py | 12 +- sklearn/utils/_tags.py | 133 ++++++++++++++++++ sklearn/utils/tests/test_estimator_checks.py | 46 ++---- 15 files changed, 206 insertions(+), 72 deletions(-) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index c2e4cb33a542c..44f28792a717e 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -941,7 +941,7 @@ def fit(self, X, y): ) def test_bagging_allow_nan_tag(bagging, expected_allow_nan): """Check that bagging inherits allow_nan tag.""" - assert bagging.__sklearn_tags__()["allow_nan"] == expected_allow_nan + assert bagging.__sklearn_tags__().input_tags.allow_nan == expected_allow_nan @pytest.mark.parametrize( diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index e5100b3154565..326f2b6def368 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -534,7 +534,10 @@ def predict_log_proba(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.classifier_tags.poor_score = True + if tags.classifier_tags is not None: + tags.classifier_tags.poor_score = True + if tags.regressor_tags is not None: + tags.regressor_tags.poor_score = True tags.target_tags.required = True tags.input_tags.allow_nan = _safe_tags(self.estimator).input_tags.allow_nan return tags diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index a7a1d02dce165..8008b8c028085 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -36,20 +36,23 @@ class NaNTag(BaseEstimator): def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags class NoNaNTag(BaseEstimator): def __sklearn_tags__(self): - more_tags = {"allow_nan": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags class NaNTagRandomForest(RandomForestClassifier): def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags iris = datasets.load_iris() @@ -561,11 +564,11 @@ def test_transform_accepts_nan_inf(): def test_allow_nan_tag_comes_from_estimator(): allow_nan_est = NaNTag() model = SelectFromModel(estimator=allow_nan_est) - assert model.__sklearn_tags__()["allow_nan"] is True + assert model.__sklearn_tags__().input_tags.allow_nan is True no_nan_est = NoNaNTag() model = SelectFromModel(estimator=no_nan_est) - assert model.__sklearn_tags__()["allow_nan"] is False + assert model.__sklearn_tags__().input_tags.allow_nan is False def _pca_importances(pca_estimator): diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py index 61e1dacb146b2..34244d628600f 100644 --- a/sklearn/impute/tests/test_knn.py +++ b/sklearn/impute/tests/test_knn.py @@ -567,4 +567,4 @@ def test_knn_imputer_distance_weighted_not_enough_neighbors(na, working_memory): @pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)]) def test_knn_tags(na, allow_nan): knn = KNNImputer(missing_values=na) - assert knn.__sklearn_tags__()["allow_nan"] == allow_nan + assert knn.__sklearn_tags__().input_tags.allow_nan == allow_nan diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index e16493b5f548b..cb052860dd756 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -923,7 +923,7 @@ def test_tweedie_score(regression_data, power, link): ], ) def test_tags(estimator, value): - assert estimator.__sklearn_tags__()["requires_positive_y"] is value + assert estimator.__sklearn_tags__().target_tags.positive_only is value def test_linalg_warning_with_newton_solver(global_random_seed): diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 6d2e9ff753b1c..e7637be8d654b 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2276,13 +2276,14 @@ def test_search_cv_pairwise_property_delegated_to_base_estimator(pairwise): class TestEstimator(BaseEstimator): def __sklearn_tags__(self): - more_tags = {"pairwise": pairwise} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = pairwise + return tags est = TestEstimator() attr_message = "BaseSearchCV pairwise tag must match estimator" cv = GridSearchCV(est, {"n_neighbors": [10]}) - assert pairwise == cv.__sklearn_tags__()["pairwise"], attr_message + assert pairwise == cv.__sklearn_tags__().input_tags.pairwise, attr_message def test_search_cv__pairwise_property_delegated_to_base_estimator(): @@ -2299,8 +2300,9 @@ def __init__(self, pairwise=True): self.pairwise = pairwise def __sklearn_tags__(self): - more_tags = {"pairwise": self.pairwise} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.pairwise + return tags est = EstimatorPairwise() attr_message = "BaseSearchCV _pairwise property must match estimator" @@ -2308,7 +2310,9 @@ def __sklearn_tags__(self): for _pairwise_setting in [True, False]: est.set_params(pairwise=_pairwise_setting) cv = GridSearchCV(est, {"n_neighbors": [10]}) - assert _pairwise_setting == cv.__sklearn_tags__()["pairwise"], attr_message + assert ( + _pairwise_setting == cv.__sklearn_tags__().input_tags.pairwise + ), attr_message def test_search_cv_pairwise_property_equivalence_of_precomputed(): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 8c58570697042..72850228cb880 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2109,7 +2109,7 @@ def test_cv_pipeline_precomputed(): pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())]) # did the pipeline set the pairwise attribute? - assert pipeline.__sklearn_tags__()["pairwise"] + assert pipeline.__sklearn_tags__().input_tags.pairwise # test cross-validation, score should be almost perfect # NB: this test is pretty vacuous -- it's mainly to test integration diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 1ce771b667dff..7d7db00136323 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -869,7 +869,7 @@ def test_categories(density, drop): @pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) def test_encoders_has_categorical_tags(Encoder): - assert "categorical" in Encoder().__sklearn_tags__()["X_types"] + assert Encoder().__sklearn_tags__().input_tags.categorical @pytest.mark.parametrize( diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index add3a0f35918c..5d88043c2fb2e 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -59,20 +59,23 @@ def __init__(self, a=None, b=None): class NaNTag(BaseEstimator): def __sklearn_tags__(self): - more_tags = {"allow_nan": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags class NoNaNTag(BaseEstimator): def __sklearn_tags__(self): - more_tags = {"allow_nan": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags class OverrideTag(NaNTag): def __sklearn_tags__(self): - more_tags = {"allow_nan": False} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags class DiamondOverwriteTag(NaNTag, NoNaNTag): @@ -627,17 +630,17 @@ def test_tag_inheritance(): nan_tag_est = NaNTag() no_nan_tag_est = NoNaNTag() - assert nan_tag_est.__sklearn_tags__()["allow_nan"] - assert not no_nan_tag_est.__sklearn_tags__()["allow_nan"] + assert nan_tag_est.__sklearn_tags__().input_tags.allow_nan + assert not no_nan_tag_est.__sklearn_tags__().input_tags.allow_nan redefine_tags_est = OverrideTag() - assert not redefine_tags_est.__sklearn_tags__()["allow_nan"] + assert not redefine_tags_est.__sklearn_tags__().input_tags.allow_nan diamond_tag_est = DiamondOverwriteTag() - assert diamond_tag_est.__sklearn_tags__()["allow_nan"] + assert diamond_tag_est.__sklearn_tags__().input_tags.allow_nan inherit_diamond_tag_est = InheritDiamondOverwriteTag() - assert inherit_diamond_tag_est.__sklearn_tags__()["allow_nan"] + assert inherit_diamond_tag_est.__sklearn_tags__().input_tags.allow_nan def test_raises_on_get_params_non_attribute(): diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 9a65bd74e1755..b8449d999455a 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -265,11 +265,11 @@ def test_fit_docstring_attributes(name, Estimator): y = _enforce_estimator_tags_y(est, y) X = _enforce_estimator_tags_X(est, X) - if "1dlabels" in est.__sklearn_tags__()["X_types"]: + if est.__sklearn_tags__().input_tags.one_d_labels: est.fit(y) - elif "2dlabels" in est.__sklearn_tags__()["X_types"]: + elif est.__sklearn_tags__().input_tags.two_d_labels: est.fit(np.c_[y, y]) - elif "3darray" in est.__sklearn_tags__()["X_types"]: + elif est.__sklearn_tags__().input_tags.three_d_array: est.fit(X[np.newaxis, ...], y) else: est.fit(X, y) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index e851b00954921..566b8f535c9cb 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -843,10 +843,10 @@ def test_pairwise_tag(MultiClassClassifier): clf_notprecomputed = svm.SVC() ovr_false = MultiClassClassifier(clf_notprecomputed) - assert not ovr_false.__sklearn_tags__()["pairwise"] + assert not ovr_false.__sklearn_tags__().input_tags.pairwise ovr_true = MultiClassClassifier(clf_precomputed) - assert ovr_true.__sklearn_tags__()["pairwise"] + assert ovr_true.__sklearn_tags__().input_tags.pairwise @pytest.mark.parametrize( diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index ec3b38c7262af..8891fa9189979 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1616,7 +1616,7 @@ def test_pipeline_get_tags_none(passthrough): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/18815 pipe = make_pipeline(passthrough, SVC()) - assert not pipe.__sklearn_tags__()["pairwise"] + assert not pipe.__sklearn_tags__().input_tags.pairwise # FIXME: Replace this test with a full `check_estimator` once we have API only diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index 3ce37cce6a5be..a07ca429e8d74 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -336,8 +336,11 @@ def score(self, X=None, Y=None): return score def __sklearn_tags__(self): - more_tags = {"_skip_test": True, "X_types": ["1dlabel"]} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags._skip_test = True + tags.input_tags.two_d_array = False + tags.input_tags.one_d_labels = True + return tags # Deactivate key validation for CheckingClassifier because we want to be able to @@ -370,8 +373,9 @@ def predict_proba(self, X): return self.est.predict_proba(X) def __sklearn_tags__(self): - more_tags = {"_skip_test": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags._skip_test = True + return tags def _check_response(method): diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index ec4ff94bb4e12..ff8023133a70b 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -8,6 +8,48 @@ @dataclass class InputTags: + """Tags for the input data. + + Parameters + ---------- + one_d_array : bool + Whether the input can be a 1D array. + + two_d_array : bool + Whether the input can be a 2D array. + + three_d_array : bool + Whether the input can be a 3D array. + + one_d_labels : bool + Whether the input is a 1D labels(y). + + two_d_labels : bool + Whether the input is a 2D labels(y). + + sparse : bool + Whether the input can be a sparse matrix. + + categorical : bool + Whether the input can be categorical. + + string : bool + Whether the input can be an array-like of strings. + + dict : bool + Whether the input can be a dictionary. + + positive_only : bool + Whether the input has to be positive. + + allow_nan : bool + Whether the input can contain NaNs. + + pairwise : bool + Whether the input is in the form of a calculated pairwise distances or computed + kernel values. + """ + one_d_array = False two_d_array: bool = True three_d_array: bool = False @@ -24,6 +66,23 @@ class InputTags: @dataclass class TargetTags: + """Tags for the target data. + + Parameters + ---------- + required : bool + Whether the target is required. + + positive_only : bool + Whether the target has to be positive. + + multi_output : bool + Whether the target can be multi-output. + + single_output : bool + Whether the target can be single-output. + """ + required: bool positive_only: bool = False multi_output: bool = False @@ -32,11 +91,36 @@ class TargetTags: @dataclass class TransformerTags: + """Tags for the transformer. + + Parameters + ---------- + preserves_dtype : list[object] + The data types that the transformer preserves. + """ + preserves_dtype: list[object] = field(default_factory=lambda: [np.float64]) @dataclass class ClassifierTags: + """Tags for the classifier. + + Parameters + ---------- + poor_score : bool + Whether the classifier can have a poor score in tests. + + binary : bool + Whether the classifier can handle binary classification. + + multi_class : bool + Whether the classifier can handle multi-class classification. + + multi_label : bool + Whether the classifier can handle multi-label classification. + """ + poor_score: bool = False binary: bool = True multi_class: bool = True @@ -45,12 +129,61 @@ class ClassifierTags: @dataclass class RegressorTags: + """Tags for the regressor. + + Parameters + ---------- + poor_score : bool + Whether the regressor can have a poor score in tests. + + multi_label : bool + Whether the regressor can handle multi-label regression. + """ + poor_score: bool = False multi_label: bool = False @dataclass class Tags: + """Tags for the estimator. + + Parameters + ---------- + target_tags : TargetTags + The target(y) tags. + + transformer_tags : TransformerTags + The transformer tags. + + classifier_tags : ClassifierTags + The classifier tags. + + regressor_tags : RegressorTags + The regressor tags. + + array_api_support : bool + Whether the estimator supports array API supporting input. + + no_validation : bool + Whether the estimator does not validate input. + + non_deterministic : bool + Whether the estimator is non-deterministic. + + requires_fit : bool + Whether the estimator requires fitting before other methods can be called. + + _skip_test : bool + Whether the estimator should be skipped in tests. + + _xfail_checks : dict[str, str] + Checks that should be xfailed. + + input_tags : InputTags + The input data(X) tags. + """ + target_tags: TargetTags transformer_tags: TransformerTags classifier_tags: ClassifierTags diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 05cbd9275b242..f10d1b56a2690 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -52,7 +52,6 @@ check_dataframe_column_names_consistency, check_decision_proba_consistency, check_estimator, - check_estimator_get_tags_default_keys, check_estimators_unfitted, check_fit_check_is_fitted, check_fit_score_takes_y, @@ -457,14 +456,9 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): class TaggedBinaryClassifier(UntaggedBinaryClassifier): # Toy classifier that only supports binary classification. def __sklearn_tags__(self): - more_tags = {"binary_only": True} - return {**super().__sklearn_tags__(), **more_tags} - - -class EstimatorMissingDefaultTags(BaseEstimator): - def __sklearn_tags__(self): - tags = super().__sklearn_tags__().copy() - del tags["allow_nan"] + tags = super().__sklearn_tags__() + tags.classifier_tags.binary = True + tags.classifier_tags.multi_class = False return tags @@ -476,8 +470,9 @@ def fit(self, X, y): return super().fit(X, y) def __sklearn_tags__(self): - more_tags = {"requires_positive_X": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + return tags class RequiresPositiveYRegressor(LinearRegression): @@ -488,8 +483,9 @@ def fit(self, X, y): return super().fit(X, y) def __sklearn_tags__(self): - more_tags = {"requires_positive_y": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.positive_only = True + return tags class PoorScoreLogisticRegression(LogisticRegression): @@ -497,8 +493,9 @@ def decision_function(self, X): return super().decision_function(X) + 1 def __sklearn_tags__(self): - more_tags = {"poor_score": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.poor_score = True + return tags class PartialFitChecksName(BaseEstimator): @@ -852,20 +849,6 @@ def test_check_regressor_data_not_an_array(): ) -def test_check_estimator_get_tags_default_keys(): - estimator = EstimatorMissingDefaultTags() - err_msg = ( - r"EstimatorMissingDefaultTags.__sklearn_tags__\(\) is missing entries" - r" for the following default tags: {'allow_nan'}" - ) - with raises(AssertionError, match=err_msg): - check_estimator_get_tags_default_keys(estimator.__class__.__name__, estimator) - - # noop check when _get_tags is not available - estimator = MinimalTransformer() - check_estimator_get_tags_default_keys(estimator.__class__.__name__, estimator) - - def test_check_dataframe_column_names_consistency(): err_msg = "Estimator does not have a feature_names_in_" with raises(ValueError, match=err_msg): @@ -890,8 +873,9 @@ def fit(self, X, y): return self def __sklearn_tags__(self): - more_tags = {"multilabel": True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags def test_check_classifiers_multilabel_output_format_predict(): From d9e730586a755f76e5025c2c47d18cf36d47b087 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 13:45:49 +0200 Subject: [PATCH 32/48] ... --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4f4ee23abacce..3cb0b51d9a7c6 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -139,7 +139,7 @@ def _yield_checks(estimator): yield partial(check_estimators_pickle, readonly_memmap=True) yield check_estimator_get_tags_default_keys - # yield check_estimator_tags_deprecated + yield check_estimator_tags_renamed if tags.array_api_support: for check in _yield_array_api_checks(estimator): @@ -4061,7 +4061,7 @@ def check_estimator_get_tags_default_keys(name, estimator_orig): ), f"{name}.__sklearn_tags__() must be an instance of Tags" -def check_estimator_tags_deprecated(name, estimator_orig): +def check_estimator_tags_renamed(name, estimator_orig): assert not hasattr(estimator_orig, "_more_tags"), ( "_more_tags() was removed in 1.6. " "Please use __sklearn_tags__ instead.", ) From c164d71ec1da07aff8f979f994f66a100f34ff0c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 14:54:41 +0200 Subject: [PATCH 33/48] self review --- sklearn/base.py | 6 ++++++ sklearn/cross_decomposition/_pls.py | 6 ++---- sklearn/decomposition/_lda.py | 1 + sklearn/manifold/tests/test_spectral_embedding.py | 2 +- sklearn/naive_bayes.py | 6 ++---- sklearn/utils/estimator_checks.py | 4 ++-- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 165176ffc6ac6..6655633ef7d2c 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -881,6 +881,12 @@ def fit_predict(self, X, y=None, **kwargs): self.fit(X, **kwargs) return self.labels_ + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + if tags.transformer_tags is not None: + tags.transformer_tags.preserves_dtype = [] + return tags + class BiclusterMixin: """Mixin class for all bicluster estimators in scikit-learn. diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 0d95a9e2f8343..8ef3192ab60eb 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -19,8 +19,6 @@ RegressorMixin, TransformerMixin, _fit_context, - is_classifier, - is_regressor, ) from ..exceptions import ConvergenceWarning from ..utils import check_array, check_consistent_length @@ -555,9 +553,9 @@ def fit_transform(self, X, y=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - if is_classifier(self): + if tags.classifier_tags is not None: tags.classifier_tags.poor_score = True - if is_regressor(self): + if tags.regressor_tags is not None: tags.regressor_tags.poor_score = True tags.target_tags.required = False return tags diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index d718282d3294e..782ab2b211052 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -549,6 +549,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.positive_only = True + tags.transformer_tags.preserves_dtype = [np.float32, np.float64] return tags def _check_non_neg_array(self, X, reset_n_features, whom): diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 922d3ae981119..6dec35123f9cc 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -444,7 +444,7 @@ def test_spectral_embedding_preserves_dtype(eigen_solver, dtype): attribute and transformed data. Ideally, this test should be covered by the common test - `check_transformer_preserves_dtypes`. However, this test only run + `check_transformer_preserve_dtypes`. However, this test only run with transformers implementing `transform` while `SpectralEmbedding` implements only `fit_transform`. """ diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 142669f140232..d3a072c3fa1be 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -18,8 +18,6 @@ BaseEstimator, ClassifierMixin, _fit_context, - is_classifier, - is_regressor, ) from .preprocessing import LabelBinarizer, binarize, label_binarize from .utils._param_validation import Interval @@ -768,9 +766,9 @@ def _init_counters(self, n_classes, n_features): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - if is_classifier(self): + if tags.classifier_tags is not None: tags.classifier_tags.poor_score = True - if is_regressor(self): + if tags.regressor_tags is not None: tags.regressor_tags.poor_score = True return tags diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3cb0b51d9a7c6..6d27daf98cf47 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4212,7 +4212,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): def check_transformer_get_feature_names_out(name, transformer_orig): - tags = transformer_orig.__sklearn_tags__() + tags = _safe_tags(transformer_orig) if not tags.input_tags.two_d_array or tags.no_validation: return @@ -4267,7 +4267,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): "pandas is not installed: not checking column name consistency for pandas" ) - tags = transformer_orig.__sklearn_tags__() + tags = _safe_tags(transformer_orig) if not tags.input_tags.two_d_array or tags.no_validation: return From bc2f53ce55687ccdfe33200b34a9d0f9ed4a67da Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 15:01:47 +0200 Subject: [PATCH 34/48] codecov review --- sklearn/cross_decomposition/_pls.py | 5 +---- sklearn/linear_model/_glm/glm.py | 2 +- sklearn/naive_bayes.py | 5 +---- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 8ef3192ab60eb..a57fb7c6fa882 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -553,10 +553,7 @@ def fit_transform(self, X, y=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - if tags.classifier_tags is not None: - tags.classifier_tags.poor_score = True - if tags.regressor_tags is not None: - tags.regressor_tags.poor_score = True + tags.regressor_tags.poor_score = True tags.target_tags.required = False return tags diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 4459deb42b3cc..05b3f9037e781 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -449,7 +449,7 @@ def __sklearn_tags__(self): except (ValueError, AttributeError, TypeError): # This happens when the link or power parameter of TweedieRegressor is # invalid. We fallback on the default tags in that case. - pass + pass # pragma: no cover return tags def _get_loss(self): diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index d3a072c3fa1be..e95cf1e98f5b0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -766,10 +766,7 @@ def _init_counters(self, n_classes, n_features): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - if tags.classifier_tags is not None: - tags.classifier_tags.poor_score = True - if tags.regressor_tags is not None: - tags.regressor_tags.poor_score = True + tags.classifier_tags.poor_score = True return tags From d8bb6a33e3b7c4f4d674e246c5eb013240a960aa Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 16:22:24 +0200 Subject: [PATCH 35/48] docs and API --- doc/developers/develop.rst | 158 +++----------- sklearn/compose/_target.py | 4 +- sklearn/ensemble/_bagging.py | 6 +- sklearn/ensemble/_base.py | 4 +- sklearn/ensemble/_forest.py | 4 +- sklearn/feature_selection/_base.py | 6 +- sklearn/feature_selection/_from_model.py | 4 +- sklearn/feature_selection/_rfe.py | 4 +- sklearn/feature_selection/_sequential.py | 4 +- .../_classification_threshold.py | 1 - sklearn/model_selection/_search.py | 6 +- sklearn/multiclass.py | 6 +- sklearn/pipeline.py | 6 +- sklearn/tests/test_common.py | 4 +- sklearn/utils/__init__.py | 4 + sklearn/utils/_tags.py | 204 ++++++++++++------ sklearn/utils/estimator_checks.py | 97 ++++----- sklearn/utils/metaestimators.py | 4 +- sklearn/utils/tests/test_estimator_checks.py | 1 - sklearn/utils/tests/test_tags.py | 4 +- 20 files changed, 247 insertions(+), 284 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 5c02715d028b1..fa1246a4bfc62 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -523,138 +523,44 @@ Estimator Tags The estimator tags are experimental and the API is subject to change. -Scikit-learn introduced estimator tags in version 0.21. These are annotations -of estimators that allow programmatic inspection of their capabilities, such as -sparse matrix support, supported output types and supported methods. The -estimator tags are a dictionary returned by the method ``__sklearn_tags__()``. These +.. note:: + + Scikit-learn introduced estimator tags in version 0.21 as a + private API and mostly used in tests. However, these tags expanded + over time and many third party developers also need to use + them. Therefore in version 1.6 the API for the tags were revamped + and exposed as public API. + +The estimator tags are annotations of estimators that allow +programmatic inspection of their capabilities, such as sparse matrix +support, supported output types and supported methods. The estimator +tags are an instance of :class:`~sklearn.utils.Tags` returned by the +method :meth:`~sklearn.base.BaseEstimator.__sklearn_tags__()`. These tags are used in the common checks run by the -:func:`~sklearn.utils.estimator_checks.check_estimator` function and the -:func:`~sklearn.utils.estimator_checks.parametrize_with_checks` decorator. -Tags determine which checks to run and what input data is appropriate. Tags -can depend on estimator parameters or even system architecture and can in -general only be determined at runtime. - -The current set of estimator tags are: - -allow_nan (default=False) - whether the estimator supports data with missing values encoded as np.nan - -array_api_support (default=False) - whether the estimator supports Array API compatible inputs. - -binary_only (default=False) - whether estimator supports binary classification but lacks multi-class - classification support. - -multilabel (default=False) - whether the estimator supports multilabel output - -multioutput (default=False) - whether a regressor supports multi-target outputs or a classifier supports - multi-class multi-output. - -multioutput_only (default=False) - whether estimator supports only multi-output classification or regression. - -no_validation (default=False) - whether the estimator skips input-validation. This is only meant for - stateless and dummy transformers! - -non_deterministic (default=False) - whether the estimator is not deterministic given a fixed ``random_state`` - -pairwise (default=False) - This boolean attribute indicates whether the data (`X`) :term:`fit` and - similar methods consists of pairwise measures over samples rather than a - feature representation for each sample. It is usually `True` where an - estimator has a `metric` or `affinity` or `kernel` parameter with value - 'precomputed'. Its primary purpose is to support a :term:`meta-estimator` - or a cross validation procedure that extracts a sub-sample of data intended - for a pairwise estimator, where the data needs to be indexed on both axes. - Specifically, this tag is used by - `sklearn.utils.metaestimators._safe_split` to slice rows and - columns. - -preserves_dtype (default=``[np.float64]``) - applies only on transformers. It corresponds to the data types which will - be preserved such that `X_trans.dtype` is the same as `X.dtype` after - calling `transformer.transform(X)`. If this list is empty, then the - transformer is not expected to preserve the data type. The first value in - the list is considered as the default data type, corresponding to the data - type of the output when the input data type is not going to be preserved. - -poor_score (default=False) - whether the estimator fails to provide a "reasonable" test-set score, which - currently for regression is an R2 of 0.5 on ``make_regression(n_samples=200, - n_features=10, n_informative=1, bias=5.0, noise=20, random_state=42)``, and - for classification an accuracy of 0.83 on - ``make_blobs(n_samples=300, random_state=0)``. These datasets and values - are based on current estimators in sklearn and might be replaced by - something more systematic. - -requires_fit (default=True) - whether the estimator requires to be fitted before calling one of - `transform`, `predict`, `predict_proba`, or `decision_function`. - -requires_positive_X (default=False) - whether the estimator requires positive X. - -requires_y (default=False) - whether the estimator requires y to be passed to `fit`, `fit_predict` or - `fit_transform` methods. The tag is True for estimators inheriting from - `~sklearn.base.RegressorMixin` and `~sklearn.base.ClassifierMixin`. - -requires_positive_y (default=False) - whether the estimator requires a positive y (only applicable for regression). - -_skip_test (default=False) - whether to skip common tests entirely. Don't use this unless you have a - *very good* reason. - -_xfail_checks (default=False) - dictionary ``{check_name: reason}`` of common checks that will be marked - as `XFAIL` for pytest, when using - :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`. These - checks will be simply ignored and not run by - :func:`~sklearn.utils.estimator_checks.check_estimator`, but a - `SkipTestWarning` will be raised. - Don't use this unless there is a *very good* reason for your estimator - not to pass the check. - Also note that the usage of this tag is highly subject to change because - we are trying to make it more flexible: be prepared for breaking changes - in the future. - -stateless (default=False) - whether the estimator needs access to data for fitting. Even though an - estimator is stateless, it might still need a call to ``fit`` for - initialization. - -X_types (default=['2darray']) - Supported input types for X as list of strings. Tests are currently only - run if '2darray' is contained in the list, signifying that the estimator - takes continuous 2d numpy arrays as input. The default value is - ['2darray']. Other possible types are ``'string'``, ``'sparse'``, - ``'categorical'``, ``dict``, ``'1dlabels'`` and ``'2dlabels'``. The goal is - that in the future the supported input type will determine the data used - during testing, in particular for ``'string'``, ``'sparse'`` and - ``'categorical'`` data. For now, the test for sparse data do not make use - of the ``'sparse'`` tag. - -It is unlikely that the default values for each tag will suit the needs of your -specific estimator. Additional tags can be created or default tags can be -overridden by defining a `__sklearn_tags__()` method which returns a dict with the -desired overridden tags or new tags. For example:: +:func:`~sklearn.utils.estimator_checks.check_estimator` function and +the :func:`~sklearn.utils.estimator_checks.parametrize_with_checks` +decorator. Tags determine which checks to run and what input data is +appropriate. Tags can depend on estimator parameters or even system +architecture and can in general only be determined at runtime and +are therefore instance attributes rather than class attributes. See +:class:`~sklearn.utils.Tags` for more information about individual +tags. + +It is unlikely that the default values for each tag will suit the +needs of your specific estimator. You can change the default values by +defining a `__sklearn_tags__()` method which returns the new values +for your estimator's tags. For example:: class MyMultiOutputEstimator(BaseEstimator): def __sklearn_tags__(self): - more_tags = {'multioutput_only': True, - 'non_deterministic': True} - return {**super().__sklearn_tags__(), **more_tags} + tags = super().__sklearn_tags__() + tags.target_tags.single_output = False + tags.non_deterministic = True + return tags -`__sklearn_tags__()` should return all tags in the dictionary. Note however that -**all tags must be present in the dict**. If any of the keys documented above is -not present in the output of `__sklearn_tags__()`, an error will occur. +You can create a new subclass of :class:`~sklearn.utils.Tags` if you wish +to add new tags to the existing set. In addition to the tags, estimators also need to declare any non-optional parameters to ``__init__`` in the ``_required_parameters`` class attribute, diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 2f69e32a24eae..167887640b7d5 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -17,7 +17,7 @@ process_routing, ) from ..utils._param_validation import HasMethods -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.validation import check_is_fitted __all__ = ["TransformedTargetRegressor"] @@ -356,7 +356,7 @@ def __sklearn_tags__(self): regressor = self._get_regressor() tags = super().__sklearn_tags__() tags.regressor_tags.poor_score = True - tags.target_tags.multi_output = _safe_tags(regressor).target_tags.multi_output + tags.target_tags.multi_output = get_tags(regressor).target_tags.multi_output return tags @property diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index af76cc38a7574..f19e836a10cc3 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -24,7 +24,7 @@ ) from ..utils._mask import indices_to_mask from ..utils._param_validation import HasMethods, Interval, RealNotInt -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.metadata_routing import ( MetadataRouter, MethodMapping, @@ -640,9 +640,7 @@ def _get_estimator(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.allow_nan = _safe_tags( - self._get_estimator() - ).input_tags.allow_nan + tags.input_tags.allow_nan = get_tags(self._get_estimator()).input_tags.allow_nan return tags diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 0ee3a5a82e1fa..1ba0b3bd005f1 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -11,7 +11,7 @@ from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor from ..utils import Bunch, check_random_state -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils._user_interface import _print_elapsed_time from ..utils.metadata_routing import _routing_enabled from ..utils.metaestimators import _BaseComposition @@ -295,7 +295,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() try: allow_nan = all( - _safe_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True + get_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True for est in self.estimators ) except Exception: diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 2fea3a9fde18c..2cbed44d596b0 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -66,7 +66,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..tree._tree import DOUBLE, DTYPE from ..utils import check_random_state, compute_sample_weight from ..utils._param_validation import Interval, RealNotInt, StrOptions -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.multiclass import check_classification_targets, type_of_target from ..utils.parallel import Parallel, delayed from ..utils.validation import ( @@ -715,7 +715,7 @@ def __sklearn_tags__(self): # Only the criterion is required to determine if the tree supports # missing values estimator = type(self.estimator)(criterion=self.criterion) - tags.input_tags.allow_nan = _safe_tags(estimator).input_tags.allow_nan + tags.input_tags.allow_nan = get_tags(estimator).input_tags.allow_nan return tags diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index a6b24672cda38..39b99f7b55404 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -13,7 +13,7 @@ from ..base import TransformerMixin from ..utils import _safe_indexing, check_array, safe_sqr from ..utils._set_output import _get_output_config -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.validation import _check_feature_names_in, _is_pandas_df, check_is_fitted @@ -97,13 +97,13 @@ def transform(self, X): output_config_dense = _get_output_config("transform", estimator=self)["dense"] preserve_X = output_config_dense != "default" and _is_pandas_df(X) - # note: we use _safe_tags instead of __sklearn_tags__ because this is a + # note: we use get_tags instead of __sklearn_tags__ because this is a # public Mixin. X = self._validate_data( X, dtype=None, accept_sparse="csr", - ensure_all_finite=not _safe_tags(self).input_tags.allow_nan, + ensure_all_finite=not get_tags(self).input_tags.allow_nan, cast_to_ndarray=not preserve_X, reset=False, ) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 01b6bbef41886..37aab009a63e4 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -9,7 +9,7 @@ from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone from ..exceptions import NotFittedError from ..utils._param_validation import HasMethods, Interval, Options -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.metadata_routing import ( MetadataRouter, MethodMapping, @@ -520,5 +520,5 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.allow_nan = _safe_tags(self.estimator).input_tags.allow_nan + tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan return tags diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 326f2b6def368..85c949c19767d 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -22,7 +22,7 @@ process_routing, ) from ..utils._param_validation import HasMethods, Interval, RealNotInt -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.metaestimators import _safe_split, available_if from ..utils.parallel import Parallel, delayed from ..utils.validation import ( @@ -539,7 +539,7 @@ def __sklearn_tags__(self): if tags.regressor_tags is not None: tags.regressor_tags.poor_score = True tags.target_tags.required = True - tags.input_tags.allow_nan = _safe_tags(self.estimator).input_tags.allow_nan + tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan return tags def get_metadata_routing(self): diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 0046c47c6849e..6cb0af516283f 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -20,7 +20,7 @@ process_routing, ) from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.validation import check_is_fitted from ._base import SelectorMixin @@ -327,7 +327,7 @@ def _get_support_mask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.allow_nan = _safe_tags(self.estimator).input_tags.allow_nan + tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan return tags def get_metadata_routing(self): diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py index 0aa6e04408c79..26a94baa33f15 100644 --- a/sklearn/model_selection/_classification_threshold.py +++ b/sklearn/model_selection/_classification_threshold.py @@ -212,7 +212,6 @@ def decision_function(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.classifier_tags.binary = True tags.classifier_tags.multi_class = False tags._xfail_checks = { "check_classifiers_train": "Threshold at probability 0.5 does not hold", diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d99fff811543c..2d8ca76f355e7 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -31,7 +31,7 @@ from ..utils import Bunch, check_random_state from ..utils._estimator_html_repr import _VisualBlock from ..utils._param_validation import HasMethods, Interval, StrOptions -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ..utils.deprecation import _deprecate_Xt_in_inverse_transform from ..utils.metadata_routing import ( MetadataRouter, @@ -482,12 +482,12 @@ def _estimator_type(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() # allows cross-validation to see 'precomputed' metrics - tags.input_tags.pairwise = _safe_tags(self.estimator).input_tags.pairwise + tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise tags._xfail_checks = { "check_supervised_y_2d": "DataConversionWarning not caught", "check_requires_y_none": "Doesn't fail gracefully", } - tags.array_api_support = _safe_tags(self.estimator).array_api_support + tags.array_api_support = get_tags(self.estimator).array_api_support return tags def score(self, X, y=None, **params): diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 07b8e14fa6ce5..49e74b48e93f5 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -50,7 +50,7 @@ from .preprocessing import LabelBinarizer from .utils import check_random_state from .utils._param_validation import HasMethods, Interval -from .utils._tags import _safe_tags +from .utils._tags import get_tags from .utils.metadata_routing import ( MetadataRouter, MethodMapping, @@ -592,7 +592,7 @@ def n_classes_(self): def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" tags = super().__sklearn_tags__() - tags.input_tags.pairwise = _safe_tags(self.estimator).input_tags.pairwise + tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise return tags def get_metadata_routing(self): @@ -993,7 +993,7 @@ def n_classes_(self): def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" tags = super().__sklearn_tags__() - tags.input_tags.pairwise = _safe_tags(self.estimator).input_tags.pairwise + tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise return tags def get_metadata_routing(self): diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index cc70767192cbe..a7ef4f2a7b6c0 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -20,7 +20,7 @@ _get_container_adapter, _safe_set_output, ) -from .utils._tags import _safe_tags +from .utils._tags import get_tags from .utils._user_interface import _print_elapsed_time from .utils.deprecation import _deprecate_Xt_in_inverse_transform from .utils.metadata_routing import ( @@ -1028,14 +1028,14 @@ def __sklearn_tags__(self): } try: - tags.input_tags.pairwise = _safe_tags(self.steps[0][1]).input_tags.pairwise + tags.input_tags.pairwise = get_tags(self.steps[0][1]).input_tags.pairwise except (ValueError, AttributeError, TypeError): # This happens when the `steps` is not a list of (name, estimator) # tuples and `fit` is not called yet to validate the steps. pass try: - tags.target_tags.multi_output = _safe_tags( + tags.target_tags.multi_output = get_tags( self.steps[-1][1] ).target_tags.multi_output except (ValueError, AttributeError, TypeError): diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 7300bdf7b239d..d6da7073114d7 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -61,7 +61,7 @@ ) from sklearn.semi_supervised import LabelPropagation, LabelSpreading from sklearn.utils import all_estimators -from sklearn.utils._tags import _safe_tags +from sklearn.utils._tags import get_tags from sklearn.utils._testing import ( SkipTest, ignore_warnings, @@ -372,7 +372,7 @@ def check_field_types(tags, defaults): correct_tags = (correct_tags, dict) assert isinstance(getattr(tags, field.name), correct_tags) - tags = _safe_tags(estimator) + tags = get_tags(estimator) defaults = default_tags(estimator) check_field_types(tags, defaults) check_field_types(tags.input_tags, defaults.input_tags) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index cb06d90572c7e..b86fb72bb6ee2 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -26,6 +26,7 @@ shuffle, ) from ._mask import safe_mask +from ._tags import Tags, default_tags, get_tags from .class_weight import compute_class_weight, compute_sample_weight from .deprecation import deprecated from .discovery import all_estimators @@ -83,6 +84,9 @@ class parallel_backend(_joblib.parallel_backend): "safe_mask", "gen_batches", "gen_even_slices", + "Tags", + "default_tags", + "get_tags", ] diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index ff8023133a70b..97aa4a2591fef 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -1,10 +1,12 @@ -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause +from __future__ import annotations from dataclasses import dataclass, field import numpy as np +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + @dataclass class InputTags: @@ -12,42 +14,53 @@ class InputTags: Parameters ---------- - one_d_array : bool + one_d_array : bool (default=False) Whether the input can be a 1D array. - two_d_array : bool - Whether the input can be a 2D array. + two_d_array : bool (default=True) + Whether the input can be a 2D array. Note that most common + tests currently run only if this flag is set to ``True``. - three_d_array : bool + three_d_array : bool (default=False) Whether the input can be a 3D array. - one_d_labels : bool + one_d_labels : bool (default=False) Whether the input is a 1D labels(y). - two_d_labels : bool + two_d_labels : bool (default=False) Whether the input is a 2D labels(y). - sparse : bool + sparse : bool (default=False) Whether the input can be a sparse matrix. - categorical : bool + categorical : bool (default=False) Whether the input can be categorical. - string : bool + string : bool (default=False) Whether the input can be an array-like of strings. - dict : bool + dict : bool (default=False) Whether the input can be a dictionary. - positive_only : bool - Whether the input has to be positive. - - allow_nan : bool - Whether the input can contain NaNs. - - pairwise : bool - Whether the input is in the form of a calculated pairwise distances or computed - kernel values. + positive_only : bool (default=False) + Whether the estimator requires positive X. + + allow_nan : bool (default=False) + Whether the estimator supports data with missing values encoded as np.nan + + pairwise : bool (default=False) + This boolean attribute indicates whether the data (`X`) + :term:`fit` and similar methods consists of pairwise measures + over samples rather than a feature representation for each + sample. It is usually `True` where an estimator has a + `metric` or `affinity` or `kernel` parameter with value + 'precomputed'. Its primary purpose is to support a + :term:`meta-estimator` or a cross validation procedure that + extracts a sub-sample of data intended for a pairwise + estimator, where the data needs to be indexed on both axes. + Specifically, this tag is used by + `sklearn.utils.metaestimators._safe_split` to slice rows and + columns. """ one_d_array = False @@ -71,16 +84,22 @@ class TargetTags: Parameters ---------- required : bool - Whether the target is required. - - positive_only : bool - Whether the target has to be positive. - - multi_output : bool - Whether the target can be multi-output. - - single_output : bool - Whether the target can be single-output. + Whether the estimator requires y to be passed to `fit`, + `fit_predict` or `fit_transform` methods. The tag is ``True`` + for estimators inheriting from `~sklearn.base.RegressorMixin` + and `~sklearn.base.ClassifierMixin`. + + positive_only : bool (default=False) + Whether the estimator requires a positive y (only applicable + for regression). + + multi_output : bool (default=False) + Whether a regressor supports multi-target outputs or a classifier supports + multi-class multi-output. + + single_output : bool (default=True) + Whether the target can be single-output. This can be ``False`` if the + estimator supports only multi-output cases. """ required: bool @@ -95,8 +114,15 @@ class TransformerTags: Parameters ---------- - preserves_dtype : list[object] - The data types that the transformer preserves. + preserves_dtype : list[object] (default=[np.float64]) + Applies only on transformers. It corresponds to the data types + which will be preserved such that `X_trans.dtype` is the same + as `X.dtype` after calling `transformer.transform(X)`. If this + list is empty, then the transformer is not expected to + preserve the data type. The first value in the list is + considered as the default data type, corresponding to the data + type of the output when the input data type is not going to be + preserved. """ preserves_dtype: list[object] = field(default_factory=lambda: [np.float64]) @@ -108,21 +134,24 @@ class ClassifierTags: Parameters ---------- - poor_score : bool - Whether the classifier can have a poor score in tests. - - binary : bool - Whether the classifier can handle binary classification. - - multi_class : bool - Whether the classifier can handle multi-class classification. - - multi_label : bool - Whether the classifier can handle multi-label classification. + poor_score : bool (default=False) + Whether the estimator fails to provide a "reasonable" test-set + score, which currently for classification is an accuracy of + 0.83 on ``make_blobs(n_samples=300, random_state=0)``. The + datasets and values are based on current estimators in sklearn + and might be replaced by something more systematic. + + multi_class : bool (default=True) + Whether the classifier can handle multi-class + classification. Note that all classifiers support binary + classification. Therefore this flag indicates whether the + classifier is a binary-classifier-only or not. + + multi_label : bool (default=False) + Whether the classifier supports multi-label output. """ poor_score: bool = False - binary: bool = True multi_class: bool = True multi_label: bool = False @@ -133,11 +162,16 @@ class RegressorTags: Parameters ---------- - poor_score : bool - Whether the regressor can have a poor score in tests. - - multi_label : bool - Whether the regressor can handle multi-label regression. + poor_score : bool (default=False) + Whether the estimator fails to provide a "reasonable" test-set + score, which currently for regression is an R2 of 0.5 on + ``make_regression(n_samples=200, n_features=10, + n_informative=1, bias=5.0, noise=20, random_state=42)``. The + dataset and values are based on current estimators in sklearn + and might be replaced by something more systematic. + + multi_label : bool (default=False) + Whether the regressor supports multilabel output. """ poor_score: bool = False @@ -162,32 +196,44 @@ class Tags: regressor_tags : RegressorTags The regressor tags. - array_api_support : bool - Whether the estimator supports array API supporting input. + array_api_support : bool (default=False) + Whether the estimator supports Array API compatible inputs. - no_validation : bool - Whether the estimator does not validate input. + no_validation : bool (default=False) + Whether the estimator skips input-validation. This is only meant for + stateless and dummy transformers! - non_deterministic : bool - Whether the estimator is non-deterministic. + non_deterministic : bool (default=False) + Whether the estimator is not deterministic given a fixed ``random_state``. - requires_fit : bool - Whether the estimator requires fitting before other methods can be called. + requires_fit : bool (default=True) + Whether the estimator requires to be fitted before calling one of + `transform`, `predict`, `predict_proba`, or `decision_function`. - _skip_test : bool - Whether the estimator should be skipped in tests. + _skip_test : bool (default=False) + Whether to skip common tests entirely. Don't use this unless + you have a *very good* reason. - _xfail_checks : dict[str, str] - Checks that should be xfailed. + _xfail_checks : dict[str, str] (default={}) + Dictionary ``{check_name: reason}`` of common checks that will + be marked as `XFAIL` for pytest, when using + :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`. These + checks will be simply ignored and not run by + :func:`~sklearn.utils.estimator_checks.check_estimator`, but a + `SkipTestWarning` will be raised. Don't use this unless there + is a *very good* reason for your estimator not to pass the + check. Also note that the usage of this tag is highly subject + to change because we are trying to make it more flexible: be + prepared for breaking changes in the future. input_tags : InputTags The input data(X) tags. """ target_tags: TargetTags - transformer_tags: TransformerTags - classifier_tags: ClassifierTags - regressor_tags: RegressorTags + transformer_tags: TransformerTags | None + classifier_tags: ClassifierTags | None + regressor_tags: RegressorTags | None array_api_support: bool = False no_validation: bool = False non_deterministic: bool = False @@ -197,9 +243,24 @@ class Tags: input_tags: InputTags = field(default_factory=InputTags) -def default_tags(estimator): +def default_tags(estimator) -> Tags: """Get the default tags for an estimator. + This ignores any ``__sklearn_tags__`` method that the estimator may have. + + If the estimator is a classifier or a regressor, ``target_tags.required`` + will be set to ``True``, otherwise it will be set to ``False``. + + ``transformer_tags`` will be set to ``TransformerTags()`` if the estimator + has a ``transform`` or ``fit_transform`` method, otherwise it will be set + to ``None``. + + ``classifier_tags`` will be set to ``ClassifierTags()`` if the estimator is + a classifier, otherwise it will be set to ``None``. + + ``regressor_tags`` will be set to ``RegressorTags()`` if the estimator is a + regressor, otherwise it will be set to ``None``. + Parameters ---------- estimator : estimator object @@ -226,17 +287,18 @@ def default_tags(estimator): ) -def _safe_tags(estimator) -> Tags: - """Safely get estimator tags. +def get_tags(estimator) -> Tags: + """Get estimator tags. :class:`~sklearn.BaseEstimator` provides the estimator tags machinery. However, if an estimator does not inherit from this base class, we should fall-back to the default tags. For scikit-learn built-in estimators, we should still rely on - `self.__sklearn_tags__()`. `_safe_tags(est)` should be used when we are not sure - where `est` comes from: typically `_safe_tags(self.estimator)` where - `self` is a meta-estimator, or in the common checks. + `self.__sklearn_tags__()`. `get_tags(est)` should be used when we + are not sure where `est` comes from: typically + `get_tags(self.estimator)` where `self` is a meta-estimator, or in + the common checks. Parameters ---------- diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6d27daf98cf47..c5fd0e40c37d9 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -65,7 +65,7 @@ from . import shuffle from ._missing import is_scalar_nan from ._param_validation import Interval -from ._tags import Tags, _safe_tags +from ._tags import Tags, get_tags from ._testing import ( SkipTest, _array_api_for_tests, @@ -89,7 +89,7 @@ def _yield_checks(estimator): name = estimator.__class__.__name__ - tags = _safe_tags(estimator) + tags = get_tags(estimator) yield check_no_attributes_set_in_init yield check_estimators_dtypes @@ -147,7 +147,7 @@ def _yield_checks(estimator): def _yield_classifier_checks(classifier): - tags = _safe_tags(classifier) + tags = get_tags(classifier) # test classifiers can handle non-array data and pandas objects yield check_classifier_data_not_an_array @@ -218,7 +218,7 @@ def check_supervised_y_no_nan(name, estimator_orig): def _yield_regressor_checks(regressor): - tags = _safe_tags(regressor) + tags = get_tags(regressor) # TODO: test with intercept # TODO: test with multiple responses # basic testing @@ -243,7 +243,7 @@ def _yield_regressor_checks(regressor): def _yield_transformer_checks(transformer): - tags = _safe_tags(transformer) + tags = get_tags(transformer) # All transformers should either deal with sparse data or raise an # exception with type TypeError and an intelligible error message if not tags.no_validation: @@ -253,7 +253,7 @@ def _yield_transformer_checks(transformer): if tags.transformer_tags.preserves_dtype: yield check_transformer_preserve_dtypes yield partial(check_transformer_general, readonly_memmap=True) - if _safe_tags(transformer).requires_fit: + if get_tags(transformer).requires_fit: yield check_transformers_unfitted else: yield check_transformers_unfitted_stateless @@ -302,7 +302,7 @@ def _yield_outliers_checks(estimator): # test outlier detectors can handle non-array data yield check_classifier_data_not_an_array # test if NotFittedError is raised - if _safe_tags(estimator).requires_fit: + if get_tags(estimator).requires_fit: yield check_estimators_unfitted yield check_non_transformer_estimators_n_iter @@ -323,7 +323,7 @@ def _yield_array_api_checks(estimator): def _yield_all_checks(estimator): name = estimator.__class__.__name__ - tags = _safe_tags(estimator) + tags = get_tags(estimator) if not tags.input_tags.two_d_array: warnings.warn( "Can't test estimator {} which requires input of type {}".format( @@ -504,7 +504,7 @@ def _should_be_skipped_or_marked(estimator, check): check_name = check.func.__name__ if isinstance(check, partial) else check.__name__ - xfail_checks = _safe_tags(estimator)._xfail_checks or {} + xfail_checks = get_tags(estimator)._xfail_checks or {} if check_name in xfail_checks: return True, xfail_checks[check_name] @@ -1030,7 +1030,7 @@ def _check_estimator_sparse_container(name, estimator_orig, sparse_type): with ignore_warnings(category=FutureWarning): estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) - tags = _safe_tags(estimator_orig) + tags = get_tags(estimator_orig) for matrix_format, X in _generate_sparse_data(sparse_type(X)): # catch deprecation warnings with ignore_warnings(category=FutureWarning): @@ -1067,7 +1067,7 @@ def _check_estimator_sparse_container(name, estimator_orig, sparse_type): assert pred.shape == (X.shape[0],) if hasattr(estimator, "predict_proba"): probs = estimator.predict_proba(X) - if tags.classifier_tags.binary and not tags.classifier_tags.multi_class: + if not tags.classifier_tags.multi_class: expected_probs_shape = (X.shape[0], 2) else: expected_probs_shape = (X.shape[0], 4) @@ -1111,8 +1111,8 @@ def check_sample_weights_pandas_series(name, estimator_orig): y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = pd.Series([1] * 12) if ( - not _safe_tags(estimator).target_tags.single_output - and _safe_tags(estimator).target_tags.multi_output + not get_tags(estimator).target_tags.single_output + and get_tags(estimator).target_tags.multi_output ): y = pd.DataFrame(y, copy=False) try: @@ -1154,7 +1154,7 @@ def check_sample_weights_not_an_array(name, estimator_orig): X = _NotAnArray(_enforce_estimator_tags_X(estimator_orig, X)) y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = _NotAnArray([1] * 12) - tags = _safe_tags(estimator) + tags = get_tags(estimator) if not tags.target_tags.single_output and tags.target_tags.multi_output: y = _NotAnArray(y.data.reshape(-1, 1)) estimator.fit(X, y, sample_weight=weights) @@ -1327,7 +1327,7 @@ def check_dtype_object(name, estimator_orig): rng = np.random.RandomState(0) X = _enforce_estimator_tags_X(estimator_orig, rng.uniform(size=(40, 10))) X = X.astype(object) - tags = _safe_tags(estimator_orig) + tags = get_tags(estimator_orig) y = (X[:, 0] * 4).astype(int) estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) @@ -1560,12 +1560,8 @@ def check_methods_sample_order_invariance(name, estimator_orig): X = 3 * rnd.uniform(size=(20, 3)) X = _enforce_estimator_tags_X(estimator_orig, X) y = X[:, 0].astype(np.int64) - tags = _safe_tags(estimator_orig) - if ( - tags.classifier_tags is not None - and tags.classifier_tags.binary - and not tags.classifier_tags.multi_class - ): + tags = get_tags(estimator_orig) + if tags.classifier_tags is not None and not tags.classifier_tags.multi_class: y[y == 2] = 1 estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) @@ -1795,7 +1791,7 @@ def _check_transformer(name, transformer_orig, X, y): X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) - if _safe_tags(transformer_orig).non_deterministic: + if get_tags(transformer_orig).non_deterministic: msg = name + " is non deterministic" raise SkipTest(msg) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): @@ -1835,7 +1831,7 @@ def _check_transformer(name, transformer_orig, X, y): # raises error on malformed input for transform if ( hasattr(X, "shape") - and _safe_tags(transformer).requires_fit + and get_tags(transformer).requires_fit and X.ndim == 2 and X.shape[1] > 1 ): @@ -1853,7 +1849,7 @@ def _check_transformer(name, transformer_orig, X, y): @ignore_warnings def check_pipeline_consistency(name, estimator_orig): - if _safe_tags(estimator_orig).non_deterministic: + if get_tags(estimator_orig).non_deterministic: msg = name + " is non deterministic" raise SkipTest(msg) @@ -1949,7 +1945,7 @@ def check_transformer_preserve_dtypes(name, transformer_orig): X = StandardScaler().fit_transform(X) X = _enforce_estimator_tags_X(transformer_orig, X) - for dtype in _safe_tags(transformer_orig).transformer_tags.preserves_dtype: + for dtype in get_tags(transformer_orig).transformer_tags.preserves_dtype: X_cast = X.astype(dtype) transformer = clone(transformer_orig) set_random_state(transformer) @@ -2073,7 +2069,7 @@ def check_estimators_pickle(name, estimator_orig, readonly_memmap=False): X = _enforce_estimator_tags_X(estimator_orig, X, kernel=rbf_kernel) - tags = _safe_tags(estimator_orig) + tags = get_tags(estimator_orig) # include NaN values when the estimator should deal with them if tags.input_tags.allow_nan: # set randomly 10 elements to np.nan @@ -2144,7 +2140,7 @@ def check_estimators_partial_fit_n_features(name, estimator_orig): @ignore_warnings(category=FutureWarning) def check_classifier_multioutput(name, estimator): n_samples, n_labels, n_classes = 42, 5, 3 - tags = _safe_tags(estimator) + tags = get_tags(estimator) estimator = clone(estimator) X, y = make_multilabel_classification( random_state=42, n_samples=n_samples, n_labels=n_labels, n_classes=n_classes @@ -2257,7 +2253,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False): pred = clusterer.labels_ assert pred.shape == (n_samples,) assert adjusted_rand_score(pred, y) > 0.4 - if _safe_tags(clusterer).non_deterministic: + if get_tags(clusterer).non_deterministic: return set_random_state(clusterer) with warnings.catch_warnings(record=True): @@ -2383,7 +2379,7 @@ def check_classifiers_train( X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b]) problems = [(X_b, y_b)] - tags = _safe_tags(classifier_orig) + tags = get_tags(classifier_orig) if tags.classifier_tags.multi_class: problems.append((X_m, y_m)) @@ -2897,7 +2893,7 @@ def check_estimators_unfitted(name, estimator_orig): @ignore_warnings(category=FutureWarning) def check_supervised_y_2d(name, estimator_orig): - tags = _safe_tags(estimator_orig) + tags = get_tags(estimator_orig) rnd = np.random.RandomState(0) n_samples = 30 X = _enforce_estimator_tags_X(estimator_orig, rnd.uniform(size=(n_samples, 3))) @@ -3025,7 +3021,7 @@ def check_classifiers_classes(name, classifier_orig): y_names_binary = np.take(labels_binary, y_binary) problems = [(X_binary, y_binary, y_names_binary)] - if _safe_tags(classifier_orig).classifier_tags.multi_class: + if get_tags(classifier_orig).classifier_tags.multi_class: problems.append((X_multiclass, y_multiclass, y_names_multiclass)) for X, y, y_names in problems: @@ -3114,7 +3110,7 @@ def check_regressors_train( # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped - if not _safe_tags(regressor).regressor_tags.poor_score: + if not get_tags(regressor).regressor_tags.poor_score: assert regressor.score(X, y_) > 0.5 @@ -3137,7 +3133,7 @@ def check_regressors_no_decision_function(name, regressor_orig): @ignore_warnings(category=FutureWarning) def check_class_weight_classifiers(name, classifier_orig): - if _safe_tags(classifier_orig).classifier_tags.multi_class: + if get_tags(classifier_orig).classifier_tags.multi_class: problems = [2, 3] else: # binary only @@ -3151,7 +3147,7 @@ def check_class_weight_classifiers(name, classifier_orig): ) # can't use gram_if_pairwise() here, setting up gram matrix manually - if _safe_tags(classifier_orig).input_tags.pairwise: + if get_tags(classifier_orig).input_tags.pairwise: X_test = rbf_kernel(X_test, X_train) X_train = rbf_kernel(X_train, X_train) @@ -3177,7 +3173,7 @@ def check_class_weight_classifiers(name, classifier_orig): y_pred = classifier.predict(X_test) # XXX: Generally can use 0.89 here. On Windows, LinearSVC gets # 0.88 (Issue #9111) - if not _safe_tags(classifier_orig).classifier_tags.poor_score: + if not get_tags(classifier_orig).classifier_tags.poor_score: assert np.mean(y_pred == 0) > 0.87 @@ -3535,14 +3531,13 @@ def param_filter(p): def _enforce_estimator_tags_y(estimator, y): # Estimators with a `requires_positive_y` tag only accept strictly positive # data - tags = _safe_tags(estimator) + tags = get_tags(estimator) if tags.target_tags.positive_only: # Create strictly positive y. The minimal increment above 0 is 1, as # y could be of integer dtype. y += 1 + abs(y.min()) if ( tags.classifier_tags is not None - and tags.classifier_tags.binary and not tags.classifier_tags.multi_class and y.size > 0 ): @@ -3557,14 +3552,14 @@ def _enforce_estimator_tags_y(estimator, y): def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel): # Estimators with `1darray` in `X_types` tag only accept # X of shape (`n_samples`,) - if _safe_tags(estimator).input_tags.one_d_array: + if get_tags(estimator).input_tags.one_d_array: X = X[:, 0] # Estimators with a `requires_positive_X` tag only accept # strictly positive data - if _safe_tags(estimator).input_tags.positive_only: + if get_tags(estimator).input_tags.positive_only: X = X - X.min() - if _safe_tags(estimator).input_tags.categorical: - dtype = np.float64 if _safe_tags(estimator).input_tags.allow_nan else np.int32 + if get_tags(estimator).input_tags.categorical: + dtype = np.float64 if get_tags(estimator).input_tags.allow_nan else np.int32 X = np.round((X - X.min())).astype(dtype) if estimator.__class__.__name__ == "SkewedChi2Sampler": @@ -3575,7 +3570,7 @@ def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel): # X of shape (`n_samples`, `n_samples`) if _is_pairwise_metric(estimator): X = pairwise_distances(X, metric="euclidean") - elif _safe_tags(estimator).input_tags.pairwise: + elif get_tags(estimator).input_tags.pairwise: X = kernel(X, X) return X @@ -3731,7 +3726,7 @@ def check_classifiers_regression_target(name, estimator_orig): X = _enforce_estimator_tags_X(estimator_orig, X) e = clone(estimator_orig) msg = "Unknown label type: " - if not _safe_tags(e).no_validation: + if not get_tags(e).no_validation: with raises(ValueError, match=msg): e.fit(X, y) @@ -3912,7 +3907,7 @@ def check_fit_check_is_fitted(name, estimator_orig): y = rng.randint(low=0, high=2, size=n_samples) y = _enforce_estimator_tags_y(estimator, y) - if _safe_tags(estimator).requires_fit: + if get_tags(estimator).requires_fit: # stateless estimators (such as FunctionTransformer) are always "fit"! try: check_is_fitted(estimator) @@ -3986,7 +3981,7 @@ def check_requires_y_none(name, estimator_orig): @ignore_warnings(category=FutureWarning) def check_n_features_in_after_fitting(name, estimator_orig): # Make sure that n_features_in are checked after fitting - tags = _safe_tags(estimator_orig) + tags = get_tags(estimator_orig) is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical @@ -4078,7 +4073,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): "pandas is not installed: not checking column name consistency for pandas" ) - tags = _safe_tags(estimator_orig) + tags = get_tags(estimator_orig) is_supported_X_types = tags.input_tags.two_d_array or tags.input_tags.categorical if not is_supported_X_types or tags.no_validation: @@ -4212,7 +4207,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): def check_transformer_get_feature_names_out(name, transformer_orig): - tags = _safe_tags(transformer_orig) + tags = get_tags(transformer_orig) if not tags.input_tags.two_d_array or tags.no_validation: return @@ -4267,7 +4262,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): "pandas is not installed: not checking column name consistency for pandas" ) - tags = _safe_tags(transformer_orig) + tags = get_tags(transformer_orig) if not tags.input_tags.two_d_array or tags.no_validation: return @@ -4323,7 +4318,7 @@ def check_param_validation(name, estimator_orig): X = rng.uniform(size=(20, 5)) y = rng.randint(0, 2, size=20) y = _enforce_estimator_tags_y(estimator_orig, y) - tags = _safe_tags(estimator_orig) + tags = get_tags(estimator_orig) estimator_params = estimator_orig.get_params(deep=False).keys() @@ -4433,7 +4428,7 @@ def check_param_validation(name, estimator_orig): def check_set_output_transform(name, transformer_orig): # Check transformer.set_output with the default configuration does not # change the transform output. - tags = _safe_tags(transformer_orig) + tags = get_tags(transformer_orig) if not tags.input_tags.two_d_array or tags.no_validation: return @@ -4621,7 +4616,7 @@ def _check_set_output_transform_dataframe( or a global context by using the `with config_context(...)` """ # Check transformer.set_output configures the output of transform="pandas". - tags = _safe_tags(transformer_orig) + tags = get_tags(transformer_orig) if not tags.input_tags.two_d_array or tags.no_validation: return diff --git a/sklearn/utils/metaestimators.py b/sklearn/utils/metaestimators.py index 6da13f35c70eb..dced64f2fe392 100644 --- a/sklearn/utils/metaestimators.py +++ b/sklearn/utils/metaestimators.py @@ -11,7 +11,7 @@ from ..base import BaseEstimator from ..utils import _safe_indexing -from ..utils._tags import _safe_tags +from ..utils._tags import get_tags from ._available_if import available_if __all__ = ["available_if"] @@ -139,7 +139,7 @@ def _safe_split(estimator, X, y, indices, train_indices=None): Indexed targets. """ - if _safe_tags(estimator).input_tags.pairwise: + if get_tags(estimator).input_tags.pairwise: if not hasattr(X, "shape"): raise ValueError( "Precomputed kernels or affinity matrices have " diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index f10d1b56a2690..800cd6cf80409 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -457,7 +457,6 @@ class TaggedBinaryClassifier(UntaggedBinaryClassifier): # Toy classifier that only supports binary classification. def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.classifier_tags.binary = True tags.classifier_tags.multi_class = False return tags diff --git a/sklearn/utils/tests/test_tags.py b/sklearn/utils/tests/test_tags.py index 587430c45fb75..4f4e1928db26f 100644 --- a/sklearn/utils/tests/test_tags.py +++ b/sklearn/utils/tests/test_tags.py @@ -1,7 +1,7 @@ import pytest from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin -from sklearn.utils._tags import _safe_tags +from sklearn.utils._tags import get_tags class NoTagsEstimator: @@ -23,4 +23,4 @@ class ClassifierEstimator: ], ) def test_requires_y(estimator, value): - assert _safe_tags(estimator).target_tags.required == value + assert get_tags(estimator).target_tags.required == value From b9dada7998b1a402c12d68a62b855965e38816b2 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2024 16:24:41 +0200 Subject: [PATCH 36/48] changelog --- doc/whats_new/v1.6.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index a6a936ac4ccd4..138620ebfd560 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -26,7 +26,7 @@ Changes impacting many modules ------------------------------ - |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators. - :pr:`22606` by `Thomas Fan`_. + :pr:`22606` by `Thomas Fan`_ and :pr:`29677` by `Adrin Jalali`_. Support for Array API From 79aefe0a1c7c14115e70917069ea2b784ea13508 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 23 Aug 2024 12:54:29 +0200 Subject: [PATCH 37/48] Update sklearn/utils/_tags.py Co-authored-by: Omar Salman --- sklearn/utils/_tags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 97aa4a2591fef..13c72186d6514 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -63,7 +63,7 @@ class InputTags: columns. """ - one_d_array = False + one_d_array: bool = False two_d_array: bool = True three_d_array: bool = False one_d_labels = False From 43ee0fbf5a449d260142d00affeb3026be398ef1 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 23 Aug 2024 12:54:36 +0200 Subject: [PATCH 38/48] Update sklearn/utils/_tags.py Co-authored-by: Omar Salman --- sklearn/utils/_tags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 13c72186d6514..493bd723e55c5 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -46,7 +46,7 @@ class InputTags: Whether the estimator requires positive X. allow_nan : bool (default=False) - Whether the estimator supports data with missing values encoded as np.nan + Whether the estimator supports data with missing values encoded as np.nan. pairwise : bool (default=False) This boolean attribute indicates whether the data (`X`) From 2283851c992f74faec4438e5c689ce2bea6297cc Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 23 Aug 2024 12:54:45 +0200 Subject: [PATCH 39/48] Update sklearn/utils/_tags.py Co-authored-by: Omar Salman --- sklearn/utils/_tags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 493bd723e55c5..4d114a671f115 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -49,7 +49,7 @@ class InputTags: Whether the estimator supports data with missing values encoded as np.nan. pairwise : bool (default=False) - This boolean attribute indicates whether the data (`X`) + This boolean attribute indicates whether the data (`X`), :term:`fit` and similar methods consists of pairwise measures over samples rather than a feature representation for each sample. It is usually `True` where an estimator has a From b740987948aae9f514dee9d9ec4a5c925f6285c5 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 27 Aug 2024 10:45:42 +0200 Subject: [PATCH 40/48] Omar's comments --- sklearn/feature_extraction/text.py | 1 - sklearn/utils/estimator_checks.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index feeb49496ed38..98875362f1cde 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1711,7 +1711,6 @@ def transform(self, X, copy=True): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.two_d_array = True tags.input_tags.sparse = True # FIXME: np.float16 could be preserved if _inplace_csr_row_normalize_l2 # accepted it. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index c5fd0e40c37d9..738dd34759f89 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2443,10 +2443,7 @@ def check_classifiers_train( # decision_function agrees with predict decision = classifier.decision_function(X) if n_classes == 2: - if ( - not tags.target_tags.multi_output - or tags.target_tags.single_output - ): + if tags.target_tags.single_output: assert decision.shape == (n_samples,) else: assert decision.shape == (n_samples, 1) From 789c52ab8dbd283c968f5155b4cc2d6740bf1204 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 27 Aug 2024 13:34:09 +0200 Subject: [PATCH 41/48] Update sklearn/multioutput.py Co-authored-by: Omar Salman --- sklearn/multioutput.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 9e7b391ac0ff6..05867f7318fdc 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -1255,7 +1255,6 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - # FIXME tags.target_tags.single_output = False tags.target_tags.multi_output = True return tags From 07b4bfd4c64846e4a6fd8aac87c471640af0f001 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 30 Aug 2024 14:56:18 +0200 Subject: [PATCH 42/48] Update sklearn/utils/_tags.py Co-authored-by: Guillaume Lemaitre --- sklearn/utils/_tags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 4d114a671f115..fe97ae01866dd 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -66,7 +66,7 @@ class InputTags: one_d_array: bool = False two_d_array: bool = True three_d_array: bool = False - one_d_labels = False + one_d_labels: bool = False two_d_labels: bool = False sparse: bool = False categorical: bool = False From 635ce30710182affcd1391be00badf888a2cf96c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 30 Aug 2024 15:12:30 +0200 Subject: [PATCH 43/48] fix tab/space --- doc/developers/develop.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index fa1246a4bfc62..cfc30ed3b374b 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -554,10 +554,10 @@ for your estimator's tags. For example:: class MyMultiOutputEstimator(BaseEstimator): def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.target_tags.single_output = False - tags.non_deterministic = True - return tags + tags = super().__sklearn_tags__() + tags.target_tags.single_output = False + tags.non_deterministic = True + return tags You can create a new subclass of :class:`~sklearn.utils.Tags` if you wish to add new tags to the existing set. From 241559d6506c1af354d9b6b53883e5fd83619b2d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 2 Sep 2024 16:35:29 +0200 Subject: [PATCH 44/48] Add tags to docs --- doc/api_reference.py | 8 ++++++++ sklearn/utils/__init__.py | 16 +++++++++++++++- sklearn/utils/_tags.py | 10 +++++----- sklearn/utils/tests/test_tags.py | 1 + 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/doc/api_reference.py b/doc/api_reference.py index 39eac12c1c6cf..102ff191aea88 100644 --- a/doc/api_reference.py +++ b/doc/api_reference.py @@ -1159,6 +1159,14 @@ def _get_submodule(module_name, submodule_name): "safe_mask", "safe_sqr", "shuffle", + "Tags", + "InputTags", + "TargetTags", + "ClassifierTags", + "RegressorTags", + "TransformerTags", + "default_tags", + "get_tags", ], }, { diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index b86fb72bb6ee2..1a7a43fdbc01f 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -26,7 +26,16 @@ shuffle, ) from ._mask import safe_mask -from ._tags import Tags, default_tags, get_tags +from ._tags import ( + ClassifierTags, + InputTags, + RegressorTags, + Tags, + TargetTags, + TransformerTags, + default_tags, + get_tags, +) from .class_weight import compute_class_weight, compute_sample_weight from .deprecation import deprecated from .discovery import all_estimators @@ -85,6 +94,11 @@ class parallel_backend(_joblib.parallel_backend): "gen_batches", "gen_even_slices", "Tags", + "InputTags", + "TargetTags", + "ClassifierTags", + "RegressorTags", + "TransformerTags", "default_tags", "get_tags", ] diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index fe97ae01866dd..fbdb54219845e 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -184,16 +184,16 @@ class Tags: Parameters ---------- - target_tags : TargetTags + target_tags : :class:`TargetTags` The target(y) tags. - transformer_tags : TransformerTags + transformer_tags : :class:`TransformerTags` The transformer tags. - classifier_tags : ClassifierTags + classifier_tags : :class:`ClassifierTags` The classifier tags. - regressor_tags : RegressorTags + regressor_tags : :class:`RegressorTags` The regressor tags. array_api_support : bool (default=False) @@ -226,7 +226,7 @@ class Tags: to change because we are trying to make it more flexible: be prepared for breaking changes in the future. - input_tags : InputTags + input_tags : :class:`InputTags` The input data(X) tags. """ diff --git a/sklearn/utils/tests/test_tags.py b/sklearn/utils/tests/test_tags.py index 4f4e1928db26f..a6dab5078e5ac 100644 --- a/sklearn/utils/tests/test_tags.py +++ b/sklearn/utils/tests/test_tags.py @@ -9,6 +9,7 @@ class NoTagsEstimator: class ClassifierEstimator: + # This is to test whether not inheriting from mixins works. _estimator_type = "classifier" From 9c504dde337d9c2320b5b8f5713c08785067d228 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 2 Sep 2024 16:57:32 +0200 Subject: [PATCH 45/48] preserves_dtype is not a list of str --- sklearn/cluster/_birch.py | 2 +- sklearn/cluster/_bisect_k_means.py | 2 +- sklearn/decomposition/_dict_learning.py | 6 +++--- sklearn/decomposition/_fastica.py | 2 +- sklearn/decomposition/_kernel_pca.py | 2 +- sklearn/decomposition/_lda.py | 2 +- sklearn/decomposition/_nmf.py | 2 +- sklearn/decomposition/_pca.py | 2 +- sklearn/decomposition/_sparse_pca.py | 2 +- sklearn/decomposition/_truncated_svd.py | 2 +- sklearn/feature_extraction/text.py | 2 +- sklearn/feature_selection/_univariate_selection.py | 2 +- sklearn/kernel_approximation.py | 6 +++--- sklearn/manifold/_isomap.py | 2 +- sklearn/neural_network/_rbm.py | 2 +- sklearn/preprocessing/_data.py | 2 +- sklearn/random_projection.py | 2 +- sklearn/utils/estimator_checks.py | 4 +++- 18 files changed, 24 insertions(+), 22 deletions(-) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 54e1bb64b64f6..a802b27ea9d39 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -737,5 +737,5 @@ def _global_clustering(self, X=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index 23945bc77552b..dd7c6d11db51a 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -529,5 +529,5 @@ def _predict_recursive(self, X, sample_weight, cluster_node): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 699dc1ba42e8a..dabb5734f00f7 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1350,7 +1350,7 @@ def transform(self, X, y=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.requires_fit = False - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags @property @@ -1708,7 +1708,7 @@ def _n_features_out(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags @@ -2306,5 +2306,5 @@ def _n_features_out(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 751cfbd24472f..2e196f6de7d4f 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -792,5 +792,5 @@ def _n_features_out(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 7888e976f9b03..25646edf98e10 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -565,7 +565,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] tags.input_tags.pairwise = self.kernel == "precomputed" return tags diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 782ab2b211052..92052ed233240 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -549,7 +549,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.positive_only = True - tags.transformer_tags.preserves_dtype = [np.float32, np.float64] + tags.transformer_tags.preserves_dtype = ["float32", "float64"] return tags def _check_non_neg_array(self, X, reset_n_features, whom): diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index c39171af4b270..cc741e103db1e 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1343,7 +1343,7 @@ def _n_features_out(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.positive_only = True - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index c10202a51940e..4fe2d38cea74c 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -848,6 +848,6 @@ def score(self, X, y=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] tags.array_api_support = True return tags diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 273d1cbcd344e..1feb700cb2945 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -152,7 +152,7 @@ def _n_features_out(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 408ef9a9f138a..bc4f5a47179cc 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -312,7 +312,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags @property diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 98875362f1cde..e26db9e94e0dc 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1714,7 +1714,7 @@ def __sklearn_tags__(self): tags.input_tags.sparse = True # FIXME: np.float16 could be preserved if _inplace_csr_row_normalize_l2 # accepted it. - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 5d86420a0c64d..27999feda757f 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -1153,7 +1153,7 @@ def _make_selector(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags def _check_params(self, X, y): diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 6d5805124c2be..d6951f8c70395 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -403,7 +403,7 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags @@ -563,7 +563,7 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags @@ -1095,5 +1095,5 @@ def __sklearn_tags__(self): "dtypes are preserved but not at a close enough precision" ) } - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index b0b7f4402512e..ee302bc07b384 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -437,5 +437,5 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 5755416cc4c70..33821a6c2c728 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -448,5 +448,5 @@ def __sklearn_tags__(self): "fails for the score_samples method" ), } - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 6b98e2c924872..063283c972449 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1112,7 +1112,7 @@ def inverse_transform(self, X, copy=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 06b591bcd8b90..63c28ddc8b8d8 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -458,7 +458,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.transformer_tags.preserves_dtype = [np.float64, np.float32] + tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 566c5c07d3748..9c189894fe552 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1945,8 +1945,10 @@ def check_transformer_preserve_dtypes(name, transformer_orig): X = StandardScaler().fit_transform(X) X = _enforce_estimator_tags_X(transformer_orig, X) + dtype_map = {"float64": np.float64, "float32": np.float32} + for dtype in get_tags(transformer_orig).transformer_tags.preserves_dtype: - X_cast = X.astype(dtype) + X_cast = X.astype(dtype_map[dtype]) transformer = clone(transformer_orig) set_random_state(transformer) X_trans1 = transformer.fit_transform(X_cast, y) From 99486dbfd6c330ed8892cf0bc6fde410dc2e1df7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 3 Sep 2024 12:37:06 +0200 Subject: [PATCH 46/48] add missing required change in _tags.py --- sklearn/utils/_tags.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index fbdb54219845e..490f2bcf48baa 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -2,8 +2,6 @@ from dataclasses import dataclass, field -import numpy as np - # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause @@ -125,7 +123,7 @@ class TransformerTags: preserved. """ - preserves_dtype: list[object] = field(default_factory=lambda: [np.float64]) + preserves_dtype: list[str] = field(default_factory=lambda: ["float64"]) @dataclass From 6ac2b7c9d0648b3ad36a906ade6feef8a1f1076e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 4 Sep 2024 13:14:35 +0200 Subject: [PATCH 47/48] Most Guillaume's comments --- doc/glossary.rst | 3 +- doc/whats_new/v1.6.rst | 1 + sklearn/preprocessing/_label.py | 6 +- sklearn/tests/test_docstring_parameters.py | 4 +- sklearn/utils/_mocking.py | 2 +- sklearn/utils/_tags.py | 91 +++++++++++----------- sklearn/utils/estimator_checks.py | 7 +- 7 files changed, 57 insertions(+), 57 deletions(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 84a628b0f716d..d2df0d959a9c0 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -407,8 +407,7 @@ General Concepts likelihoods. estimator tags - A proposed feature (e.g. :issue:`8022`) by which the capabilities of an - estimator are described through a set of semantic tags. This would + Estimator tags describe certain capabilities of an estimator. This would enable some runtime behaviors based on estimator inspection, but it also allows each estimator to be tested for appropriate invariances while being excepted from other :term:`common tests`. diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index fc60563a95077..d4d373e9bf280 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -26,6 +26,7 @@ Changes impacting many modules ------------------------------ - |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators. + More details in :ref:`estimator_tags`. :pr:`22606` by `Thomas Fan`_ and :pr:`29677` by `Adrin Jalali`_. diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index b6b2f9e90c813..345d55556459b 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -167,7 +167,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.array_api_support = True tags.input_tags.two_d_array = False - tags.input_tags.one_d_labels = True + tags.target_tags.one_d_labels = True return tags @@ -422,7 +422,7 @@ def inverse_transform(self, Y, threshold=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.two_d_array = False - tags.input_tags.one_d_labels = True + tags.target_tags.one_d_labels = True return tags @@ -959,5 +959,5 @@ def inverse_transform(self, yt): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.two_d_array = False - tags.input_tags.two_d_labels = True + tags.target_tags.two_d_labels = True return tags diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index b8449d999455a..25381856b08a0 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -265,9 +265,9 @@ def test_fit_docstring_attributes(name, Estimator): y = _enforce_estimator_tags_y(est, y) X = _enforce_estimator_tags_X(est, X) - if est.__sklearn_tags__().input_tags.one_d_labels: + if est.__sklearn_tags__().target_tags.one_d_labels: est.fit(y) - elif est.__sklearn_tags__().input_tags.two_d_labels: + elif est.__sklearn_tags__().target_tags.two_d_labels: est.fit(np.c_[y, y]) elif est.__sklearn_tags__().input_tags.three_d_array: est.fit(X[np.newaxis, ...], y) diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index a07ca429e8d74..5e9973f373d9d 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -339,7 +339,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags._skip_test = True tags.input_tags.two_d_array = False - tags.input_tags.one_d_labels = True + tags.target_tags.one_d_labels = True return tags diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py index 490f2bcf48baa..c800254b8b4e4 100644 --- a/sklearn/utils/_tags.py +++ b/sklearn/utils/_tags.py @@ -12,41 +12,35 @@ class InputTags: Parameters ---------- - one_d_array : bool (default=False) + one_d_array : bool, default=False Whether the input can be a 1D array. - two_d_array : bool (default=True) + two_d_array : bool, default=True Whether the input can be a 2D array. Note that most common tests currently run only if this flag is set to ``True``. - three_d_array : bool (default=False) + three_d_array : bool, default=False Whether the input can be a 3D array. - one_d_labels : bool (default=False) - Whether the input is a 1D labels(y). - - two_d_labels : bool (default=False) - Whether the input is a 2D labels(y). - - sparse : bool (default=False) + sparse : bool, default=False Whether the input can be a sparse matrix. - categorical : bool (default=False) + categorical : bool, default=False Whether the input can be categorical. - string : bool (default=False) + string : bool, default=False Whether the input can be an array-like of strings. - dict : bool (default=False) + dict : bool, default=False Whether the input can be a dictionary. - positive_only : bool (default=False) + positive_only : bool, default=False Whether the estimator requires positive X. - allow_nan : bool (default=False) - Whether the estimator supports data with missing values encoded as np.nan. + allow_nan : bool, default=False + Whether the estimator supports data with missing values encoded as `np.nan`. - pairwise : bool (default=False) + pairwise : bool, default=False This boolean attribute indicates whether the data (`X`), :term:`fit` and similar methods consists of pairwise measures over samples rather than a feature representation for each @@ -64,8 +58,6 @@ class InputTags: one_d_array: bool = False two_d_array: bool = True three_d_array: bool = False - one_d_labels: bool = False - two_d_labels: bool = False sparse: bool = False categorical: bool = False string: bool = False @@ -87,20 +79,28 @@ class TargetTags: for estimators inheriting from `~sklearn.base.RegressorMixin` and `~sklearn.base.ClassifierMixin`. - positive_only : bool (default=False) + one_d_labels : bool, default=False + Whether the input is a 1D labels (y). + + two_d_labels : bool, default=False + Whether the input is a 2D labels (y). + + positive_only : bool, default=False Whether the estimator requires a positive y (only applicable for regression). - multi_output : bool (default=False) + multi_output : bool, default=False Whether a regressor supports multi-target outputs or a classifier supports multi-class multi-output. - single_output : bool (default=True) + single_output : bool, default=True Whether the target can be single-output. This can be ``False`` if the estimator supports only multi-output cases. """ required: bool + one_d_labels: bool = False + two_d_labels: bool = False positive_only: bool = False multi_output: bool = False single_output: bool = True @@ -112,7 +112,7 @@ class TransformerTags: Parameters ---------- - preserves_dtype : list[object] (default=[np.float64]) + preserves_dtype : list[str], default=["float64"] Applies only on transformers. It corresponds to the data types which will be preserved such that `X_trans.dtype` is the same as `X.dtype` after calling `transformer.transform(X)`. If this @@ -132,20 +132,20 @@ class ClassifierTags: Parameters ---------- - poor_score : bool (default=False) + poor_score : bool, default=False Whether the estimator fails to provide a "reasonable" test-set score, which currently for classification is an accuracy of 0.83 on ``make_blobs(n_samples=300, random_state=0)``. The - datasets and values are based on current estimators in sklearn + datasets and values are based on current estimators in scikit-learn and might be replaced by something more systematic. - multi_class : bool (default=True) + multi_class : bool, default=True Whether the classifier can handle multi-class classification. Note that all classifiers support binary classification. Therefore this flag indicates whether the classifier is a binary-classifier-only or not. - multi_label : bool (default=False) + multi_label : bool, default=False Whether the classifier supports multi-label output. """ @@ -160,15 +160,15 @@ class RegressorTags: Parameters ---------- - poor_score : bool (default=False) + poor_score : bool, default=False Whether the estimator fails to provide a "reasonable" test-set score, which currently for regression is an R2 of 0.5 on ``make_regression(n_samples=200, n_features=10, n_informative=1, bias=5.0, noise=20, random_state=42)``. The - dataset and values are based on current estimators in sklearn + dataset and values are based on current estimators in scikit-learn and might be replaced by something more systematic. - multi_label : bool (default=False) + multi_label : bool, default=False Whether the regressor supports multilabel output. """ @@ -185,34 +185,34 @@ class Tags: target_tags : :class:`TargetTags` The target(y) tags. - transformer_tags : :class:`TransformerTags` + transformer_tags : :class:`TransformerTags` or None The transformer tags. - classifier_tags : :class:`ClassifierTags` + classifier_tags : :class:`ClassifierTags` or None The classifier tags. - regressor_tags : :class:`RegressorTags` + regressor_tags : :class:`RegressorTags` or None The regressor tags. - array_api_support : bool (default=False) + array_api_support : bool, default=False Whether the estimator supports Array API compatible inputs. - no_validation : bool (default=False) + no_validation : bool, default=False Whether the estimator skips input-validation. This is only meant for stateless and dummy transformers! - non_deterministic : bool (default=False) + non_deterministic : bool, default=False Whether the estimator is not deterministic given a fixed ``random_state``. - requires_fit : bool (default=True) + requires_fit : bool, default=True Whether the estimator requires to be fitted before calling one of `transform`, `predict`, `predict_proba`, or `decision_function`. - _skip_test : bool (default=False) + _skip_test : bool, default=False Whether to skip common tests entirely. Don't use this unless you have a *very good* reason. - _xfail_checks : dict[str, str] (default={}) + _xfail_checks : dict[str, str], default={} Dictionary ``{check_name: reason}`` of common checks that will be marked as `XFAIL` for pytest, when using :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`. These @@ -249,15 +249,16 @@ def default_tags(estimator) -> Tags: If the estimator is a classifier or a regressor, ``target_tags.required`` will be set to ``True``, otherwise it will be set to ``False``. - ``transformer_tags`` will be set to ``TransformerTags()`` if the estimator - has a ``transform`` or ``fit_transform`` method, otherwise it will be set + ``transformer_tags`` will be set to :class:`~.sklearn.utils. TransformerTags` if the + estimator has a ``transform`` or ``fit_transform`` method, otherwise it will be set to ``None``. - ``classifier_tags`` will be set to ``ClassifierTags()`` if the estimator is + ``classifier_tags`` will be set to :class:`~.sklearn.utils.ClassifierTags` if the + estimator is a classifier, otherwise it will be set to ``None``. a classifier, otherwise it will be set to ``None``. - ``regressor_tags`` will be set to ``RegressorTags()`` if the estimator is a - regressor, otherwise it will be set to ``None``. + ``regressor_tags`` will be set to :class:`~.sklearn.utils.RegressorTags` if the + estimator is a regressor, otherwise it will be set to ``None``. Parameters ---------- @@ -305,7 +306,7 @@ def get_tags(estimator) -> Tags: Returns ------- - tags : Tags + tags : :class:`~.sklearn.utils.Tags` The estimator tags. """ if hasattr(estimator, "__sklearn_tags__"): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e2cf479e00974..0e4aa34e1db37 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3144,8 +3144,7 @@ def check_regressors_no_decision_function(name, regressor_orig): def check_class_weight_classifiers(name, classifier_orig): if get_tags(classifier_orig).classifier_tags.multi_class: problems = [2, 3] - else: - # binary only + else: # binary classification only problems = [2] for n_centers in problems: @@ -4392,7 +4391,7 @@ def check_param_validation(name, estimator_orig): ) with raises(InvalidParameterError, match=match, err_msg=err_msg): - if tags.input_tags.one_d_labels or tags.input_tags.two_d_labels: + if tags.target_tags.one_d_labels or tags.target_tags.two_d_labels: # The estimator is a label transformer and take only `y` getattr(estimator, method)(y) else: @@ -4427,7 +4426,7 @@ def check_param_validation(name, estimator_orig): ) with raises(InvalidParameterError, match=match, err_msg=err_msg): - if tags.input_tags.one_d_labels or tags.input_tags.two_d_labels: + if tags.target_tags.one_d_labels or tags.target_tags.two_d_labels: # The estimator is a label transformer and take only `y` getattr(estimator, method)(y) else: From 88af8964bc0aa8f8a1410c5e1207e54bbe8cb635 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 4 Sep 2024 13:24:42 +0200 Subject: [PATCH 48/48] remove dtype map --- sklearn/utils/estimator_checks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e7a2a7c9a5d38..22557c45cdead 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1782,10 +1782,8 @@ def check_transformer_preserve_dtypes(name, transformer_orig): X = StandardScaler().fit_transform(X) X = _enforce_estimator_tags_X(transformer_orig, X) - dtype_map = {"float64": np.float64, "float32": np.float32} - for dtype in get_tags(transformer_orig).transformer_tags.preserves_dtype: - X_cast = X.astype(dtype_map[dtype]) + X_cast = X.astype(dtype) transformer = clone(transformer_orig) set_random_state(transformer) X_trans1 = transformer.fit_transform(X_cast, y)