From a896f2ea0c496c7d55d2f84b20f84c638b228705 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 31 Oct 2024 17:42:54 +0100 Subject: [PATCH 01/28] add input_tags.sparse and test --- sklearn/calibration.py | 11 +++--- sklearn/cluster/_affinity_propagation.py | 1 + sklearn/cluster/_bicluster.py | 1 + sklearn/cluster/_birch.py | 1 + sklearn/cluster/_bisect_k_means.py | 1 + sklearn/cluster/_dbscan.py | 1 + sklearn/cluster/_hdbscan/hdbscan.py | 1 + sklearn/cluster/_kmeans.py | 1 + sklearn/cluster/_spectral.py | 1 + sklearn/compose/_target.py | 1 + sklearn/decomposition/_incremental_pca.py | 5 +++ sklearn/decomposition/_kernel_pca.py | 1 + sklearn/decomposition/_lda.py | 1 + sklearn/decomposition/_nmf.py | 1 + sklearn/decomposition/_pca.py | 1 + sklearn/decomposition/_truncated_svd.py | 1 + sklearn/dummy.py | 2 ++ sklearn/ensemble/_bagging.py | 1 + sklearn/ensemble/_base.py | 6 ++++ sklearn/ensemble/_forest.py | 3 ++ sklearn/ensemble/_gb.py | 2 ++ sklearn/ensemble/_weight_boosting.py | 2 ++ sklearn/feature_selection/_from_model.py | 1 + sklearn/feature_selection/_rfe.py | 1 + sklearn/feature_selection/_sequential.py | 1 + .../_univariate_selection.py | 1 + .../feature_selection/_variance_threshold.py | 1 + sklearn/impute/_base.py | 2 ++ sklearn/kernel_approximation.py | 8 +++++ sklearn/kernel_ridge.py | 1 + sklearn/linear_model/_base.py | 5 +++ sklearn/linear_model/_coordinate_descent.py | 8 +++++ sklearn/linear_model/_glm/glm.py | 1 + sklearn/linear_model/_huber.py | 5 +++ sklearn/linear_model/_logistic.py | 6 ++++ sklearn/linear_model/_quantile.py | 5 +++ sklearn/linear_model/_ransac.py | 1 + sklearn/linear_model/_ridge.py | 13 +++++++ sklearn/linear_model/_stochastic_gradient.py | 11 ++++++ sklearn/manifold/_isomap.py | 1 + sklearn/manifold/_spectral_embedding.py | 1 + .../_classification_threshold.py | 3 +- sklearn/model_selection/_search.py | 1 + sklearn/multiclass.py | 7 ++++ sklearn/multioutput.py | 8 ++++- sklearn/naive_bayes.py | 8 +++++ sklearn/neighbors/_base.py | 1 + sklearn/neighbors/_nearest_centroid.py | 5 +++ .../neural_network/_multilayer_perceptron.py | 5 +++ sklearn/neural_network/_rbm.py | 1 + sklearn/pipeline.py | 1 + sklearn/preprocessing/_data.py | 5 +++ .../preprocessing/_function_transformer.py | 1 + sklearn/preprocessing/_polynomial.py | 5 +++ sklearn/random_projection.py | 1 + sklearn/semi_supervised/_label_propagation.py | 5 +++ sklearn/semi_supervised/_self_training.py | 3 +- sklearn/svm/_classes.py | 7 ++++ sklearn/tree/_classes.py | 2 ++ sklearn/utils/estimator_checks.py | 36 +++++++++++++++++++ 60 files changed, 216 insertions(+), 8 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 93035fef52b45..19e6516c30096 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -27,11 +27,7 @@ from .model_selection import LeaveOneOut, check_cv, cross_val_predict from .preprocessing import LabelEncoder, label_binarize from .svm import LinearSVC -from .utils import ( - _safe_indexing, - column_or_1d, - indexable, -) +from .utils import _safe_indexing, column_or_1d, get_tags, indexable from .utils._param_validation import ( HasMethods, Interval, @@ -540,6 +536,11 @@ def get_metadata_routing(self): ) return router + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self._get_estimator()).input_tags.sparse + return tags + def _fit_classifier_calibrator_pair( estimator, diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 677421974bdc0..76b5f27a21af4 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -483,6 +483,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.pairwise = self.affinity == "precomputed" + tags.input_tags.sparse = True return tags @_fit_context(prefer_skip_nested_validation=True) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 08cd63b58cbaa..16818b98c703b 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -195,6 +195,7 @@ def _k_means(self, data, n_clusters): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags._xfail_checks = { "check_estimators_dtypes": "raises nan error", "check_fit2d_1sample": "_scale_normalize fails", diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 3e5f9d10a79e8..4d8abb43513dc 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -742,4 +742,5 @@ def _global_clustering(self, X=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.transformer_tags.preserves_dtype = ["float64", "float32"] + tags.input_tags.sparse = True return tags diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index 3c9ccdcf06414..77e24adbf8084 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -538,5 +538,6 @@ def _predict_recursive(self, X, sample_weight, cluster_node): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 7764bff94582f..d79c4f286d76d 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -473,4 +473,5 @@ def fit_predict(self, X, y=None, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.pairwise = self.metric == "precomputed" + tags.input_tags.sparse = True return tags diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 8bf402a5081c9..0607bbb23ba51 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -1003,5 +1003,6 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.allow_nan = self.metric != "precomputed" return tags diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 80958f8c845a2..654df8d69a0e4 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1179,6 +1179,7 @@ def score(self, X, y=None, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index ebfeccee677a9..6d1dcd093e803 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -794,6 +794,7 @@ def fit_predict(self, X, y=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.pairwise = self.affinity in [ "precomputed", "precomputed_nearest_neighbors", diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index d90ee17d13f49..86fc6294878b9 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -348,6 +348,7 @@ def __sklearn_tags__(self): regressor = self._get_regressor() tags = super().__sklearn_tags__() tags.regressor_tags.poor_score = True + tags.input_tags.sparse = get_tags(regressor).input_tags.sparse tags.target_tags.multi_output = get_tags(regressor).target_tags.multi_output return tags diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index b2caf81aa9793..35a894416f39a 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -418,3 +418,8 @@ def transform(self, X): return np.vstack(output) else: return super().transform(X) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index d9757c7845be1..37ff77c8d7c64 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -566,6 +566,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = ["float64", "float32"] tags.input_tags.pairwise = self.kernel == "precomputed" return tags diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 875c6e25fbb10..4580ff073bca5 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -549,6 +549,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.positive_only = True + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = ["float32", "float64"] return tags diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 6be97f2223fb5..dc21e389f6849 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1331,6 +1331,7 @@ def _n_features_out(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.positive_only = True + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 24cb1649c5fee..ba5ca4dd4c680 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -851,4 +851,5 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.transformer_tags.preserves_dtype = ["float64", "float32"] tags.array_api_support = True + tags.input_tags.sparse = True return tags diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index b87a53684c140..b77882f5da78d 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -312,6 +312,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 6332ff43cd482..aa07ca78810b0 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -423,6 +423,7 @@ def predict_log_proba(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.classifier_tags.poor_score = True tags.no_validation = True tags._xfail_checks = { @@ -666,6 +667,7 @@ def predict(self, X, return_std=False): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.regressor_tags.poor_score = True tags.no_validation = True return tags diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 423fc0ec6449a..b3b7590ce7fb1 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -642,6 +642,7 @@ def _get_estimator(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self._get_estimator()).input_tags.sparse tags.input_tags.allow_nan = get_tags(self._get_estimator()).input_tags.allow_nan # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 2789dd234294e..3545c4e802d7c 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -292,11 +292,17 @@ def __sklearn_tags__(self): get_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True for est in self.estimators ) + sparse = all( + get_tags(est[1]).input_tags.sparse if est[1] != "drop" else True + for est in self.estimators + ) except Exception: # If `estimators` does not comply with our API (list of tuples) then it will # fail. In this case, we assume that `allow_nan` is False but the parameter # validation will raise an error during `fit`. allow_nan = False + sparse = False tags.input_tags.allow_nan = allow_nan + tags.input_tags.sparse = sparse tags.transformer_tags.preserves_dtype = [] return tags diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 7c7663864ad92..126addb0b5f9c 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1003,6 +1003,7 @@ def predict_log_proba(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.classifier_tags.multi_label = True + tags.input_tags.sparse = True return tags @@ -1169,6 +1170,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.regressor_tags.multi_label = True + tags.input_tags.sparse = True return tags @@ -3016,6 +3018,7 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 8f85f2f7aa3cd..dca3d65b87925 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1727,6 +1727,7 @@ def staged_predict_proba(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: investigate failure see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( @@ -2194,6 +2195,7 @@ def apply(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: investigate failure see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 7780230b046cb..66363808455bf 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -860,6 +860,7 @@ def predict_log_proba(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( @@ -1179,6 +1180,7 @@ def staged_predict(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index d5476e3f06abf..850505dbf6b92 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -519,5 +519,6 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan return tags diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 0282facf9fd31..6823759774c30 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -533,6 +533,7 @@ def __sklearn_tags__(self): if tags.regressor_tags is not None: tags.regressor_tags.poor_score = True tags.target_tags.required = True + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan return tags diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index ac5f13fd00e7d..676106c4a19ca 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -329,6 +329,7 @@ def _get_support_mask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags def get_metadata_routing(self): diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 7933818a6a19b..996d5423995d2 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -581,6 +581,7 @@ def _check_params(self, X, y): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.target_tags.required = True + tags.input_tags.sparse = True return tags diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 1aab9080b964d..f26d70ecf8f82 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -137,4 +137,5 @@ def _get_support_mask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True + tags.input_tags.sparse = True return tags diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index faf1f9e23b678..7a8f2cc4483e2 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -739,6 +739,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.allow_nan = is_pandas_na(self.missing_values) or is_scalar_nan( self.missing_values ) @@ -1130,5 +1131,6 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True tags.input_tags.string = True + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = [] return tags diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 96f9b7e9d4778..dd16d2d5711ee 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -235,6 +235,11 @@ def transform(self, X): return data_sketch + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class RBFSampler(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """Approximate a RBF kernel feature map using random Fourier features. @@ -404,6 +409,7 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags @@ -826,6 +832,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.requires_fit = False tags.input_tags.positive_only = True + tags.input_tags.sparse = True return tags @@ -1094,6 +1101,7 @@ def _get_kernel_params(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags._xfail_checks = { "check_transformer_preserves_dtypes": ( "dtypes are preserved but not at a close enough precision" diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 983b463508c5b..29e744647acc9 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -169,6 +169,7 @@ def _get_kernel(self, X, Y=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.pairwise = self.kernel == "precomputed" return tags diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 3bb3b8b7626d8..7be8188981455 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -687,6 +687,11 @@ def rmatvec(b): self._set_intercept(X_offset, y_offset, X_scale) return self + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + def _check_precomputed_gram_matrix( X, precompute, X_offset, X_scale, rtol=None, atol=1e-5 diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 2dbb83c82fbaa..b8430fe00b862 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1149,6 +1149,11 @@ def _decision_function(self, X): else: return super()._decision_function(X) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + ############################################################################### # Lasso model @@ -2078,6 +2083,7 @@ def _is_multitask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.target_tags.multi_output = False return tags @@ -2359,6 +2365,7 @@ def _is_multitask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.target_tags.multi_output = False return tags @@ -2654,6 +2661,7 @@ def fit(self, X, y): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = False tags.target_tags.multi_output = True tags.target_tags.single_output = False return tags diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 093a813f60550..fc31f9825d2e5 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -442,6 +442,7 @@ def score(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True try: # Create instance of BaseLoss if fit wasn't called yet. This is necessary as # TweedieRegressor might set the used loss during fit different from diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 9e41cc4eae3b5..81fdfa51ead51 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -351,3 +351,8 @@ def fit(self, X, y, sample_weight=None): residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_) self.outliers_ = residual > self.scale_ * self.epsilon return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index fe5ee918066fa..014fae4287108 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1459,6 +1459,7 @@ def predict_log_proba(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags._xfail_checks.update( { "check_non_transformer_estimators_n_iter": ( @@ -2285,3 +2286,8 @@ def _get_scorer(self): """ scoring = self.scoring or "accuracy" return get_scorer(scoring) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py index 883a41558f2f7..446d232958e8d 100644 --- a/sklearn/linear_model/_quantile.py +++ b/sklearn/linear_model/_quantile.py @@ -294,3 +294,8 @@ def fit(self, X, y, sample_weight=None): self.coef_ = params self.intercept_ = 0.0 return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 8b5b34317f5eb..18f061622cf54 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -723,6 +723,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 2b7b3708354e3..913f3a6cecce4 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1251,6 +1251,10 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.array_api_support = True + reject_sparse = (self.solver == "svd") or ( + self.solver == "cholesky" and self.fit_intercept + ) + tags.input_tags.sparse = not reject_sparse tags._xfail_checks.update( { "check_non_transformer_estimators_n_iter": ( @@ -1577,6 +1581,10 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + reject_sparse = (self.solver == "svd") or ( + self.solver == "cholesky" and self.fit_intercept + ) + tags.input_tags.sparse = not reject_sparse tags._xfail_checks.update( { "check_non_transformer_estimators_n_iter": ( @@ -2550,6 +2558,11 @@ def _get_scorer(self): def cv_values_(self): return self.cv_results_ + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): """Ridge regression with built-in cross-validation. diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 4d924a1ad00a6..d86ef9e1ef000 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -941,6 +941,11 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): sample_weight=sample_weight, ) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class SGDClassifier(BaseSGDClassifier): """Linear classifiers (SVM, logistic regression, etc.) with SGD training. @@ -1782,6 +1787,11 @@ def _fit_regressor( else: self.intercept_ = np.atleast_1d(intercept) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class SGDRegressor(BaseSGDRegressor): """Linear model fitted by minimizing a regularized empirical loss with SGD. @@ -2656,6 +2666,7 @@ def predict(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index ee302bc07b384..90154470c18a4 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -438,4 +438,5 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.transformer_tags.preserves_dtype = ["float64", "float32"] + tags.input_tags.sparse = True return tags diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index ebd5d7c5b651b..d3d45ec0773c3 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -650,6 +650,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.pairwise = self.affinity in [ "precomputed", "precomputed_nearest_neighbors", diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py index 56bc26299a442..2790cf8f5a4ad 100644 --- a/sklearn/model_selection/_classification_threshold.py +++ b/sklearn/model_selection/_classification_threshold.py @@ -22,7 +22,7 @@ _CurveScorer, _threshold_scores_to_class_labels, ) -from ..utils import _safe_indexing +from ..utils import _safe_indexing, get_tags from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions from ..utils._response import _get_response_values_binary from ..utils.metadata_routing import ( @@ -212,6 +212,7 @@ def decision_function(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.classifier_tags.multi_class = False + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse tags._xfail_checks = { "check_classifiers_train": "Threshold at probability 0.5 does not hold", "check_sample_weight_equivalence": ( diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 2935f7ce2465c..b7670d7c304d0 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -483,6 +483,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() # allows cross-validation to see 'precomputed' metrics tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse tags._xfail_checks = { "check_supervised_y_2d": "DataConversionWarning not caught", "check_requires_y_none": "Doesn't fail gracefully", diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index dca055ecbfb4a..1ddb36ca4fa8f 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -601,6 +601,7 @@ def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" tags = super().__sklearn_tags__() tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags def get_metadata_routing(self): @@ -1004,6 +1005,7 @@ def __sklearn_tags__(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" tags = super().__sklearn_tags__() tags.input_tags.pairwise = get_tags(self.estimator).input_tags.pairwise + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags def get_metadata_routing(self): @@ -1276,3 +1278,8 @@ def get_metadata_routing(self): method_mapping=MethodMapping().add(caller="fit", callee="fit"), ) return router + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse + return tags diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index ebcd73e95d881..38b6eb4a7e0ec 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -25,7 +25,7 @@ is_classifier, ) from .model_selection import cross_val_predict -from .utils import Bunch, check_random_state +from .utils import Bunch, check_random_state, get_tags from .utils._param_validation import HasMethods, StrOptions from .utils._response import _get_response_values from .utils._user_interface import _print_elapsed_time @@ -311,6 +311,7 @@ def predict(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse tags.target_tags.single_output = False tags.target_tags.multi_output = True return tags @@ -829,6 +830,11 @@ def predict(self, X): """ return self._get_predictions(X, output_method="predict") + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self.base_estimator).input_tags.sparse + return tags + class ClassifierChain(MetaEstimatorMixin, ClassifierMixin, _BaseChain): """A multi-label model that arranges binary classifiers into a chain. diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index fa99448f9d347..62cf52c4c2e49 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -880,6 +880,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.positive_only = True return tags @@ -1028,6 +1029,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.positive_only = True return tags @@ -1227,6 +1229,11 @@ def _joint_log_likelihood(self, X): return jll + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class CategoricalNB(_BaseDiscreteNB): """Naive Bayes classifier for categorical features. @@ -1432,6 +1439,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = False tags.input_tags.positive_only = True # TODO: fix sample_weight handling of this estimator, see meta-issue #16298 tags._xfail_checks = { diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 1925e0dbc758c..20d16bfa57878 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -693,6 +693,7 @@ def _fit(self, X, y=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # For cross-validation routines to split data correctly tags.input_tags.pairwise = self.metric == "precomputed" return tags diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index cb8d1dbf7107f..d52f6d128d052 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -332,3 +332,8 @@ def _check_euclidean_metric(self): predict_log_proba = available_if(_check_euclidean_metric)( DiscriminantAnalysisPredictionMixin.predict_log_proba ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 196203ce46763..47805857b5154 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -771,6 +771,11 @@ def _score_with_function(self, X, y, score_function): return score_function(y, y_pred) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): """Multi-layer Perceptron classifier. diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 49848e9f982cc..019fc3af83ef0 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -448,5 +448,6 @@ def __sklearn_tags__(self): "fails for the score_samples method" ), } + tags.input_tags.sparse = True tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 90a62d9e4e8ab..3c7b85c8abc9a 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1060,6 +1060,7 @@ def __sklearn_tags__(self): try: tags.input_tags.pairwise = get_tags(self.steps[0][1]).input_tags.pairwise + tags.input_tags.sparse = get_tags(self.steps[0][1]).input_tags.sparse except (ValueError, AttributeError, TypeError): # This happens when the `steps` is not a list of (name, estimator) # tuples and `fit` is not called yet to validate the steps. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 74ea7431a5d72..8d428fe50c7f8 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1130,6 +1130,7 @@ def inverse_transform(self, X, copy=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True + tags.input_tags.sparse = not self.with_mean tags.transformer_tags.preserves_dtype = ["float64", "float32"] return tags @@ -1363,6 +1364,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True + tags.input_tags.sparse = True return tags @@ -2136,6 +2138,7 @@ def transform(self, X, copy=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.requires_fit = False tags.array_api_support = True return tags @@ -2343,6 +2346,7 @@ def transform(self, X, copy=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.requires_fit = False + tags.input_tags.sparse = True return tags @@ -3009,6 +3013,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.input_tags.allow_nan = True return tags diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 02379273e302e..54166fdacb2ad 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -394,6 +394,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.no_validation = not self.validate tags.requires_fit = False + tags.input_tags.sparse = True return tags def set_output(self, *, transform=None): diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 5a3239f113024..7ec9953680331 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -585,6 +585,11 @@ def transform(self, X): XP = Xout return XP + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class SplineTransformer(TransformerMixin, BaseEstimator): """Generate univariate B-spline bases for features. diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 804bd1088d70a..bec365616da7b 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -463,6 +463,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.transformer_tags.preserves_dtype = ["float64", "float32"] + tags.input_tags.sparse = True return tags diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index a2e25277cf450..3eaee712b93a4 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -336,6 +336,11 @@ def fit(self, X, y): self.transduction_ = transduction.ravel() return self + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class LabelPropagation(BaseLabelPropagation): """Label Propagation classifier. diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index 3e1709adaa267..32eb36c83b4c0 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -5,7 +5,7 @@ import numpy as np from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone -from ..utils import Bunch, safe_mask +from ..utils import Bunch, get_tags, safe_mask from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions from ..utils.metadata_routing import ( MetadataRouter, @@ -635,6 +635,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse tags._xfail_checks.update( {"check_non_transformer_estimators_n_iter": "n_iter_ can be 0."} ) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index f4e4aa118c069..91e1a06d035e7 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -351,6 +351,7 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test when _dual=True, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( @@ -615,6 +616,7 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: replace by a statistical test, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( @@ -902,6 +904,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = self.kernel != "precomputed" tags._xfail_checks = { # TODO: fix sample_weight handling of this estimator when probability=False # TODO: replace by a statistical test when probability=True @@ -1177,6 +1180,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = self.kernel != "precomputed" tags._xfail_checks = { "check_methods_subset_invariance": ( "fails for the decision_function method" @@ -1388,6 +1392,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = self.kernel != "precomputed" # TODO: fix sample_weight handling of this estimator, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( @@ -1583,6 +1588,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = self.kernel != "precomputed" # TODO: fix sample_weight handling of this estimator, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( @@ -1850,6 +1856,7 @@ def predict(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True # TODO: fix sample_weight handling of this estimator, see meta-issue #16298 tags._xfail_checks = { "check_sample_weight_equivalence": ( diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 93246a1376e85..f94dd70aefe1c 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1100,6 +1100,7 @@ def __sklearn_tags__(self): } tags.classifier_tags.multi_label = True tags.input_tags.allow_nan = allow_nan + tags.input_tags.sparse = True return tags @@ -1442,6 +1443,7 @@ def __sklearn_tags__(self): "poisson", } tags.input_tags.allow_nan = allow_nan + tags.input_tags.sparse = True return tags diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 728fd71844118..1b6e7e79e4b7e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -136,6 +136,7 @@ def _yield_checks(estimator): if hasattr(estimator, "sparsify"): yield check_sparsify_coefficients + yield check_estimator_sparse_tag yield check_estimator_sparse_array yield check_estimator_sparse_matrix @@ -871,6 +872,41 @@ def check_array_api_input_and_values( ) +def check_estimator_sparse_tag(name, estimator_orig): + if SPARSE_ARRAY_PRESENT: + sparse_container = sparse.csr_array + else: + sparse_container = sparse.csr_matrix + estimator = clone(estimator_orig) + + rng = np.random.RandomState(0) + n_samples = 15 if name == "SpectralCoclustering" else 40 + X = rng.uniform(size=(n_samples, 3)) + X[X < 0.6] = 0 + y = rng.randint(0, 3, size=n_samples) + X = _enforce_estimator_tags_X(estimator, X) + y = _enforce_estimator_tags_y(estimator, y) + X = sparse_container(X) + + tags = get_tags(estimator) + if tags.input_tags.sparse: + estimator.fit(X, y) # should pass + return + else: + err_msg = ( + f"Estimator {name} has input_tags.sparse=False " + "but didn't raise an error when fitted on sparse data." + ) + with raises( + (TypeError, ValueError), + match=["sparse", "Sparse"], + may_pass=False, + err_msg=err_msg, + ): + estimator.fit(X, y) # should fail + return + + def _check_estimator_sparse_container(name, estimator_orig, sparse_type): rng = np.random.RandomState(0) X = rng.uniform(size=(40, 3)) From b0e605d127cb4f6c8d15cb0e177740104bcb3d2f Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Mon, 4 Nov 2024 10:19:13 +0100 Subject: [PATCH 02/28] fix LinearRegression tag --- sklearn/linear_model/_base.py | 2 +- sklearn/utils/_test_common/instance_generator.py | 3 +++ sklearn/utils/tests/test_estimator_checks.py | 7 +++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 7be8188981455..bb71cbe9ed550 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -689,7 +689,7 @@ def rmatvec(b): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = True + tags.input_tags.sparse = not self.positive return tags diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 846c132aa0feb..d60c5786a3025 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -533,6 +533,9 @@ "check_dict_unchanged": dict(batch_size=10, max_iter=5, n_components=1) }, LinearDiscriminantAnalysis: {"check_dict_unchanged": dict(n_components=1)}, + LinearRegression: { + "check_estimator_sparse_tag": [dict(positive=False), dict(positive=True)] + }, LocallyLinearEmbedding: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, LogisticRegression: { "check_sample_weight_equivalence": [ diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 29611a853938f..7d604eb0c61c1 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -495,7 +495,8 @@ def __sklearn_tags__(self): class RequiresPositiveXRegressor(LinearRegression): def fit(self, X, y): - X, y = validate_data(self, X, y, multi_output=True) + # reject sparse X to be able to call (X < 0).any() + X, y = validate_data(self, X, y, accept_sparse=False, multi_output=True) if (X < 0).any(): raise ValueError("negative X values not supported!") return super().fit(X, y) @@ -503,12 +504,14 @@ def fit(self, X, y): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.positive_only = True + # reject sparse X to be able to call (X < 0).any() + tags.input_tags.sparse = False return tags class RequiresPositiveYRegressor(LinearRegression): def fit(self, X, y): - X, y = validate_data(self, X, y, multi_output=True) + X, y = validate_data(self, X, y, accept_sparse=True, multi_output=True) if (y <= 0).any(): raise ValueError("negative y values not supported!") return super().fit(X, y) From 6c72527502bf89c716cf9cf24020416b486fc28e Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Mon, 4 Nov 2024 10:34:08 +0100 Subject: [PATCH 03/28] changelog --- doc/whats_new/upcoming_changes/sklearn.utils/30187.api.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.utils/30187.api.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/30187.api.rst b/doc/whats_new/upcoming_changes/sklearn.utils/30187.api.rst new file mode 100644 index 0000000000000..46b5dfa96267a --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.utils/30187.api.rst @@ -0,0 +1,4 @@ +- :func:`utils.estimator_checks.check_estimator_sparse_tag` ensures that + the estimator tag `input_tags.sparse` is consistent with its `fit` + method (acceting sparse input `X` or raising an error). + By :user:`Antoine Baker ` From c3065930002b5ef8a36a1c6f45a40a7fe47180c5 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Mon, 4 Nov 2024 16:13:02 +0100 Subject: [PATCH 04/28] changelog --- .../sklearn.utils/{30187.api.rst => 30187.fix.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/whats_new/upcoming_changes/sklearn.utils/{30187.api.rst => 30187.fix.rst} (100%) diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/30187.api.rst b/doc/whats_new/upcoming_changes/sklearn.utils/30187.fix.rst similarity index 100% rename from doc/whats_new/upcoming_changes/sklearn.utils/30187.api.rst rename to doc/whats_new/upcoming_changes/sklearn.utils/30187.fix.rst From 6aadc95b3eec13df3ac7793712ed7fb79caa1939 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 14 Nov 2024 16:49:32 +0100 Subject: [PATCH 05/28] fix column transformer tag --- sklearn/compose/_column_transformer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f9f6419310a6d..c73c5befa6eb2 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -29,6 +29,7 @@ _get_output_config, _safe_set_output, ) +from ..utils._tags import get_tags from ..utils.metadata_routing import ( MetadataRouter, MethodMapping, @@ -1322,6 +1323,10 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = all( + get_tags(trans).input_tags.sparse if trans != "drop" else True + for name, trans, _ in self.transformers + ) tags._xfail_checks = { "check_estimators_empty_data_messages": "FIXME", "check_estimators_nan_inf": "FIXME", From 7979fa9a8b74b9bdadbebd4c4ea9f004d94886a2 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 15 Nov 2024 10:40:44 +0100 Subject: [PATCH 06/28] change error message --- sklearn/utils/estimator_checks.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1b6e7e79e4b7e..f56095eecb819 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -891,11 +891,10 @@ def check_estimator_sparse_tag(name, estimator_orig): tags = get_tags(estimator) if tags.input_tags.sparse: estimator.fit(X, y) # should pass - return else: err_msg = ( - f"Estimator {name} has input_tags.sparse=False " - "but didn't raise an error when fitted on sparse data." + f"Estimator {name} with input_tags.sparse=False doesn't " + "fail gracefully when fitted on sparse data." ) with raises( (TypeError, ValueError), @@ -904,7 +903,6 @@ def check_estimator_sparse_tag(name, estimator_orig): err_msg=err_msg, ): estimator.fit(X, y) # should fail - return def _check_estimator_sparse_container(name, estimator_orig, sparse_type): From 6d7c2b1a07bb4bc4134841258ffcfad70a9ff62a Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 15 Nov 2024 14:57:09 +0100 Subject: [PATCH 07/28] changelog --- doc/whats_new/upcoming_changes/changed-models/30187.fix.rst | 3 +++ .../sklearn.utils/{30187.fix.rst => 30187.enhancement.rst} | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 doc/whats_new/upcoming_changes/changed-models/30187.fix.rst rename doc/whats_new/upcoming_changes/sklearn.utils/{30187.fix.rst => 30187.enhancement.rst} (72%) diff --git a/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst b/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst new file mode 100644 index 0000000000000..0b662db8f8091 --- /dev/null +++ b/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst @@ -0,0 +1,3 @@ +- The `tags.input_tags.sparse` flag was corrected for a + majority of estimators. + By :user:`Antoine Baker ` diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/30187.fix.rst b/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst similarity index 72% rename from doc/whats_new/upcoming_changes/sklearn.utils/30187.fix.rst rename to doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst index 46b5dfa96267a..0a7bf5b06e8df 100644 --- a/doc/whats_new/upcoming_changes/sklearn.utils/30187.fix.rst +++ b/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst @@ -1,4 +1,4 @@ - :func:`utils.estimator_checks.check_estimator_sparse_tag` ensures that the estimator tag `input_tags.sparse` is consistent with its `fit` - method (acceting sparse input `X` or raising an error). + method (acceting sparse input `X` or raising the appropriate error). By :user:`Antoine Baker ` From 7005797170f23441585cff0a7e184ef6e911c3c1 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 15 Nov 2024 18:19:44 +0100 Subject: [PATCH 08/28] fix passthrough sparse tag --- sklearn/compose/_column_transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index ba914ce7821f6..33a0b1bea2a10 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1319,8 +1319,9 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.sparse = all( - get_tags(trans).input_tags.sparse if trans != "drop" else True + get_tags(trans).input_tags.sparse for name, trans, _ in self.transformers + if trans not in {"passthrough", "drop"} ) return tags From 6962aa9344465c604bf5566cc98e7a50bb370794 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 15 Nov 2024 18:34:28 +0100 Subject: [PATCH 09/28] fix SelfTrainingClassifier --- sklearn/semi_supervised/_self_training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index b28a6bd88305e..2f7391cb4f76f 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -616,5 +616,6 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse + if self.estimator is not None: + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags From 2668fea2ff907a4dbc54692ac7efefdf02da6944 Mon Sep 17 00:00:00 2001 From: antoinebaker Date: Fri, 15 Nov 2024 19:53:40 +0100 Subject: [PATCH 10/28] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- .../upcoming_changes/changed-models/30187.fix.rst | 3 +-- .../sklearn.utils/30187.enhancement.rst | 2 +- sklearn/cluster/_affinity_propagation.py | 2 +- sklearn/decomposition/_pca.py | 2 +- sklearn/preprocessing/_function_transformer.py | 2 +- sklearn/utils/estimator_checks.py | 12 +++++++++++- 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst b/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst index 0b662db8f8091..001b8840d9a7b 100644 --- a/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst +++ b/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst @@ -1,3 +1,2 @@ -- The `tags.input_tags.sparse` flag was corrected for a - majority of estimators. +- The `tags.input_tags.sparse` flag was corrected for a majority of estimators. By :user:`Antoine Baker ` diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst index 0a7bf5b06e8df..de75f70cb552e 100644 --- a/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst +++ b/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst @@ -1,4 +1,4 @@ - :func:`utils.estimator_checks.check_estimator_sparse_tag` ensures that the estimator tag `input_tags.sparse` is consistent with its `fit` - method (acceting sparse input `X` or raising the appropriate error). + method (accepting sparse input `X` or raising the appropriate error). By :user:`Antoine Baker ` diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 76b5f27a21af4..e5cb501984762 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -483,7 +483,7 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.pairwise = self.affinity == "precomputed" - tags.input_tags.sparse = True + tags.input_tags.sparse = self.affinity != "precomputed" return tags @_fit_context(prefer_skip_nested_validation=True) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index ba5ca4dd4c680..405f9e8c92c85 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -851,5 +851,5 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.transformer_tags.preserves_dtype = ["float64", "float32"] tags.array_api_support = True - tags.input_tags.sparse = True + tags.input_tags.sparse = self.svd_solver in ("auto", "arpack", "covariance_eigh") return tags diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 54166fdacb2ad..df84f5333357c 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -394,7 +394,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.no_validation = not self.validate tags.requires_fit = False - tags.input_tags.sparse = True + tags.input_tags.sparse = not self.validate or (self.validate and self.accept_sparse) return tags def set_output(self, *, transform=None): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index c963040f1ff55..3ec4a795c2f5c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1194,6 +1194,7 @@ def check_array_api_input_and_values( def check_estimator_sparse_tag(name, estimator_orig): + """Check that estimator tag related with accepting sparse data is properly set.""" if SPARSE_ARRAY_PRESENT: sparse_container = sparse.csr_array else: @@ -1211,7 +1212,16 @@ def check_estimator_sparse_tag(name, estimator_orig): tags = get_tags(estimator) if tags.input_tags.sparse: - estimator.fit(X, y) # should pass + try: + estimator.fit(X, y) # should pass + except Exception as e: + raise AssertionError( + f"Estimator {name} raised an exception: {e}. The tag " + "self.input_tags.sparse_tag might not be consistent with the " + "estimator's ability to handle sparse data (i.e. controlled by the " + "parameter `accept_sparse` in `validate_data` or `check_array` " + f"functions). Got input_tags.sparse={tags.input_tags.sparse}." + ) else: err_msg = ( f"Estimator {name} with input_tags.sparse=False doesn't " From 705c41443f22b0e535859909d711b6cc410c5276 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 15 Nov 2024 19:55:46 +0100 Subject: [PATCH 11/28] add suggestions --- sklearn/decomposition/_incremental_pca.py | 1 + sklearn/ensemble/_gb.py | 15 +++++-------- sklearn/ensemble/_weight_boosting.py | 15 +++++-------- sklearn/linear_model/_coordinate_descent.py | 7 ++++-- sklearn/naive_bayes.py | 8 +------ sklearn/semi_supervised/_self_training.py | 10 ++++++--- sklearn/svm/_base.py | 6 +++++ sklearn/svm/_classes.py | 25 --------------------- sklearn/tree/_classes.py | 7 ++++-- 9 files changed, 35 insertions(+), 59 deletions(-) diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index a8ea2dcd558aa..da617ef8fa787 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -421,5 +421,6 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + # Beware that fit accepts sparse data but partial_fit doesn't tags.input_tags.sparse = True return tags diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 34282db589ff9..fded8a535413d 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1117,6 +1117,11 @@ def apply(self, X): return leaves + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): """Gradient Boosting for classification. @@ -1725,11 +1730,6 @@ def staged_predict_proba(self, X): "loss=%r does not support predict_proba" % self.loss ) from e - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = True - return tags - class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): """Gradient Boosting for regression. @@ -2190,8 +2190,3 @@ def apply(self, X): leaves = super().apply(X) leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0]) return leaves - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = True - return tags diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 065c7204b0a91..8503c4fdb8ae7 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -312,6 +312,11 @@ def feature_importances_(self): "feature_importances_ attribute" ) from e + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + def _samme_proba(estimator, n_classes, X): """Calculate algorithm 4, step 2, equation c) of Zhu et al [1]. @@ -858,11 +863,6 @@ def predict_log_proba(self, X): """ return np.log(self.predict_proba(X)) - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = True - return tags - class AdaBoostRegressor(_RoutingNotSupportedMixin, RegressorMixin, BaseWeightBoosting): """An AdaBoost regressor. @@ -1171,8 +1171,3 @@ def staged_predict(self, X): for i, _ in enumerate(self.estimators_, 1): yield self._get_median_predict(X, limit=i) - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = True - return tags diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index b8430fe00b862..cfc2a1832b887 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1869,6 +1869,11 @@ def get_metadata_routing(self): ) return router + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class LassoCV(RegressorMixin, LinearModelCV): """Lasso linear model with iterative fitting along a regularization path. @@ -2083,7 +2088,6 @@ def _is_multitask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = True tags.target_tags.multi_output = False return tags @@ -2365,7 +2369,6 @@ def _is_multitask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = True tags.target_tags.multi_output = False return tags diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 3418e99e17c3c..0bb2daab25d0b 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -771,6 +771,7 @@ def _init_counters(self, n_classes, n_features): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = True tags.classifier_tags.poor_score = True return tags @@ -880,7 +881,6 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = True tags.input_tags.positive_only = True return tags @@ -1029,7 +1029,6 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = True tags.input_tags.positive_only = True return tags @@ -1229,11 +1228,6 @@ def _joint_log_likelihood(self, X): return jll - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = True - return tags - class CategoricalNB(_BaseDiscreteNB): """Naive Bayes classifier for categorical features. diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index 2f7391cb4f76f..da5dd85d159d4 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -4,9 +4,13 @@ import numpy as np -from sklearn.base import ClassifierMixin - -from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone +from ..base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + _fit_context, + clone, +) from ..utils import Bunch, get_tags, safe_mask from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions from ..utils.metadata_routing import ( diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 3e5024364df5c..f5b35f39a7daf 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -147,6 +147,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() # Used by cross_val_score. tags.input_tags.pairwise = self.kernel == "precomputed" + tags.input_tags.sparse = self.kernel != "precomputed" return tags @_fit_context(prefer_skip_nested_validation=True) @@ -999,6 +1000,11 @@ def probB_(self): """ return self._probB + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = self.kernel != "precomputed" + return tags + def _get_liblinear_solver_type(multi_class, penalty, loss, dual): """Find the liblinear magic number for the solver. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index f29f9baa0f139..0eb49a8c0832c 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -887,11 +887,6 @@ def __init__( random_state=random_state, ) - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = self.kernel != "precomputed" - return tags - class NuSVC(BaseSVC): """Nu-Support Vector Classification. @@ -1155,11 +1150,6 @@ def __init__( random_state=random_state, ) - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = self.kernel != "precomputed" - return tags - class SVR(RegressorMixin, BaseLibSVM): """Epsilon-Support Vector Regression. @@ -1354,11 +1344,6 @@ def __init__( random_state=None, ) - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = self.kernel != "precomputed" - return tags - class NuSVR(RegressorMixin, BaseLibSVM): """Nu Support Vector Regression. @@ -1546,11 +1531,6 @@ def __init__( random_state=None, ) - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = self.kernel != "precomputed" - return tags - class OneClassSVM(OutlierMixin, BaseLibSVM): """Unsupervised Outlier Detection. @@ -1807,8 +1787,3 @@ def predict(self, X): """ y = super().predict(X) return np.asarray(y, dtype=np.intp) - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.sparse = True - return tags diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index f94dd70aefe1c..646aa7fb034c4 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -690,6 +690,11 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + # ============================================================================= # Public estimators @@ -1100,7 +1105,6 @@ def __sklearn_tags__(self): } tags.classifier_tags.multi_label = True tags.input_tags.allow_nan = allow_nan - tags.input_tags.sparse = True return tags @@ -1443,7 +1447,6 @@ def __sklearn_tags__(self): "poisson", } tags.input_tags.allow_nan = allow_nan - tags.input_tags.sparse = True return tags From 5178539d5f995c305636e55de6ce14ba8db71a7a Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 15 Nov 2024 20:11:25 +0100 Subject: [PATCH 12/28] fix multitask --- sklearn/linear_model/_coordinate_descent.py | 2 +- sklearn/utils/estimator_checks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index cfc2a1832b887..938331bd7f23f 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1871,7 +1871,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = True + tags.input_tags.sparse = not self._is_multitask() return tags diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3ec4a795c2f5c..5eafe9ab7836a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1217,7 +1217,7 @@ def check_estimator_sparse_tag(name, estimator_orig): except Exception as e: raise AssertionError( f"Estimator {name} raised an exception: {e}. The tag " - "self.input_tags.sparse_tag might not be consistent with the " + "self.input_tags.sparse might not be consistent with the " "estimator's ability to handle sparse data (i.e. controlled by the " "parameter `accept_sparse` in `validate_data` or `check_array` " f"functions). Got input_tags.sparse={tags.input_tags.sparse}." From e5a4458c184a8ce6eac49f24d4a8e3041b290fac Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 15 Nov 2024 20:19:44 +0100 Subject: [PATCH 13/28] black formatting --- sklearn/decomposition/_pca.py | 6 +++++- sklearn/preprocessing/_function_transformer.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 405f9e8c92c85..f8882a7a6b5d6 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -851,5 +851,9 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.transformer_tags.preserves_dtype = ["float64", "float32"] tags.array_api_support = True - tags.input_tags.sparse = self.svd_solver in ("auto", "arpack", "covariance_eigh") + tags.input_tags.sparse = self.svd_solver in ( + "auto", + "arpack", + "covariance_eigh", + ) return tags diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index df84f5333357c..e4bb8e55ad62f 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -394,7 +394,9 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.no_validation = not self.validate tags.requires_fit = False - tags.input_tags.sparse = not self.validate or (self.validate and self.accept_sparse) + tags.input_tags.sparse = not self.validate or ( + self.validate and self.accept_sparse + ) return tags def set_output(self, *, transform=None): From d6f277fd74eaaf66aec257029ec7fd57bc6a00ec Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Mon, 18 Nov 2024 11:56:24 +0100 Subject: [PATCH 14/28] add meta test --- sklearn/utils/estimator_checks.py | 35 ++++++++++++------ sklearn/utils/tests/test_estimator_checks.py | 37 ++++++++++++++++++++ 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5eafe9ab7836a..3699a582e1cfa 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1193,6 +1193,12 @@ def check_array_api_input_and_values( ) +def _is_sparse_input_error(e): + if not (isinstance(e, TypeError) or isinstance(e, ValueError)): + return False + return True if re.search("[Ss]parse", str(e)) else False + + def check_estimator_sparse_tag(name, estimator_orig): """Check that estimator tag related with accepting sparse data is properly set.""" if SPARSE_ARRAY_PRESENT: @@ -1223,17 +1229,26 @@ def check_estimator_sparse_tag(name, estimator_orig): f"functions). Got input_tags.sparse={tags.input_tags.sparse}." ) else: - err_msg = ( - f"Estimator {name} with input_tags.sparse=False doesn't " - "fail gracefully when fitted on sparse data." + try: + estimator.fit(X, y) # should fail with appropriate error + except Exception as e: + if _is_sparse_input_error(e): + return + else: + raise AssertionError( + f"Estimator {name} raised an exception: {e}. " + "The estimator failed when fitted on sparse data in accordance " + f"with its tag self.input_tags.sparse={tags.input_tags.sparse} " + "but didn't raise the appropriate error : error message should " + "state explicitly that sparse input is not supported if this is " + "not the case, e.g. by using check_array(X, accept_sparse=False)." + ) + raise AssertionError( + f"Estimator {name} didn't fail when fitted on sparse data " + "but should have according to its tag " + f"self.input_tags.sparse={tags.input_tags.sparse}. " + f"The tag is inconsistent and must be fixed." ) - with raises( - (TypeError, ValueError), - match=["sparse", "Sparse"], - may_pass=False, - err_msg=err_msg, - ): - estimator.fit(X, y) # should fail def _check_estimator_sparse_container(name, estimator_orig, sparse_type): diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 7329277722dfb..d98a1ec1f4b92 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -72,6 +72,7 @@ check_estimator_repr, check_estimator_sparse_array, check_estimator_sparse_matrix, + check_estimator_sparse_tag, check_estimator_tags_renamed, check_estimators_nan_inf, check_estimators_overwrite_params, @@ -847,6 +848,42 @@ def test_check_outlier_corruption(): check_outlier_corruption(1, 2, decision) +def test_check_estimator_sparse_tag(): + """Test that check_estimator_sparse_tag raises error when sparse tag is + misaligned.""" + + class EstimatorWithSparseConfig(BaseEstimator): + def __init__(self, tag_sparse, accept_sparse): + self.tag_sparse = tag_sparse + self.accept_sparse = accept_sparse + + def fit(self, X, y=None): + validate_data(self, X, y, accept_sparse=self.accept_sparse) + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = self.tag_sparse + return tags + + test_cases = [ + {"tag_sparse": True, "accept_sparse": True, "error_type": None}, + {"tag_sparse": False, "accept_sparse": False, "error_type": None}, + {"tag_sparse": False, "accept_sparse": True, "error_type": AssertionError}, + {"tag_sparse": True, "accept_sparse": False, "error_type": AssertionError}, + ] + + for test_case in test_cases: + estimator = EstimatorWithSparseConfig( + test_case["tag_sparse"], test_case["accept_sparse"] + ) + if test_case["error_type"] is None: + check_estimator_sparse_tag(estimator.__class__.__name__, estimator) + else: + with raises(test_case["error_type"]): + check_estimator_sparse_tag(estimator.__class__.__name__, estimator) + + def test_check_estimator_transformer_no_mixin(): # check that TransformerMixin is not required for transformer tests to run # but it fails since the tag is not set From 0bcc765a503865df41f77a57497a268fa2b3cee5 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Mon, 18 Nov 2024 14:31:36 +0100 Subject: [PATCH 15/28] add feature union --- sklearn/pipeline.py | 9 +++++++++ sklearn/utils/_test_common/instance_generator.py | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index f8422ae934d28..edb65e8f55a84 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -2114,6 +2114,15 @@ def get_metadata_routing(self): return router + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = all( + get_tags(trans).input_tags.sparse + for name, trans in self.transformer_list + if trans not in {"passthrough", "drop"} + ) + return tags + def make_union(*transformers, n_jobs=None, verbose=False): """Construct a :class:`FeatureUnion` from the given transformers. diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index 7eeb0c848c508..a2e43308d0d0f 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -528,6 +528,18 @@ FactorAnalysis: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, FastICA: {"check_dict_unchanged": dict(max_iter=5, n_components=1)}, FeatureAgglomeration: {"check_dict_unchanged": dict(n_clusters=1)}, + FeatureUnion: { + "check_estimator_sparse_tag": [ + dict(transformer_list=[("trans1", StandardScaler())]), + dict( + transformer_list=[ + ("trans1", StandardScaler(with_mean=False)), + ("trans2", "drop"), + ("trans3", "passthrough"), + ] + ), + ] + }, GammaRegressor: { "check_sample_weight_equivalence": [ dict(solver="newton-cholesky"), From d862de777e31b77c81cde419377e585c5cfa178a Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Mon, 18 Nov 2024 14:48:24 +0100 Subject: [PATCH 16/28] check function transformer --- sklearn/preprocessing/_function_transformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index e4bb8e55ad62f..3fc33c59e76bd 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -394,9 +394,7 @@ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.no_validation = not self.validate tags.requires_fit = False - tags.input_tags.sparse = not self.validate or ( - self.validate and self.accept_sparse - ) + tags.input_tags.sparse = not self.validate or self.accept_sparse return tags def set_output(self, *, transform=None): From 21ec5858049616fb8dfb0a0370fffaeeb979e966 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 20 Nov 2024 17:45:59 +0100 Subject: [PATCH 17/28] catch invalid transformers list --- sklearn/compose/_column_transformer.py | 17 ++++++++++++----- sklearn/pipeline.py | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 33a0b1bea2a10..f347266ae204a 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1318,11 +1318,18 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = all( - get_tags(trans).input_tags.sparse - for name, trans, _ in self.transformers - if trans not in {"passthrough", "drop"} - ) + try: + sparse = all( + get_tags(trans).input_tags.sparse + for name, trans, _ in self.transformers + if trans not in {"passthrough", "drop"} + ) + except Exception: + # If `transformers` does not comply with our API (list of tuples) + # then it will fail. In this case, we assume that `sparse` is False + # but the parameter validation will raise an error during `fit`. + sparse = False + tags.input_tags.sparse = sparse return tags diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index edb65e8f55a84..e0d0f8184edb9 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -2116,11 +2116,18 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = all( - get_tags(trans).input_tags.sparse - for name, trans in self.transformer_list - if trans not in {"passthrough", "drop"} - ) + try: + sparse = all( + get_tags(trans).input_tags.sparse + for name, trans in self.transformer_list + if trans not in {"passthrough", "drop"} + ) + except Exception: + # If `transformer_list` does not comply with our API (list of tuples) + # then it will fail. In this case, we assume that `sparse` is False + # but the parameter validation will raise an error during `fit`. + sparse = False + tags.input_tags.sparse = sparse return tags From c6538342ed2057ffa2af810af42728bdaf7a09f4 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Tue, 26 Nov 2024 12:09:29 +0100 Subject: [PATCH 18/28] add todo --- sklearn/semi_supervised/_self_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index da5dd85d159d4..d19b57d033040 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -620,6 +620,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + # TODO(1.8): remove together with base_estimator if self.estimator is not None: tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags From d9f4de33dcb63aae5c0c0b60fbe852a5805e6409 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Tue, 26 Nov 2024 12:09:55 +0100 Subject: [PATCH 19/28] remove outer function --- sklearn/utils/estimator_checks.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1d6f27b0a1b65..3c26116b717be 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1197,12 +1197,6 @@ def check_array_api_input_and_values( ) -def _is_sparse_input_error(e): - if not (isinstance(e, TypeError) or isinstance(e, ValueError)): - return False - return True if re.search("[Ss]parse", str(e)) else False - - def check_estimator_sparse_tag(name, estimator_orig): """Check that estimator tag related with accepting sparse data is properly set.""" if SPARSE_ARRAY_PRESENT: @@ -1233,20 +1227,23 @@ def check_estimator_sparse_tag(name, estimator_orig): f"functions). Got input_tags.sparse={tags.input_tags.sparse}." ) else: + err_msg = ( + f"Estimator {name} raised an exception. " + "The estimator failed when fitted on sparse data in accordance " + f"with its tag self.input_tags.sparse={tags.input_tags.sparse} " + "but didn't raise the appropriate error : error message should " + "state explicitly that sparse input is not supported if this is " + "not the case, e.g. by using check_array(X, accept_sparse=False)." + ) try: estimator.fit(X, y) # should fail with appropriate error - except Exception as e: - if _is_sparse_input_error(e): + except (ValueError, TypeError) as e: + if re.search("[Ss]parse", str(e)): + # Got the right error type and mentioning sparse issue return - else: - raise AssertionError( - f"Estimator {name} raised an exception: {e}. " - "The estimator failed when fitted on sparse data in accordance " - f"with its tag self.input_tags.sparse={tags.input_tags.sparse} " - "but didn't raise the appropriate error : error message should " - "state explicitly that sparse input is not supported if this is " - "not the case, e.g. by using check_array(X, accept_sparse=False)." - ) + raise AssertionError(err_msg) from e + except Exception as e: + raise AssertionError(err_msg) from e raise AssertionError( f"Estimator {name} didn't fail when fitted on sparse data " "but should have according to its tag " From a9fb7d711ce902565cc7c4fd0953a87feb3d1c17 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Tue, 26 Nov 2024 18:05:22 +0100 Subject: [PATCH 20/28] change pipeline tag --- sklearn/pipeline.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index e0d0f8184edb9..70b08d28192d9 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1224,7 +1224,15 @@ def __sklearn_tags__(self): tags.input_tags.pairwise = get_tags( self.steps[0][1] ).input_tags.pairwise - tags.input_tags.sparse = get_tags(self.steps[0][1]).input_tags.sparse + # WARNING: the sparse tag can be incorrect. + # Some Pipelines accepting sparse data are wrongly tagged sparse=False. + # For example Pipeline([PCA(), estimator]) accepts sparse data + # even if the estimator doesn't as PCA outputs a dense array. + tags.input_tags.sparse = all( + get_tags(step).input_tags.sparse + for name, step in self.steps + if step != "passthrough" + ) except (ValueError, AttributeError, TypeError): # This happens when the `steps` is not a list of (name, estimator) # tuples and `fit` is not called yet to validate the steps. From e85f94a94da301d54a99a66e31087d8f5b58b7a2 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 29 Nov 2024 16:16:01 +0100 Subject: [PATCH 21/28] tag RobustScaler --- sklearn/preprocessing/_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 8d428fe50c7f8..f0d1defe61ca9 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1739,6 +1739,7 @@ def inverse_transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.input_tags.sparse = not self.with_centering tags.input_tags.allow_nan = True return tags From 8f2f3dbfaabdd1a2a68741753093a2b03d321def Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 29 Nov 2024 16:30:17 +0100 Subject: [PATCH 22/28] tag RANSAC --- sklearn/linear_model/_ransac.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index a5fa36aa4c468..99b52e5840e32 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -15,7 +15,7 @@ clone, ) from ..exceptions import ConvergenceWarning -from ..utils import check_consistent_length, check_random_state +from ..utils import check_consistent_length, check_random_state, get_tags from ..utils._bunch import Bunch from ..utils._param_validation import ( HasMethods, @@ -724,5 +724,8 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = True + if self.estimator is None: + tags.input_tags.sparse = True + else: + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags From 3ac38e10c2d0af50261842691e0251a49b932c74 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Fri, 29 Nov 2024 16:51:01 +0100 Subject: [PATCH 23/28] multi_output in LinearModelCV --- sklearn/linear_model/_coordinate_descent.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 938331bd7f23f..b98cf08925910 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1871,7 +1871,9 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.sparse = not self._is_multitask() + multitask = self._is_multitask() + tags.input_tags.sparse = not multitask + tags.target_tags.multi_output = multitask return tags @@ -2086,11 +2088,6 @@ def _get_estimator(self): def _is_multitask(self): return False - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.target_tags.multi_output = False - return tags - def fit(self, X, y, sample_weight=None, **params): """Fit Lasso model with coordinate descent. @@ -2367,11 +2364,6 @@ def _get_estimator(self): def _is_multitask(self): return False - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.target_tags.multi_output = False - return tags - def fit(self, X, y, sample_weight=None, **params): """Fit ElasticNet model with coordinate descent. @@ -3035,7 +3027,6 @@ def _is_multitask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.target_tags.multi_output = True tags.target_tags.single_output = False return tags @@ -3276,7 +3267,6 @@ def _is_multitask(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.target_tags.multi_output = True tags.target_tags.single_output = False return tags From 425b4736dab567a29613155dc02bdd9fdd6f4ebf Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Mon, 9 Dec 2024 10:47:09 +0100 Subject: [PATCH 24/28] raise from exception --- sklearn/utils/estimator_checks.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 66333be57ff56..4ffc3143508b3 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1220,19 +1220,20 @@ def check_estimator_sparse_tag(name, estimator_orig): try: estimator.fit(X, y) # should pass except Exception as e: - raise AssertionError( - f"Estimator {name} raised an exception: {e}. The tag " - "self.input_tags.sparse might not be consistent with the " - "estimator's ability to handle sparse data (i.e. controlled by the " - "parameter `accept_sparse` in `validate_data` or `check_array` " - f"functions). Got input_tags.sparse={tags.input_tags.sparse}." + err_msg = ( + f"Estimator {name} raised an exception. " + f"The tag self.input_tags.sparse={tags.input_tags.sparse} " + "might not be consistent with the estimator's ability to " + "handle sparse data (i.e. controlled by the parameter `accept_sparse`" + " in `validate_data` or `check_array` functions)." ) + raise AssertionError(err_msg) from e else: err_msg = ( f"Estimator {name} raised an exception. " "The estimator failed when fitted on sparse data in accordance " f"with its tag self.input_tags.sparse={tags.input_tags.sparse} " - "but didn't raise the appropriate error : error message should " + "but didn't raise the appropriate error: error message should " "state explicitly that sparse input is not supported if this is " "not the case, e.g. by using check_array(X, accept_sparse=False)." ) From 17ccb72608450b537f19bfb84a560c0e66988efa Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 11 Dec 2024 11:08:24 +0100 Subject: [PATCH 25/28] no cover --- sklearn/compose/_column_transformer.py | 5 ++--- sklearn/ensemble/_base.py | 13 +++++-------- sklearn/pipeline.py | 5 ++--- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f347266ae204a..e088f534707d2 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1319,7 +1319,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() try: - sparse = all( + tags.input_tags.sparse = all( get_tags(trans).input_tags.sparse for name, trans, _ in self.transformers if trans not in {"passthrough", "drop"} @@ -1328,8 +1328,7 @@ def __sklearn_tags__(self): # If `transformers` does not comply with our API (list of tuples) # then it will fail. In this case, we assume that `sparse` is False # but the parameter validation will raise an error during `fit`. - sparse = False - tags.input_tags.sparse = sparse + pass # pragma: no cover return tags diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 5a51e8e015727..db5a0944a72c3 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -288,20 +288,17 @@ def get_params(self, deep=True): def __sklearn_tags__(self): tags = super().__sklearn_tags__() try: - allow_nan = all( + tags.input_tags.allow_nan = all( get_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True for est in self.estimators ) - sparse = all( + tags.input_tags.sparse = all( get_tags(est[1]).input_tags.sparse if est[1] != "drop" else True for est in self.estimators ) except Exception: # If `estimators` does not comply with our API (list of tuples) then it will - # fail. In this case, we assume that `allow_nan` is False but the parameter - # validation will raise an error during `fit`. - allow_nan = False - sparse = False - tags.input_tags.allow_nan = allow_nan - tags.input_tags.sparse = sparse + # fail. In this case, we assume that `allow_nan` and `sparse` are False but + # the parameter validation will raise an error during `fit`. + pass # pragma: no cover return tags diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 70b08d28192d9..6b750297e9f13 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -2125,7 +2125,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() try: - sparse = all( + tags.input_tags.sparse = all( get_tags(trans).input_tags.sparse for name, trans in self.transformer_list if trans not in {"passthrough", "drop"} @@ -2134,8 +2134,7 @@ def __sklearn_tags__(self): # If `transformer_list` does not comply with our API (list of tuples) # then it will fail. In this case, we assume that `sparse` is False # but the parameter validation will raise an error during `fit`. - sparse = False - tags.input_tags.sparse = sparse + pass # pragma: no cover return tags From 8429e8f55d9a2b134b49ab5fa9d9e2521e419e67 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 11 Dec 2024 11:26:54 +0100 Subject: [PATCH 26/28] test raise inappropriate error --- sklearn/utils/tests/test_estimator_checks.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 5e23fbe1620f7..b805bc1209f0c 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -854,11 +854,14 @@ def test_check_estimator_sparse_tag(): misaligned.""" class EstimatorWithSparseConfig(BaseEstimator): - def __init__(self, tag_sparse, accept_sparse): + def __init__(self, tag_sparse, accept_sparse, fit_error=None): self.tag_sparse = tag_sparse self.accept_sparse = accept_sparse + self.fit_error = fit_error def fit(self, X, y=None): + if self.fit_error: + raise self.fit_error validate_data(self, X, y, accept_sparse=self.accept_sparse) return self @@ -876,7 +879,8 @@ def __sklearn_tags__(self): for test_case in test_cases: estimator = EstimatorWithSparseConfig( - test_case["tag_sparse"], test_case["accept_sparse"] + test_case["tag_sparse"], + test_case["accept_sparse"], ) if test_case["error_type"] is None: check_estimator_sparse_tag(estimator.__class__.__name__, estimator) @@ -884,6 +888,13 @@ def __sklearn_tags__(self): with raises(test_case["error_type"]): check_estimator_sparse_tag(estimator.__class__.__name__, estimator) + # estimator `tag_sparse=accept_sparse=False` fails on sparse data + # but does not raise the appropriate error + for fit_error in [TypeError("unexpected error"), KeyError("other error")]: + estimator = EstimatorWithSparseConfig(False, False, fit_error) + with raises(AssertionError): + check_estimator_sparse_tag(estimator.__class__.__name__, estimator) + def test_check_estimator_transformer_no_mixin(): # check that TransformerMixin is not required for transformer tests to run From 97d700dde492640717a5ef1ba3882d6099ec267b Mon Sep 17 00:00:00 2001 From: antoinebaker Date: Thu, 12 Dec 2024 17:46:26 +0100 Subject: [PATCH 27/28] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/linear_model/_ransac.py | 2 +- sklearn/linear_model/_ridge.py | 5 ++--- sklearn/semi_supervised/_self_training.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 99b52e5840e32..90dc6d6bc5e70 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -725,7 +725,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() if self.estimator is None: - tags.input_tags.sparse = True + tags.input_tags.sparse = True # default estimator is LinearRegression else: tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index b7bd5d417877a..1646c08e25d31 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1251,10 +1251,9 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.array_api_support = True - reject_sparse = (self.solver == "svd") or ( - self.solver == "cholesky" and self.fit_intercept + tags.input_tags.sparse = (self.solver != "svd") and ( + self.solver != "cholesky" or not self.fit_intercept ) - tags.input_tags.sparse = not reject_sparse return tags diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index d19b57d033040..4b469a2e9f8d8 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -620,7 +620,7 @@ def get_metadata_routing(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - # TODO(1.8): remove together with base_estimator + # TODO(1.8): remove the condition check together with base_estimator if self.estimator is not None: tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse return tags From 66c3bd9e223bb6cb2fce070860673eee092139d8 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 12 Dec 2024 17:50:13 +0100 Subject: [PATCH 28/28] suggestions from code review --- sklearn/ensemble/_forest.py | 1 + sklearn/linear_model/_ridge.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index ef9e6c0cdb437..f8dad7d6edd9a 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1001,6 +1001,7 @@ def predict_log_proba(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True tags.input_tags.sparse = True return tags diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 1646c08e25d31..9a94ba1caec1c 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1573,10 +1573,9 @@ def fit(self, X, y, sample_weight=None): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - reject_sparse = (self.solver == "svd") or ( - self.solver == "cholesky" and self.fit_intercept + tags.input_tags.sparse = (self.solver != "svd") and ( + self.solver != "cholesky" or not self.fit_intercept ) - tags.input_tags.sparse = not reject_sparse return tags