scikit-learn · glemaitre · Jan 2, 2025 · Oct 31, 2024 · Nov 4, 2024 · Nov 4, 2024
diff --git a/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst b/doc/whats_new/upcoming_changes/changed-models/30187.fix.rst
@@ -0,0 +1,2 @@
+- The `tags.input_tags.sparse` flag was corrected for a majority of estimators.
+  By :user:`Antoine Baker <antoinebaker>`
diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.utils/30187.enhancement.rst
@@ -0,0 +1,4 @@
+- :func:`utils.estimator_checks.check_estimator_sparse_tag` ensures that
+  the estimator tag `input_tags.sparse` is consistent with its `fit`
+  method (accepting sparse input `X` or raising the appropriate error).
+  By :user:`Antoine Baker <antoinebaker>`
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -28,11 +28,7 @@
 from .model_selection import LeaveOneOut, check_cv, cross_val_predict
 from .preprocessing import LabelEncoder, label_binarize
 from .svm import LinearSVC
-from .utils import (
-    _safe_indexing,
-    column_or_1d,
-    indexable,
-)
+from .utils import _safe_indexing, column_or_1d, get_tags, indexable
 from .utils._param_validation import (
     HasMethods,
     Hidden,
@@ -554,6 +550,11 @@ def get_metadata_routing(self):
         )
         return router
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = get_tags(self._get_estimator()).input_tags.sparse
+        return tags
+
 
 def _fit_classifier_calibrator_pair(
     estimator,

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
@@ -483,6 +483,7 @@ def __init__(
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.pairwise = self.affinity == "precomputed"
+        tags.input_tags.sparse = self.affinity != "precomputed"
         return tags
 
     @_fit_context(prefer_skip_nested_validation=True)

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
@@ -193,6 +193,11 @@ def _k_means(self, data, n_clusters):
         labels = model.labels_
         return centroid, labels
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
 
 class SpectralCoclustering(BaseSpectral):
     """Spectral Co-Clustering algorithm (Dhillon, 2001).

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
@@ -742,4 +742,5 @@ def _global_clustering(self, X=None):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
+        tags.input_tags.sparse = True
         return tags
diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py
@@ -538,5 +538,6 @@ def _predict_recursive(self, X, sample_weight, cluster_node):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
         return tags
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
@@ -473,4 +473,5 @@ def fit_predict(self, X, y=None, sample_weight=None):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.pairwise = self.metric == "precomputed"
+        tags.input_tags.sparse = True
         return tags
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -999,5 +999,6 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.input_tags.allow_nan = self.metric != "precomputed"
         return tags
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -1177,6 +1177,11 @@ def score(self, X, y=None, sample_weight=None):
         )
         return -scores
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
 
 class KMeans(_BaseKMeans):
     """K-Means clustering.

diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
@@ -794,6 +794,7 @@ def fit_predict(self, X, y=None):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.input_tags.pairwise = self.affinity in [
             "precomputed",
             "precomputed_nearest_neighbors",

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -29,6 +29,7 @@
     _get_output_config,
     _safe_set_output,
 )
+from ..utils._tags import get_tags
 from ..utils.metadata_routing import (
     MetadataRouter,
     MethodMapping,
@@ -1315,6 +1316,21 @@
 
         return router
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        try:
+            tags.input_tags.sparse = all(
+                get_tags(trans).input_tags.sparse
+                for name, trans, _ in self.transformers
+                if trans not in {"passthrough", "drop"}
+            )
+        except Exception:
+            # If `transformers` does not comply with our API (list of tuples)
+            # then it will fail. In this case, we assume that `sparse` is False
+            # but the parameter validation will raise an error during `fit`.
+            pass  # pragma: no cover
+        return tags
+
 
 def _check_X(X):
     """Use check_array only when necessary, e.g. on lists and other non-array-likes."""

diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
@@ -348,6 +348,7 @@ def __sklearn_tags__(self):
         regressor = self._get_regressor()
         tags = super().__sklearn_tags__()
         tags.regressor_tags.poor_score = True
+        tags.input_tags.sparse = get_tags(regressor).input_tags.sparse
         tags.target_tags.multi_output = get_tags(regressor).target_tags.multi_output
         return tags
 

diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
@@ -418,3 +418,9 @@ def transform(self, X):
             return np.vstack(output)
         else:
             return super().transform(X)
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # Beware that fit accepts sparse data but partial_fit doesn't
+        tags.input_tags.sparse = True
+        return tags
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
@@ -566,6 +566,7 @@ def inverse_transform(self, X):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
         tags.input_tags.pairwise = self.kernel == "precomputed"
10000
         return tags

diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
@@ -549,6 +549,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.positive_only = True
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = ["float32", "float64"]
         return tags
 

diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
@@ -1331,6 +1331,7 @@ def _n_features_out(self):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.positive_only = True
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
         return tags
 

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -851,4 +851,9 @@ def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
         tags.array_api_support = True
+        tags.input_tags.sparse = self.svd_solver in (
+            "auto",
+            "arpack",
+            "covariance_eigh",
+        )
         return tags
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
@@ -312,6 +312,7 @@ def inverse_transform(self, X):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
         return tags
 

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -423,6 +423,7 @@ def predict_log_proba(self, X):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.classifier_tags.poor_score = True
         tags.no_validation = True
         return tags
@@ -662,6 +663,7 @@ def predict(self, X, return_std=False):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.regressor_tags.poor_score = True
         tags.no_validation = True
         return tags

diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
@@ -627,6 +627,7 @@ def _get_estimator(self):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = get_tags(self._get_estimator()).input_tags.sparse
         tags.input_tags.allow_nan = get_tags(self._get_estimator()).input_tags.allow_nan
         return tags
 

diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
@@ -288,14 +288,17 @@ def get_params(self, deep=True):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         try:
-            allow_nan = all(
+            tags.input_tags.allow_nan = all(
                 get_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True
                 for est in self.estimators
             )
+            tags.input_tags.sparse = all(
+                get_tags(est[1]).input_tags.sparse if est[1] != "drop" else True
+                for est in self.estimators
+            )
         except Exception:
             # If `estimators` does not comply with our API (list of tuples) then it will
-            # fail. In this case, we assume that `allow_nan` is False but the parameter
-            # validation will raise an error during `fit`.
-            allow_nan = False
-        tags.input_tags.allow_nan = allow_nan
+            # fail. In this case, we assume that `allow_nan` and `sparse` are False but
+            # the parameter validation will raise an error during `fit`.
+            pass  # pragma: no cover
         return tags
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -1002,6 +1002,7 @@ def predict_log_proba(self, X):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.classifier_tags.multi_label = True
+        tags.input_tags.sparse = True
         return tags
 
 
@@ -1165,6 +1166,11 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
 
         return averaged_predictions
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
 
 class RandomForestClassifier(ForestClassifier):
     """
@@ -2987,3 +2993,8 @@ def transform(self, X):
         """
         check_is_fitted(self)
         return self.one_hot_encoder_.transform(self.apply(X))
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
@@ -1117,6 +1117,11 @@ def apply(self, X):
 
         return leaves
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
 
 class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
     """Gradient Boosting for classification.

diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
@@ -312,6 +312,11 @@ def feature_importances_(self):
                 "feature_importances_ attribute"
             ) from e
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
 
 def _samme_proba(estimator, n_classes, X):
     """Calculate algorithm 4, step 2, equation c) of Zhu et al [1].

diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
@@ -501,5 +501,6 @@ def get_metadata_routing(self):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
         tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan
         return tags
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
@@ -521,6 +521,7 @@ def __sklearn_tags__(self):
         if tags.regressor_tags is not None:
             tags.regressor_tags.poor_score = True
         tags.target_tags.required = True
+        tags.input_tags.sparse = sub_estimator_tags.input_tags.sparse
         tags.input_tags.allow_nan = sub_estimator_tags.input_tags.allow_nan
         return tags
 

diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py
@@ -329,6 +329,7 @@ def _get_support_mask(self):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan
+        tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
         return tags
 
     def get_metadata_routing(self):

diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
@@ -581,6 +581,7 @@ def _check_params(self, X, y):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.target_tags.required = True
+        tags.input_tags.sparse = True
         return tags
 
 

diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py
@@ -137,4 +137,5 @@ def _get_support_mask(self):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.allow_nan = True
+        tags.input_tags.sparse = True
         return tags
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
@@ -739,6 +739,7 @@ def inverse_transform(self, X):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.input_tags.allow_nan = is_pandas_na(self.missing_values) or is_scalar_nan(
             self.missing_values
         )
@@ -1130,5 +1131,6 @@ def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.allow_nan = True
         tags.input_tags.string = True
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = []
         return tags
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
@@ -235,6 +235,11 @@ def transform(self, X):
 
         return data_sketch
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        return tags
+
 
 class RBFSampler(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """Approximate a RBF kernel feature map using random Fourier features.
@@ -404,6 +409,7 @@ def transform(self, X):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
         return tags
 
@@ -826,6 +832,7 @@ def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.requires_fit = False
         tags.input_tags.positive_only = True
+        tags.input_tags.sparse = True
         return tags
 
 
@@ -1094,5 +1101,6 @@ def _get_kernel_params(self):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.transformer_tags.preserves_dtype = ["float64", "float32"]
         return tags
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
@@ -169,6 +169,7 @@ def _get_kernel(self, X, Y=None):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
         tags.input_tags.pairwise = self.kernel == "precomputed"
         return tags
 

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -687,6 +687,11 @@ def rmatvec(b):
         self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = not self.positive
+        return tags
+
 
 def _check_precomputed_gram_matrix(
     X, precompute, X_offset, X_scale, rtol=None, atol=1e-5
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		- The `tags.input_tags.sparse` flag was corrected for a majority of estimators.
		By :user:`Antoine Baker <antoinebaker>`