scikit-learn · adam2392 · Oct 29, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 24, 2024
diff --git a/doc/whats_new/upcoming_changes/metadata-routing/29920.fix.rst b/doc/whats_new/upcoming_changes/metadata-routing/29920.fix.rst
@@ -0,0 +1,3 @@
+- Many method arguments which shouldn't be included in the routing mechanism are
+  now excluded and the `set_{method}_request` methods are not generated for them.
+  By `Adrin Jalali`_
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
@@ -12,6 +12,8 @@
 import numpy as np
 from scipy import linalg
 
+from sklearn.utils import metadata_routing
+
 from .. import config_context
 from ..base import BaseEstimator, _fit_context
 from ..metrics.pairwise import pairwise_distances
@@ -181,6 +183,9 @@ class EmpiricalCovariance(BaseEstimator):
     array([0.0622..., 0.0193...])
     """
 
+    # X_test should have been called X
+    __metadata_request__score = {"X_test": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "store_precision": ["boolean"],
         "assume_centered": ["boolean"],

diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
@@ -8,6 +8,8 @@
 import numpy as np
 from scipy import linalg, sparse
 
+from sklearn.utils import metadata_routing
+
 from ..base import _fit_context
 from ..utils import gen_batches
 from ..utils._param_validation import Interval
@@ -184,6 +186,8 @@ class IncrementalPCA(_BasePCA):
     (1797, 7)
     """
 
+    __metadata_request__partial_fit = {"check_input": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "n_components": [Interval(Integral, 1, None, closed="left"), None],
         "whiten": ["boolean"],

diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -9,6 +9,8 @@
 import numpy as np
 import scipy.sparse as sp
 
+from sklearn.utils import metadata_routing
+
 from ..base import BaseEstimator, TransformerMixin, _fit_context
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
@@ -91,6 +93,9 @@ class DictVectorizer(TransformerMixin, BaseEstimator):
     array([[0., 0., 4.]])
     """
 
+    # This isn't something that people should be routing / using in a pipeline.
+    __metadata_request__inverse_transform = {"dict_type": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "dtype": "no_validation",  # validation delegated to numpy,
         "separator": [str],

diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
@@ -7,6 +7,8 @@
 import numpy as np
 import scipy.sparse as sp
 
+from sklearn.utils import metadata_routing
+
 from ..base import BaseEstimator, TransformerMixin, _fit_context
 from ..utils._param_validation import Interval, StrOptions
 from ._hashing_fast import transform as _hashing_transform
@@ -104,6 +106,9 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
            [ 0., -1.,  0.,  0.,  0.,  0.,  0.,  1.]])
     """
 
+    # raw_X should have been called X
+    __metadata_request__transform = {"raw_X": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="both")],
         "input_type": [StrOptions({"dict", "pair", "string"})],

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -16,6 +16,8 @@
 import numpy as np
 import scipy.sparse as sp
 
+from sklearn.utils import metadata_routing
+
 from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
 from ..exceptions import NotFittedError
 from ..preprocessing import normalize
@@ -1118,6 +1120,11 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
      [0 0 1 0 1 0 1 0 0 0 0 0 1]]
     """
 
+    # raw_documents should not be in the routing mechanism. It should have been
+    # called X in the first place.
+    __metadata_request__fit = {"raw_documents": metadata_routing.UNUSED}
+    __metadata_request__transform = {"raw_documents": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "input": [StrOptions({"filename", "file", "content"})],
         "encoding": [str],

diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
@@ -11,6 +11,8 @@
 from scipy import interpolate, optimize
 from scipy.stats import spearmanr
 
+from sklearn.utils import metadata_routing
+
 from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique
 from .base import BaseEstimator, RegressorMixin, TransformerMixin, _fit_context
 from .utils import check_array, check_consistent_length
@@ -272,6 +274,10 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator):
     array([1.8628..., 3.7256...])
     """
 
+    # T should have been called X
+    __metadata_request__predict = {"T": metadata_routing.UNUSED}
+    __metadata_request__transform = {"T": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "y_min": [Interval(Real, None, None, closed="both"), None],
         "y_max": [Interval(Real, None, None, closed="both"), None],

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
@@ -12,6 +12,8 @@
 from joblib import effective_n_jobs
 from scipy import sparse
 
+from sklearn.utils import metadata_routing
+
 from ..base import MultiOutputMixin, RegressorMixin, _fit_context
 from ..model_selection import check_cv
 from ..utils import Bunch, check_array, check_scalar
@@ -875,6 +877,10 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
     [1.451...]
     """
 
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__fit = {"check_input": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "alpha": [Interval(Real, 0, None, closed="left")],
         "l1_ratio": [Interval(Real, 0, 1, closed="both")],

diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -9,6 +9,8 @@
 from scipy import optimize, sparse, stats
 from scipy.special import boxcox, inv_boxcox
 
+from sklearn.utils import metadata_routing
+
 from ..base import (
     BaseEstimator,
     ClassNamePrefixFeaturesOutMixin,
@@ -2422,6 +2424,10 @@ class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEsti
            [ -5., -14.,  19.]])
     """
 
+    # X is called K in these methods.
+    __metadata_request__transform = {"K": metadata_routing.UNUSED}
+    __metadata_request__fit = {"K": metadata_routing.UNUSED}
+
     def fit(self, K, y=None):
         """Fit KernelCenterer.
 

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
@@ -15,6 +15,8 @@
 import numpy as np
 from scipy.sparse import issparse
 
+from sklearn.utils import metadata_routing
+
 from ..base import (
     BaseEstimator,
     ClassifierMixin,
@@ -93,6 +95,10 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
     Use derived classes instead.
     """
 
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__predict = {"check_input": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "splitter": [StrOptions({"best", "random"})],
         "max_depth": [Interval(Integral, 1, None, closed="left"), None],
@@ -935,6 +941,11 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
     """
 
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__predict_proba = {"check_input": metadata_routing.UNUSED}
+    __metadata_request__fit = {"check_input": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         **BaseDecisionTree._parameter_constraints,
         "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
@@ -1312,6 +1323,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
            0.16...,  0.11..., -0.73..., -0.30..., -0.00...])
     """
 
+    # "check_input" is used for optimisation and isn't something to be passed
+    # around in a pipeline.
+    __metadata_request__fit = {"check_input": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         **BaseDecisionTree._parameter_constraints,
         "criterion": [