diff --git a/doc/whats_new/upcoming_changes/metadata-routing/29920.fix.rst b/doc/whats_new/upcoming_changes/metadata-routing/29920.fix.rst new file mode 100644 index 0000000000000..a15a66ce6c74f --- /dev/null +++ b/doc/whats_new/upcoming_changes/metadata-routing/29920.fix.rst @@ -0,0 +1,3 @@ +- Many method arguments which shouldn't be included in the routing mechanism are + now excluded and the `set_{method}_request` methods are not generated for them. + By `Adrin Jalali`_ diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index fc3d1dc07f10d..955046fa37d4b 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -12,6 +12,8 @@ import numpy as np from scipy import linalg +from sklearn.utils import metadata_routing + from .. import config_context from ..base import BaseEstimator, _fit_context from ..metrics.pairwise import pairwise_distances @@ -181,6 +183,9 @@ class EmpiricalCovariance(BaseEstimator): array([0.0622..., 0.0193...]) """ + # X_test should have been called X + __metadata_request__score = {"X_test": metadata_routing.UNUSED} + _parameter_constraints: dict = { "store_precision": ["boolean"], "assume_centered": ["boolean"], diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index fa442101839cd..b2caf81aa9793 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -8,6 +8,8 @@ import numpy as np from scipy import linalg, sparse +from sklearn.utils import metadata_routing + from ..base import _fit_context from ..utils import gen_batches from ..utils._param_validation import Interval @@ -184,6 +186,8 @@ class IncrementalPCA(_BasePCA): (1797, 7) """ + __metadata_request__partial_fit = {"check_input": metadata_routing.UNUSED} + _parameter_constraints: dict = { "n_components": [Interval(Integral, 1, None, closed="left"), None], "whiten": ["boolean"], diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 64c9a5704652d..a754b92824585 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -9,6 +9,8 @@ import numpy as np import scipy.sparse as sp +from sklearn.utils import metadata_routing + from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils import check_array from ..utils.validation import check_is_fitted @@ -91,6 +93,9 @@ class DictVectorizer(TransformerMixin, BaseEstimator): array([[0., 0., 4.]]) """ + # This isn't something that people should be routing / using in a pipeline. + __metadata_request__inverse_transform = {"dict_type": metadata_routing.UNUSED} + _parameter_constraints: dict = { "dtype": "no_validation", # validation delegated to numpy, "separator": [str], diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 45570a523dbbf..ac0bed3110c4e 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -7,6 +7,8 @@ import numpy as np import scipy.sparse as sp +from sklearn.utils import metadata_routing + from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils._param_validation import Interval, StrOptions from ._hashing_fast import transform as _hashing_transform @@ -104,6 +106,9 @@ class FeatureHasher(TransformerMixin, BaseEstimator): [ 0., -1., 0., 0., 0., 0., 0., 1.]]) """ + # raw_X should have been called X + __metadata_request__transform = {"raw_X": metadata_routing.UNUSED} + _parameter_constraints: dict = { "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="both")], "input_type": [StrOptions({"dict", "pair", "string"})], diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2f21b3ccbe254..e1bdfd5a7dee5 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -16,6 +16,8 @@ import numpy as np import scipy.sparse as sp +from sklearn.utils import metadata_routing + from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context from ..exceptions import NotFittedError from ..preprocessing import normalize @@ -1118,6 +1120,11 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): [0 0 1 0 1 0 1 0 0 0 0 0 1]] """ + # raw_documents should not be in the routing mechanism. It should have been + # called X in the first place. + __metadata_request__fit = {"raw_documents": metadata_routing.UNUSED} + __metadata_request__transform = {"raw_documents": metadata_routing.UNUSED} + _parameter_constraints: dict = { "input": [StrOptions({"filename", "file", "content"})], "encoding": [str], diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 7312fdba7f63d..fb47ca1dde68f 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -11,6 +11,8 @@ from scipy import interpolate, optimize from scipy.stats import spearmanr +from sklearn.utils import metadata_routing + from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique from .base import BaseEstimator, RegressorMixin, TransformerMixin, _fit_context from .utils import check_array, check_consistent_length @@ -272,6 +274,10 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): array([1.8628..., 3.7256...]) """ + # T should have been called X + __metadata_request__predict = {"T": metadata_routing.UNUSED} + __metadata_request__transform = {"T": metadata_routing.UNUSED} + _parameter_constraints: dict = { "y_min": [Interval(Real, None, None, closed="both"), None], "y_max": [Interval(Real, None, None, closed="both"), None], diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index b13535bab512d..2dbb83c82fbaa 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -12,6 +12,8 @@ from joblib import effective_n_jobs from scipy import sparse +from sklearn.utils import metadata_routing + from ..base import MultiOutputMixin, RegressorMixin, _fit_context from ..model_selection import check_cv from ..utils import Bunch, check_array, check_scalar @@ -875,6 +877,10 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): [1.451...] """ + # "check_input" is used for optimisation and isn't something to be passed + # around in a pipeline. + __metadata_request__fit = {"check_input": metadata_routing.UNUSED} + _parameter_constraints: dict = { "alpha": [Interval(Real, 0, None, closed="left")], "l1_ratio": [Interval(Real, 0, 1, closed="both")], diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 007e5b0029f23..74ea7431a5d72 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -9,6 +9,8 @@ from scipy import optimize, sparse, stats from scipy.special import boxcox, inv_boxcox +from sklearn.utils import metadata_routing + from ..base import ( BaseEstimator, ClassNamePrefixFeaturesOutMixin, @@ -2422,6 +2424,10 @@ class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEsti [ -5., -14., 19.]]) """ + # X is called K in these methods. + __metadata_request__transform = {"K": metadata_routing.UNUSED} + __metadata_request__fit = {"K": metadata_routing.UNUSED} + def fit(self, K, y=None): """Fit KernelCenterer. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 885c210a0b343..93246a1376e85 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -15,6 +15,8 @@ import numpy as np from scipy.sparse import issparse +from sklearn.utils import metadata_routing + from ..base import ( BaseEstimator, ClassifierMixin, @@ -93,6 +95,10 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): Use derived classes instead. """ + # "check_input" is used for optimisation and isn't something to be passed + # around in a pipeline. + __metadata_request__predict = {"check_input": metadata_routing.UNUSED} + _parameter_constraints: dict = { "splitter": [StrOptions({"best", "random"})], "max_depth": [Interval(Integral, 1, None, closed="left"), None], @@ -935,6 +941,11 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): 0.93..., 0.93..., 1. , 0.93..., 1. ]) """ + # "check_input" is used for optimisation and isn't something to be passed + # around in a pipeline. + __metadata_request__predict_proba = {"check_input": metadata_routing.UNUSED} + __metadata_request__fit = {"check_input": metadata_routing.UNUSED} + _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], @@ -1312,6 +1323,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): 0.16..., 0.11..., -0.73..., -0.30..., -0.00...]) """ + # "check_input" is used for optimisation and isn't something to be passed + # around in a pipeline. + __metadata_request__fit = {"check_input": metadata_routing.UNUSED} + _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, "criterion": [