From 55a4bfd6046095f21952b74835463c7e4b5cbea1 Mon Sep 17 00:00:00 2001 From: windiana42 Date: Tue, 28 Mar 2023 19:40:56 +0200 Subject: [PATCH 1/3] #23112: fix documentation last step is not cached In compose.rst and pipeline.py there are three places where pipeline caching is explained. An extra sentence was added that currently, the last step will never be cached. In one place it is mentioned that this might change in the future. --- doc/modules/compose.rst | 3 ++- sklearn/pipeline.py | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 4a61b5ec5f118..04eb0cb234628 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -198,7 +198,8 @@ after calling ``fit``. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical. A typical example is the case of a grid search in which the transformers can be fitted only once and reused for -each configuration. +each configuration. Currently, the last step will never be cached. This might +change in the future. The parameter ``memory`` is needed in order to cache the transformers. ``memory`` can be either a string containing the directory where to cache the diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 94d9465d7f819..4949f1767e7ac 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -80,11 +80,11 @@ class Pipeline(_BaseComposition): estimator. memory : str or object with the joblib.Memory interface, default=None - Used to cache the fitted transformers of the pipeline. By default, - no caching is performed. If a string is given, it is the path to - the caching directory. Enabling caching triggers a clone of - the transformers before fitting. Therefore, the transformer - instance given to the pipeline cannot be inspected + Used to cache the fitted transformers of the pipeline. Currently, + the last step will never be cached. By default, no caching is performed. + If a string is given, it is the path to the caching directory. Enabling + caching triggers a clone of the transformers before fitting. Therefore, + the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. @@ -858,11 +858,11 @@ def make_pipeline(*steps, memory=None, verbose=False): List of the scikit-learn estimators that are chained together. memory : str or object with the joblib.Memory interface, default=None - Used to cache the fitted transformers of the pipeline. By default, - no caching is performed. If a string is given, it is the path to - the caching directory. Enabling caching triggers a clone of - the transformers before fitting. Therefore, the transformer - instance given to the pipeline cannot be inspected + Used to cache the fitted transformers of the pipeline. Currently, + the last step will never be cached. By default, no caching is performed. + If a string is given, it is the path to the caching directory. Enabling + caching triggers a clone of the transformers before fitting. Therefore, + the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. From 307a2bd63300943faaa113fc3ad60f86ad43ac8f Mon Sep 17 00:00:00 2001 From: windiana42 <61181806+windiana42@users.noreply.github.com> Date: Tue, 28 Mar 2023 20:12:08 +0200 Subject: [PATCH 2/3] Update doc/modules/compose.rst No reference to future changes needed. Co-authored-by: Guillaume Lemaitre --- doc/modules/compose.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 04eb0cb234628..f5e82496848b4 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -198,8 +198,7 @@ after calling ``fit``. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical. A typical example is the case of a grid search in which the transformers can be fitted only once and reused for -each configuration. Currently, the last step will never be cached. This might -change in the future. +each configuration. Currently, the last step will never be cached. The parameter ``memory`` is needed in order to cache the transformers. ``memory`` can be either a string containing the directory where to cache the From 099a7e31500fbe309416cd8d1784b261dc04abfe Mon Sep 17 00:00:00 2001 From: windiana42 Date: Tue, 28 Mar 2023 20:17:33 +0200 Subject: [PATCH 3/3] suggestion: emphasize on last step transformer Implement suggestion to emphasize that last step is not cached even if it is a transformer. --- doc/modules/compose.rst | 2 +- sklearn/pipeline.py | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index f5e82496848b4..5bcee9550b968 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -198,7 +198,7 @@ after calling ``fit``. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical. A typical example is the case of a grid search in which the transformers can be fitted only once and reused for -each configuration. Currently, the last step will never be cached. +each configuration. The last step will never be cached, even if it is a transformer. The parameter ``memory`` is needed in order to cache the transformers. ``memory`` can be either a string containing the directory where to cache the diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 4949f1767e7ac..a604bb6fc6e6e 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -80,13 +80,13 @@ class Pipeline(_BaseComposition): estimator. memory : str or object with the joblib.Memory interface, default=None - Used to cache the fitted transformers of the pipeline. Currently, - the last step will never be cached. By default, no caching is performed. - If a string is given, it is the path to the caching directory. Enabling - caching triggers a clone of the transformers before fitting. Therefore, - the transformer instance given to the pipeline cannot be inspected - directly. Use the attribute ``named_steps`` or ``steps`` to - inspect estimators within the pipeline. Caching the + Used to cache the fitted transformers of the pipeline. The last step + will never be cached, even if it is a transformer. By default, no + caching is performed. If a string is given, it is the path to the + caching directory. Enabling caching triggers a clone of the transformers + before fitting. Therefore, the transformer instance given to the + pipeline cannot be inspected directly. Use the attribute ``named_steps`` + or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. verbose : bool, default=False @@ -858,13 +858,13 @@ def make_pipeline(*steps, memory=None, verbose=False): List of the scikit-learn estimators that are chained together. memory : str or object with the joblib.Memory interface, default=None - Used to cache the fitted transformers of the pipeline. Currently, - the last step will never be cached. By default, no caching is performed. - If a string is given, it is the path to the caching directory. Enabling - caching triggers a clone of the transformers before fitting. Therefore, - the transformer instance given to the pipeline cannot be inspected - directly. Use the attribute ``named_steps`` or ``steps`` to - inspect estimators within the pipeline. Caching the + Used to cache the fitted transformers of the pipeline. The last step + will never be cached, even if it is a transformer. By default, no + caching is performed. If a string is given, it is the path to the + caching directory. Enabling caching triggers a clone of the transformers + before fitting. Therefore, the transformer instance given to the + pipeline cannot be inspected directly. Use the attribute ``named_steps`` + or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. verbose : bool, default=False