diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst index 232b3ed72bbda..456d2f6142847 100644 --- a/doc/modules/pipeline.rst +++ b/doc/modules/pipeline.rst @@ -47,7 +47,7 @@ is an estimator object:: >>> pipe # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS Pipeline(memory=None, steps=[('reduce_dim', PCA(copy=True,...)), - ('clf', SVC(C=1.0,...))]) + ('clf', SVC(C=1.0,...))], verbose=False) The utility function :func:`make_pipeline` is a shorthand for constructing pipelines; @@ -62,7 +62,7 @@ filling in the names automatically:: steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, - fit_prior=True))]) + fit_prior=True))], verbose=False) The estimators of a pipeline are stored as a list in the ``steps`` attribute:: @@ -82,7 +82,8 @@ Parameters of the estimators in the pipeline can be accessed using the >>> pipe.set_params(clf__C=10) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS Pipeline(memory=None, steps=[('reduce_dim', PCA(copy=True, iterated_power='auto',...)), - ('clf', SVC(C=10, cache_size=200, class_weight=None,...))]) + ('clf', SVC(C=10, cache_size=200, class_weight=None,...))], + verbose=False) Attributes of named_steps map to keys, enabling tab completion in interactive environments:: @@ -160,7 +161,7 @@ object:: >>> pipe # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS Pipeline(..., steps=[('reduce_dim', PCA(copy=True,...)), - ('clf', SVC(C=1.0,...))]) + ('clf', SVC(C=1.0,...))], verbose=False) >>> # Clear the cache directory when you don't need it anymore >>> rmtree(cachedir) @@ -177,7 +178,7 @@ object:: >>> pipe.fit(digits.data, digits.target) ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS Pipeline(memory=None, - steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))]) + steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))], verbose=False) >>> # The pca instance can be inspected directly >>> print(pca1.components_) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS [[ -1.77484909e-19 ... 4.07058917e-18]] @@ -199,7 +200,7 @@ object:: >>> cached_pipe.fit(digits.data, digits.target) ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS Pipeline(memory=..., - steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))]) + steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))], verbose=False) >>> print(cached_pipe.named_steps['reduce_dim'].components_) ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS [[ -1.77484909e-19 ... 4.07058917e-18]] @@ -253,7 +254,7 @@ and ``value`` is an estimator object:: FeatureUnion(n_jobs=1, transformer_list=[('linear_pca', PCA(copy=True,...)), ('kernel_pca', KernelPCA(alpha=1.0,...))], - transformer_weights=None) + transformer_weights=None, verbose=False) Like pipelines, feature unions have a shorthand constructor called @@ -268,7 +269,7 @@ and ignored by setting to ``None``:: FeatureUnion(n_jobs=1, transformer_list=[('linear_pca', PCA(copy=True,...)), ('kernel_pca', None)], - transformer_weights=None) + transformer_weights=None, verbose=False) .. topic:: Examples: diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0ca707ce2cbbf..981e7ae95788c 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -48,6 +48,18 @@ Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. :issue:`9521` by :user:`Hanmin Qin `. +Miscellaneous + +- Added optional parameter ``verbose`` in :class:`pipeline.Pipeline` and + :class:`pipeline.FeatureUnion` for showing progress and timing of each + step. :issue:`8568` by :user:`Karan Desai `. + +- Added optional parameter ``verbose`` in functions `pipeline.make_pipeline` + and `pipeline.make_union` to extend the same functionality as the + corresponding classes. :issue:`9668` by + :user:`Baze Petrushev ` and :user:`Karan Desai `. + + Bug fixes ......... @@ -5754,3 +5766,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Neeraj Gangwar: http://neerajgangwar.in .. _Arthur Mensch: https://amensch.fr +.. _Karan Desai: https://www.github.com/karandesai-96 diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 7646459da3936..5544890d95e8d 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -22,7 +22,8 @@ import scipy.sparse as sp from .base import is_classifier, clone -from .utils import indexable, check_random_state, safe_indexing +from .utils import (indexable, check_random_state, safe_indexing, + message_with_time) from .utils.validation import (_is_arraylike, _num_samples, column_or_1d) from .utils.multiclass import type_of_target @@ -1700,8 +1701,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: - end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) - print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) + print(message_with_time('CV', msg, scoring_time)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 773f70fb7dba2..a5a3a50dae443 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -20,10 +20,11 @@ import scipy.sparse as sp from ..base import is_classifier, clone -from ..utils import indexable, check_random_state, safe_indexing +from ..utils import (indexable, check_random_state, safe_indexing, + message_with_time) from ..utils.validation import _is_arraylike, _num_samples from ..utils.metaestimators import _safe_split -from ..externals.joblib import Parallel, delayed, logger +from ..externals.joblib import Parallel, delayed from ..externals.six.moves import zip from ..metrics.scorer import check_scoring, _check_multimetric_scoring from ..exceptions import FitFailedWarning @@ -480,8 +481,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time - end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) - print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) + print(message_with_time('CV', msg, total_time)) ret = [train_scores, test_scores] if return_train_score else [test_scores] diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 66da9dffeb066..1a4e4d24c6e62 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -10,6 +10,7 @@ # License: BSD from collections import defaultdict +import time import numpy as np from scipy import sparse @@ -18,7 +19,7 @@ from .externals.joblib import Parallel, delayed from .externals import six from .utils.metaestimators import if_delegate_has_method -from .utils import Bunch +from .utils import Bunch, message_with_time from .utils.validation import check_memory from .utils.metaestimators import _BaseComposition @@ -62,6 +63,9 @@ class Pipeline(_BaseComposition): inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. + verbose : boolean, optional + Verbosity mode. + Attributes ---------- named_steps : bunch object, a dictionary with attribute access @@ -89,7 +93,7 @@ class Pipeline(_BaseComposition): ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE Pipeline(memory=None, steps=[('anova', SelectKBest(...)), - ('svc', SVC(...))]) + ('svc', SVC(...))], verbose=False) >>> prediction = anova_svm.predict(X) >>> anova_svm.score(X, y) # doctest: +ELLIPSIS 0.829... @@ -109,11 +113,12 @@ class Pipeline(_BaseComposition): # BaseEstimator interface - def __init__(self, steps, memory=None): + def __init__(self, steps, memory=None, verbose=False): # shallow copy of steps self.steps = list(steps) self._validate_steps() self.memory = memory + self.verbose = verbose def get_params(self, deep=True): """Get parameters for this estimator. @@ -168,6 +173,13 @@ def _validate_steps(self): "'%s' (type %s) doesn't" % (estimator, type(estimator))) + def _print_final_step(self, final_step_time_elapsed, time_elapsed_so_far): + message = '(step %d of %d) %s' % ( + len(self.steps), len(self.steps), self.steps[-1][0]) + print(message_with_time('Pipeline', message, final_step_time_elapsed)) + print(message_with_time( + 'Pipeline', 'Total time elapsed:', time_elapsed_so_far)) + @property def _estimator_type(self): return self.steps[-1][1]._estimator_type @@ -196,7 +208,10 @@ def _fit(self, X, y=None, **fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X + # Keep a record of time elapsed + time_elapsed_so_far = 0 for step_idx, (name, transformer) in enumerate(self.steps[:-1]): + step_start_time = time.time() if transformer is None: pass else: @@ -214,9 +229,17 @@ def _fit(self, X, y=None, **fit_params): # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) + + step_time_elapsed = time.time() - step_start_time + time_elapsed_so_far += step_time_elapsed + # Logging time elapsed for current step to stdout + if self.verbose: + message = '(step %d of %d) %s' % ( + step_idx + 1, len(self.steps), name) + print(message_with_time('Pipeline', message, step_time_elapsed)) if self._final_estimator is None: - return Xt, {} - return Xt, fit_params_steps[self.steps[-1][0]] + return Xt, {}, time_elapsed_so_far + return Xt, fit_params_steps[self.steps[-1][0]], time_elapsed_so_far def fit(self, X, y=None, **fit_params): """Fit the model @@ -244,9 +267,15 @@ def fit(self, X, y=None, **fit_params): self : Pipeline This estimator """ - Xt, fit_params = self._fit(X, y, **fit_params) + Xt, fit_params, time_elapsed_so_far = self._fit(X, y, **fit_params) + final_step_start_time = time.time() if self._final_estimator is not None: self._final_estimator.fit(Xt, y, **fit_params) + final_step_time_elapsed = time.time() - final_step_start_time + time_elapsed_so_far += final_step_time_elapsed + if self.verbose: + self._print_final_step(final_step_time_elapsed, + time_elapsed_so_far) return self def fit_transform(self, X, y=None, **fit_params): @@ -277,13 +306,25 @@ def fit_transform(self, X, y=None, **fit_params): Transformed samples """ last_step = self._final_estimator - Xt, fit_params = self._fit(X, y, **fit_params) - if hasattr(last_step, 'fit_transform'): - return last_step.fit_transform(Xt, y, **fit_params) - elif last_step is None: + Xt, fit_params, time_elapsed_so_far = self._fit(X, y, **fit_params) + final_step_start_time = time.time() + if last_step is None: + if self.verbose: + message = 'Step %s is NoneType' % (self.steps[-1][0],) + print(message_with_time('Pipeline', message, 0)) + print(message_with_time( + 'Pipeline', 'Total time elapsed', time_elapsed_so_far)) return Xt + elif hasattr(last_step, 'fit_transform'): + Xt = last_step.fit_transform(Xt, y, **fit_params) else: - return last_step.fit(Xt, y, **fit_params).transform(Xt) + Xt = last_step.fit(Xt, y, **fit_params).transform(Xt) + final_step_time_elapsed = time.time() - final_step_start_time + time_elapsed_so_far += final_step_time_elapsed + if self.verbose: + self._print_final_step(final_step_time_elapsed, + time_elapsed_so_far) + return Xt @if_delegate_has_method(delegate='_final_estimator') def predict(self, X): @@ -332,8 +373,15 @@ def fit_predict(self, X, y=None, **fit_params): ------- y_pred : array-like """ - Xt, fit_params = self._fit(X, y, **fit_params) - return self.steps[-1][-1].fit_predict(Xt, y, **fit_params) + Xt, fit_params, time_elapsed_so_far = self._fit(X, y, **fit_params) + final_step_start_time = time.time() + y_pred = self.steps[-1][-1].fit_predict(Xt, y, **fit_params) + final_step_time_elapsed = time.time() - final_step_start_time + time_elapsed_so_far += final_step_time_elapsed + if self.verbose: + self._print_final_step(final_step_time_elapsed, + time_elapsed_so_far) + return y_pred @if_delegate_has_method(delegate='_final_estimator') def predict_proba(self, X): @@ -538,6 +586,9 @@ def make_pipeline(*steps, **kwargs): inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. + verbose : boolean, optional + Verbosity mode. + Examples -------- >>> from sklearn.naive_bayes import GaussianNB @@ -547,21 +598,30 @@ def make_pipeline(*steps, **kwargs): Pipeline(memory=None, steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), - ('gaussiannb', GaussianNB(priors=None))]) + ('gaussiannb', GaussianNB(priors=None))], verbose=False) Returns ------- p : Pipeline """ memory = kwargs.pop('memory', None) + verbose = kwargs.pop('verbose', False) if kwargs: raise TypeError('Unknown keyword arguments: "{}"' .format(list(kwargs.keys())[0])) - return Pipeline(_name_estimators(steps), memory=memory) + return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose) -def _fit_one_transformer(transformer, X, y): - return transformer.fit(X, y) +def _fit_one_transformer(transformer, X, y, verbose=False, idx=None, + total_steps=None, name=None): + # idx, total_steps and name are not required when verbosity is disabled + step_start_time = time.time() + transformer = transformer.fit(X, y) + step_time_elapsed = time.time() - step_start_time + if verbose: + message = '(step %d of %d) %s' % (idx + 1, total_steps, name) + print(message_with_time('FeatureUnion', message, step_time_elapsed)) + return transformer def _transform_one(transformer, weight, X): @@ -572,12 +632,18 @@ def _transform_one(transformer, weight, X): return res * weight -def _fit_transform_one(transformer, weight, X, y, - **fit_params): +def _fit_transform_one(transformer, weight, X, y, verbose=False, idx=None, + total_steps=None, name=None, **fit_params): + # idx, total_steps and name are not required when verbosity is disabled + step_start_time = time.time() if hasattr(transformer, 'fit_transform'): res = transformer.fit_transform(X, y, **fit_params) else: res = transformer.fit(X, y, **fit_params).transform(X) + step_time_elapsed = time.time() - step_start_time + if verbose: + message = '(step %d of %d) %s' % (idx + 1, total_steps, name) + print(message_with_time('FeatureUnion', message, step_time_elapsed)) # if we have a weight for this transformer, multiply output if weight is None: return res, transformer @@ -611,11 +677,17 @@ class FeatureUnion(_BaseComposition, TransformerMixin): Multiplicative weights for features per transformer. Keys are transformer names, values the weights. + verbose : boolean, optional + Verbosity mode. + """ - def __init__(self, transformer_list, n_jobs=1, transformer_weights=None): + + def __init__(self, transformer_list, n_jobs=1, transformer_weights=None, + verbose=False): self.transformer_list = list(transformer_list) self.n_jobs = n_jobs self.transformer_weights = transformer_weights + self.verbose = verbose self._validate_transformers() def get_params(self, deep=True): @@ -705,9 +777,19 @@ def fit(self, X, y=None): This estimator """ self._validate_transformers() + all_transformers = list(self._iter()) + total_steps = len(all_transformers) + # Keep a record of time elapsed + start_time = time.time() transformers = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_one_transformer)(trans, X, y) - for _, trans, _ in self._iter()) + delayed(_fit_one_transformer)(transformer, X, y, self.verbose, + idx, total_steps, name) + for idx, (name, transformer, _) in enumerate(all_transformers)) + time_elapsed = time.time() - start_time + if self.verbose: + print(message_with_time( + 'FeatureUnion', 'Total time elapsed:', time_elapsed)) + self._update_transformer_list(transformers) return self @@ -729,10 +811,19 @@ def fit_transform(self, X, y=None, **fit_params): sum of n_components (output dimension) over transformers. """ self._validate_transformers() + all_transformers = list(self._iter()) + total_steps = len(all_transformers) + # Keep a record of time elapsed + start_time = time.time() result = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_transform_one)(trans, weight, X, y, + delayed(_fit_transform_one)(transformer, weight, X, y, + self.verbose, idx, total_steps, name, **fit_params) - for name, trans, weight in self._iter()) + for idx, (name, transformer, weight) in enumerate(all_transformers)) + time_elapsed = time.time() - start_time + if self.verbose: + print(message_with_time( + 'FeatureUnion', 'Total time elapsed:', time_elapsed)) if not result: # All transformers are None @@ -793,6 +884,9 @@ def make_union(*transformers, **kwargs): n_jobs : int, optional Number of jobs to run in parallel (default 1). + verbose : boolean, optional + Verbosity mode. + Returns ------- f : FeatureUnion @@ -811,12 +905,14 @@ def make_union(*transformers, **kwargs): TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=None, tol=0.0))], - transformer_weights=None) + transformer_weights=None, verbose=False) """ n_jobs = kwargs.pop('n_jobs', 1) + verbose = kwargs.pop('verbose', False) if kwargs: # We do not currently support `transformer_weights` as we may want to # change its type spec in make_union raise TypeError('Unknown keyword arguments: "{}"' .format(list(kwargs.keys())[0])) - return FeatureUnion(_name_estimators(transformers), n_jobs=n_jobs) + return FeatureUnion(_name_estimators(transformers), n_jobs=n_jobs, + verbose=verbose) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 1165370885d36..76d3fe66634fe 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -573,6 +573,7 @@ def make(): 'memory': None, 'm2__mult': 2, 'last__mult': 5, + 'verbose': False }) pipeline.set_params(m2=None) @@ -968,3 +969,86 @@ def test_make_pipeline_memory(): assert_true(pipeline.memory is None) shutil.rmtree(cachedir) + +def check_pipeline_verbosity_fit_predict(pipe_method): + # Test that the verbosity of pipeline is proper + from sklearn.externals.six.moves import cStringIO as StringIO + import sys + old_stdout = sys.stdout + sys.stdout = StringIO() + pipe_method(X=None, y=None, clf__should_succeed=True) + verbose_output = sys.stdout + sys.stdout = old_stdout + + # check output + verbose_output.seek(0) + lines = verbose_output.readlines() + assert_true('(step 1 of 2) transf' in lines[0]) + assert_true('(step 2 of 2) clf' in lines[1]) + assert_true('Total time elapsed' in lines[2]) + for line in lines: + assert line.startswith('[Pipeline]') + +def test_pipeline_fit_verbosity(): + pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())], verbose=True) + yield check_pipeline_verbosity_fit_predict, pipe.fit + yield check_pipeline_verbosity_fit_predict, pipe.fit_predict + + +def check_pipeline_verbosity_fit_transform(pipe_method, last_was_none=False): + # Test that the verbosity of pipeline is proper + from sklearn.externals.six.moves import cStringIO as StringIO + import sys + old_stdout = sys.stdout + sys.stdout = StringIO() + pipe_method(X=[[1, 2, 3], [4, 5, 6]], y=[[7], [8]]) + verbose_output = sys.stdout + sys.stdout = old_stdout + + # check output + verbose_output.seek(0) + lines = verbose_output.readlines() + assert_true('(step 1 of 2) mult1' in lines[0]) + assert_true(lines[0].startswith('[Pipeline]')) + if last_was_none: + assert_true('Step mult2 is NoneType' in lines[1]) + else: + assert_true('(step 2 of 2) mult2' in lines[1]) + assert_true('Total time elapsed' in lines[2]) + + +def test_pipeline_verbosity_fit_transform(): + pipe = Pipeline([('mult1', Mult(mult=1)), ('mult2', Mult(mult=2))], + verbose=True) + yield check_pipeline_verbosity_fit_transform, pipe.fit_transform + pipe = Pipeline([('mult1', Mult(mult=1)), ('mult2', None)], + verbose=True) + yield check_pipeline_verbosity_fit_transform, pipe.fit_transform, True + + +def check_feature_union_verbosity(feature_union_method): + # Test that the verbosity of feature union is proper + from sklearn.externals.six.moves import cStringIO as StringIO + import sys + old_stdout = sys.stdout + sys.stdout = StringIO() + feature_union_method(X=[[1, 2, 3], [4, 5, 6]], y=[[7], [8]]) + verbose_output = sys.stdout + sys.stdout = old_stdout + + # check output + verbose_output.seek(0) + lines = verbose_output.readlines() + assert_true('(step 1 of 2) mult1' in lines[0]) + assert_true('(step 2 of 2) mult2' in lines[1]) + assert_true('Total time elapsed' in lines[2]) + assert_true(lines[0].startswith('[FeatureUnion]')) + assert_true(lines[1].startswith('[FeatureUnion]')) + assert_true(lines[2].startswith('[FeatureUnion]')) + + +def test_feature_union_verbosity(): + union = FeatureUnion([('mult1', Mult(mult=1)), ('mult2', Mult(mult=2))], + verbose=True) + yield check_feature_union_verbosity, union.fit + yield check_feature_union_verbosity, union.fit_transform diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4b2665cdd4f77..f4c2a1791663f 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -14,7 +14,7 @@ check_consistent_length, check_X_y, indexable, check_symmetric) from .class_weight import compute_class_weight, compute_sample_weight -from ..externals.joblib import cpu_count +from ..externals.joblib import cpu_count, logger from ..exceptions import DataConversionWarning from .deprecation import deprecated @@ -506,3 +506,23 @@ def indices_to_mask(indices, mask_length): mask[indices] = True return mask + + +def message_with_time(source, message, time_): + """Create one line message for logging purposes + + Parameters + ---------- + source: str + String indicating the source or the reference of the message + + message: str + Short message + + time_: int + Time in seconds + """ + start_message = '[%s]' % (source,) + end_message = "%s, total=%s" % (message, logger.short_format_time(time_)) + dots_len = (68 - len(start_message) - len(end_message)) + return ("%s %s %s" % (start_message, dots_len * '.', end_message))