From d3ce2adf64cde0997fa28e86763c2f1b4597415c Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 10 Jul 2017 16:58:18 -0500 Subject: [PATCH 1/7] Use a def as it is pickle-able --- doc/modules/model_evaluation.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index dee5865bdd33e..8985a5ee87958 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -242,10 +242,10 @@ permitted and will require a wrapper to return a single metric:: >>> # A sample toy binary classification dataset >>> X, y = datasets.make_classification(n_classes=2, random_state=0) >>> svm = LinearSVC(random_state=0) - >>> tp = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 0] - >>> tn = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 0] - >>> fp = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[1, 0] - >>> fn = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 1] + >>> def tp(y_true, y_pred): confusion_matrix(y_true, y_pred)[0, 0] + >>> def tn(y_true, y_pred): confusion_matrix(y_true, y_pred)[0, 0] + >>> def fp(y_true, y_pred): confusion_matrix(y_true, y_pred)[1, 0] + >>> def fn(y_true, y_pred): confusion_matrix(y_true, y_pred)[0, 1] >>> scoring = {'tp' : make_scorer(tp), 'tn' : make_scorer(tn), ... 'fp' : make_scorer(fp), 'fn' : make_scorer(fn)} >>> cv_results = cross_validate(svm.fit(X, y), X, y, scoring=scoring) From 54d5341ecb79e550786b1aea57b8ab93d438bf16 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 10 Jul 2017 17:03:43 -0500 Subject: [PATCH 2/7] Address Joel's comments --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 8985a5ee87958..52d4f976f474a 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -249,7 +249,7 @@ permitted and will require a wrapper to return a single metric:: >>> scoring = {'tp' : make_scorer(tp), 'tn' : make_scorer(tn), ... 'fp' : make_scorer(fp), 'fn' : make_scorer(fn)} >>> cv_results = cross_validate(svm.fit(X, y), X, y, scoring=scoring) - >>> # Getting the test set false positive scores + >>> # Getting the test set true positive scores >>> print(cv_results['test_tp']) # doctest: +NORMALIZE_WHITESPACE [12 13 15] >>> # Getting the test set false negative scores From 2e5b72901efb97eafde272b702e20efc836045d5 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Sat, 15 Jul 2017 17:48:13 -0500 Subject: [PATCH 3/7] Try memoizing predict / decicion_function calls --- sklearn/model_selection/_validation.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 1e5ea29740c00..897a5439d4123 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -26,6 +26,7 @@ from ..externals.joblib import Parallel, delayed, logger from ..externals.six.moves import zip from ..metrics.scorer import check_scoring, _check_multimetric_scoring +from ..metrics.scorer import _passthrough_scorer from ..exceptions import FitFailedWarning from ._split import check_cv from ..preprocessing import LabelEncoder @@ -521,10 +522,34 @@ def _score(estimator, X_test, y_test, scorer, is_multimetric=False): return score +class _MemoizedPredictEstimator: + def __init__(self, estimator): + self.estimator = estimator + def fit(self, X, y): + self.estimator.fit(X, y) + @if_delegate_has_method(delegate='estimator') + def predict(self, X): + if not hasattr(self, '_predictions'): + self._predictions = self.estimator.predict(X) + return self._predictions + @if_delegate_has_method(delegate='estimator') + def decision_function(self, X): + if not hasattr(self, '_decisions'): + self._decisions = self.estimator.decision_function(X) + return self._decisions + + def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" scores = {} + # Try wrapping the estimator in _MemoizedPredictEstimator if we don't use + # the pass-through scorer + uses_score_method = any([scorer is _passthrough_scorer + for _, scorer in scorers.items()]) + if not uses_score_method: + estimator = _MemoizedPredictEstimator(estimator) + for name, scorer in scorers.items(): if y_test is None: score = scorer(estimator, X_test) From 18d3549582d19d55280d92ef05b43cd727b00b0e Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Sun, 16 Jul 2017 11:36:58 -0500 Subject: [PATCH 4/7] TST memoizing of the predictions when non-default scoring is used --- sklearn/model_selection/_validation.py | 1 + .../model_selection/tests/test_validation.py | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 897a5439d4123..02de823afb21b 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -23,6 +23,7 @@ from ..utils import indexable, check_random_state, safe_indexing from ..utils.validation import _is_arraylike, _num_samples from ..utils.metaestimators import _safe_split +from ..utils.metaestimators import if_delegate_has_method from ..externals.joblib import Parallel, delayed, logger from ..externals.six.moves import zip from ..metrics.scorer import check_scoring, _check_multimetric_scoring diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index dedb77026c544..a51be9c3cb6dd 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -38,6 +38,7 @@ from sklearn.model_selection import learning_curve from sklearn.model_selection import validation_curve from sklearn.model_selection._validation import _check_is_permutation +from sklearn.model_selection._validation import _multimetric_score from sklearn.datasets import make_regression from sklearn.datasets import load_boston @@ -50,6 +51,7 @@ from sklearn.metrics import precision_score from sklearn.metrics import r2_score from sklearn.metrics.scorer import check_scoring +from sklearn.metrics.scorer import _check_multimetric_scoring from sklearn.linear_model import Ridge, LogisticRegression from sklearn.linear_model import PassiveAggressiveClassifier @@ -219,6 +221,17 @@ def get_params(self, deep=False): return {'a': self.a, 'allow_nd': self.allow_nd} +class CountCallPredictedEstimator: + def __init__(self): + self._n_predict_calls = 0 + self._rng = np.random.RandomState(0) + def fit(self, X, y): + return self + def predict(self, X): + self._n_predict_calls += 1 + return self._rng.randint(0, 2, size=X.shape[0]) + + # XXX: use 2D array, since 1D X is being detected as a single sample in # check_consistent_length X = np.ones((10, 2)) @@ -1299,3 +1312,14 @@ def test_permutation_test_score_pandas(): check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) permutation_test_score(clf, X_df, y_ser) + + +def test_multiscore_memoizing(): + # Check if memoizing works as expected in _multimetric_score + X, y = make_classification(n_samples=1000, random_state=0) + estimator = CountCallPredictedEstimator() + scorers, _ = _check_multimetric_scoring(estimator, + ['neg_mean_squared_error', + 'neg_median_absolute_error']) + scores = _multimetric_score(estimator, X, y, scorers=scorers) + assert estimator._n_predict_calls == 1 From e45b1541f15f8607fecfd56babe464ee9825a08c Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Wed, 19 Jul 2017 14:02:02 -0500 Subject: [PATCH 5/7] Add predict_proba, predict_log_proba and wrap all estimators --- sklearn/model_selection/_validation.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 02de823afb21b..e32be6be3eb14 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -526,30 +526,42 @@ def _score(estimator, X_test, y_test, scorer, is_multimetric=False): class _MemoizedPredictEstimator: def __init__(self, estimator): self.estimator = estimator + def fit(self, X, y): self.estimator.fit(X, y) + @if_delegate_has_method(delegate='estimator') def predict(self, X): if not hasattr(self, '_predictions'): self._predictions = self.estimator.predict(X) return self._predictions + @if_delegate_has_method(delegate='estimator') def decision_function(self, X): if not hasattr(self, '_decisions'): self._decisions = self.estimator.decision_function(X) return self._decisions + @if_delegate_has_method(delegate='estimator') + def predict_proba(self, X): + if not hasattr(self, '_probs'): + self._probs = self.estimator.predict_proba(X) + return self._probs + + @if_delegate_has_method(delegate='estimator') + def predict_log_proba(self, X): + if not hasattr(self, '_log_probs'): + self._log_probs = self.estimator.predict_log_proba(X) + return self._log_probs + def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" scores = {} - # Try wrapping the estimator in _MemoizedPredictEstimator if we don't use - # the pass-through scorer - uses_score_method = any([scorer is _passthrough_scorer - for _, scorer in scorers.items()]) - if not uses_score_method: - estimator = _MemoizedPredictEstimator(estimator) + # Try wrapping the estimator in _MemoizedPredictEstimator + # If the estimator has a score, wrapping it will not do any harm + estimator = _MemoizedPredictEstimator(estimator) for name, scorer in scorers.items(): if y_test is None: From b6b3994ebfc002af0ffe45e12defead388722557 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 24 Jul 2017 12:29:17 +0200 Subject: [PATCH 6/7] Fix travis --- sklearn/model_selection/_validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index e32be6be3eb14..1e8327330e107 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -554,6 +554,10 @@ def predict_log_proba(self, X): self._log_probs = self.estimator.predict_log_proba(X) return self._log_probs + @if_delegate_has_method(delegate='estimator') + def score(self, *args, **kwargs): + return self.estimator.score(*args, **kwargs) + def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" From 7c0ed5480de9030854a39f7d74554d093a85c80b Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 24 Jul 2017 14:07:21 +0200 Subject: [PATCH 7/7] Undo the changes in model_evaluation.rst --- doc/modules/model_evaluation.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 52d4f976f474a..dee5865bdd33e 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -242,14 +242,14 @@ permitted and will require a wrapper to return a single metric:: >>> # A sample toy binary classification dataset >>> X, y = datasets.make_classification(n_classes=2, random_state=0) >>> svm = LinearSVC(random_state=0) - >>> def tp(y_true, y_pred): confusion_matrix(y_true, y_pred)[0, 0] - >>> def tn(y_true, y_pred): confusion_matrix(y_true, y_pred)[0, 0] - >>> def fp(y_true, y_pred): confusion_matrix(y_true, y_pred)[1, 0] - >>> def fn(y_true, y_pred): confusion_matrix(y_true, y_pred)[0, 1] + >>> tp = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 0] + >>> tn = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 0] + >>> fp = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[1, 0] + >>> fn = lambda y_true, y_pred: confusion_matrix(y_true, y_pred)[0, 1] >>> scoring = {'tp' : make_scorer(tp), 'tn' : make_scorer(tn), ... 'fp' : make_scorer(fp), 'fn' : make_scorer(fn)} >>> cv_results = cross_validate(svm.fit(X, y), X, y, scoring=scoring) - >>> # Getting the test set true positive scores + >>> # Getting the test set false positive scores >>> print(cv_results['test_tp']) # doctest: +NORMALIZE_WHITESPACE [12 13 15] >>> # Getting the test set false negative scores