From 994ca848370af7e1a5eb5900791803048b6fb1df Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 5 Jul 2019 09:25:36 -0400 Subject: [PATCH 01/11] ENH Uses class based caching --- sklearn/model_selection/_validation.py | 32 +++++++++++++++++++ .../model_selection/tests/test_validation.py | 24 ++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index c5cf9e8ee59b6..65836e8376ea2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -596,6 +596,38 @@ def _score(estimator, X_test, y_test, scorer, is_multimetric=False): return score +class _CacheEstimator: + """Cache predict, predict_proba, decision_function, and score + of an estimator + """ + def __init__(self, estimator): + self.estimator = estimator + self.cache = {} + + def _call_func(*args, name=None, **kwargs): + try: + return self.cache[name] + except KeyError: + func = getattr(self.estimator, name) + result = func(*args, **kwargs) + self.cache[name] = result + return result + + func_names = ['predict', 'predict_proba', 'decision_function', 'score'] + for func_name in func_names: + # only add when estimator defines func_name + if hasattr(estimator, func_name): + func = partial(_call_func, name=func_name) + setattr(self, func_name, func) + + def __getattr__(self, name): + return getattr(self.estimator, name) + + @property + def __class__(self): + return self.estimator.__class__ + + def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" scores = {} diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 6fa2e4fee5ed7..34dbabe061276 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1718,3 +1718,27 @@ def two_params_scorer(estimator, X_test): fit_and_score_args = [None, None, None, two_params_scorer] assert_raise_message(ValueError, error_message, _score, *fit_and_score_args) + + +@pytest.mark.parametrize("func_name", + ["predict", "predict_proba", + "decision_function", "score"]) +def test_cached_estimator(func_name): + mock_est = Mock() + mock_est.my_attribute = "hello" + mock_func = getattr(mock_est, func_name) + mock_func.return_value = 42 + + cached_est = _CacheEstimator(mock_est) + + # call func twice + func = getattr(cached_est, func_name) + assert func() == 42 + assert func() == 42 + assert cached_est.cache[func_name] == 42 + + assert cached_est.my_attribute == "hello" + assert isinstance(cached_est, Mock) + + # only called once + assert mock_func.call_count == 1 From ae8d7a4597482eefa6f5c2f0917648120413cbe8 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 5 Jul 2019 09:30:13 -0400 Subject: [PATCH 02/11] BUG Fixes imports --- sklearn/model_selection/_validation.py | 6 ++++-- sklearn/model_selection/tests/test_validation.py | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 65836e8376ea2..1bf12443bf1a2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -14,6 +14,7 @@ import numbers import time from traceback import format_exception_only +from functools import partial import numpy as np import scipy.sparse as sp @@ -632,11 +633,12 @@ def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" scores = {} + cached_estimator = _CacheEstimator(estimator) for name, scorer in scorers.items(): if y_test is None: - score = scorer(estimator, X_test) + score = scorer(cached_estimator, X_test) else: - score = scorer(estimator, X_test, y_test) + score = scorer(cached_estimator, X_test, y_test) if hasattr(score, 'item'): try: diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 34dbabe061276..7c10e67238593 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -5,6 +5,7 @@ import tempfile import os from time import sleep +from unittest.mock import Mock import pytest import numpy as np @@ -76,6 +77,7 @@ from sklearn.model_selection.tests.common import OneTimeSplitter from sklearn.model_selection import GridSearchCV +from sklearn.model_selection._validation import _CacheEstimator try: From c26f94e50bf440d84739e8895127fbb59d2279b1 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 5 Jul 2019 11:27:21 -0400 Subject: [PATCH 03/11] ENH Only caches when used with make_scorer --- sklearn/model_selection/_validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 1bf12443bf1a2..32be53d9e0bc0 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -25,7 +25,8 @@ _message_with_time) from ..utils.validation import _is_arraylike, _num_samples from ..utils.metaestimators import _safe_split -from ..metrics.scorer import check_scoring, _check_multimetric_scoring +from ..metrics.scorer import (check_scoring, _check_multimetric_scoring, + _BaseScorer) from ..exceptions import FitFailedWarning from ._split import check_cv from ..preprocessing import LabelEncoder @@ -633,12 +634,13 @@ def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" scores = {} - cached_estimator = _CacheEstimator(estimator) + if all(isinstance(scorer, _BaseScorer) for scorer in scorers.values()): + estimator = _CacheEstimator(estimator) for name, scorer in scorers.items(): if y_test is None: - score = scorer(cached_estimator, X_test) + score = scorer(estimator, X_test) else: - score = scorer(cached_estimator, X_test, y_test) + score = scorer(estimator, X_test, y_test) if hasattr(score, 'item'): try: From 6c343bf29c197f12728d906a39cba89d5081dc06 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 5 Jul 2019 11:28:46 -0400 Subject: [PATCH 04/11] DOC Adds whats_new --- doc/whats_new/v0.22.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index f2962df4a74b8..07583e4491c3a 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -80,6 +80,9 @@ Changelog plot model scalability (see learning_curve example). :pr:`13938` by :user:`Hadrien Reboul `. +- |Efficiency| Multimetric scoring now caches predictions to avoid repeated + calls. :pr:`14261` by `Thomas Fan`_. + :mod:`sklearn.pipeline` ....................... From 9e7b059ff29a2b6cca202e116ee87f6089b08827 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 5 Jul 2019 11:30:34 -0400 Subject: [PATCH 05/11] CLN Removes __class__ hack --- sklearn/model_selection/_validation.py | 4 ---- sklearn/model_selection/tests/test_validation.py | 1 - 2 files changed, 5 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 32be53d9e0bc0..aac5dc748ed29 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -625,10 +625,6 @@ def _call_func(*args, name=None, **kwargs): def __getattr__(self, name): return getattr(self.estimator, name) - @property - def __class__(self): - return self.estimator.__class__ - def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 7c10e67238593..6059289f36f8f 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1740,7 +1740,6 @@ def test_cached_estimator(func_name): assert cached_est.cache[func_name] == 42 assert cached_est.my_attribute == "hello" - assert isinstance(cached_est, Mock) # only called once assert mock_func.call_count == 1 From 62e85ee7770d2291c69545a94923dd2a321bd1b7 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 5 Jul 2019 11:32:34 -0400 Subject: [PATCH 06/11] CLN Removes attribute passthrough --- sklearn/model_selection/_validation.py | 3 --- sklearn/model_selection/tests/test_validation.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index aac5dc748ed29..69f379f83349c 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -622,9 +622,6 @@ def _call_func(*args, name=None, **kwargs): func = partial(_call_func, name=func_name) setattr(self, func_name, func) - def __getattr__(self, name): - return getattr(self.estimator, name) - def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 6059289f36f8f..3636a87eb18fb 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1727,7 +1727,6 @@ def two_params_scorer(estimator, X_test): "decision_function", "score"]) def test_cached_estimator(func_name): mock_est = Mock() - mock_est.my_attribute = "hello" mock_func = getattr(mock_est, func_name) mock_func.return_value = 42 @@ -1739,7 +1738,5 @@ def test_cached_estimator(func_name): assert func() == 42 assert cached_est.cache[func_name] == 42 - assert cached_est.my_attribute == "hello" - # only called once assert mock_func.call_count == 1 From d5a00af0ffd0b80aa775d088dd1ec5bfce38d7b5 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sat, 6 Jul 2019 09:23:38 -0400 Subject: [PATCH 07/11] TST Uses context manager --- sklearn/model_selection/_validation.py | 56 ++++++++++--------- .../model_selection/tests/test_validation.py | 25 ++++----- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 69f379f83349c..388e380b9e90b 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -14,6 +14,7 @@ import numbers import time from traceback import format_exception_only +from contextlib import contextmanager from functools import partial import numpy as np @@ -598,42 +599,43 @@ def _score(estimator, X_test, y_test, scorer, is_multimetric=False): return score -class _CacheEstimator: - """Cache predict, predict_proba, decision_function, and score - of an estimator - """ - def __init__(self, estimator): - self.estimator = estimator - self.cache = {} +@contextmanager +def _cache_estimator(estimator): - def _call_func(*args, name=None, **kwargs): - try: - return self.cache[name] - except KeyError: - func = getattr(self.estimator, name) - result = func(*args, **kwargs) - self.cache[name] = result - return result + def _call_func(*args, name=None, func=None, cache=None, **kwargs): + try: + return cache[name] + except KeyError: + result = func(*args, **kwargs) + cache[name] = result + return result + + cache = {} + names = ['predict', 'predict_proba', 'decision_function', 'score'] + cache_funcs = {name: getattr(estimator, name) for name in names if + hasattr(estimator, name)} - func_names = ['predict', 'predict_proba', 'decision_function', 'score'] - for func_name in func_names: - # only add when estimator defines func_name - if hasattr(estimator, func_name): - func = partial(_call_func, name=func_name) - setattr(self, func_name, func) + # patch methods + for name, func in cache_funcs.items(): + setattr(estimator, name, + partial(_call_func, name=name, func=func, cache=cache)) + yield estimator + + # place methods back + for name, func in cache_funcs.items(): + setattr(estimator, name, func) def _multimetric_score(estimator, X_test, y_test, scorers): """Return a dict of score for multimetric scoring""" scores = {} - if all(isinstance(scorer, _BaseScorer) for scorer in scorers.values()): - estimator = _CacheEstimator(estimator) for name, scorer in scorers.items(): - if y_test is None: - score = scorer(estimator, X_test) - else: - score = scorer(estimator, X_test, y_test) + with _cache_estimator(estimator) as cached_estimator: + if y_test is None: + score = scorer(cached_estimator, X_test) + else: + score = scorer(cached_estimator, X_test, y_test) if hasattr(score, 'item'): try: diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 3636a87eb18fb..39d9ba55c976f 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -77,8 +77,7 @@ from sklearn.model_selection.tests.common import OneTimeSplitter from sklearn.model_selection import GridSearchCV -from sklearn.model_selection._validation import _CacheEstimator - +from sklearn.model_selection._validation import _cache_estimator try: WindowsError @@ -1722,21 +1721,19 @@ def two_params_scorer(estimator, X_test): _score, *fit_and_score_args) -@pytest.mark.parametrize("func_name", - ["predict", "predict_proba", - "decision_function", "score"]) +@pytest.mark.parametrize('func_name', + ['predict', 'predict_proba', + 'decision_function', 'score']) def test_cached_estimator(func_name): mock_est = Mock() mock_func = getattr(mock_est, func_name) mock_func.return_value = 42 - cached_est = _CacheEstimator(mock_est) - - # call func twice - func = getattr(cached_est, func_name) - assert func() == 42 - assert func() == 42 - assert cached_est.cache[func_name] == 42 + with _cache_estimator(mock_est) as cached_est: + # call func twice + func = getattr(cached_est, func_name) + assert func() == 42 + assert func() == 42 - # only called once - assert mock_func.call_count == 1 + # only called once + assert mock_func.call_count == 1 From f0471662bb5534f7c065a99768242a89de91c69e Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sat, 6 Jul 2019 09:27:03 -0400 Subject: [PATCH 08/11] CLN Adds id of data --- sklearn/model_selection/_validation.py | 6 +++--- sklearn/model_selection/tests/test_validation.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 388e380b9e90b..ae4e71c5f50b4 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -602,12 +602,12 @@ def _score(estimator, X_test, y_test, scorer, is_multimetric=False): @contextmanager def _cache_estimator(estimator): - def _call_func(*args, name=None, func=None, cache=None, **kwargs): + def _call_func(X, *args, name=None, func=None, cache=None, **kwargs): try: - return cache[name] + return cache[name, id(X)] except KeyError: result = func(*args, **kwargs) - cache[name] = result + cache[name, id(X)] = result return result cache = {} diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 39d9ba55c976f..2c383f13a706d 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1728,12 +1728,13 @@ def test_cached_estimator(func_name): mock_est = Mock() mock_func = getattr(mock_est, func_name) mock_func.return_value = 42 + X = np.array([[1]]) with _cache_estimator(mock_est) as cached_est: # call func twice func = getattr(cached_est, func_name) - assert func() == 42 - assert func() == 42 + assert func(X) == 42 + assert func(X) == 42 # only called once assert mock_func.call_count == 1 From 2b761a3de5b2d3974a48360d7700d0731d5b225f Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sat, 6 Jul 2019 12:02:07 -0400 Subject: [PATCH 09/11] BUG Pass X as well --- sklearn/model_selection/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ae4e71c5f50b4..6c37df7077887 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -606,7 +606,7 @@ def _call_func(X, *args, name=None, func=None, cache=None, **kwargs): try: return cache[name, id(X)] except KeyError: - result = func(*args, **kwargs) + result = func(X, *args, **kwargs) cache[name, id(X)] = result return result From 9733db97ae6fbaf3fda8e10279cee0b79fd880b1 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sat, 6 Jul 2019 20:34:52 -0400 Subject: [PATCH 10/11] BUG Adds setter to properties --- sklearn/ensemble/voting.py | 4 ++++ sklearn/linear_model/stochastic_gradient.py | 8 ++++++++ sklearn/neighbors/lof.py | 8 ++++++++ sklearn/neural_network/multilayer_perceptron.py | 8 ++++++++ sklearn/svm/base.py | 8 ++++++++ 5 files changed, 36 insertions(+) diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py index 0b01340d4f1af..a028d35d138e6 100644 --- a/sklearn/ensemble/voting.py +++ b/sklearn/ensemble/voting.py @@ -337,6 +337,10 @@ def predict_proba(self): """ return self._predict_proba + @predict_proba.setter + def predict_proba(self, func): + self._predict_proba = func + def transform(self, X): """Return class labels or probabilities for X for each estimator. diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index 625bdb5bdc3f9..9ac2b7f9241df 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -988,6 +988,10 @@ def predict_proba(self): self._check_proba() return self._predict_proba + @predict_proba.setter + def predict_proba(self, func): + self._predict_proba = func + def _predict_proba(self, X): check_is_fitted(self, "t_") @@ -1056,6 +1060,10 @@ def predict_log_proba(self): self._check_proba() return self._predict_log_proba + @predict_log_proba.setter + def predict_log_proba(self, func): + self._predict_log_proba = func + def _predict_log_proba(self, X): return np.log(self.predict_proba(X)) diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index a58997502be91..2fc40fdceafc4 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -296,6 +296,10 @@ def predict(self): return self._predict + @predict.setter + def predict(self, func): + self._predict = func + def _predict(self, X=None): """Predict the labels (1 inlier, -1 outlier) of X according to LOF. @@ -363,6 +367,10 @@ def decision_function(self): return self._decision_function + @decision_function.setter + def decision_function(self, func): + self._decision_function = func + def _decision_function(self, X): """Shifted opposite of the Local Outlier Factor of X. diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index e5325ecda69f0..c8b2fc5f8e489 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -664,6 +664,10 @@ def partial_fit(self): % self.solver) return self._partial_fit + @partial_fit.setter + def partial_fit(self, func): + self._partial_fit = func + def _partial_fit(self, X, y): return self._fit(X, y, incremental=True) @@ -1040,6 +1044,10 @@ def partial_fit(self): % self.solver) return self._partial_fit + @partial_fit.setter + def partial_fit(self, func): + self._partial_fit = func + def _partial_fit(self, X, y, classes=None): if _check_partial_fit_first_call(self, classes): self._label_binarizer = LabelBinarizer() diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index 4a50ee479f030..aaeda264821ed 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -614,6 +614,10 @@ def predict_proba(self): self._check_proba() return self._predict_proba + @predict_proba.setter + def predict_proba(self, func): + self._predict_proba = func + def _predict_proba(self, X): X = self._validate_for_predict(X) if self.probA_.size == 0 or self.probB_.size == 0: @@ -653,6 +657,10 @@ def predict_log_proba(self): self._check_proba() return self._predict_log_proba + @predict_proba.setter + def predict_log_proba(self, func): + self._predict_log_proba = func + def _predict_log_proba(self, X): return np.log(self.predict_proba(X)) From 900baf29f76ef4243c8b59718d1db92522172f98 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sun, 7 Jul 2019 11:08:33 -0400 Subject: [PATCH 11/11] BUG Fixes property --- sklearn/svm/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index aaeda264821ed..428c6d96b5787 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -657,7 +657,7 @@ def predict_log_proba(self): self._check_proba() return self._predict_log_proba - @predict_proba.setter + @predict_log_proba.setter def predict_log_proba(self, func): self._predict_log_proba = func