From b908947c01a3c1e08767dc8fc4f8dd9fc9b86194 Mon Sep 17 00:00:00 2001 From: Ekaterina Krivich Date: Mon, 3 Oct 2016 11:36:26 +0300 Subject: [PATCH 1/3] Add test for __dict__ for estimator checks check that "predict", "transform", "decision_function" or "predict_proba" methods do not change the state of __dict__ of any estimator see #7297 --- sklearn/decomposition/nmf.py | 1 - sklearn/utils/estimator_checks.py | 44 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 43a368050e10c..2cb1863803099 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -1106,7 +1106,6 @@ def transform(self, X): nls_max_iter=self.nls_max_iter, sparseness=self.sparseness, beta=self.beta, eta=self.eta) - self.n_iter_ = n_iter_ return W def inverse_transform(self, W): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6c58cea6032c9..6291e54c4676c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -30,6 +30,7 @@ from sklearn.utils.testing import SkipTest from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import assert_dict_equal from sklearn.base import (clone, ClassifierMixin, RegressorMixin, @@ -230,6 +231,7 @@ def _yield_all_checks(name, Estimator): yield check_fit1d_1feature yield check_fit1d_1sample yield check_get_params_invariance + yield check_dict_unchanged def check_estimator(Estimator): @@ -409,6 +411,48 @@ def check_dtype_object(name, Estimator): @ignore_warnings +def check_dict_unchanged(name, Estimator): + # this estimator raises + # ValueError: Found array with 0 feature(s) (shape=(23, 0)) + # while a minimum of 1 is required. + # error + if name in ['SpectralCoclustering']: + return + rnd = np.random.RandomState(0) + if name in ['RANSACRegressor']: + X = 3 * rnd.uniform(size=(20, 3)) + else: + X = 2 * rnd.uniform(size=(20, 3)) + + y = X[:, 0].astype(np.int) + y = multioutput_estimator_convert_y_2d(name, y) + estimator = Estimator() + set_testing_parameters(estimator) + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + + if hasattr(estimator, "n_clusters"): + estimator.n_clusters = 1 + + if hasattr(estimator, "n_best"): + estimator.n_best = 1 + + set_random_state(estimator, 1) + + # should be just `estimator.fit(X, y)` + # after merging #6141 + if name in ['SpectralBiclustering']: + estimator.fit(X) + else: + estimator.fit(X, y) + for method in ["predict", "transform", "decision_function", + "predict_proba"]: + if hasattr(estimator, method): + dict_before = estimator.__dict__.copy() + getattr(estimator, method)(X) + assert_dict_equal(estimator.__dict__, dict_before) + + def check_fit2d_predict1d(name, Estimator): # check by fitting a 2d array and prediting with a 1d array rnd = np.random.RandomState(0) From 018dbea672e632b4d524b6a4997fbf91bc4a8252 Mon Sep 17 00:00:00 2001 From: Ekaterina Krivich Date: Fri, 28 Oct 2016 11:20:34 +0300 Subject: [PATCH 2/3] Add a test to test_check_estimator that shows that check_estimator fails on an estimator that violates this --- sklearn/utils/estimator_checks.py | 5 ++++- sklearn/utils/tests/test_estimator_checks.py | 21 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6291e54c4676c..c7d84836bd02b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -450,7 +450,10 @@ def check_dict_unchanged(name, Estimator): if hasattr(estimator, method): dict_before = estimator.__dict__.copy() getattr(estimator, method)(X) - assert_dict_equal(estimator.__dict__, dict_before) + assert_dict_equal(estimator.__dict__, dict_before, + ('Estimator changes __dict__ during' + 'predict, transform, decision_function' + ' or predict_proba methods')) def check_fit2d_predict1d(name, Estimator): diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index be4a3c4f5b5d6..100b583385204 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -28,6 +28,20 @@ def predict(self, X): return np.ones(X.shape[0]) +class ChangesDict(BaseEstimator): + def __init__(self): + self.key = 0 + + def fit(self, X, y=None): + X, y = check_X_y(X, y) + return self + + def predict(self, X): + X = check_array(X) + self.key = 1000 + return np.ones(X.shape[0]) + + class NoCheckinPredict(BaseBadClassifier): def fit(self, X, y): X, y = check_X_y(X, y) @@ -75,6 +89,13 @@ def test_check_estimator(): # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) + # check that estimator state does not change + # at transform/predict/predict_proba time + msg = ('Estimator changes __dict__ during' + 'predict, transform, decision_function' + ' or predict_proba methods') + assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) + # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator " + name + " doesn't seem to fail gracefully on sparse data" From 64adf7336f1e21102e5c0b2c0abfef7c57809c01 Mon Sep 17 00:00:00 2001 From: Ekaterina Krivich Date: Tue, 1 Nov 2016 11:41:46 +0300 Subject: [PATCH 3/3] Fixed bug where NMF's n_iter_ attribute was set by calls to transform --- doc/whats_new.rst | 9 ++++++++- sklearn/decomposition/nmf.py | 21 -------------------- sklearn/utils/estimator_checks.py | 4 +--- sklearn/utils/tests/test_estimator_checks.py | 4 +--- 4 files changed, 10 insertions(+), 28 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index f02169a56b004..bdd81ff104996 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -65,10 +65,14 @@ Enhancements - Added ``sample_weight`` parameter to :meth:`pipeline.Pipeline.score`. :issue:`7723` by `Mikhail Korobov`_. + - ``check_estimator`` now attempts to ensure that methods transform, predict, etc. + do not set attributes on the estimator. + :issue:`7533` by `Ekaterina Krivich`_. + Bug fixes ......... - - Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not + - Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not exactly implement Benjamini-Hochberg procedure. It formerly may have selected fewer features than it should. :issue:`7490` by `Peng Meng`_. @@ -88,6 +92,9 @@ Bug fixes - Tree splitting criterion classes' cloning/pickling is now memory safe :issue:`7680` by `Ibraim Ganiev`_. + - Fixed a bug where :class:`decomposition.NMF` sets its ``n_iters_`` + attribute in `transform()`. :issue:`7553` by `Ekaterina Krivich`_. + .. _changes_0_18_1: Version 0.18.1 diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 2cb1863803099..29707ac94cf70 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -1016,14 +1016,6 @@ def fit_transform(self, X, y=None, W=None, H=None): H : array-like, shape (n_components, n_features) If init='custom', it is used as initial guess for the solution. - Attributes - ---------- - components_ : array-like, shape (n_components, n_features) - Factorization matrix, sometimes called 'dictionary'. - - n_iter_ : int - Actual number of iterations for the transform. - Returns ------- W: array, shape (n_samples, n_components) @@ -1061,14 +1053,6 @@ def fit(self, X, y=None, **params): X: {array-like, sparse matrix}, shape (n_samples, n_features) Data matrix to be decomposed - Attributes - ---------- - components_ : array-like, shape (n_components, n_features) - Factorization matrix, sometimes called 'dictionary'. - - n_iter_ : int - Actual number of iterations for the transform. - Returns ------- self @@ -1084,11 +1068,6 @@ def transform(self, X): X: {array-like, sparse matrix}, shape (n_samples, n_features) Data matrix to be transformed by the model - Attributes - ---------- - n_iter_ : int - Actual number of iterations for the transform. - Returns ------- W: array, shape (n_samples, n_components) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index c7d84836bd02b..5c1b9eca5f221 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -451,9 +451,7 @@ def check_dict_unchanged(name, Estimator): dict_before = estimator.__dict__.copy() getattr(estimator, method)(X) assert_dict_equal(estimator.__dict__, dict_before, - ('Estimator changes __dict__ during' - 'predict, transform, decision_function' - ' or predict_proba methods')) + 'Estimator changes __dict__ during %s' % method) def check_fit2d_predict1d(name, Estimator): diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 100b583385204..de7e1e2f2ac1b 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -91,9 +91,7 @@ def test_check_estimator(): assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) # check that estimator state does not change # at transform/predict/predict_proba time - msg = ('Estimator changes __dict__ during' - 'predict, transform, decision_function' - ' or predict_proba methods') + msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) # check for sparse matrix input handling