diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index aaf174906f960..02e4850cfc4d0 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -30,7 +30,8 @@ from sklearn.utils.testing import SkipTest from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns - +from sklearn.utils.testing import assert_same_model +from sklearn.utils.testing import assert_not_same_model from sklearn.base import (clone, ClassifierMixin, RegressorMixin, TransformerMixin, ClusterMixin, BaseEstimator) @@ -76,6 +77,7 @@ def _yield_non_meta_checks(name, Estimator): + """ yield check_estimators_dtypes yield check_fit_score_takes_y yield check_dtype_object @@ -107,6 +109,9 @@ def _yield_non_meta_checks(name, Estimator): # Test that estimators can be pickled, and once pickled # give the same answer as before. yield check_estimators_pickle + """ + if name not in ('SpectralEmbedding',): + yield check_estimator_fit_reset def _yield_classifier_checks(name, Classifier): @@ -199,6 +204,7 @@ def _yield_clustering_checks(name, Clusterer): def _yield_all_checks(name, Estimator): for check in _yield_non_meta_checks(name, Estimator): yield check + """ if issubclass(Estimator, ClassifierMixin): for check in _yield_classifier_checks(name, Estimator): yield check @@ -217,6 +223,7 @@ def _yield_all_checks(name, Estimator): yield check_fit2d_1feature yield check_fit1d_1feature yield check_fit1d_1sample + """ def check_estimator(Estimator): @@ -1553,3 +1560,45 @@ def check_classifiers_regression_target(name, Estimator): e = Estimator() msg = 'Unknown label type: ' assert_raises_regex(ValueError, msg, e.fit, X, y) + + +@ignore_warnings +def check_estimator_fit_reset(name, Estimator): + X1, y1 = make_blobs(n_samples=50, n_features=2, center_box=(-200, -150), + centers=2, random_state=0) + X2, y2 = make_blobs(n_samples=50, n_features=2, center_box=(200, 150), + centers=2, random_state=1) + X3, y3 = make_blobs(n_samples=50, n_features=2, center_box=(-200, 150), + centers=3, random_state=2) + X4, y4 = make_blobs(n_samples=50, n_features=5, center_box=(-200, -150), + centers=2, random_state=0) + X5, y5 = make_blobs(n_samples=50, n_features=5, center_box=(200, 150), + centers=2, random_state=1) + X6, y6 = make_blobs(n_samples=50, n_features=5, center_box=(-200, 150), + centers=3, random_state=2) + + # Some estimators work only on non-negative inputs + if name in ('AdditiveChi2Sampler', 'SkewedChi2Sampler', 'NMF', + 'MultinomialNB', 'ProjectedGradientNMF',): + X1, X2, X3, X4, X5, X6 = map(lambda X: X - X.min(), + (X1, X2, X3, X4, X5, X6)) + + y1, y2, y3, y4, y5, y6 = map(multioutput_estimator_convert_y_2d, + (name,)*6, (y1, y2, y3, y4, y5, y6)) + estimator_1 = Estimator() + estimator_2 = Estimator() + + set_testing_parameters(estimator_1) + set_testing_parameters(estimator_2) + + set_random_state(estimator_1) + set_random_state(estimator_2) + + assert_not_same_model(X3, estimator_1.fit(X1, y1), estimator_2.fit(X2, y2)) + assert_same_model(X3, estimator_1.fit(X2, y2), estimator_2) + assert_same_model(X2, estimator_1.fit(X1, y1), estimator_2.fit(X1, y1)) + + # Fitting new data with 5 features + assert_not_same_model(X6, estimator_1.fit(X4, y4), estimator_2.fit(X5, y5)) + assert_same_model(X6, estimator_1.fit(X5, y5), estimator_2) + assert_same_model(X5, estimator_1.fit(X4, y4), estimator_2.fit(X4, y4)) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 1035c4e7b9a2b..c67df100292da 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -19,8 +19,11 @@ import platform import struct -import scipy as sp +import scipy import scipy.io +import scipy.sparse as sp +import numpy as np + from functools import wraps from operator import itemgetter try: @@ -71,10 +74,11 @@ __all__ = ["assert_equal", "assert_not_equal", "assert_raises", "assert_raises_regexp", "raises", "with_setup", "assert_true", "assert_false", "assert_almost_equal", "assert_array_equal", - "assert_array_almost_equal", "assert_array_less", - "assert_less", "assert_less_equal", - "assert_greater", "assert_greater_equal", - "assert_approx_equal"] + "assert_allclose", "assert_array_almost_equal", "assert_array_less", + "assert_less", "assert_less_equal", "assert_greater", + "assert_greater_equal", "assert_same_model", + "assert_not_same_model", "assert_fitted_attributes_almost_equal", + "assert_approx_equal", "assert_safe_sparse_allclose"] try: @@ -383,20 +387,83 @@ def __exit__(self, *exc_info): assert_greater = _assert_greater +if hasattr(np.testing, 'assert_allclose'): + assert_allclose = np.testing.assert_allclose +else: + assert_allclose = _assert_allclose + + +def assert_safe_sparse_allclose(val1, val2, rtol=1e-7, atol=0, msg=None): + """Check if two objects are close up to the preset tolerance. + + The objects can be scalars, lists, tuples, ndarrays or sparse matrices. + """ + if msg is None: + msg = ("The val1,\n%s\nand val2,\n%s\nare not all close" + % (val1, val2)) + + if isinstance(val1, str) and isinstance(val2, str): + assert_true(val1 == val2, msg=msg) + + elif np.isscalar(val1) and np.isscalar(val2): + assert_allclose(val1, val2, rtol=rtol, atol=atol, err_msg=msg) + + # To allow mixed formats for sparse matrices alone + elif type(val1) is not type(val2) and not ( + sp.issparse(val1) and sp.issparse(val2)): + assert False, msg + + elif not (isinstance(val1, (list, tuple, np.ndarray, sp.spmatrix, dict))): + raise ValueError("The objects,\n%s\nand\n%s\n, are neither scalar nor " + "array-like." % (val1, val2)) + + # list/tuple/dict (of list/tuple/dict...) of ndarrays/spmatrices/scalars + elif isinstance(val1, (tuple, list, dict)): + if isinstance(val1, dict): + val1, val2 = tuple(val1.iteritems()), tuple(val2.iteritems()) + if (len(val1) == 0) and (len(val2) == 0): + assert True + elif len(val1) != len(val2): + assert False, msg + # nested lists/tuples - [array([5, 6]), array([5, ])] and [[1, 3], ] + # Or ['str',] and ['str',] + elif isinstance(val1[0], (tuple, list, np.ndarray, sp.spmatrix, str)): + # Compare them recursively + for i, val1_i in enumerate(val1): + assert_safe_sparse_allclose(val1_i, val2[i], + rtol=rtol, atol=atol, msg=msg) + # Compare the lists using np.allclose, if they are neither nested nor + # contain strings + else: + assert_allclose(val1, val2, rtol=rtol, atol=atol, err_msg=msg) + + # scipy sparse matrix + elif sp.issparse(val1) or sp.issparse(val2): + # NOTE: ref np.allclose's note for assymetricity in this testing + if val1.shape != val2.shape: + assert False, msg + + diff = abs(val1 - val2) - (rtol * abs(val2)) + assert np.any(diff > atol).size == 0, msg + + # numpy ndarray + elif isinstance(val1, (np.ndarray)): + if val1.shape != val2.shape: + assert False, msg + assert_allclose(val1, val2, rtol=rtol, atol=atol, err_msg=msg) + else: + assert False, msg + + def _assert_allclose(actual, desired, rtol=1e-7, atol=0, err_msg='', verbose=True): actual, desired = np.asanyarray(actual), np.asanyarray(desired) if np.allclose(actual, desired, rtol=rtol, atol=atol): return - msg = ('Array not equal to tolerance rtol=%g, atol=%g: ' - 'actual %s, desired %s') % (rtol, atol, actual, desired) - raise AssertionError(msg) - - -if hasattr(np.testing, 'assert_allclose'): - assert_allclose = np.testing.assert_allclose -else: - assert_allclose = _assert_allclose + if err_msg == '': + err_msg = ('Array not equal to tolerance rtol=%g, atol=%g: ' + 'actual %s, desired %s') % (rtol, atol, actual, desired) + raise AssertionError(err_msg) def assert_raise_message(exceptions, message, function, *args, **kwargs): @@ -433,6 +500,162 @@ def assert_raise_message(exceptions, message, function, *args, **kwargs): (names, function.__name__)) +def _assert_same_model_method(method, X, estimator1, estimator2, msg=None): + method_err = '%r\n\nhas %s, but\n\n%r\n\ndoes not.' + # If the method is absent in only one model consider them different + if hasattr(estimator1, method) and not hasattr(estimator2, method): + raise AssertionError(method_err % (estimator1, method, estimator2)) + if hasattr(estimator2, method) and not hasattr(estimator1, method): + raise AssertionError(method_err % estimator2, method, estimator1) + + if not hasattr(estimator1, method): + return + + # Check if the method(X) returns the same for both models. + res1, res2 = getattr(estimator1, method)(X), getattr(estimator2, method)(X) + if msg is None: + msg = ("Models are not equal. \n\n%s method returned different " + "results:\n\n%s\n\n for :\n\n%s and\n\n%s\n\n for :\n\n%s." + % (method, res1, estimator1, res2, estimator2)) + assert_safe_sparse_allclose(res1, res2, msg=msg) + + +def assert_same_model(X, estimator1, estimator2, msg=None): + """Helper function to check if the models are similar. + + The check is done by comparing the outputs of the methods ``predict``, + ``transform``, ``decision_function`` and the ``predict_proba`` provided + they exist in both the models. If any of those methods do not exist in + one model alone, the models are considered different. + + If the outputs from both the models for each of the available above listed + function(s) are similar, a comparison of the attributes of the models + that end with ``_`` is done to ascertain the similarity of the model. + + If the models are different an AssertionError with the given error message + is raised. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, for the fitted models, used for comparing them. + + estimator1 : An estimator object. + The first fitted model to be compared. + + estimator2 : An estimator object. + The second fitted model to be compared. + + msg : str + The error message to be used while raising the AssertionError if the + models are similar. + + Notes + ----- + This check is not exhaustive since all attributes of the model are assumed + to end with ``_``. If that is not the case, it could lead to false + positives. + """ + _assert_same_model_method('predict', X, estimator1, estimator2, msg) + _assert_same_model_method('transform', X, estimator1, estimator2, msg) + _assert_same_model_method('decision_function', + X, estimator1, estimator2, msg) + _assert_same_model_method('predict_proba', X, estimator1, estimator2, msg) + assert_fitted_attributes_almost_equal(estimator1, estimator2) + + +def assert_not_same_model(X, estimator1, estimator2, msg=None): + """Helper function to check if the models are different. + + The check is done by comparing the outputs of the methods ``predict``, + ``transform``, ``decision_function`` and the ``predict_proba``, provided + they exist in both the models. If any of those methods do not exist in + one model alone, the models are considered different. + + If the outputs from both the models for each of the available, above listed + function(s) are similar, a comparison of the attributes of the models + that end with ``_`` is done to ascertain the similarity of the model. + + If the models are similar an AssertionError with the given error message + is raised. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, for the fitted models, used for comparing them. + + estimator1 : An estimator object. + The first fitted model to be compared. + + estimator2 : An estimator object. + The second fitted model to be compared. + + msg : str + The error message to be used while raising the AssertionError if the + models are similar. + + Notes + ----- + This check is not exhaustive since all attributes of the model are assumed + to end with ``_``. If that is not the case, it could lead to false + negatives. + """ + try: + assert_same_model(X, estimator1, estimator2) + except AssertionError: + return + raise AssertionError(msg) + + +def assert_fitted_attributes_almost_equal(estimator1, estimator2, msg=None): + """Helper function to check if the fitted model attributes are similar. + + This check is done by comparing the attributes from both the models that + end in ``_``. + + If the fitted models attributes are different an AssertionError with the + given error message is raised. + + Parameters + ---------- + estimator1 : An estimator object. + The first fitted model whose attributes are to be compared. + + estimator2 : An estimator object. + The second fitted model whose attributes are to be compared. + + msg : str + The error message to be used while raising the AssertionError, if the + fitted models attributes are different. + + Notes + ----- + This check is not exhaustive since all attributes of the model are assumed + to end with ``_``. If that is not the case, it could lead to false + positives. + """ + est1_dict, est2_dict = estimator1.__dict__, estimator2.__dict__ + assert_array_equal(est1_dict.keys(), est2_dict.keys(), + "The attributes of both the estimators do not match.") + + non_attributes = ("estimators_", "estimator_", "tree_", "base_estimator_", + "random_state_", "root_", "label_binarizer_", "loss_") + non_attr_suffixes = ("leaf_",) + + for attr in est1_dict: + val1, val2 = est1_dict[attr], est2_dict[attr] + + # Consider keys that end in ``_`` only as attributes. + if (attr.endswith('_') and attr not in non_attributes and + not attr.endswith(non_attr_suffixes)): + if msg is None: + msg = ("Attributes do not match. \nThe attribute, %s, in " + "estimator1,\n\n%r\n\n is %r and in estimator2," + "\n\n%r\n\n is %r.\n") % (attr, estimator1, val1, + estimator2, val2) + assert_safe_sparse_allclose(val1, val2, msg=msg) + + def fake_mldata(columns_dict, dataname, matfile, ordering=None): """Create a fake mldata data set. @@ -465,7 +688,7 @@ def fake_mldata(columns_dict, dataname, matfile, ordering=None): ordering = sorted(list(datasets.keys())) # NOTE: setting up this array is tricky, because of the way Matlab # re-packages 1D arrays - datasets['mldata_descr_ordering'] = sp.empty((1, len(ordering)), + datasets['mldata_descr_ordering'] = np.empty((1, len(ordering)), dtype='object') for i, name in enumerate(ordering): datasets['mldata_descr_ordering'][0, i] = name diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index ea76333a6eafc..00ba84d942b06 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -1,9 +1,11 @@ import warnings import unittest import sys +import numpy as np +from scipy import sparse as sp +from numpy.testing import assert_allclose from nose.tools import assert_raises - from sklearn.utils.testing import ( _assert_less, _assert_greater, @@ -14,10 +16,16 @@ assert_equal, set_random_state, assert_raise_message, - ignore_warnings) - + ignore_warnings, + assert_safe_sparse_allclose, + assert_same_model, + assert_not_same_model) from sklearn.tree import DecisionTreeClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.qda import QDA +from sklearn.datasets import make_blobs +from sklearn.svm import LinearSVC +from sklearn.cluster import KMeans try: from nose.tools import assert_less @@ -189,10 +197,114 @@ def context_manager_no_user_multiple_warning(): assert_warns(DeprecationWarning, context_manager_no_user_multiple_warning) +def test_assert_safe_sparse_allclose(): + # Test Scalars + x = 1e-3 + y = 1e-9 + assert_safe_sparse_allclose(x, y, atol=1) + assert_raises(AssertionError, assert_safe_sparse_allclose, x, y) + + # Test Sparse matrices + a = sp.coo_matrix(np.array([x, y, x, y])) + b = sp.csr_matrix(np.array([x, y, x, x])) + assert_safe_sparse_allclose(a, b, atol=1) + assert_raises(AssertionError, assert_safe_sparse_allclose, a, b) + + b[0, 3] = y * (1 + 1e-8) + assert_safe_sparse_allclose(a, b) + assert_raises(AssertionError, assert_safe_sparse_allclose, a, b, rtol=1e-9) + + assert_safe_sparse_allclose([np.array([(6, 6)]),], [np.array([(10, 10)]),], + rtol=0.5) + assert_raises(AssertionError, assert_safe_sparse_allclose, + [np.array([(6, 6)]),], [np.array([(10, 10)]),]) + + # Test nested lists of scalars + assert_safe_sparse_allclose([(['a', 'bcd'], ['a'])], + [(['a', 'bcd'], ['a'])]) + assert_raises(AssertionError, assert_safe_sparse_allclose, + [(['a', 'bcd'], ['a'])], [(['a', 'bcd'], ['a', 'a'])]) + assert_raises(AssertionError, assert_safe_sparse_allclose, + [(['a', 'bcd'], ['a'])], [(['a', 'bcd'], ['b'])]) + + # Test dicts + assert_safe_sparse_allclose({}, {}) + assert_safe_sparse_allclose({'a':'a'}, {'a':'a'}) + dict_1 = {'a':{'b':{'arr':np.array([1, 2, 3]), 'str':'str', 'int':9}}} + dict_2 = {'a':{'b':{'arr':np.array([1, 2, 3]), 'str':'str', 'int':9}}} + assert_safe_sparse_allclose(dict_1, dict_2) + dict_1['a']['b']['arr'] = np.array([2, 2, 3]) + assert_safe_sparse_allclose(dict_1, dict_2, atol=1) + assert_raises(AssertionError, assert_safe_sparse_allclose, dict_1, dict_2) + + # Test nested list of dicts of spmatrices and ndarrays + dict_1['a']['b']['arr1'] = [a, np.array([3, 4.])] + assert_raises(AssertionError, assert_safe_sparse_allclose, dict_1, dict_2, + atol=1) + dict_2['a']['b']['arr1'] = [b, np.array([3, 4.])] + assert_safe_sparse_allclose(dict_1, dict_2, atol=1) + assert_raises(AssertionError, assert_safe_sparse_allclose, dict_1, dict_2) + + # Test the string comparison + assert_safe_sparse_allclose('a', 'a') + assert_safe_sparse_allclose('abcdl', 'abcdl') + assert_raises(AssertionError, assert_safe_sparse_allclose, 'a', 'b') + assert_raises(AssertionError, assert_safe_sparse_allclose, 'aa', 'b') + + # Test numeric comparisons + assert_safe_sparse_allclose(6, np.float64(6)) + assert_safe_sparse_allclose(6, 6.0) + assert_safe_sparse_allclose(7, 7.0) + assert_safe_sparse_allclose(5, np.int32(5)) + + +def test_assert_same_not_same_model(): + X1, y1 = make_blobs(n_samples=200, n_features=5, center_box=(-200, -150), + centers=2, random_state=0) + X2, y2 = make_blobs(n_samples=100, n_features=5, center_box=(-1, 1), + centers=3, random_state=1) + X3, y3 = make_blobs(n_samples=50, n_features=5, center_box=(-100, -50), + centers=4, random_state=2) + + # Checking both non-transductive and transductive algorithms + # By testing for transductive algorithms we also eventually test + # the assert_fitted_attributes_equal helper. + for Estimator in (LinearSVC, KMeans): + assert_same_model(X3, Estimator(random_state=0).fit(X1, y1), + Estimator(random_state=0).fit(X1, y1)) + assert_raises(AssertionError, assert_not_same_model, X3, + Estimator(random_state=0).fit(X1, y1), + Estimator(random_state=0).fit(X1, y1)) + assert_raises(AssertionError, assert_same_model, X3, + Estimator(random_state=0).fit(X1, y1), + Estimator(random_state=0).fit(X2, y2)) + assert_not_same_model(X3, Estimator(random_state=0).fit(X1, y1), + Estimator(random_state=0).fit(X2, y2)) + + +def test_qda_same_model(): + # NRT to make sure the rotations_ attribute is correctly compared + X = np.array([[0, 0], [-2, -2], [-2, -1], [-1, -1], [-1, -2], + [1, 3], [1, 2], [2, 1], [2, 2]]) + y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2]) + X1 = np.array([[-3, -1], [-2, 0], [-1, 0], [-11, 0], [0, 0], [1, 0], + [1, 5], [2, 0], [3, 4]]) + y1 = np.array([1, 1, 1, 1, 2, 2, 2, 2, 2]) + X2 = np.array([[-1, -3], [0, -2], [0, -1], [0, -5], [0, 0], [10, 1], + [0, 11], [0, 22], [0, 33]]) + + clf1 = QDA().fit(X, y) + clf2 = QDA().fit(X, y) + assert_same_model(X1, clf1, clf2) + + clf3 = QDA().fit(X1, y1) + assert_not_same_model(X2, clf1, clf3) + + # This class is inspired from numpy 1.7 with an alteration to check # the reset warning filters after calls to assert_warns. # This assert_warns behavior is specific to scikit-learn because -#`clean_warning_registry()` is called internally by assert_warns +# `clean_warning_registry()` is called internally by assert_warns # and clears all previous filters. class TestWarns(unittest.TestCase): def test_warn(self):