From cd63accc05cd841d20d82cf55f4b63b2fa3b90df Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 6 Feb 2015 13:26:42 +0100 Subject: [PATCH] Add test that score takes y, fix KMeans, FIX pipeline compatibility of clustering algorithms! --- doc/developers/index.rst | 12 ++-- doc/whats_new.rst | 3 + sklearn/cluster/affinity_propagation_.py | 2 +- sklearn/cluster/dbscan_.py | 4 +- sklearn/cluster/hierarchical.py | 2 +- sklearn/cluster/k_means_.py | 4 +- sklearn/cluster/mean_shift_.py | 2 +- sklearn/cluster/spectral.py | 2 +- sklearn/decomposition/dict_learning.py | 11 ++-- sklearn/decomposition/incremental_pca.py | 2 +- sklearn/neural_network/rbm.py | 4 +- sklearn/tests/test_common.py | 5 ++ sklearn/utils/estimator_checks.py | 82 ++++++++++++++++-------- 13 files changed, 86 insertions(+), 49 deletions(-) diff --git a/doc/developers/index.rst b/doc/developers/index.rst index 09f06f749e855..134d41d0136ff 100644 --- a/doc/developers/index.rst +++ b/doc/developers/index.rst @@ -716,8 +716,11 @@ is not met, an exception of type ``ValueError`` should be raised. ``y`` might be ignored in the case of unsupervised learning. However, to make it possible to use the estimator as part of a pipeline that can mix both supervised and unsupervised transformers, even unsupervised -estimators are kindly asked to accept a ``y=None`` keyword argument in +estimators need to accept a ``y=None`` keyword argument in the second position that is just ignored by the estimator. +For the same reason, ``fit_predict``, ``fit_transform``, ``score`` + and ``partial_fit`` methods need to accept a ``y`` argument in +the second place if they are implemented. The method should return the object (``self``). This pattern is useful to be able to implement quick one liners in an IPython session such as:: @@ -857,9 +860,10 @@ last step, it needs to provide a ``fit`` or ``fit_transform`` function. To be able to evaluate the pipeline on any data but the training set, it also needs to provide a ``transform`` function. There are no special requirements for the last step in a pipeline, except that -it has a ``fit`` function. All ``fit`` and ``fit_transform`` functions must -take arguments ``X, y``, even if y is not used. - +it has a ``fit`` function. All ``fit`` and ``fit_transform`` functions must +take arguments ``X, y``, even if y is not used. Similarly, for ``score`` to be +usable, the last step of the pipeline needs to have a ``score`` function that +accepts an optional ``y``. Working notes ------------- diff --git a/doc/whats_new.rst b/doc/whats_new.rst index eec69b22250a0..5ec4591b29ae7 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -176,6 +176,9 @@ Enhancements - Parallelized calculation of :func:`pairwise_distances` is now supported for scipy metrics and custom callables. By `Joel Nothman`_. + - Allow the fitting and scoring of all clustering algorithms in + :class:`pipeline.Pipeline`. By `Andreas Müller`_. + Documentation improvements .......................... diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py index 49ed24df7ff02..3f3745b657101 100644 --- a/sklearn/cluster/affinity_propagation_.py +++ b/sklearn/cluster/affinity_propagation_.py @@ -269,7 +269,7 @@ def __init__(self, damping=.5, max_iter=200, convergence_iter=15, def _pairwise(self): return self.affinity == "precomputed" - def fit(self, X): + def fit(self, X, y=None): """ Create affinity matrix from negative euclidean distances, then apply affinity propagation clustering. diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 6d45dd0506c02..896cf0c20d350 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -189,7 +189,7 @@ class DBSCAN(BaseEstimator, ClusterMixin): of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - + Attributes ---------- core_sample_indices_ : array, shape = [n_core_samples] @@ -224,7 +224,7 @@ def __init__(self, eps=0.5, min_samples=5, metric='euclidean', self.p = p self.random_state = random_state - def fit(self, X, sample_weight=None): + def fit(self, X, y=None, sample_weight=None): """Perform DBSCAN clustering from features or distance matrix. Parameters diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 5e834a6c6dbed..7817894e4d7f9 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -683,7 +683,7 @@ def __init__(self, n_clusters=2, affinity="euclidean", self.affinity = affinity self.pooling_func = pooling_func - def fit(self, X): + def fit(self, X, y=None): """Fit the hierarchical clustering on the data Parameters diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index ef11c2183fc0f..c7c537cad1ab9 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -795,7 +795,7 @@ def fit(self, X, y=None): n_jobs=self.n_jobs) return self - def fit_predict(self, X): + def fit_predict(self, X, y=None): """Compute cluster centers and predict cluster index for each sample. Convenience method; equivalent to calling fit(X) followed by @@ -864,7 +864,7 @@ def predict(self, X): x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0] - def score(self, X): + def score(self, X, y=None): """Opposite of the value of X on the K-means objective. Parameters diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py index 3ad1bbf3b029b..220b97ed2e40d 100644 --- a/sklearn/cluster/mean_shift_.py +++ b/sklearn/cluster/mean_shift_.py @@ -320,7 +320,7 @@ def __init__(self, bandwidth=None, seeds=None, bin_seeding=False, self.cluster_all = cluster_all self.min_bin_freq = min_bin_freq - def fit(self, X): + def fit(self, X, y=None): """Perform clustering. Parameters diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 9336dc64f8faa..b1c1834d7b618 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -405,7 +405,7 @@ def __init__(self, n_clusters=8, eigen_solver=None, random_state=None, self.coef0 = coef0 self.kernel_params = kernel_params - def fit(self, X): + def fit(self, X, y=None): """Creates an affinity matrix for X using the selected affinity, then applies spectral clustering to this affinity matrix. diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 74d8dc9c7c6b0..3e58bac5fa827 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -412,7 +412,6 @@ def dict_learning(X, n_components, alpha, max_iter=100, tol=1e-8, SparsePCA MiniBatchSparsePCA """ - if method not in ('lars', 'cd'): raise ValueError('Coding method %r not supported as a fit algorithm.' % method) @@ -604,6 +603,8 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, MiniBatchSparsePCA """ + if n_components is None: + n_components = X.shape[1] if method not in ('lars', 'cd'): raise ValueError('Coding method not supported as a fit algorithm.') @@ -750,7 +751,7 @@ def transform(self, X, y=None): Transformed data """ - check_is_fitted(self, 'components_') + check_is_fitted(self, 'components_') # XXX : kwargs is not documented X = check_array(X) @@ -1159,13 +1160,9 @@ def fit(self, X, y=None): """ random_state = check_random_state(self.random_state) X = check_array(X) - if self.n_components is None: - n_components = X.shape[1] - else: - n_components = self.n_components U, (A, B), self.n_iter_ = dict_learning_online( - X, n_components, self.alpha, + X, self.n_components, self.alpha, n_iter=self.n_iter, return_code=False, method=self.fit_algorithm, n_jobs=self.n_jobs, dict_init=self.dict_init, diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index 4991bc7ea3680..9b5ccff00af22 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -174,7 +174,7 @@ def fit(self, X, y=None): self.partial_fit(X[batch]) return self - def partial_fit(self, X): + def partial_fit(self, X, y=None): """Incremental fit with X. All of X is processed as a single batch. Parameters diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py index b2a0fb14780b3..baa0a6dd60a48 100644 --- a/sklearn/neural_network/rbm.py +++ b/sklearn/neural_network/rbm.py @@ -217,7 +217,7 @@ def gibbs(self, v): return v_ - def partial_fit(self, X): + def partial_fit(self, X, y=None): """Fit the model to the data X which should contain a partial segment of the data. @@ -301,7 +301,7 @@ def score_samples(self, X): returns the log of the logistic function of the difference. """ check_is_fitted(self, "components_") - + v = check_array(X, accept_sparse='csr') rng = check_random_state(self.random_state) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index ec535eb076f9a..36bb8bf080d54 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -55,7 +55,9 @@ check_regressor_data_not_an_array, check_transformer_data_not_an_array, check_transformer_n_iter, + check_fit_score_takes_y, check_non_transformer_estimators_n_iter, + check_pipeline_consistency, CROSS_DECOMPOSITION) @@ -87,6 +89,9 @@ def test_non_meta_estimators(): estimators = all_estimators(type_filter=['classifier', 'regressor', 'transformer', 'cluster']) for name, Estimator in estimators: + if name not in CROSS_DECOMPOSITION: + yield check_fit_score_takes_y, name, Estimator + yield check_pipeline_consistency, name, Estimator if name not in CROSS_DECOMPOSITION + ['Imputer']: # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf, name, Estimator diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e8a9feab24377..a7344afc23e14 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -22,15 +22,16 @@ from sklearn.utils.testing import assert_greater from sklearn.utils.testing import SkipTest from sklearn.utils.testing import check_skip_travis +from sklearn.utils.testing import ignore_warnings -from sklearn.base import (clone, ClusterMixin, ClassifierMixin, RegressorMixin, - TransformerMixin) +from sklearn.base import clone, ClassifierMixin from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score from sklearn.lda import LDA from sklearn.random_projection import BaseRandomProjection from sklearn.feature_selection import SelectKBest from sklearn.svm.base import BaseLibSVM +from sklearn.pipeline import make_pipeline from sklearn.utils.validation import DataConversionWarning, NotFittedError from sklearn.cross_validation import train_test_split @@ -44,13 +45,6 @@ CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'] -def is_supervised(estimator): - return (isinstance(estimator, ClassifierMixin) - or isinstance(estimator, RegressorMixin) - # transformers can all take a y - or isinstance(estimator, TransformerMixin)) - - def _boston_subset(n_samples=200): global BOSTON if BOSTON is None: @@ -88,6 +82,10 @@ def set_fast_parameters(estimator): # K-Means estimator.set_params(n_init=2) + if estimator.__class__.__name__ == "SelectFdr": + # avoid not selecting any features + estimator.set_params(alpha=.5) + if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably @@ -131,10 +129,7 @@ def check_estimator_sparse_data(name, Estimator): set_fast_parameters(estimator) # fit and predict try: - if is_supervised(estimator): - estimator.fit(X, y) - else: - estimator.fit(X) + estimator.fit(X, y) if hasattr(estimator, "predict"): estimator.predict(X) if hasattr(estimator, 'predict_proba'): @@ -252,6 +247,50 @@ def _check_transformer(name, Transformer, X, y): assert_raises(ValueError, transformer.transform, X.T) +@ignore_warnings +def check_pipeline_consistency(name, Estimator): + # check that make_pipeline(est) gives same score as est + X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], + random_state=0, n_features=2, cluster_std=0.1) + X -= X.min() + y = multioutput_estimator_convert_y_2d(name, y) + estimator = Estimator() + pipeline = make_pipeline(estimator) + set_fast_parameters(estimator) + set_random_state(estimator) + estimator.fit(X, y) + pipeline.fit(X, y) + funcs = ["score", "fit_transform"] + for func_name in funcs: + func = getattr(estimator, func_name, None) + if func is not None: + func_pipeline = getattr(pipeline, func_name) + result = func(X, y) + result_pipe = func_pipeline(X, y) + assert_array_almost_equal(result, result_pipe) + + +@ignore_warnings +def check_fit_score_takes_y(name, Estimator): + # check that all estimators accept an optional y + # in fit and score so they can be used in pipelines + rnd = np.random.RandomState(0) + X = rnd.uniform(size=(10, 3)) + y = (X[:, 0] * 4).astype(np.int) + y = multioutput_estimator_convert_y_2d(name, y) + estimator = Estimator() + set_fast_parameters(estimator) + set_random_state(estimator) + funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"] + + for func_name in funcs: + func = getattr(estimator, func_name, None) + if func is not None: + func(X, y) + args = inspect.getargspec(func).args + assert_true(args[2] in ["y", "Y"]) + + def check_estimators_nan_inf(name, Estimator): rnd = np.random.RandomState(0) X_train_finite = rnd.uniform(size=(10, 3)) @@ -275,10 +314,7 @@ def check_estimators_nan_inf(name, Estimator): set_random_state(estimator, 1) # try to fit try: - if issubclass(Estimator, ClusterMixin): - estimator.fit(X_train) - else: - estimator.fit(X_train, y) + estimator.fit(X_train, y) except ValueError as e: if 'inf' not in repr(e) and 'NaN' not in repr(e): print(error_string_fit, Estimator, e) @@ -291,12 +327,7 @@ def check_estimators_nan_inf(name, Estimator): else: raise AssertionError(error_string_fit, Estimator) # actually fit - if issubclass(Estimator, ClusterMixin): - # All estimators except clustering algorithm - # support fitting with (optional) y - estimator.fit(X_train_finite) - else: - estimator.fit(X_train_finite, y) + estimator.fit(X_train_finite, y) # predict if hasattr(estimator, "predict"): @@ -833,10 +864,7 @@ def check_estimators_overwrite_params(name, Estimator): set_random_state(estimator) params = estimator.get_params() - if is_supervised(estimator): - estimator.fit(X, y) - else: - estimator.fit(X) + estimator.fit(X, y) new_params = estimator.get_params() for k, v in params.items(): assert_false(np.any(new_params[k] != v),