From 2fd375c2c96a4561277553294456692c5f5f754d Mon Sep 17 00:00:00 2001 From: Arnaud Fouchet Date: Fri, 13 Mar 2015 11:37:01 +0100 Subject: [PATCH 1/5] add distortion --- sklearn/metrics/cluster/distortion.py | 78 +++++++++++++++++++ .../metrics/cluster/tests/test_distortion.py | 12 +++ 2 files changed, 90 insertions(+) create mode 100644 sklearn/metrics/cluster/distortion.py create mode 100644 sklearn/metrics/cluster/tests/test_distortion.py diff --git a/sklearn/metrics/cluster/distortion.py b/sklearn/metrics/cluster/distortion.py new file mode 100644 index 0000000000000..335f9f2d0a42f --- /dev/null +++ b/sklearn/metrics/cluster/distortion.py @@ -0,0 +1,78 @@ +from collections import defaultdict + +import numpy as np +from scipy.spatial.distance import cdist + + +def distortion(X, labels, distortion_meth='sqeuclidean', p=2): + """ + Given data and their cluster assigment, compute the distortion D + + Parameter + --------- + X: numpy array of shape (nb_data, nb_feature) + labels: list of int of length nb_data + distortion_meth: can be a function X, labels -> float, + can be a string naming a scipy.spatial distance. can be in + ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev' + 'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard', + 'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski', + 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', + 'canberra', 'wminkowski']) + p : double + The p-norm to apply (for Minkowski, weighted and unweighted) + + Return + ------ + distortion: float + """ + if isinstance(distortion_meth, str): + return distortion_metrics(X, labels, distortion_meth, p) + else: + return distortion_meth(X, labels) + + +def distortion_metrics(X, labels, metric='sqeuclidean', p=2): + """ + Given data and their cluster assigment, compute the distortion D + + D = \sum_{x \in X} distance(x, c_x) + + With c_x the center of the cluster containing x, distance is the distance + defined by metrics + + Parameter + --------- + X: numpy array of shape (nb_data, nb_feature) + labels: list of int of length nb_data + metric: string naming a scipy.spatial distance. metric can be in + ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev' + 'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard', + 'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski', + 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', + 'wminkowski', 'canberra'] + p : double + The p-norm to apply (for Minkowski, weighted and unweighted) + + Return + ------ + distortion: float + """ + if metric == 'l2': + # Translate to something understood by scipy + metric = 'euclidean' + elif metric in ('l1', 'manhattan'): + metric = 'cityblock' + + assi = defaultdict(list) + for i, l in enumerate(labels): + assi[l].append(i) + + inertia = .0 + nb_feature = X.shape[1] + for points in assi.values(): + clu_points = X[points, :] + clu_center = np.mean(clu_points, axis=0).reshape(1, nb_feature) + inertia += np.sum(cdist(clu_points, clu_center, metric=metric, p=p)) + + return inertia / X.shape[1] diff --git a/sklearn/metrics/cluster/tests/test_distortion.py b/sklearn/metrics/cluster/tests/test_distortion.py new file mode 100644 index 0000000000000..c38db57f69220 --- /dev/null +++ b/sklearn/metrics/cluster/tests/test_distortion.py @@ -0,0 +1,12 @@ +import numpy as np + +from sklearn.utils.testing import assert_almost_equal + +from sklearn.metrics.cluster.distortion import distortion + + +def test_distortion(): + X = np.array([[0, 0], [2, 2], + [5, 5], [6, 6]]) + labels = [0, 0, 1, 1] + assert_almost_equal(distortion(X, labels), 2.5) From 733390ae59ca4b5ececb93b1063f005833d4f17a Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Wed, 29 Jun 2016 15:11:51 +0200 Subject: [PATCH 2/5] =?UTF-8?q?[WIP]=20ENH=20Add=20the=20OptimalNClusterSe?= =?UTF-8?q?arch=20class=C2=A0[ci=20skip]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sklearn/cluster/__init__.py | 4 +- sklearn/cluster/optimal_nclusters_search.py | 385 ++++++++++++++++++ sklearn/metrics/cluster/distortion.py | 78 ---- .../metrics/cluster/tests/test_distortion.py | 12 - 4 files changed, 388 insertions(+), 91 deletions(-) create mode 100644 sklearn/cluster/optimal_nclusters_search.py delete mode 100644 sklearn/metrics/cluster/distortion.py delete mode 100644 sklearn/metrics/cluster/tests/test_distortion.py diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index c9afcd98f23ce..c5c51e52e6916 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -13,6 +13,7 @@ from .dbscan_ import dbscan, DBSCAN from .bicluster import SpectralBiclustering, SpectralCoclustering from .birch import Birch +from .optimal_nclusters_search import OptimalNClusterSearch __all__ = ['AffinityPropagation', 'AgglomerativeClustering', @@ -33,4 +34,5 @@ 'spectral_clustering', 'ward_tree', 'SpectralBiclustering', - 'SpectralCoclustering'] + 'SpectralCoclustering', + 'OptimalNClusterSearch'] diff --git a/sklearn/cluster/optimal_nclusters_search.py b/sklearn/cluster/optimal_nclusters_search.py new file mode 100644 index 0000000000000..91a3745b8df16 --- /dev/null +++ b/sklearn/cluster/optimal_nclusters_search.py @@ -0,0 +1,385 @@ +"""Algorithm computing the optimal value of n_clusters.""" + +# Authors: Thierry Guillemot +# Arnaud Fouchet + +import warnings +import numpy as np + +from abc import ABCMeta, abstractmethod +from functools import partial +from collections import defaultdict + +from ..base import clone, BaseEstimator +from ..mixture.base import BaseMixture +from ..externals import six +from ..externals.joblib import Parallel, delayed +from ..model_selection import ParameterGrid +from ..utils import check_array, check_random_state +from ..utils.fixes import rankdata +from ..utils.metaestimators import if_delegate_has_method +from ..utils.testing import ignore_warnings +from ..utils.validation import check_is_fitted + + +class _NClusterSearchBase(six.with_metaclass(ABCMeta, object)): + def __init__(self, estimator, parameters, n_jobs=1, refit=True, + pre_dispatch='2*n_jobs', verbose=0): + self.estimator = estimator + self.parameters = parameters + self.n_jobs = n_jobs + self.refit = refit + self.pre_dispatch = pre_dispatch + self.verbose = verbose + + def _initialized(self, X, y): + pass + + def _compute_values(self, n_clusters_list): + return np.array(n_clusters_list) + + def _parameter_grid(self, X, y): + # Determinate the name of the property controling then number of clusters + if isinstance(self.estimator, BaseMixture): + # General case + property_name = 'n_components' + elif isinstance(self.estimator, BaseEstimator): + # Mixture case + property_name = 'n_clusters' + else: + raise ValueError("The estimator has to be from the type ...") + + # Compute n_clusters parameters + parameters = self.parameters.copy() + n_clusters_list = parameters.pop(property_name) + self.n_clusters_values, self._index = np.unique( + self._compute_values(n_clusters_list), + return_index=True) + self._index = self._index < len(n_clusters_list) + + for n_clusters in self.n_clusters_values: + for params in ParameterGrid(parameters): + params[property_name] = n_clusters + yield params + + @abstractmethod + def _estimator_fit(self, estimator, X, y, parameters): + pass + + @abstractmethod + def _compute_results(self, X, out): + pass + + @if_delegate_has_method(delegate='estimator') + def predict(self, X): + """Call predict on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict``. + + Parameters + ----------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + """ + check_is_fitted(self, 'best_estimator_') + return self.best_estimator_.predict(X) + + @if_delegate_has_method(delegate='estimator') + def predict_proba(self, X): + """Call predict_proba on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict_proba``. + + Parameters + ----------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + """ + check_is_fitted(self, 'best_estimator_') + return self.best_estimator_.predict_proba(X) + + def fit(self, X, y): + self._initialized(X, y) + + base_estimator = clone(self.estimator) + + out = Parallel( + n_jobs=self.n_jobs, verbose=self.verbose, + pre_dispatch=self.pre_dispatch + )(delayed(self._estimator_fit)(clone(base_estimator), X, y, parameters) + for parameters in self._parameter_grid(X, y)) + + self.results_ = self._compute_results(X, out) + ranks = np.asarray(rankdata(-self.results_['score'], method='min'), + dtype=np.int32) + self.best_index_ = np.flatnonzero(ranks == 1)[0] + best_parameters = self.results_['params'][self.best_index_] + + self.results_['rank_score'] = ranks + + # Use one np.MaskedArray and mask all the places where the param is not + # applicable for that candidate. Use defaultdict as each candidate may + # not contain all the params + param_results = defaultdict(partial(np.ma.masked_all, + (len(ranks),), dtype=object)) + for cand_i, params in enumerate(self.results_['params']): + for name, value in params.items(): + # An all masked empty array gets created for the key + # `"param_%s" % name` at the first occurence of `name`. + # Setting the value at an index also unmasks that index + param_results["param_%s" % name][cand_i] = value + + self.results_.update(param_results) + + if self.refit: + # fit the best estimator using the entire dataset + # clone first to work around broken estimators + best_estimator = clone(base_estimator).set_params( + **best_parameters).fit(X, y) + self.best_estimator_ = best_estimator + + return self + + @property + def best_params_(self): + check_is_fitted(self, 'results_') + return self.results_['params'][self.best_index_] + + @property + def best_score_(self): + check_is_fitted(self, 'results_') + return self.results_['score'][self.best_index_] + + +# Search for unsupervised metrics +class UnsupervisedMetricSearch(_NClusterSearchBase): + def __init__(self, estimator, parameters, metric): + super(UnsupervisedMetricSearch, self).__init__( + estimator, parameters) + self.metric = metric + + def _estimator_fit(self, estimator, X, y, parameters): + estimator.set_params(**parameters) + try: + score = self.metric(X, estimator.fit_predict(X)) + except ValueError: + # TODO Add a warning for false values + warnings.warn('Put a warning.') + score = np.nan + + return score, parameters + + def _compute_results(self, X, out): + scores, parameters = zip(*out) + return { + 'score': np.array(scores), + 'params': parameters, + } + + +# Scorer for supervised metrics +class StabilitySearch(_NClusterSearchBase): + def __init__(self, estimator, parameters, metric, n_draws=10, + p_samples=.8, random_state=None): + super(StabilitySearch, self).__init__( + estimator, parameters) + self.metric = metric + self.n_draws = n_draws + self.p_samples = p_samples + self.random_state = random_state + + def _initialized(self, X, y): + n_samples, _ = X.shape + rng = check_random_state(self.random_state) + self.data_ = ( + rng.uniform(size=(self.n_draws, 2 * n_samples)) < self.p_samples) + + def _estimator_fit(self, estimator, X, y, parameters): + estimator.set_params(**parameters) + draw_scores = np.empty(self.n_draws) + for l, d in enumerate(self.data_): + p1, p2 = np.split(d, 2) + + labels1 = estimator.fit_predict(X[p1]) + labels2 = estimator.fit_predict(X[p2]) + + draw_scores[l] = self.metric(labels1[p2[p1]], labels2[p1[p2]]) + score = draw_scores.mean() + return score, parameters + + def _compute_results(self, X, out): + scores, parameters = zip(*out) + return { + 'score': np.array(scores), + 'params': parameters, + } + + +# Scorer for distortion jump +class DistortionJumpSearch(_NClusterSearchBase): + def __init__(self, estimator, parameters): + super(DistortionJumpSearch, self).__init__( + estimator, parameters) + + def _compute_values(self, n_clusters_list): + return np.hstack((n_clusters_list, np.array(n_clusters_list) + 1)) + + def _estimator_fit(self, estimator, X, y, parameters): + _, n_features = X.shape + estimator.set_params(**parameters) + distortion = (np.array(estimator.fit(X).score(X)) / + n_features) ** (-n_features / 2) + return distortion, parameters + + def _compute_results(self, X, out): + distortion, parameters = zip(*out) + distortion = np.array(distortion).reshape( + len(self.n_clusters_values), -1) + parameters = np.array(parameters).reshape(distortion.shape) + + return { + 'score': (distortion[self._index] - + distortion[1:][self._index[:-1]]), + 'params': parameters[self._index].ravel(), + } + + +# Scorer for Pham +class PhamSearch(_NClusterSearchBase): + def __init__(self, estimator, parameters): + super(PhamSearch, self).__init__( + estimator, parameters) + + def _compute_values(self, n_clusters_list): + return np.hstack((n_clusters_list, np.array(n_clusters_list) - 1)) + + def _estimator_fit(self, estimator, X, y, parameters): + estimator.set_params(**parameters) + if parameters['n_clusters'] == 0: + return np.nan, parameters + return estimator.fit(X).score(X), parameters + + def _compute_results(self, X, out): + _, n_features = X.shape + + scores, parameters = zip(*out) + scores = np.array(scores).reshape(len(self.n_clusters_values), -1) + parameters = np.array(parameters).reshape(scores.shape) + + weights = 1. - np.exp((self.n_clusters_values[self._index] - 2) * + np.log(5. / 6.) + np.log(.75) - + np.log(n_features)) + + with ignore_warnings(category=RuntimeWarning): + scores = np.where( + scores[self._index] != 0., scores[self._index] / + (weights[:, np.newaxis] * scores[:-1][self._index[1:]]), 1.) + + if self.n_clusters_values[0] == 0: + scores[0, :] = 1. + + return { + 'score': -scores.ravel(), + 'params': parameters[self._index].ravel(), + } + + +# Scorer for Gap +class GapSearch(_NClusterSearchBase): + def __init__(self, estimator, parameters, n_draws=10, n_jobs=1, + random_state=None): + super(GapSearch, self).__init__(estimator, parameters, n_jobs) + self.n_draws = n_draws + self.random_state = random_state + + def _initialized(self, X, y): + n_samples, n_features = X.shape + rng = check_random_state(self.random_state) + + # Compute the random data once for all + bb_min, bb_max = np.min(X, 0), np.max(X, 0) + self._data = (rng.uniform(size=(self.n_draws, n_samples, n_features)) * + (bb_max - bb_min) + bb_min) + + def _compute_values(self, n_clusters_list): + return np.hstack((n_clusters_list, np.array(n_clusters_list) + 1)) + + def _estimator_fit(self, estimator, X, y, parameters): + estimator.set_params(**parameters) + + estimated_log_inertia = np.log(-estimator.fit(X).score(X)) + inertia_n_draws = np.empty(self.n_draws) + for t, Xt in enumerate(self._data): + inertia_n_draws[t] = -estimator.fit(Xt).score(Xt) + inertia_n_draws = np.log(inertia_n_draws) + + expected_log_inertia = np.mean(inertia_n_draws) + gap = expected_log_inertia - estimated_log_inertia + safety = (np.std(inertia_n_draws) * + np.sqrt(1. + 1. / self.n_draws)) + + return gap, safety, parameters + + def _compute_results(self, X, out): + gap, safety, parameters = zip(*out) + + gap = np.array(gap).reshape(len(self.n_clusters_values), -1) + safety = np.array(safety).reshape(gap.shape) + + scores = (gap[self._index] - gap[1:][self._index[:-1]] + + safety[1:][self._index[:-1]]) + + return { + 'gap': gap[self._index].ravel(), + 'safety': safety[self._index].ravel(), + 'score': scores.ravel(), + 'params': ( + np.array(parameters).reshape(gap.shape)[self._index].ravel()), + } + + +class OptimalNClusterSearch(): + def __init__(self, estimator, parameters, fitting_process='auto', + **kwargs): + self.estimator = estimator + self.parameters = parameters.copy() + self.fitting_process = fitting_process + self.kwargs = kwargs + + def _check_initial_parameters(self, X): + n_samples, _ = X.shape + + X = check_array(X, dtype=[np.float64, np.float32]) + # if self.n_clusters_range.max() > n_samples: + # raise ValueError("Put a message") + + def _select_fitting_method(self): + fitting_method = { + 'auto': PhamSearch, + 'pham': PhamSearch, + 'gap': GapSearch, + 'distortion_jump': DistortionJumpSearch, + 'stability': StabilitySearch, + 'unsupervised': UnsupervisedMetricSearch + } + + self.scorer_ = fitting_method[self.fitting_process]( + estimator=self.estimator, parameters=self.parameters, + **self.kwargs) + + def fit(self, X, y=None): + self._check_initial_parameters(X) + + self._select_fitting_method() + self.scorer_.fit(X, y) + + self.results_ = self.scorer_.results_ + self.best_estimator_ = self.scorer_.best_estimator_ + + return self diff --git a/sklearn/metrics/cluster/distortion.py b/sklearn/metrics/cluster/distortion.py deleted file mode 100644 index 335f9f2d0a42f..0000000000000 --- a/sklearn/metrics/cluster/distortion.py +++ /dev/null @@ -1,78 +0,0 @@ -from collections import defaultdict - -import numpy as np -from scipy.spatial.distance import cdist - - -def distortion(X, labels, distortion_meth='sqeuclidean', p=2): - """ - Given data and their cluster assigment, compute the distortion D - - Parameter - --------- - X: numpy array of shape (nb_data, nb_feature) - labels: list of int of length nb_data - distortion_meth: can be a function X, labels -> float, - can be a string naming a scipy.spatial distance. can be in - ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev' - 'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard', - 'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski', - 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', - 'canberra', 'wminkowski']) - p : double - The p-norm to apply (for Minkowski, weighted and unweighted) - - Return - ------ - distortion: float - """ - if isinstance(distortion_meth, str): - return distortion_metrics(X, labels, distortion_meth, p) - else: - return distortion_meth(X, labels) - - -def distortion_metrics(X, labels, metric='sqeuclidean', p=2): - """ - Given data and their cluster assigment, compute the distortion D - - D = \sum_{x \in X} distance(x, c_x) - - With c_x the center of the cluster containing x, distance is the distance - defined by metrics - - Parameter - --------- - X: numpy array of shape (nb_data, nb_feature) - labels: list of int of length nb_data - metric: string naming a scipy.spatial distance. metric can be in - ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev' - 'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard', - 'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski', - 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', - 'wminkowski', 'canberra'] - p : double - The p-norm to apply (for Minkowski, weighted and unweighted) - - Return - ------ - distortion: float - """ - if metric == 'l2': - # Translate to something understood by scipy - metric = 'euclidean' - elif metric in ('l1', 'manhattan'): - metric = 'cityblock' - - assi = defaultdict(list) - for i, l in enumerate(labels): - assi[l].append(i) - - inertia = .0 - nb_feature = X.shape[1] - for points in assi.values(): - clu_points = X[points, :] - clu_center = np.mean(clu_points, axis=0).reshape(1, nb_feature) - inertia += np.sum(cdist(clu_points, clu_center, metric=metric, p=p)) - - return inertia / X.shape[1] diff --git a/sklearn/metrics/cluster/tests/test_distortion.py b/sklearn/metrics/cluster/tests/test_distortion.py deleted file mode 100644 index c38db57f69220..0000000000000 --- a/sklearn/metrics/cluster/tests/test_distortion.py +++ /dev/null @@ -1,12 +0,0 @@ -import numpy as np - -from sklearn.utils.testing import assert_almost_equal - -from sklearn.metrics.cluster.distortion import distortion - - -def test_distortion(): - X = np.array([[0, 0], [2, 2], - [5, 5], [6, 6]]) - labels = [0, 0, 1, 1] - assert_almost_equal(distortion(X, labels), 2.5) From 227b30d930a9ecea0438a399c30c8d7862851b55 Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Wed, 29 Jun 2016 18:06:21 +0200 Subject: [PATCH 3/5] Add plot_choose_n_optimal example [ci skip] --- examples/cluster/plot_optimal_n_clusters.py | 73 +++++++++++++++++++++ sklearn/cluster/optimal_nclusters_search.py | 11 +++- 2 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 examples/cluster/plot_optimal_n_clusters.py diff --git a/examples/cluster/plot_optimal_n_clusters.py b/examples/cluster/plot_optimal_n_clusters.py new file mode 100644 index 0000000000000..30805d4ecb14e --- /dev/null +++ b/examples/cluster/plot_optimal_n_clusters.py @@ -0,0 +1,73 @@ +"""Plot the results of the gap criterium.""" + +# Authors: Thierry Guillemot + +import time +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.cluster import KMeans, OptimalNClusterSearch +from sklearn.datasets import make_blobs +from sklearn.metrics import calinski_harabaz_score, fowlkes_mallows_score +from sklearn.metrics import silhouette_score +from sklearn.utils import check_random_state + + +n_samples, n_features, random_state = 1000, 2, 1 +parameters = {'n_clusters': np.arange(1, 7)} + +rng = check_random_state(random_state) +datasets = [ + ('3 clusters', make_blobs(n_samples=n_samples, n_features=2, + random_state=random_state, centers=3)), + ('5 clusters', make_blobs(n_samples=n_samples, n_features=2, + random_state=random_state, centers=5)), + ('random', (rng.rand(n_samples, n_features), + np.zeros(n_samples, dtype=int))), +] + +estimator = KMeans(n_init=10, random_state=0) +searchers = [ + ('Silhouette', OptimalNClusterSearch( + estimator=estimator, parameters=parameters, + fitting_process='unsupervised', metric=silhouette_score)), + ('Calinski', OptimalNClusterSearch( + estimator=estimator, parameters=parameters, + fitting_process='unsupervised', metric=calinski_harabaz_score)), + ('Stability', OptimalNClusterSearch( + estimator=estimator, parameters=parameters, random_state=0, + fitting_process='stability', metric=fowlkes_mallows_score)), + ('Distortion jump', OptimalNClusterSearch( + estimator=estimator, parameters=parameters, + fitting_process='distortion_jump')), + ('Gap', OptimalNClusterSearch( + estimator=estimator, parameters=parameters, random_state=0, + fitting_process='gap')), + ('Pham', OptimalNClusterSearch( + estimator=estimator, parameters=parameters, fitting_process='pham')), +] + +color = 'bgrcmyk' +plt.figure(figsize=(13, 9.5)) +plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05, + hspace=.01) +for k, (data_name, data) in enumerate(datasets): + X, _ = data + for l, (search_name, search) in enumerate(searchers): + t0 = time.time() + y = search.fit(X).predict(X) + t1 = time.time() + + colors = np.array([color[k] for k in y]) + plt.subplot(len(datasets), len(searchers), + len(searchers) * k + l + 1) + if k == 0: + plt.title(search_name, size=18) + plt.scatter(X[:, 0], X[:, 1], color=colors, alpha=.6) + plt.xticks(()) + plt.yticks(()) + plt.axis('equal') + plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), + transform=plt.gca().transAxes, size=15, + horizontalalignment='right') +plt.show() diff --git a/sklearn/cluster/optimal_nclusters_search.py b/sklearn/cluster/optimal_nclusters_search.py index 91a3745b8df16..8daf5f476bbbb 100644 --- a/sklearn/cluster/optimal_nclusters_search.py +++ b/sklearn/cluster/optimal_nclusters_search.py @@ -202,6 +202,10 @@ def _initialized(self, X, y): def _estimator_fit(self, estimator, X, y, parameters): estimator.set_params(**parameters) + if parameters['n_clusters'] == 1: + warnings.warn('Put a warning.') + return np.nan, parameters + draw_scores = np.empty(self.n_draws) for l, d in enumerate(self.data_): p1, p2 = np.split(d, 2) @@ -284,6 +288,9 @@ def _compute_results(self, X, out): if self.n_clusters_values[0] == 0: scores[0, :] = 1. + # XXX change that to not modify the score + scores[scores > 0.85] = 1. + return { 'score': -scores.ravel(), 'params': parameters[self._index].ravel(), @@ -333,7 +340,7 @@ def _compute_results(self, X, out): safety = np.array(safety).reshape(gap.shape) scores = (gap[self._index] - gap[1:][self._index[:-1]] + - safety[1:][self._index[:-1]]) + safety[1:][self._index[:-1]]) >= 0 return { 'gap': gap[self._index].ravel(), @@ -382,4 +389,4 @@ def fit(self, X, y=None): self.results_ = self.scorer_.results_ self.best_estimator_ = self.scorer_.best_estimator_ - return self + return self.scorer_ From a0926db6462f15c9bef11ce1fd44093f09802738 Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Mon, 9 Jan 2017 10:36:48 +0100 Subject: [PATCH 4/5] Modify little stuff for testing. --- examples/cluster/plot_optimal_n_clusters.py | 26 +++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/examples/cluster/plot_optimal_n_clusters.py b/examples/cluster/plot_optimal_n_clusters.py index 30805d4ecb14e..eb9f494d5ac25 100644 --- a/examples/cluster/plot_optimal_n_clusters.py +++ b/examples/cluster/plot_optimal_n_clusters.py @@ -6,8 +6,8 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn.cluster import KMeans, OptimalNClusterSearch -from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans, SpectralClustering, OptimalNClusterSearch +from sklearn.datasets import make_blobs, make_circles, make_moons from sklearn.metrics import calinski_harabaz_score, fowlkes_mallows_score from sklearn.metrics import silhouette_score from sklearn.utils import check_random_state @@ -24,9 +24,11 @@ random_state=random_state, centers=5)), ('random', (rng.rand(n_samples, n_features), np.zeros(n_samples, dtype=int))), + ('circle', make_circles(n_samples=n_samples, factor=.5, noise=.05, shuffle=True, random_state=0)), + ('moon', make_moons(n_samples=n_samples, noise=.05, shuffle=True, random_state=0)), ] -estimator = KMeans(n_init=10, random_state=0) +estimator = SpectralClustering(n_init=10, random_state=0) searchers = [ ('Silhouette', OptimalNClusterSearch( estimator=estimator, parameters=parameters, @@ -37,14 +39,14 @@ ('Stability', OptimalNClusterSearch( estimator=estimator, parameters=parameters, random_state=0, fitting_process='stability', metric=fowlkes_mallows_score)), - ('Distortion jump', OptimalNClusterSearch( - estimator=estimator, parameters=parameters, - fitting_process='distortion_jump')), - ('Gap', OptimalNClusterSearch( - estimator=estimator, parameters=parameters, random_state=0, - fitting_process='gap')), - ('Pham', OptimalNClusterSearch( - estimator=estimator, parameters=parameters, fitting_process='pham')), + # ('Distortion jump', OptimalNClusterSearch( + # estimator=estimator, parameters=parameters, + # fitting_process='distortion_jump')), + # ('Gap', OptimalNClusterSearch( + # estimator=estimator, parameters=parameters, random_state=0, + # fitting_process='gap')), + # ('Pham', OptimalNClusterSearch( + # estimator=estimator, parameters=parameters, fitting_process='pham')), ] color = 'bgrcmyk' @@ -55,7 +57,7 @@ X, _ = data for l, (search_name, search) in enumerate(searchers): t0 = time.time() - y = search.fit(X).predict(X) + y = search.fit(X).best_estimator_.labels_ t1 = time.time() colors = np.array([color[k] for k in y]) From 1868d751067a3fe319c8cce91971caf552c5d382 Mon Sep 17 00:00:00 2001 From: Thierry Guillemot Date: Mon, 9 Jan 2017 18:42:53 +0100 Subject: [PATCH 5/5] [ci-skip] Add doc, unsupervised metric for any clustersearch. --- examples/cluster/plot_optimal_n_clusters.py | 38 +- sklearn/cluster/optimal_nclusters_search.py | 618 ++++++++++++++++++-- 2 files changed, 584 insertions(+), 72 deletions(-) diff --git a/examples/cluster/plot_optimal_n_clusters.py b/examples/cluster/plot_optimal_n_clusters.py index eb9f494d5ac25..fc033cfc8a4ae 100644 --- a/examples/cluster/plot_optimal_n_clusters.py +++ b/examples/cluster/plot_optimal_n_clusters.py @@ -14,7 +14,7 @@ n_samples, n_features, random_state = 1000, 2, 1 -parameters = {'n_clusters': np.arange(1, 7)} +parameters = {'n_clusters': np.arange(1, 8)} rng = check_random_state(random_state) datasets = [ @@ -24,29 +24,35 @@ random_state=random_state, centers=5)), ('random', (rng.rand(n_samples, n_features), np.zeros(n_samples, dtype=int))), - ('circle', make_circles(n_samples=n_samples, factor=.5, noise=.05, shuffle=True, random_state=0)), - ('moon', make_moons(n_samples=n_samples, noise=.05, shuffle=True, random_state=0)), + ('circle', make_circles(n_samples=n_samples, factor=.5, noise=.05, + shuffle=True, random_state=random_state)), + ('moon', make_moons(n_samples=n_samples, noise=.05, + shuffle=True, random_state=random_state)), ] -estimator = SpectralClustering(n_init=10, random_state=0) +estimator = SpectralClustering(eigen_solver='arpack', + affinity="nearest_neighbors", + n_init=10, random_state=0) +# estimator = KMeans(n_init=10, random_state=0) searchers = [ - ('Silhouette', OptimalNClusterSearch( - estimator=estimator, parameters=parameters, - fitting_process='unsupervised', metric=silhouette_score)), - ('Calinski', OptimalNClusterSearch( - estimator=estimator, parameters=parameters, - fitting_process='unsupervised', metric=calinski_harabaz_score)), - ('Stability', OptimalNClusterSearch( - estimator=estimator, parameters=parameters, random_state=0, - fitting_process='stability', metric=fowlkes_mallows_score)), + # ('Silhouette', OptimalNClusterSearch( + # estimator=estimator, parameters=parameters, + # fitting_process='unsupervised', metric=silhouette_score)), + # ('Calinski', OptimalNClusterSearch( + # estimator=estimator, parameters=parameters, + # fitting_process='unsupervised', metric=calinski_harabaz_score)), + # ('Stability', OptimalNClusterSearch( + # estimator=estimator, parameters=parameters, random_state=0, + # fitting_process='stability', metric=fowlkes_mallows_score)), # ('Distortion jump', OptimalNClusterSearch( # estimator=estimator, parameters=parameters, # fitting_process='distortion_jump')), # ('Gap', OptimalNClusterSearch( # estimator=estimator, parameters=parameters, random_state=0, - # fitting_process='gap')), - # ('Pham', OptimalNClusterSearch( - # estimator=estimator, parameters=parameters, fitting_process='pham')), + # fitting_process='gap', metric=silhouette_score)), + ('Pham', OptimalNClusterSearch( + estimator=estimator, parameters=parameters, fitting_process='pham', + metric=silhouette_score)), ] color = 'bgrcmyk' diff --git a/sklearn/cluster/optimal_nclusters_search.py b/sklearn/cluster/optimal_nclusters_search.py index 8daf5f476bbbb..1f3ccdc5f17a1 100644 --- a/sklearn/cluster/optimal_nclusters_search.py +++ b/sklearn/cluster/optimal_nclusters_search.py @@ -23,12 +23,16 @@ class _NClusterSearchBase(six.with_metaclass(ABCMeta, object)): - def __init__(self, estimator, parameters, n_jobs=1, refit=True, + """Base class for cluster searchers. + + This abstract class specifies an interface for all cluster searchers and + provides some basic common methods. + """ + def __init__(self, estimator, parameters, n_jobs=1, pre_dispatch='2*n_jobs', verbose=0): self.estimator = estimator self.parameters = parameters self.n_jobs = n_jobs - self.refit = refit self.pre_dispatch = pre_dispatch self.verbose = verbose @@ -36,38 +40,64 @@ def _initialized(self, X, y): pass def _compute_values(self, n_clusters_list): + """Define all n_clusters values needed to compute a score.""" return np.array(n_clusters_list) def _parameter_grid(self, X, y): - # Determinate the name of the property controling then number of clusters + """Define the grid used by the cluster searcher.""" + # Define the property name controling the number of clusters if isinstance(self.estimator, BaseMixture): - # General case + # Case for clustering estimator property_name = 'n_components' elif isinstance(self.estimator, BaseEstimator): - # Mixture case + # Case for mixture estimator property_name = 'n_clusters' else: raise ValueError("The estimator has to be from the type ...") - # Compute n_clusters parameters + # Extract number of cluster parameter from the parameter list parameters = self.parameters.copy() n_clusters_list = parameters.pop(property_name) + # Avoid multiple computation of sets of same parameters self.n_clusters_values, self._index = np.unique( self._compute_values(n_clusters_list), return_index=True) self._index = self._index < len(n_clusters_list) + # Grid definition for n_clusters in self.n_clusters_values: for params in ParameterGrid(parameters): params[property_name] = n_clusters yield params + def _compute_score(self, estimator, X): + """Compute the score of the estimator according to self.metric.""" + # XXX Not good to deal with bad values to modify + try: + if self.metric is not None: + # Add a warning for n_cluster = 1 + score = self.metric(X, estimator.fit_predict(X)) + elif hasattr(estimator, 'score'): + score = estimator.fit(X).score(X) + # else: + # raise ValueError( + # "The class %s does not have a score method. Change the " + # "estimator type or give an unsupervised metric to the " + # "``metric`` attribute.") + except ValueError: + # TODO Add a warning for false values + # warnings.warn('Put a warning.') + score = np.nan + return score + @abstractmethod def _estimator_fit(self, estimator, X, y, parameters): + """Fit the estimator and return values used to define a score.""" pass @abstractmethod def _compute_results(self, X, out): + """Compute result from the values computed by _estimator_fit.""" pass @if_delegate_has_method(delegate='estimator') @@ -105,10 +135,25 @@ def predict_proba(self, X): return self.best_estimator_.predict_proba(X) def fit(self, X, y): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like, shape = [n_samples, n_features] + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + """ self._initialized(X, y) base_estimator = clone(self.estimator) + # Fit the estimator for each parameter and compute output used + # by _compute_results to define a score out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch @@ -116,11 +161,11 @@ def fit(self, X, y): for parameters in self._parameter_grid(X, y)) self.results_ = self._compute_results(X, out) + + # XXX Change that ?? ranks = np.asarray(rankdata(-self.results_['score'], method='min'), dtype=np.int32) self.best_index_ = np.flatnonzero(ranks == 1)[0] - best_parameters = self.results_['params'][self.best_index_] - self.results_['rank_score'] = ranks # Use one np.MaskedArray and mask all the places where the param is not @@ -136,46 +181,102 @@ def fit(self, X, y): param_results["param_%s" % name][cand_i] = value self.results_.update(param_results) - - if self.refit: - # fit the best estimator using the entire dataset - # clone first to work around broken estimators - best_estimator = clone(base_estimator).set_params( - **best_parameters).fit(X, y) - self.best_estimator_ = best_estimator + self.best_params_ = self.results_['params'][self.best_index_] + self.best_score_ = self.results_['score'][self.best_index_] + + # fit the best estimator + best_estimator = clone(base_estimator).set_params( + **self.best_params_) + if y is not None: + best_estimator.fit(X, y) + else: + best_estimator.fit(X) + self.best_estimator_ = best_estimator return self - @property - def best_params_(self): - check_is_fitted(self, 'results_') - return self.results_['params'][self.best_index_] - - @property - def best_score_(self): - check_is_fitted(self, 'results_') - return self.results_['score'][self.best_index_] - # Search for unsupervised metrics class UnsupervisedMetricSearch(_NClusterSearchBase): - def __init__(self, estimator, parameters, metric): + """Unsupervised metric search on hyper parameters. + + UnsupervisedMetricSearch implements an exhaustive search over specified + parameter values for a clustering estimator using an unsupervised metric + to define the best result. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``metric`` must be passed. + + parameters : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + metric : callable or None, defaults to None + An unsupervised metric function with signature + ``metric(X, y, **kwargs)``. + If ``None``, the ``score`` method of the estimator is used. + + n_jobs : int, default=1 + Number of jobs to run in parallel. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + verbose : integer + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + results_ : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if refit=False. + + best_score_ : float + Score of best_estimator on the left out data. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + """ + def __init__(self, estimator, parameters, metric=None, n_jobs=-1, + pre_dispatch='2*n_jobs', verbose=0): super(UnsupervisedMetricSearch, self).__init__( - estimator, parameters) + estimator, parameters, n_jobs, pre_dispatch, verbose) self.metric = metric def _estimator_fit(self, estimator, X, y, parameters): + """Fit the estimator and return values used to define a score.""" estimator.set_params(**parameters) - try: - score = self.metric(X, estimator.fit_predict(X)) - except ValueError: - # TODO Add a warning for false values - warnings.warn('Put a warning.') - score = np.nan - + score = self._compute_score(estimator, X) return score, parameters def _compute_results(self, X, out): + """Compute result from the values computed by _estimator_fit.""" scores, parameters = zip(*out) return { 'score': np.array(scores), @@ -185,25 +286,100 @@ def _compute_results(self, X, out): # Scorer for supervised metrics class StabilitySearch(_NClusterSearchBase): - def __init__(self, estimator, parameters, metric, n_draws=10, - p_samples=.8, random_state=None): + """Unsupervised metric search on hyper parameters. + + StabilitySearch implements an exhaustive search over specified + parameter values for a clustering estimator using a stability search + process to define the best result. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``metric`` must be passed. + + parameters : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + metric : callable + An unsupervised metric function with signature + ``metric(X, y, **kwargs)``. + If ``None``, the ``score`` method of the estimator is used. + + n_draws : int, defaults to 10 + Number of draws used to define the score. + + p_samples: float, defaults to .8 + Probability to draw point to define the sample sets. + + n_jobs : int, defaults to 1 + Number of jobs to run in parallel. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + verbose : integer + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + results_ : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if refit=False. + + best_score_ : float + Score of best_estimator on the left out data. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + """ + def __init__(self, estimator, parameters, metric=None, n_draws=10, + p_samples=.8, random_state=None, n_jobs=-1, + pre_dispatch='2*n_jobs', verbose=0): super(StabilitySearch, self).__init__( - estimator, parameters) + estimator, parameters, n_jobs, pre_dispatch, verbose) self.metric = metric self.n_draws = n_draws self.p_samples = p_samples self.random_state = random_state def _initialized(self, X, y): + """Define the subsampled sets used by _estimator_fit.""" n_samples, _ = X.shape rng = check_random_state(self.random_state) self.data_ = ( rng.uniform(size=(self.n_draws, 2 * n_samples)) < self.p_samples) def _estimator_fit(self, estimator, X, y, parameters): + """Fit the estimator and return values used to define a score.""" estimator.set_params(**parameters) + # Be sure a warining is the best thing to do ? if parameters['n_clusters'] == 1: - warnings.warn('Put a warning.') + warnings.warn('StabilitySearch can not be used for n_clusers = 1.') return np.nan, parameters draw_scores = np.empty(self.n_draws) @@ -218,6 +394,7 @@ def _estimator_fit(self, estimator, X, y, parameters): return score, parameters def _compute_results(self, X, out): + """Compute result from the values computed by _estimator_fit.""" scores, parameters = zip(*out) return { 'score': np.array(scores), @@ -227,21 +404,96 @@ def _compute_results(self, X, out): # Scorer for distortion jump class DistortionJumpSearch(_NClusterSearchBase): - def __init__(self, estimator, parameters): + """Search on hyper parameters using the distortion jump scheme. + + DistortionJumpSearch implements an exhaustive search over specified + parameter values for a clustering estimator using a distortion jump + process to define the best result. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``metric`` must be passed. + + parameters : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + metric : callable + A supervised metric function with signature + ``metric(y_true, y_pred, **kwargs)``. + + n_draws : int, defaults to 10 + Number of draws used to define the score. + + p_samples: float, defaults to .8 + Probability to draw point to define the sample sets. + + n_jobs : int, defaults to 1 + Number of jobs to run in parallel. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + verbose : integer + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + results_ : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if refit=False. + + best_score_ : float + Score of best_estimator on the left out data. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + """ + def __init__(self, estimator, parameters, metric=None, n_jobs=-1, + pre_dispatch='2*n_jobs', verbose=0): super(DistortionJumpSearch, self).__init__( - estimator, parameters) + estimator, parameters, n_jobs, pre_dispatch, verbose) + self.metric = metric def _compute_values(self, n_clusters_list): + """Define all n_clusters values needed to compute a score.""" return np.hstack((n_clusters_list, np.array(n_clusters_list) + 1)) def _estimator_fit(self, estimator, X, y, parameters): + """Fit the estimator and return values used to define a score.""" _, n_features = X.shape estimator.set_params(**parameters) - distortion = (np.array(estimator.fit(X).score(X)) / - n_features) ** (-n_features / 2) + score = self._compute_score(estimator, X) + distortion = (score / n_features) ** (-n_features / 2) return distortion, parameters def _compute_results(self, X, out): + """Compute result from the values computed by _estimator_fit.""" distortion, parameters = zip(*out) distortion = np.array(distortion).reshape( len(self.n_clusters_values), -1) @@ -256,20 +508,96 @@ def _compute_results(self, X, out): # Scorer for Pham class PhamSearch(_NClusterSearchBase): - def __init__(self, estimator, parameters): + """Unsupervised metric search on hyper parameters. + + StabilitySearch implements an exhaustive search over specified + parameter values for a clustering estimator using a stability search + process to define the best result. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``metric`` must be passed. + + parameters : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + metric : callable + A supervised metric function with signature + ``metric(y_true, y_pred, **kwargs)``. + + n_draws : int, defaults to 10 + Number of draws used to define the score. + + p_samples: float, defaults to .8 + Probability to draw point to define the sample sets. + + n_jobs : int, defaults to 1 + Number of jobs to run in parallel. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + verbose : integer + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + results_ : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if refit=False. + + best_score_ : float + Score of best_estimator on the left out data. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + """ + def __init__(self, estimator, parameters, metric=None): super(PhamSearch, self).__init__( estimator, parameters) + self.metric = metric def _compute_values(self, n_clusters_list): + """Define all n_clusters values needed to compute a score.""" return np.hstack((n_clusters_list, np.array(n_clusters_list) - 1)) def _estimator_fit(self, estimator, X, y, parameters): + """Fit the estimator and return values used to define a score.""" estimator.set_params(**parameters) - if parameters['n_clusters'] == 0: + if parameters['n_clusters'] < 1: return np.nan, parameters - return estimator.fit(X).score(X), parameters + # XXX I clearly don't like to use any unsupervised metric here. + score = self._compute_score(estimator, X) + return score, parameters def _compute_results(self, X, out): + """Compute result from the values computed by _estimator_fit.""" _, n_features = X.shape scores, parameters = zip(*out) @@ -281,15 +609,19 @@ def _compute_results(self, X, out): np.log(n_features)) with ignore_warnings(category=RuntimeWarning): - scores = np.where( - scores[self._index] != 0., scores[self._index] / + scores = np.where(np.logical_and( + np.vstack(( + True, + ~np.isnan(scores[self._index])[:-1])), + scores[self._index] > 0), scores[self._index] / (weights[:, np.newaxis] * scores[:-1][self._index[1:]]), 1.) if self.n_clusters_values[0] == 0: scores[0, :] = 1. # XXX change that to not modify the score - scores[scores > 0.85] = 1. + with ignore_warnings(category=RuntimeWarning): + scores[scores > 0.85] = 1. return { 'score': -scores.ravel(), @@ -299,13 +631,85 @@ def _compute_results(self, X, out): # Scorer for Gap class GapSearch(_NClusterSearchBase): - def __init__(self, estimator, parameters, n_draws=10, n_jobs=1, - random_state=None): + """Unsupervised metric search on hyper parameters. + + StabilitySearch implements an exhaustive search over specified + parameter values for a clustering estimator using a stability search + process to define the best result. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``metric`` must be passed. + + parameters : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + metric : callable + A supervised metric function with signature + ``metric(y_true, y_pred, **kwargs)``. + + n_draws : int, defaults to 10 + Number of draws used to define the score. + + p_samples: float, defaults to .8 + Probability to draw point to define the sample sets. + + n_jobs : int, defaults to 1 + Number of jobs to run in parallel. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + verbose : integer + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + results_ : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if refit=False. + + best_score_ : float + Score of best_estimator on the left out data. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + """ + def __init__(self, estimator, parameters, metric=None, n_draws=10, + n_jobs=1, random_state=None): super(GapSearch, self).__init__(estimator, parameters, n_jobs) + self.metric = metric self.n_draws = n_draws self.random_state = random_state def _initialized(self, X, y): + """Define the subsampled sets used by _estimator_fit.""" n_samples, n_features = X.shape rng = check_random_state(self.random_state) @@ -315,15 +719,33 @@ def _initialized(self, X, y): (bb_max - bb_min) + bb_min) def _compute_values(self, n_clusters_list): + """Define all n_clusters values needed to compute a score.""" return np.hstack((n_clusters_list, np.array(n_clusters_list) + 1)) + # def _compute_score(self, estimator, X): + # """Compute result from the values computed by _estimator_fit.""" + # if self.metric is not None: + # # Add a warning for n_cluster = 1 + # if estimator.n_clusters <= 1: + # return np.nan + # score = self.metric(X, estimator.fit_predict(X)) + # elif hasattr(estimator, 'score'): + # score = estimator.fit(X).score(X) + # else: + # raise ValueError("The estimator does not have score_ attribute.") + # + # return score + def _estimator_fit(self, estimator, X, y, parameters): + """Fit the estimator and return values used to define a score.""" estimator.set_params(**parameters) - estimated_log_inertia = np.log(-estimator.fit(X).score(X)) inertia_n_draws = np.empty(self.n_draws) - for t, Xt in enumerate(self._data): - inertia_n_draws[t] = -estimator.fit(Xt).score(Xt) + with ignore_warnings(category=RuntimeWarning): + estimated_log_inertia = np.log( + np.abs(self._compute_score(estimator, X))) + for t, Xt in enumerate(self._data): + inertia_n_draws[t] = np.abs(self._compute_score(estimator, Xt)) inertia_n_draws = np.log(inertia_n_draws) expected_log_inertia = np.mean(inertia_n_draws) @@ -334,14 +756,15 @@ def _estimator_fit(self, estimator, X, y, parameters): return gap, safety, parameters def _compute_results(self, X, out): + """Compute result from the values computed by _estimator_fit.""" gap, safety, parameters = zip(*out) gap = np.array(gap).reshape(len(self.n_clusters_values), -1) safety = np.array(safety).reshape(gap.shape) - scores = (gap[self._index] - gap[1:][self._index[:-1]] + - safety[1:][self._index[:-1]]) >= 0 - + with ignore_warnings(category=RuntimeWarning): + scores = ((gap[self._index] - gap[1:][self._index[:-1]] + + safety[1:][self._index[:-1]]) >= 0).astype(gap.dtype) return { 'gap': gap[self._index].ravel(), 'safety': safety[self._index].ravel(), @@ -352,6 +775,76 @@ def _compute_results(self, X, out): class OptimalNClusterSearch(): + """Unsupervised metric search on hyper parameters. + + StabilitySearch implements an exhaustive search over specified + parameter values for a clustering estimator using a stability search + process to define the best result. + + Parameters + ---------- + estimator : estimator object. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``metric`` must be passed. + + parameters : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + metric : callable + A supervised metric function with signature + ``metric(y_true, y_pred, **kwargs)``. + + n_draws : int, defaults to 10 + Number of draws used to define the score. + + p_samples: float, defaults to .8 + Probability to draw point to define the sample sets. + + n_jobs : int, defaults to 1 + Number of jobs to run in parallel. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + verbose : integer + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + results_ : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if refit=False. + + best_score_ : float + Score of best_estimator on the left out data. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + """ def __init__(self, estimator, parameters, fitting_process='auto', **kwargs): self.estimator = estimator @@ -381,6 +874,19 @@ def _select_fitting_method(self): **self.kwargs) def fit(self, X, y=None): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like, shape = [n_samples, n_features] + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + """ self._check_initial_parameters(X) self._select_fitting_method()