diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index a675633e19257..c28393cb6a1b0 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -135,6 +135,7 @@ each described by the mean :math:`\mu_j` of the samples in the cluster.
 The means are commonly called the cluster "centroids";
 note that they are not, in general, points from :math:`X`,
 although they live in the same space.
+
 The K-means algorithm aims to choose centroids
 that minimise the *inertia*, or within-cluster sum of squared criterion:
 
@@ -1403,3 +1404,207 @@ Drawbacks
    the silhouette analysis is used to choose an optimal value for n_clusters.
 
 
+
+Select number of clusters
+===============================
+
+.. figure:: ../auto_examples/cluster/images/plot_chosen_nb_cluster_comparaison.png
+   :target: ../auto_examples/cluster/plot_chosen_nb_cluster_comparaison.html
+   :align: center
+   :scale: 50
+
+   A comparison of algorithms to select the number of clusters in scikit-learn. The clustering algorithm used is spectral clustering
+
+.. currentmodule:: sklearn.metrics.cluster
+
+Many algorithms require to select the wanted number of clusters. If one does
+not know how many clusters he wants, there exists algorithm to find the most
+relevant number of clusters for its data, given the data and the clustering
+algorithm used.
+
+.. _calinski_harabaz_index:
+
+Calinski-Harabaz index
+----------------------
+
+The goal of the Calinski-Harabaz index is to maximize dispersion between clusters 
+and minimize dispersion within clusters. Let 
+
+- :math:`N` be the number of points in our data, 
+- :math:`C_q` be the set of points in cluster :math:`q`, 
+- :math:`c_q` be the center of cluster :math:`q`, 
+- :math:`c` be the center of :math:`E`, 
+- :math:`n_q` be the number of points in cluster :math:`q`:
+
+The Calinski-Harabaz index for data in :math:`k` cluster, noted
+:math:`CH(k)`, is defined as:
+
+.. math::
+
+  CH(k) = \frac{trace(B_k)}{trace(W_k)} \times \frac{N - k}{k - 1}
+
+with  
+
+.. math::
+  W_k  = \sum_{q=1}^k \sum_{x \in C_q} (x - c_q) (x - c_q)^T \\
+  B_k = \sum_q n_q (c_q - c) (c_q -c)^T \\
+
+Advantages
+~~~~~~~~~~
+
+- The score is higher when clusters are dense and well separated, which relates
+  to a standard concept of a cluster.
+
+- Fast computation
+
+
+Drawbacks
+~~~~~~~~~
+
+- The Calinski-Harabaz index is generally higher for convex clusters than other
+  concepts of clusters, such as density based clusters like those obtained
+  through DBSCAN.
+
+.. topic:: References
+
+ *  "A dendrite method for cluster analysis"
+    Caliński, T., & Harabasz, J., *Communications in Statistics-theory and Methods*, (1974)
+
+.. _stability:
+
+Stability
+---------
+
+A number of clusters is relevant if the clustering algorithm finds similar
+results with small perturbations of the data. In this implementation, we use
+the clustering algorithm on 2 large overlapping subsets of the data. If the
+number of clusters is relevant, data in both subsets should be clustered in
+a similar way. Given a similarity measure of two clustering :math:`sim(., .)`,
+We draw subsamples :math:`E_i` from the initial data :math:`E`.
+For all number of clusters :math:`k=2\dots k_{max}`, we perform :math:`N_{draws}`
+times:
+
+- Select 2 subsets :math:`E_1` and :math:`E_2`.
+
+- Use clustering algorithm on both subsets. Let :math:`C_1` be clusters obtained
+  on :math:`E_1`, :math:`C_2` those obtained on :math:`E_2`. 
+
+- Compute similarity :math:`s(C_1, C_2)`
+
+The chosen number of clusters is the one that has maximum average similarity
+
+Advantages
+~~~~~~~~~~
+
+- Finds a number of clusters that is truly relevant to your data and your
+  clustering algorithm
+
+- The stability score, going from 0 to 1, can measure how well your data
+  is clustered in k groups
+
+
+Drawbacks
+~~~~~~~~~
+
+- Computational time
+
+.. topic:: References
+
+ * `"A stability based method for discovering structure in clustered data"
+   <http://www.researchgate.net/profile/Asa_Ben-Hur/publication/11435997_A_stability_based_method_for_discovering_structure_in_clustered_data/links/00b4953988b6d0233d000000.pdf>`_
+   Ben-Hur, A., Elisseeff, A., & Guyon, I., *Pacific symposium on biocomputing* (2001, December)
+
+ * `"Clustering stability: an overview"
+   <http://arxiv.org/pdf/1007.1075.pdf>`_
+   Von Luxburg, U., (2010)
+
+.. _gap_statistic:
+
+Gap statistic
+-------------
+
+Gap statistic compares the :math:`k` clusters obtained by the selected clustering
+algorithm with :math:`k` clusters obtained by the same algorithm on random data.
+Clusters'quality are judged by the mean distance of clusters'points to their
+clusters'center. Given a distance function :math:`d(., .)`, we define inertia
+for a partition of the data in :math:`k` clusters :math:`(C_1, \dots, C_k)` as:
+
+.. math:: W_k = \sum_{r=1}^k \frac{\sum_{x, y \in C_r}dist(x, y)}{2|C_r|}
+
+By default, random data is drawn from a uniform distribution, with the same
+bounds as :math:`E`. Data can also be drawn from a Gaussian distribution with
+same mean and variance as :math:`E`. Let :math:`W_k` be the inertia of
+randomly-drawn data in k clusters, :math:`W_k^*` be the inertia of :math:`E` in
+k clusters. The gap statistic is defined as:
+
+.. math:: Gap(k) = \mathbb{E}\left[\log(W_k)\right] - \log(W_k^*)
+
+If we have K clusters in our data, we expect :math:`W_k^*` to increase
+fast if :math:`k \leq K` and slowly for :math:`k > K`.
+We estimate :math:`\mathbb{E}\left[\log(W_k)\right]` by creating :math:`B`
+random datasets. Let :math:`sd_k` be the standard deviation of
+:math:`\log(W_k)`. We select the smallest :math:`k` such that the gap increase is 
+too small after :math:`k^*`:
+
+.. math:: k^* = \mbox{smallest k such that} \;
+   Gap(k) \geq Gap(k+1) - \frac{sd_{k+1}}{\sqrt{1 + 1/B}}
+
+Usage
+~~~~~
+
+Given a dataset and a clustering algorithm (a :class:`ClusterMixin`),
+gap_statistic returns the estimated number of clusters
+
+  >>> from sklearn.datasets import make_blobs
+  >>> from sklearn.cluster import KMeans
+  >>> from sklearn.metrics.cluster.gap_statistic import gap_statistic
+  >>> data, labels = make_blobs(n_samples=1000, centers=4, random_state=0)
+  >>> kmeans_model = KMeans()
+  >>> gap_statistic(data, kmeans_model, k_max=10)
+  4
+
+.. topic:: References
+
+ * `"Estimating the number of clusters in a data set via the gap statistic"
+   <http://web.stanford.edu/~hastie/Papers/gap.pdf>`_
+   Tibshirani, R., Walther, G., & Hastie, T., *Journal of the Royal Statistical Society: Series B* (Statistical Methodology), (2001)
+
+.. _distortion_jump:
+
+Distortion jump
+---------------
+
+Distortion jump aims to maximize efficiency (using the smallest number of clusters)
+while minimizing error by information theoretic standards (here, the error is the
+variance of data points in cluster). The data :math:`E` consists of
+:math:`N` points of :math:`d` dimensions. The average
+distortion is:
+
+.. math:: W_k = \frac{1}{d}\sum_{q=1}^k \sum_{x \in C_q} (x-c_q)^T (x-c_q)
+
+
+with :math:`C_q` the set of points in cluster :math:`q` and 
+:math:`c_q` the center of cluster :math:`q`. :math:`k^*`, the chosen number of cluster, is the one that maximized our gain in information. Let :math:`y = d / 2`. 
+
+.. math:: k^* = \arg\min_{k=2\dots k_{max}}W_k^{-y} - W_{k-1}^{-y}
+
+The choice of the transform power :math:`Y = (d/2)` is motivated by asymptotic reasoning using results from rate distortion theory.
+
+
+.. topic:: References
+
+ * `"Distortion Jump"
+   <en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set>`_
+
+
+Advantages
+~~~~~~~~~~
+
+- Fast computation
+
+Drawbacks
+~~~~~~~~~
+
+- The distortion jump works better for convex clusters than other
+  concepts of clusters, such as density based clusters like those obtained
+  through DBSCAN.
diff --git a/examples/cluster/plot_choose_nb_cluster.py b/examples/cluster/plot_choose_nb_cluster.py
new file mode 100644
index 0000000000000..8246bf05e0c61
--- /dev/null
+++ b/examples/cluster/plot_choose_nb_cluster.py
@@ -0,0 +1,102 @@
+"""
+============================================
+Selecting number of clusters on toy datasets
+============================================
+
+This example shows several algorithms to choose the number of clusters,
+for a particular clustering algorithm on a particular dataset. It mainly
+illustrates that some algorithms are faster, some algorithms only understand
+convex clusters (first dataset) and some algorithms understand non-convex
+clusters (second and third datasets).
+
+The running times only give intuition of which algorithm is faster. Running time
+highly depends on a datasets number of samples and number of features.
+"""
+print(__doc__)
+
+import time
+from operator import itemgetter
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.cluster.spectral import SpectralClustering
+from sklearn.datasets import make_blobs, make_moons, make_circles
+from sklearn.metrics.cluster.calinski_harabaz_index import max_CH_index
+from sklearn.metrics.cluster.stability import stability
+from sklearn.metrics.cluster.distortion_jump import distortion_jump
+from sklearn.metrics.cluster.gap_statistic import gap_statistic
+from sklearn.metrics.cluster.unsupervised import silhouette_score
+from sklearn.preprocessing import StandardScaler
+
+n_samples = 1500
+seed = 1
+datasets = [
+    make_blobs(n_samples=n_samples, random_state=seed),
+    make_circles(n_samples=n_samples, factor=.5, noise=.05, shuffle=True, random_state=seed),
+    make_moons(n_samples=n_samples, noise=.05, shuffle=True, random_state=seed),
+]
+
+cluster_estimator = SpectralClustering(eigen_solver='arpack', affinity="nearest_neighbors")
+
+
+def max_silhouette(X, cluster_estimator, k_max=None):
+    if not k_max:
+        k_max = int(X.shape[0] / 2)
+    silhouettes = []
+    for k in range(2, k_max + 1):
+        cluster_estimator.set_params(n_clusters=k)
+        labels = cluster_estimator.fit_predict(X)
+        silhouettes.append((k, silhouette_score(X, labels)))
+    return max(silhouettes, key=itemgetter(1))[0]
+
+
+colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
+nb_colors = len(colors)
+
+plt.figure(figsize=(13, 9.5))
+plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05,
+                    hspace=.01)
+
+plot_num = 1
+printed_header = False
+
+for dataset in datasets:
+    X, true_labels = dataset
+    # normalize dataset for nicer plotting
+    X = StandardScaler().fit_transform(X)
+
+    for name, func_choose_nb_cluster in {
+            'Silhouette': max_silhouette,
+            'Stability': stability,
+            'Gap statistic': gap_statistic,
+            'Calinski-Harabasz index': max_CH_index,
+            'Distortion jump': distortion_jump,
+    }.items():
+        # predict cluster memberships
+        t0 = time.time()
+        nb_cluster = func_choose_nb_cluster(X, cluster_estimator, k_max=10)
+        t1 = time.time()
+
+        # retrieving clustering done
+        cluster_estimator.set_params(n_clusters=nb_cluster)
+        y_pred = cluster_estimator.fit_predict(X)
+
+        # plot
+        plt.subplot(3, 5, plot_num)
+        if not printed_header:
+            plt.title(name, size=18)
+        points_color = [colors[y % nb_colors] for y in y_pred]
+        plt.scatter(X[:, 0], X[:, 1], color=points_color, s=10)
+
+        plt.xlim(-2, 2)
+        plt.ylim(-2, 2)
+        plt.xticks(())
+        plt.yticks(())
+        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
+                 transform=plt.gca().transAxes, size=15,
+                 horizontalalignment='right')
+        plot_num += 1
+    printed_header = True
+
+plt.show()
diff --git a/sklearn/metrics/cluster/adjacency_matrix.py b/sklearn/metrics/cluster/adjacency_matrix.py
new file mode 100644
index 0000000000000..02482b7f0e9f7
--- /dev/null
+++ b/sklearn/metrics/cluster/adjacency_matrix.py
@@ -0,0 +1,22 @@
+import numpy as np
+
+
+def adjacency_matrix(cluster_assignement):
+    """
+    Parameter
+    ---------
+    cluster_assignement: vector (n_samples) of int i, 0 <= i < k
+
+    Return
+    ------
+    adj_matrix: matrix (n_samples, n_samples)
+        adji_matrix[i, j] = cluster_assignement[i] == cluster_assignement[j]
+    """
+    n_samples = len(cluster_assignement)
+    adj_matrix = np.zeros((n_samples, n_samples))
+    for i, val in enumerate(cluster_assignement):
+        for j in range(i, n_samples):
+            linked = val == cluster_assignement[j]
+            adj_matrix[i, j] = linked
+            adj_matrix[j, i] = linked
+    return adj_matrix
diff --git a/sklearn/metrics/cluster/calinski_harabaz_index.py b/sklearn/metrics/cluster/calinski_harabaz_index.py
new file mode 100644
index 0000000000000..9aaa5031a08ca
--- /dev/null
+++ b/sklearn/metrics/cluster/calinski_harabaz_index.py
@@ -0,0 +1,96 @@
+from collections import defaultdict
+
+import numpy as np
+
+
+def calinski_harabaz_index(X, labels):
+    """
+    Compute the Calinski and Harabaz (1974). It a ratio between the
+    within-cluster dispersion and the between-cluster dispersion
+
+    CH(k) = trace(B_k) / (k -1) * (n - k) / trace(W_k)
+
+    With B_k the between group dispersion matrix, W_k the within-cluster
+    dispersion matrix
+
+    B_k = \sum_q n_q (c_q - c) (c_q -c)^T
+    W_k = \sum_q \sum_{x \in C_q} (x - c_q) (x - c_q)^T
+
+    Ref: R.B.Calinsky, J.Harabasz: A dendrite method for cluster analysis 1974
+
+    Parameter
+    ---------
+    X: numpy array of size (nb_data, nb_feature)
+    labels: list of int of length nb_data: labels[i] is the cluster
+        assigned to X[i, :]
+
+    Return
+    ------
+    res: float: mean silhouette of this clustering
+    """
+    assi = defaultdict(list)
+    for i, l in enumerate(labels):
+        assi[l].append(i)
+
+    nb_data, nb_feature = X.shape
+    disp_intra = np.zeros((nb_feature, nb_feature))
+    disp_extra = np.zeros((nb_feature, nb_feature))
+    center = np.mean(X, axis=0)
+
+    for points in assi.values():
+        clu_points = X[points, :]
+        # unbiaised estimate of variace is \sum (x - mean_x)^2 / (n - 1)
+        # so, if I want sum of dispersion, I need
+        # W_k = cov(X) * (n - 1)
+        nb_point = clu_points.shape[0]
+        disp_intra += np.cov(clu_points, rowvar=0) * (nb_point - 1)
+        extra_var = (np.mean(clu_points, axis=0) - center).reshape(
+            (nb_feature, 1))
+        disp_extra += np.multiply(extra_var, extra_var.transpose()) * nb_point
+    return (disp_extra.trace() * (nb_data - len(assi)) /
+            (disp_intra.trace() * (len(assi) - 1)))
+
+
+def calc_calinski_harabaz(X, cluster_estimator, n_clusters):
+    """
+    Compute calinski harabaz for clusters made by cluster estimator
+
+    Parameter
+    ---------
+    X numpy array of size (nb_data, nb_feature)
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    n_clusters: number of clusters
+
+    """
+    cluster_estimator.set_params(n_clusters=n_clusters)
+    return calinski_harabaz_index(X, cluster_estimator.fit_predict(X))
+
+
+def max_CH_index(X, cluster_estimator, k_max=None):
+    """
+    Select number of cluster maximizing the Calinski and Harabasz (1974).
+    It a ratio between the within-cluster dispersion and the between-cluster
+    dispersion
+
+    Ref: R.B.Calinsky, J.Harabasz: A dendrite method for cluster analysis 1974
+
+    Parameters
+    ----------
+    X: numpy array of shape (nb_date, nb_features)
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    k_max: int: maximum number of clusters
+
+    Return
+    ------
+    k_star: int: optimal number of cluster
+    """
+    # if no maximum number of clusters set, take datasize divided by 2
+    if not k_max:
+        k_max = X.shape[0] // 2
+
+    return max((k for k in range(2, k_max + 1)),
+               key=lambda k: calc_calinski_harabaz(X, cluster_estimator, k))
diff --git a/sklearn/metrics/cluster/distortion.py b/sklearn/metrics/cluster/distortion.py
new file mode 100644
index 0000000000000..a257ea5dc932b
--- /dev/null
+++ b/sklearn/metrics/cluster/distortion.py
@@ -0,0 +1,79 @@
+from collections import defaultdict
+
+import numpy as np
+from scipy.spatial.distance import cdist
+
+
+def distortion(X, labels, distortion_meth='sqeuclidean', p=2):
+    """
+    Given data and their cluster assigment, compute the distortion D
+
+    Parameter
+    ---------
+    X: numpy array of shape (nb_data, nb_feature)
+    labels: list of int of length nb_data
+    distortion_meth: can be a function X, labels -> float,
+        can be a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    distortion: float
+    """
+    if isinstance(distortion_meth, str):
+        return distortion_metrics(X, labels, distortion_meth, p)
+    else:
+        return distortion_meth(X, labels)
+
+
+def distortion_metrics(X, labels, metric='sqeuclidean', p=2):
+    """
+    Given data and their cluster assigment, compute the distortion D
+
+    D = \sum_{x \in X} distance(x, c_x)
+
+    With c_x the center of the cluster containing x, distance is the distance
+    defined by metrics
+
+    Parameter
+    ---------
+    X: numpy array of shape (nb_data, nb_feature)
+    labels: list of int of length nb_data
+    metric: string naming a scipy.spatial distance. metric can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'wminkowski', 'canberra']
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    distortion: float
+    """
+    if metric == 'l2':
+        # Translate to something understood by scipy
+        metric = 'euclidean'
+    elif metric in ('l1', 'manhattan'):
+        metric = 'cityblock'
+
+    assi = defaultdict(list)
+    for i, l in enumerate(labels):
+        assi[l].append(i)
+
+    distance_sum = .0
+    nb_feature = X.shape[1]
+    for points in assi.values():
+        clu_points = X[points, :]
+        clu_center = np.mean(clu_points, axis=0).reshape(1, nb_feature)
+        distance_sum += np.sum(cdist(
+            clu_points, clu_center, metric=metric, p=p))
+
+    return distance_sum / X.shape[1]
diff --git a/sklearn/metrics/cluster/distortion_jump.py b/sklearn/metrics/cluster/distortion_jump.py
new file mode 100644
index 0000000000000..af8d0c1cce3b1
--- /dev/null
+++ b/sklearn/metrics/cluster/distortion_jump.py
@@ -0,0 +1,60 @@
+from math import pow
+
+from .distortion import distortion
+
+import numpy as np
+
+
+def distortion_jump(X, cluster_estimator, k_max=None,
+                    distortion_meth='sqeuclidean', p=2):
+    """
+    Find the number of clusters that maximizes efficiency while minimizing
+    error by information theoretic standards (wikipedia). For each number of
+    cluster, it calculates the distortion reduction. Roughly, it selects k such
+    as the difference between distortion with k clusters minus distortion with
+    k-1 clusters is maximal.
+
+    More precisely, let d(k) equals distortion with k clusters.
+    Let Y=nb_feature/2, let D[k] = d(k)^{-Y}
+    k^* = argmax(D[k] - D[k-1])
+
+    Parameters
+    ----------
+    X: numpy array of shape (nb_date, nb_features)
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    k_max: int: maximum number of clusters
+    distortion_meth: can be a function X, labels -> float,
+        can be a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    k_star: int: optimal number of cluster
+    """
+    nb_data, nb_feature = X.shape
+    # if no maximum number of clusters set, take datasize divided by 2
+    if not k_max:
+        k_max = nb_data // 2
+
+    Y = - nb_feature / 2
+    info_gain = 0
+    old_dist = pow(
+        distortion(X, np.zeros(nb_data), distortion_meth, p) / nb_feature, Y)
+    for k in range(2, k_max + 1):
+        cluster_estimator.set_params(n_clusters=k)
+        labs = cluster_estimator.fit_predict(X)
+        new_dist = pow(
+            distortion(X, labs, distortion_meth, p) / nb_feature, Y)
+        if new_dist - old_dist >= info_gain:
+            k_star = k
+            info_gain = new_dist - old_dist
+        old_dist = new_dist
+    return k_star
diff --git a/sklearn/metrics/cluster/fowlkes_mallows.py b/sklearn/metrics/cluster/fowlkes_mallows.py
new file mode 100644
index 0000000000000..44ce8d25b20f5
--- /dev/null
+++ b/sklearn/metrics/cluster/fowlkes_mallows.py
@@ -0,0 +1,37 @@
+from __future__ import division
+
+from .adjacency_matrix import adjacency_matrix
+
+from scipy.spatial.distance import cosine
+
+
+def fowlkes_mallows_index(clustering_1, clustering_2):
+    """
+    Mesure the similarity of two clusterings of a set of points.
+    Let:
+    - TP be the number of pair of points (x_i, x_j) that belongs
+        in the same clusters in both clustering_1 and clustering_2
+    - FP be the number of pair of points (x_i, x_j) that belongs
+        in the same clusters in clustering_1 and not in clustering_2
+    - FN be the number of pair of points (x_i, x_j) that belongs
+        in the same clusters in clustering_2 and not in clustering_1
+
+    The Fowlkes-Mallows index has the following formula:
+
+        fowlkes_mallows_index = TP / sqrt((TP + FP) * (TP + FN))
+
+    Parameter
+    ---------
+    clustering_1: list of int.
+        "clustering_1[i] = c" means that point i is assigned to cluster c
+    clustering_2: list of int.
+        "clustering_2[i] = c" means that point i is assigned to cluster c
+
+    Return
+    ------
+    fowlkes_mallows_index: float between 0 and 1. 1 means that both
+        clusterings perfectly match, 0 means that they totally disconnect
+    """
+    adj_mat_1 = adjacency_matrix(clustering_1)
+    adj_mat_2 = adjacency_matrix(clustering_2)
+    return 1 - cosine(adj_mat_1.flatten(), adj_mat_2.flatten())
diff --git a/sklearn/metrics/cluster/gap_statistic.py b/sklearn/metrics/cluster/gap_statistic.py
new file mode 100644
index 0000000000000..180da48f9a675
--- /dev/null
+++ b/sklearn/metrics/cluster/gap_statistic.py
@@ -0,0 +1,238 @@
+from __future__ import division
+from collections import defaultdict
+
+from math import sqrt, log
+
+import numpy as np
+
+from scipy.spatial.distance import cdist
+
+from ...utils import check_random_state
+
+
+def inertia(X, labels, metric='sqeuclidean', p=2):
+    """
+    Given data and their cluster assigment, compute the sum of
+    within-cluster mean distance to cluster's mean
+
+    Parameter
+    ---------
+    X: numpy array of shape (nb_data, nb_feature)
+    labels: list of int of length nb_data
+    metric: a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    distortion: float
+    """
+    if metric == 'l2':
+        # Translate to something understood by scipy
+        metric = 'euclidean'
+    elif metric in ('l1', 'manhattan'):
+        metric = 'cityblock'
+
+    assi = defaultdict(list)
+    for i, l in enumerate(labels):
+        assi[l].append(i)
+
+    inertia = .0
+    nb_feature = X.shape[1]
+    for points in assi.values():
+        clu_points = X[points, :]
+        clu_center = np.mean(clu_points, axis=0).reshape(1, nb_feature)
+        inertia += (np.sum(cdist(clu_points, clu_center, metric=metric, p=p)) /
+                    (2 * len(clu_points)))
+    return inertia
+
+
+def normal_inertia(X, cluster_estimator, nb_draw=100,
+                   metric='sqeuclidean', p=2, random_state=None,
+                   mu=None, sigma=None):
+    """
+    Draw multivariate normal data of size data_shape = (nb_data, nb_feature),
+    with same mean and covariance as X.
+    Clusterize data using cluster_estimator and compute inertia
+
+    Parameter
+    ---------
+    X numpy array of size (nb_data, nb_feature)
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    nb_draw: number of samples to calculate expected_inertia
+    metric: a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+    mu: mean of drawn data
+    sigma: covariance matrix of drawn data
+
+    Return
+    ------
+    dist: list of inertias (float) obtained on random dataset
+    """
+    rng = check_random_state(random_state)
+    nb_data, nb_feature = X.shape
+
+    if mu is None:
+        # data mean has no influence on distortion
+        mu = np.zeros(nb_feature)
+    if sigma is None:
+        sigma = np.cov(X.transpose())
+
+    dist = []
+    for i in range(nb_draw):
+        X_rand = rng.multivariate_normal(mu, sigma, size=nb_data)
+        dist.append(inertia(
+            X_rand, cluster_estimator.fit_predict(X_rand),
+            metric, p))
+
+    return dist
+
+
+def uniform_inertia(X, cluster_estimator, nb_draw=100, val_min=None,
+                    val_max=None, metric='sqeuclidean', p=2,
+                    random_state=None):
+    """
+    Uniformly draw data of size data_shape = (nb_data, nb_feature)
+    in the smallest hyperrectangle containing real data X.
+    Clusterize data using cluster_estimator and compute inertia
+
+    Parameter
+    ---------
+    X: numpy array of shape (nb_data, nb_feature)
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    nb_draw: number of samples to calculate expected_inertia
+    val_min: minimum values of each dimension of input data
+        array of length nb_feature
+    val_max: maximum values of each dimension of input data
+        array of length nb_feature
+    metric: a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    dist: list of distortions (float) obtained on random dataset
+    """
+    rng = check_random_state(random_state)
+    if val_min is None:
+        val_min = np.min(X, axis=0)
+    if val_max is None:
+        val_max = np.max(X, axis=0)
+
+    dist = []
+    for i in range(nb_draw):
+        X_rand = rng.uniform(size=X.shape) * (val_max - val_min) + val_min
+        dist.append(inertia(X_rand, cluster_estimator.fit_predict(X_rand),
+                            metric, p))
+
+    return dist
+
+
+def gap_statistic(X, cluster_estimator, k_max=None, nb_draw=100,
+                  random_state=None, draw_model='uniform',
+                  metric='sqeuclidean', p=2):
+    """
+    Estimating optimal number of cluster for data X with cluster_estimator by
+    comparing inertia of clustered real data with inertia of clustered
+    random data. Let W_rand(k) be the inertia of random data in k clusters,
+    W_real(k) inertia of real data in k clusters, statistic gap is defined
+    as
+
+    Gap(k) = E(log(W_rand(k))) - log(W_real(k))
+
+    We draw nb_draw random data "shapened-like X" (shape depend on draw_model)
+    We select the smallest k such as the gap between inertia of k clusters
+    of random data and k clusters of real data is superior to the gap with
+    k + 1 clusters minus a "standard-error" safety. Precisely:
+
+    k_star = min_k k
+         s.t. Gap(k) >= Gap(k + 1) - s(k + 1)
+              s(k) = stdev(log(W_rand)) * sqrt(1 + 1 / nb_draw)
+
+    From R.Tibshirani, G. Walther and T.Hastie, Estimating the number of
+    clusters in a dataset via the Gap statistic, Journal of the Royal
+    Statistical Socciety: Seris (B) (Statistical Methodology), 63(2), 411-423
+
+    Parameter
+    ---------
+    X: data. array nb_data * nb_feature
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+    nb_draw: int: number of random data of shape (nb_data, nb_feature) drawn
+        to estimate E(log(D_rand(k)))
+    draw_model: under which i.i.d data are draw. default: uniform data
+        (following Tibshirani et al.)
+        can be 'uniform', 'normal' (Gaussian distribution)
+    metric: a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    k: int: number of cluster that maximizes the gap statistic
+    """
+    rng = check_random_state(random_state)
+
+    # if no maximum number of clusters set, take datasize divided by 2
+    if not k_max:
+        k_max = X.shape[0] // 2
+    if draw_model == 'uniform':
+        val_min = np.min(X, axis=0)
+        val_max = np.max(X, axis=0)
+    elif draw_model == 'normal':
+        mu = np.mean(X, axis=0)
+        sigma = np.cov(X.transpose())
+
+    old_gap = - float("inf")
+    for k in range(2, k_max + 2):
+        cluster_estimator.set_params(n_clusters=k)
+        real_dist = inertia(X, cluster_estimator.fit_predict(X),
+                            metric, p)
+        # expected distortion
+        if draw_model == 'uniform':
+            rand_dist = uniform_inertia(X, cluster_estimator, nb_draw,
+                                        val_min, val_max, metric,
+                                        p)
+        elif draw_model == 'normal':
+            rand_dist = normal_inertia(X, cluster_estimator, nb_draw,
+                                       metric=metric,
+                                       p=p, mu=mu, sigma=sigma)
+        else:
+            raise ValueError(
+                "For gap statistic, model for random data is unknown")
+        rand_dist = np.log(rand_dist)
+        exp_dist = np.mean(rand_dist)
+        std_dist = np.std(rand_dist)
+        gap = exp_dist - log(real_dist)
+        safety = std_dist * sqrt(1 + 1 / nb_draw)
+        if k > 2 and old_gap >= gap - safety:
+            return k - 1
+        old_gap = gap
+    # if k was found, the function would have returned
+    # no clusters were found -> only 1 cluster
+    return 1
diff --git a/sklearn/metrics/cluster/stability.py b/sklearn/metrics/cluster/stability.py
new file mode 100644
index 0000000000000..1bd1de4b6d3eb
--- /dev/null
+++ b/sklearn/metrics/cluster/stability.py
@@ -0,0 +1,157 @@
+from __future__ import division
+
+from math import sqrt
+
+import numpy as np
+from .adjacency_matrix import adjacency_matrix
+from .fowlkes_mallows import fowlkes_mallows_index
+from scipy.spatial.distance import pdist
+
+from ...utils import check_random_state
+
+
+def stability(X, cluster_estimator, k_max=None, nb_draw=100, prop_subset=.8,
+              random_state=None, p=None, distance='fowlkes-mallows',
+              verbose=False):
+    """Stability algorithm.
+    For k from 2 to k_max, compute stability of cluster estimator to produce k
+    clusters. Stability measures if the estimator produces the same clusters
+    given small variations in the input data. It draws two overlapping subsets
+    A and B of input data. For points in the two subsets, we compute the
+    clustering C_A and C_B done on subsets A and B. We then compute the
+    similarity of those clustering. We can use the opposite of a distance
+    as a similarity
+
+    The stability of cluster_estimator with k cluster is the expectation of
+    similarity(C_A, C_B)
+
+    Ref: Ben-Hur, Elisseeff, Guyon: a stability based method for discovering
+    structure in clusterd data, 2002
+    Overview of stability: Luxburg: clustering stability: an overview
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape (n_samples, n_features)
+        The observations to cluster.
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    k_max: int: maximum number of clusters (default = n_samples / 2)
+    nb_draw: number of draws to estimate expectation of expectation of
+        similarity(C_A, C_B)
+    prop_subset: 0 < float < 1: proportion of input data taken in each subset
+    distance: a string naming a distance or a cluster similarity. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski', 'fowlkes-mallows'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+    Return
+    ------
+    k: int
+    """
+    rng = check_random_state(random_state)
+    cluster_similarity = function_cluster_similarity(distance, p)
+
+    n_samples, n_features = X.shape
+    if not k_max:
+        k_max = n_samples // 2
+
+    best_stab, best_k = 0, 0
+    for k in range(2, k_max + 1):
+        cluster_estimator.set_params(n_clusters=k)
+        this_score = sum(
+            _one_stability_measure(cluster_estimator, X, prop_subset,
+                                   cluster_similarity)
+            for _ in range(nb_draw)) / nb_draw
+        if verbose:
+            print('for %d cluster, stability is %f' % (k, this_score))
+
+        if this_score >= best_stab:
+            best_stab = this_score
+            best_k = k
+
+    return best_k
+
+
+def _one_stability_measure(cluster_estimator, X, prop_sample,
+                           cluster_similarity, random_state=None):
+    """
+    Draws two subsets A and B from X, compute C_A, clustering on subset
+    A, and C_B, clustering on subset B, then returns
+
+    similarity(C_A, C_B)
+
+    Parameter
+    ---------
+    X: array of size n_samples, n_features
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    prop_sample: 0 < float < 1, proportion of X taken in each subset
+    cluster_similarity: function (list, list) -> float
+    """
+    rng = check_random_state(random_state)
+
+    n_sample = X.shape[0]
+    set_1 = rng.uniform(size=n_sample) < prop_sample
+    set_2 = rng.uniform(size=n_sample) < prop_sample
+    nb_points_1, nb_points_2 = 0, 0
+    points_1, points_2 = [], []
+    common_points_1, common_points_2 = [], []
+    for i, (is_1, is_2) in enumerate(zip(set_1, set_2)):
+        if is_1 and is_2:
+            common_points_1.append(nb_points_1)
+            common_points_2.append(nb_points_2)
+        if is_1:
+            points_1.append(i)
+            nb_points_1 += 1
+        if is_2:
+            points_2.append(i)
+            nb_points_2 += 1
+
+    assi_1 = cluster_estimator.fit_predict(X[np.ix_(points_1)])
+    assi_2 = cluster_estimator.fit_predict(X[np.ix_(points_2)])
+
+    clustering_1 = [assi_1[c] for c in common_points_1]
+    clustering_2 = [assi_2[c] for c in common_points_2]
+    return cluster_similarity(clustering_1, clustering_2)
+
+
+def function_cluster_similarity(metric='fowlkes-mallows', p=None):
+    """
+    Given the name of a distance, return function to estimate
+    two clusterings  similarity
+
+    Parameter
+    --------
+    metric: a string naming a distance or a cluster similarity. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski', 'fowlkes-mallows'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return:
+    function (clustering_1, clustering_2) -> similarity; with:
+        clustering_k: a list. clustering_k[i] = c means that
+           point x_i belongs to cluster c in clustering k
+        similarity: float
+    """
+    if metric == 'fowlkes-mallows':
+        return fowlkes_mallows_index
+    if metric == 'l2':
+        # Translate to something understood by scipy
+        metric = 'euclidean'
+    elif metric in ('l1', 'manhattan'):
+        metric = 'cityblock'
+
+    def cluster_dist(clustering_1, clustering_2):
+        adj_mat_1 = adjacency_matrix(clustering_1).flatten()
+        adj_mat_2 = adjacency_matrix(clustering_2).flatten()
+        return -pdist([adj_mat_1, adj_mat_2], metric=metric, p=p)
+    return cluster_dist
diff --git a/sklearn/metrics/cluster/tests/test_adjacency.py b/sklearn/metrics/cluster/tests/test_adjacency.py
new file mode 100644
index 0000000000000..cac4f350eb9e7
--- /dev/null
+++ b/sklearn/metrics/cluster/tests/test_adjacency.py
@@ -0,0 +1,12 @@
+import numpy as np
+
+from sklearn.metrics.cluster.adjacency_matrix import adjacency_matrix
+from sklearn.utils.testing import assert_array_equal
+
+
+def test_adjacency_matrix():
+    assi = [0, 0, 1, 1]
+    adj_matrix = np.array([[1, 1, 0, 0], [1, 1, 0, 0],
+                           [0, 0, 1, 1], [0, 0, 1, 1]])
+    found_adj = adjacency_matrix(assi)
+    assert_array_equal(found_adj, adj_matrix)
diff --git a/sklearn/metrics/cluster/tests/test_calinski_harabaz_index.py b/sklearn/metrics/cluster/tests/test_calinski_harabaz_index.py
new file mode 100644
index 0000000000000..027aae36782ab
--- /dev/null
+++ b/sklearn/metrics/cluster/tests/test_calinski_harabaz_index.py
@@ -0,0 +1,21 @@
+import numpy as np
+
+from sklearn.utils.testing import (assert_almost_equal, assert_equal)
+
+from sklearn.cluster.k_means_ import KMeans
+from sklearn.metrics.cluster.calinski_harabaz_index import (calinski_harabaz_index,
+                                                            max_CH_index)
+from sklearn.datasets import make_blobs
+
+
+def test_calinsk_harabaz_index():
+    X = np.array([[0, 0]] * 50 + [[1, 1]] * 50
+                 + [[3, 3]] * 50 + [[4, 4]] * 50)
+    labels = [0] * 100 + [1] * 100
+    assert_almost_equal(calinski_harabaz_index(X, labels), 4.5 * (200 - 2) / .5)
+
+
+def test_Calinsk_Harabasz():
+    X, _ = make_blobs(90, centers=np.array([[-2, -2], [2, 0], [-2, 2]]), random_state=0)
+    cluster_estimator = KMeans()
+    assert_equal(max_CH_index(X, cluster_estimator, k_max=6), 3)
diff --git a/sklearn/metrics/cluster/tests/test_distortion.py b/sklearn/metrics/cluster/tests/test_distortion.py
new file mode 100644
index 0000000000000..c38db57f69220
--- /dev/null
+++ b/sklearn/metrics/cluster/tests/test_distortion.py
@@ -0,0 +1,12 @@
+import numpy as np
+
+from sklearn.utils.testing import assert_almost_equal
+
+from sklearn.metrics.cluster.distortion import distortion
+
+
+def test_distortion():
+    X = np.array([[0, 0], [2, 2],
+                  [5, 5], [6, 6]])
+    labels = [0, 0, 1, 1]
+    assert_almost_equal(distortion(X, labels), 2.5)
diff --git a/sklearn/metrics/cluster/tests/test_distortion_jump.py b/sklearn/metrics/cluster/tests/test_distortion_jump.py
new file mode 100644
index 0000000000000..d7e7abf154fe1
--- /dev/null
+++ b/sklearn/metrics/cluster/tests/test_distortion_jump.py
@@ -0,0 +1,14 @@
+import numpy as np
+
+from sklearn.utils.testing import (assert_almost_equal, assert_equal)
+
+from sklearn.cluster.k_means_ import KMeans
+from sklearn.metrics.cluster.distortion_jump import distortion_jump
+from sklearn.datasets import make_blobs
+
+
+def test_distortion_jump():
+    X, _ = make_blobs(90, centers=np.array([[-2, -2], [2, 0], [-2, 2]]),
+                      random_state=0)
+    cluster_estimator = KMeans()
+    assert_equal(distortion_jump(X, cluster_estimator, k_max=6), 3)
diff --git a/sklearn/metrics/cluster/tests/test_fowlkes_mallows.py b/sklearn/metrics/cluster/tests/test_fowlkes_mallows.py
new file mode 100644
index 0000000000000..56dc7495938ba
--- /dev/null
+++ b/sklearn/metrics/cluster/tests/test_fowlkes_mallows.py
@@ -0,0 +1,14 @@
+from __future__ import division
+from math import sqrt
+
+from sklearn.utils.testing import assert_almost_equal
+
+from sklearn.metrics.cluster.fowlkes_mallows import fowlkes_mallows_index
+
+
+def test_fowlkes_mallows_index():
+    clustering_1 = [0, 0, 0, 1, 1, 1]
+    clustering_2 = [0, 0, 1, 1, 2, 2]
+    assert_almost_equal(
+        fowlkes_mallows_index(clustering_1, clustering_2),
+        10 / sqrt(18 * 12))
diff --git a/sklearn/metrics/cluster/tests/test_gap.py b/sklearn/metrics/cluster/tests/test_gap.py
new file mode 100644
index 0000000000000..3435dc09e3d42
--- /dev/null
+++ b/sklearn/metrics/cluster/tests/test_gap.py
@@ -0,0 +1,34 @@
+import numpy as np
+
+from sklearn.utils.testing import assert_true, assert_equal
+
+from sklearn.cluster.k_means_ import KMeans
+from sklearn.metrics.cluster.gap_statistic import (normal_inertia,
+                                                   gap_statistic)
+from sklearn.datasets import make_blobs
+
+
+def test_normal_inertia():
+    class BogusCluster(object):
+        def fit_predict(self, points):
+            n = len(points)
+            mid = n / 2
+            return [int(i < mid) for i in range(n)]
+    var_1_data = np.asarray([[0, 0]] * 100 + [[2, 2]] * 100)
+    mean_dist = np.mean(normal_inertia(
+        var_1_data, BogusCluster(), nb_draw=10, random_state=0))
+    # Expected mean dist is 2.
+    # After 100 tries, it should be between 1.80 and 2.2
+    assert_true(mean_dist > 1.8)
+    assert_true(mean_dist < 2.2)
+
+
+def test_gap_statistic():
+    # for j in [20 * i: 20 * (i+1)[, x[j] = [rand rand] + [4 * i, 4 * i]
+    X, _ = make_blobs(90, centers=np.array([[-2, -2], [2, 0], [-2, 2]]),
+                      random_state=0)
+    cluster_estimator = KMeans()
+    assert_equal(gap_statistic(X, cluster_estimator, k_max=6, nb_draw=10,
+                               random_state=0, draw_model='normal'), 3)
+    assert_equal(gap_statistic(X, cluster_estimator, k_max=6, nb_draw=10,
+                               random_state=0, metric='cityblock'), 3)
diff --git a/sklearn/metrics/cluster/tests/test_stability.py b/sklearn/metrics/cluster/tests/test_stability.py
new file mode 100644
index 0000000000000..6e08603fccec3
--- /dev/null
+++ b/sklearn/metrics/cluster/tests/test_stability.py
@@ -0,0 +1,46 @@
+import numpy as np
+from math import sqrt
+
+from sklearn.utils.testing import assert_almost_equal, assert_equal
+
+from sklearn.cluster.k_means_ import KMeans
+from sklearn.metrics.cluster.stability import (
+    _one_stability_measure, stability, function_cluster_similarity,
+    fowlkes_mallows_index)
+from sklearn.datasets import make_blobs
+
+
+def test_one_stability_measure():
+    X = np.arange(10) < 5
+    X.reshape((10, 1))
+
+    # test perfect clustering has 1 stability
+    class SameCluster(object):
+        def set_params(self, *args, **kwargs):
+            pass
+
+        def fit_predict(self, X):
+            return X
+    same_clusters = SameCluster()
+    assert_almost_equal(
+        _one_stability_measure(same_clusters, X, .8, fowlkes_mallows_index), 1)
+
+
+def test_stability():
+    X, _ = make_blobs(90, centers=np.array([[-2, -2], [2, 0], [-2, 2]]),
+                      random_state=0)
+    cluster_estimator = KMeans()
+    assert_equal(stability(X, cluster_estimator, k_max=6,
+                           nb_draw=10, random_state=0), 3)
+
+
+def test_function_cluster_dist():
+    clustering_1 = [0, 0, 0, 1, 1, 1]
+    clustering_2 = [0, 0, 1, 1, 2, 2]
+    dist_fun = function_cluster_similarity(metric='fowlkes-mallows')
+    assert_almost_equal(dist_fun(clustering_1, clustering_2),
+                        10 / sqrt(18 * 12))
+    dist_fun = function_cluster_similarity(metric='cityblock')
+    assert_almost_equal(dist_fun(clustering_1, clustering_2), -10)
+    dist_fun = function_cluster_similarity(metric='euclidean')
+    assert_almost_equal(dist_fun(clustering_1, clustering_2), -sqrt(10))