scikit-learn · stuppie · Aug 18, 2016 · Aug 18, 2016 · Aug 18, 2016 · Aug 18, 2016
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
@@ -9,13 +9,15 @@
 #          Diego Molla <dmolla-aliod@gmail.com>
 #          Arnaud Fouchet <foucheta@gmail.com>
 #          Thierry Guillemot <thierry.guillemot.work@gmail.com>
+#          Gregory Stupp <stuppie@gmail.com>
 # License: BSD 3 clause
 
 from math import log
 
-from scipy.misc import comb
-from scipy.sparse import coo_matrix
 import numpy as np
+from scipy.misc import comb
+from scipy.sparse import coo_matrix, find
+from scipy.sparse.data import _data_matrix
 
 from .expected_mutual_info_fast import expected_mutual_information
 from ...utils.fixes import bincount
@@ -46,7 +48,8 @@ def check_clusterings(labels_true, labels_pred):
     return labels_true, labels_pred
 
 
-def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
+def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000,
+                       sparse=False):
     """Build a contingency matrix describing the relationship between labels.
 
     Parameters
@@ -57,33 +60,42 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
     labels_pred : array, shape = [n_samples]
         Cluster labels to evaluate
 
-    eps: None or float
+    eps: None or float, optional.
         If a float, that value is added to all values in the contingency
         matrix. This helps to stop NaN propagation.
         If ``None``, nothing is adjusted.
 
     max_n_classes : int, optional (default=5000)
         Maximal number of classeses handled for contingency_matrix.
         This help to avoid Memory error with regression target
-        for mutual_information.
+        for mutual_information. If ``sparse is True``,
+        `max_n_classes` is ignored.
+
+    sparse: boolean, optional.
+        If True, return a sparse continency matrix. If ``eps is not None``,
+        and ``sparse is True``, will throw ValueError.
 
     Returns
     -------
-    contingency: array, shape=[n_classes_true, n_classes_pred]
+    contingency: {array-like, sparse}, shape=[n_classes_true, n_classes_pred]
         Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in
         true class :math:`i` and in predicted class :math:`j`. If
         ``eps is None``, the dtype of this array will be integer. If ``eps`` is
-        given, the dtype will be float.
+        given, the dtype will be float. Will be sparse if ``sparse is True``
     """
+
+    if eps is not None and sparse:
+        raise ValueError("Cannot set 'eps' and return a sparse matrix")
+
     classes, class_idx = np.unique(labels_true, return_inverse=True)
     clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
     n_classes = classes.shape[0]
     n_clusters = clusters.shape[0]
-    if n_classes > max_n_classes:
+    if not sparse and (n_classes > max_n_classes):
         raise ValueError("Too many classes for a clustering metric. If you "
                          "want to increase the limit, pass parameter "
                          "max_n_classes to the scoring function")
-    if n_clusters > max_n_classes:
+    if not sparse and (n_clusters > max_n_classes):
         raise ValueError("Too many clusters for a clustering metric. If you "
                          "want to increase the limit, pass parameter "
                          "max_n_classes to the scoring function")
@@ -93,7 +105,9 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
     contingency = coo_matrix((np.ones(class_idx.shape[0]),
                               (class_idx, cluster_idx)),
                              shape=(n_classes, n_clusters),
-                             dtype=np.int).toarray()
+                             dtype=np.int)
+    if not sparse:
+        contingency = contingency.toarray()
     if eps is not None:
         # don't use += as contingency is integer
         contingency = contingency + eps
@@ -102,7 +116,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
 
 # clustering measures
 
-def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
+def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000,
+                        contingency=None):
     """Rand index adjusted for chance.
 
     The Rand Index computes a similarity measure between two clusterings
@@ -139,6 +154,11 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
         metric. Setting it too high can lead to MemoryError or OS
         freeze
 
+    contingency: {None, sparse matrix}, shape = [n_classes_true, n_classes_pred]
+        A contingency matrix given by the :func:`contingency_matrix` function.
+        If value is ``None``, it will be computed, otherwise the given value is
+        used, with ``labels_true`` and ``labels_pred`` ignored.
+
     Returns
     -------
     ari : float
@@ -188,33 +208,54 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
     adjusted_mutual_info_score: Adjusted Mutual Information
 
     """
-    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
-    n_samples = labels_true.shape[0]
-    classes = np.unique(labels_true)
-    clusters = np.unique(labels_pred)
+    if contingency is None:
+        labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
+        n_samples = labels_true.shape[0]
+        n_classes = np.unique(labels_true).shape[0]
+        n_clusters = np.unique(labels_pred).shape[0]
+    elif isinstance(contingency, _data_matrix):
+        n_samples = contingency.nnz
+        n_classes, n_clusters = contingency.shape
+    else:
+        raise ValueError("'contingency' must be a sparse matrix or None")
+
     # Special limit cases: no clustering since the data is not split;
     # or trivial clustering where each document is assigned a unique cluster.
     # These are perfect matches hence return 1.0.
-    if (classes.shape[0] == clusters.shape[0] == 1 or
-            classes.shape[0] == clusters.shape[0] == 0 or
-            classes.shape[0] == clusters.shape[0] == len(labels_true)):
+    if (n_classes == n_clusters == 1 or
+                    n_classes == n_clusters == 0 or
+                    n_classes == n_clusters == n_samples):
         return 1.0
 
-    contingency = contingency_matrix(labels_true, labels_pred,
-                                     max_n_classes=max_n_classes)
+    # Compute contingency matrix if we weren't given it
+    if contingency is None:
+        contingency = contingency_matrix(labels_true, labels_pred,
+                                         max_n_classes=max_n_classes)
 
     # Compute the ARI using the contingency data
-    sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1))
-    sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0))
+    if isinstance(contingency, np.ndarray):
+        # For an array
+        sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1))
+        sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0))
+        sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten())
+    elif isinstance(contingency, _data_matrix):
+        # For a sparse matrix
+        sum_comb_c = sum(
+            comb2(n_c) for n_c in np.array(contingency.sum(axis=1)))
+        sum_comb_k = sum(
+            comb2(n_k) for n_k in np.array(contingency.sum(axis=0)).T)
+        sum_comb = sum(comb2(n_ij) for n_ij in find(contingency)[2])
+    else:
+        raise ValueError(
+            "Unsupported type for 'contingency': " + str(type(contingency)))
 
-    sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten())
     prod_comb = (sum_comb_c * sum_comb_k) / float(comb(n_samples, 2))
     mean_comb = (sum_comb_k + sum_comb_c) / 2.
-    return ((sum_comb - prod_comb) / (mean_comb - prod_comb))
+    return float((sum_comb - prod_comb) / (mean_comb - prod_comb))
 
 
 def homogeneity_completeness_v_measure(labels_true, labels_pred,
-                                       max_n_classes=5000):
+                                       max_n_classes=5000, sparse=False):
     """Compute the homogeneity and completeness and V-Measure scores at once.
 
     Those metrics are based on normalized conditional entropy measures of
@@ -253,6 +294,10 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred,
         metric. Setting it too high can lead to MemoryError or OS
         freeze
 
+    sparse: boolean, optional.
+        If True, intermediate calculation of the contingency matrix
+        will calculate a sparse continency matrix.
+
     Returns
     -------
     homogeneity: float
@@ -278,8 +323,12 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred,
     entropy_C = entropy(labels_true)
     entropy_K = entropy(labels_pred)
 
-    MI = mutual_info_score(labels_true, labels_pred,
-                           max_n_classes=max_n_classes)
+    if sparse:
+        contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
+        MI = mutual_info_score(None, None, contingency=contingency)
+    else:
+        MI = mutual_info_score(labels_true, labels_pred,
+                               max_n_classes=max_n_classes)
 
     homogeneity = MI / (entropy_C) if entropy_C else 1.0
     completeness = MI / (entropy_K) if entropy_K else 1.0
@@ -293,7 +342,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred,
     return homogeneity, completeness, v_measure_score
 
 
-def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
+def homogeneity_score(labels_true, labels_pred, max_n_classes=5000,
+                      sparse=False):
     """Homogeneity metric of a cluster labeling given a ground truth.
 
     A clustering result satisfies homogeneity if all of its clusters
@@ -317,6 +367,10 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
     labels_pred : array, shape = [n_samples]
         cluster labels to evaluate
 
+    sparse: boolean, optional.
+        If True, intermediate calculation of the contingency matrix
+        will calculate a sparse continency matrix.
+
     max_n_classes: int, optional (default=5000)
         Maximal number of classes handled by the adjusted_rand_score
         metric. Setting it too high can lead to MemoryError or OS
@@ -369,11 +423,13 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
       0.0...
 
     """
-    return homogeneity_completeness_v_measure(labels_true, labels_pred,
-                                              max_n_classes)[0]
+    return \
+    homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse,
+                                       max_n_classes=max_n_classes)[0]
 
 
-def completeness_score(labels_true, labels_pred, max_n_classes=5000):
+def completeness_score(labels_true, labels_pred, max_n_classes=5000,
+                       sparse=False):
     """Completeness metric of a cluster labeling given a ground truth.
 
     A clustering result satisfies completeness if all the data points
@@ -397,6 +453,10 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000):
     labels_pred : array, shape = [n_samples]
         cluster labels to evaluate
 
+    sparse: boolean, optional.
+        If True, intermediate calculation of the contingency matrix
+        will calculate a sparse continency matrix.
+
     max_n_classes: int, optional (default=5000)
         Maximal number of classes handled by the adjusted_rand_score
         metric. Setting it too high can lead to MemoryError or OS
@@ -445,11 +505,12 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000):
       0.0
 
     """
-    return homogeneity_completeness_v_measure(labels_true, labels_pred,
-                                              max_n_classes)[1]
+    return \
+    homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse,
+                                       max_n_classes=max_n_classes)[1]
 
 
-def v_measure_score(labels_true, labels_pred, max_n_classes=5000):
+def v_measure_score(labels_true, labels_pred, max_n_classes=5000, sparse=False):
     """V-measure cluster labeling given a ground truth.
 
     This score is identical to :func:`normalized_mutual_info_score`.
@@ -477,6 +538,10 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000):
     labels_pred : array, shape = [n_samples]
         cluster labels to evaluate
 
+    sparse: boolean, optional.
+        If True, intermediate calculation of the contingency matrix
+        will calculate a sparse continency matrix.
+
     max_n_classes: int, optional (default=5000)
         Maximal number of classes handled by the adjusted_rand_score
         metric. Setting it too high can lead to MemoryError or OS
@@ -547,7 +612,8 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000):
 
     """
     return homogeneity_completeness_v_measure(labels_true, labels_pred,
-                                              max_n_classes)[2]
+                                              max_n_classes=max_n_classes,
+                                              sparse=sparse)[2]
 
 
 def mutual_info_score(labels_true, labels_pred, contingency=None,
@@ -586,7 +652,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None,
     labels_pred : array, shape = [n_samples]
         A clustering of the data into disjoint subsets.
 
-    contingency: None or array, shape = [n_classes_true, n_classes_pred]
+    contingency: {None, array, sparse matrix},
+                shape = [n_classes_true, n_classes_pred]
         A contingency matrix given by the :func:`contingency_matrix` function.
         If value is ``None``, it will be computed, otherwise the given value is
         used, with ``labels_true`` and ``labels_pred`` ignored.
@@ -610,22 +677,41 @@ def mutual_info_score(labels_true, labels_pred, contingency=None,
         labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
         contingency = contingency_matrix(labels_true, labels_pred,
                                          max_n_classes=max_n_classes)
-    contingency = np.array(contingency, dtype='float')
-    contingency_sum = np.sum(contingency)
-    pi = np.sum(contingency, axis=1)
-    pj = np.sum(contingency, axis=0)
-    outer = np.outer(pi, pj)
-    nnz = contingency != 0.0
-    # normalized contingency
-    contingency_nm = contingency[nnz]
-    log_contingency_nm = np.log(contingency_nm)
-    contingency_nm /= contingency_sum
-    # log(a / b) should be calculated as log(a) - log(b) for
-    # possible loss of precision
-    log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum())
-    mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
-          contingency_nm * log_outer)
-    return mi.sum()
+    if isinstance(contingency, np.ndarray):
+        # For an array
+        contingency = np.array(contingency, dtype='float')
+        contingency_sum = np.sum(contingency)
+        pi = np.sum(contingency, axis=1)
+        pj = np.sum(contingency, axis=0)
+        outer = np.outer(pi, pj)
+        nnz = contingency != 0.0
+        # normalized contingency
+        contingency_nm = contingency[nnz]
+        log_contingency_nm = np.log(contingency_nm)
+        contingency_nm /= contingency_sum
+        # log(a / b) should be calculated as log(a) - log(b) for
+        # possible loss of precision
+        log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum())
+        mi = (contingency_nm * (log_contingency_nm - log(contingency_sum))
+              + contingency_nm * log_outer)
+        return mi.sum()
+    elif isinstance(contingency, _data_matrix):
+        # For a sparse matrix
+        contingency_sum = contingency.sum()
+        pi = np.array(contingency.sum(axis=1))
+        pj = np.array(contingency.sum(axis=0)).T
+        nnzx, nnzy, nnz_val = find(contingency)
+        log_contingency_nm = np.log(nnz_val)
+        contingency_nm = nnz_val * 1.0 / contingency_sum
+        # Don't need to calculate the full outer product. Just for the non-zero values
+        outer = np.array([pi[x] * pj[y] for x, y in zip(nnzx, nnzy)]).T
+        log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
+        mi = contingency_nm * (log_contingency_nm - log(contingency_sum)) + \
+             contingency_nm * log_outer
+        return mi.sum()
+    else:
+        raise ValueError(
+            "Unsupported type for 'contingency': " + str(type(contingency)))
 
 
 def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
@@ -714,7 +800,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
     # Special limit cases: no clustering since the data is not split.
     # This is a perfect match hence return 1.0.
     if (classes.shape[0] == clusters.shape[0] == 1 or
-            classes.shape[0] == clusters.shape[0] == 0):
+                classes.shape[0] == clusters.shape[0] == 0):
         return 1.0
     contingency = contingency_matrix(labels_true, labels_pred,
                                      max_n_classes=max_n_classes)
@@ -801,7 +887,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
     # Special limit cases: no clustering since the data is not split.
     # This is a perfect match hence return 1.0.
     if (classes.shape[0] == clusters.shape[0] == 1 or
-            classes.shape[0] == clusters.shape[0] == 0):
+                    classes.shape[0] == clusters.shape[0] == 0):
         return 1.0
     contingency = contingency_matrix(labels_true, labels_pred,
                                      max_n_classes=max_n_classes)
@@ -883,7 +969,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000):
     .. [2] `Wikipedia entry for the Fowlkes-Mallows Index
            <https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_
     """
-    labels_true, labels_pred = check_clusterings(labels_true, labels_pred,)
+    labels_true, labels_pred = check_clusterings(labels_true, labels_pred, )
     n_samples, = labels_true.shape
 
     c = contingency_matrix(labels_true, labels_pred,