-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
Use sparse contingency matrix for supervised cluster metrics #7203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,13 +9,15 @@ | |
# Diego Molla <dmolla-aliod@gmail.com> | ||
# Arnaud Fouchet <foucheta@gmail.com> | ||
# Thierry Guillemot <thierry.guillemot.work@gmail.com> | ||
# Gregory Stupp <stuppie@gmail.com> | ||
# License: BSD 3 clause | ||
|
||
from math import log | ||
|
||
from scipy.misc import comb | ||
from scipy.sparse import coo_matrix | ||
import numpy as np | ||
from scipy.misc import comb | ||
from scipy.sparse import coo_matrix, find | ||
from scipy.sparse.data import _data_matrix | ||
|
||
from .expected_mutual_info_fast import expected_mutual_information | ||
from ...utils.fixes import bincount | ||
|
@@ -46,7 +48,8 @@ def check_clusterings(labels_true, labels_pred): | |
return labels_true, labels_pred | ||
|
||
|
||
def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): | ||
def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, | ||
sparse=False): | ||
"""Build a contingency matrix describing the relationship between labels. | ||
|
||
Parameters | ||
|
@@ -57,33 +60,42 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): | |
labels_pred : array, shape = [n_samples] | ||
Cluster labels to evaluate | ||
|
||
eps: None or float | ||
eps: None or float, optional. | ||
If a float, that value is added to all values in the contingency | ||
matrix. This helps to stop NaN propagation. | ||
If ``None``, nothing is adjusted. | ||
|
||
max_n_classes : int, optional (default=5000) | ||
Maximal number of classeses handled for contingency_matrix. | ||
This help to avoid Memory error with regression target | ||
for mutual_information. | ||
for mutual_information. If ``sparse is True``, | ||
`max_n_classes` is ignored. | ||
|
||
sparse: boolean, optional. | ||
If True, return a sparse continency matrix. If ``eps is not None``, | ||
and ``sparse is True``, will throw ValueError. | ||
|
||
Returns | ||
------- | ||
contingency: array, shape=[n_classes_true, n_classes_pred] | ||
contingency: {array-like, sparse}, shape=[n_classes_true, n_classes_pred] | ||
Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in | ||
true class :math:`i` and in predicted class :math:`j`. If | ||
``eps is None``, the dtype of this array will be integer. If ``eps`` is | ||
given, the dtype will be float. | ||
given, the dtype will be float. Will be sparse if ``sparse is True`` | ||
""" | ||
|
||
if eps is not None and sparse: | ||
raise ValueError("Cannot set 'eps' and return a sparse matrix") | ||
|
||
classes, class_idx = np.unique(labels_true, return_inverse=True) | ||
clusters, cluster_idx = np.unique(labels_pred, return_inverse=True) | ||
n_classes = classes.shape[0] | ||
n_clusters = clusters.shape[0] | ||
if n_classes > max_n_classes: | ||
if not sparse and (n_classes > max_n_classes): | ||
raise ValueError("Too many classes for a clustering metric. If you " | ||
"want to increase the limit, pass parameter " | ||
"max_n_classes to the scoring function") | ||
if n_clusters > max_n_classes: | ||
if not sparse and (n_clusters > max_n_classes): | ||
raise ValueError("Too many clusters for a clustering metric. If you " | ||
"want to increase the limit, pass parameter " | ||
"max_n_classes to the scoring function") | ||
|
@@ -93,7 +105,9 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): | |
contingency = coo_matrix((np.ones(class_idx.shape[0]), | ||
(class_idx, cluster_idx)), | ||
shape=(n_classes, n_clusters), | ||
dtype=np.int).toarray() | ||
dtype=np.int) | ||
if not sparse: | ||
contingency = contingency.toarray() | ||
if eps is not None: | ||
# don't use += as contingency is integer | ||
contingency = contingency + eps | ||
|
@@ -102,7 +116,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): | |
|
||
# clustering measures | ||
|
||
def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): | ||
def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000, | ||
contingency=None): | ||
"""Rand index adjusted for chance. | ||
|
||
The Rand Index computes a similarity measure between two clusterings | ||
|
@@ -139,6 +154,11 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): | |
metric. Setting it too high can lead to MemoryError or OS | ||
freeze | ||
|
||
contingency: {None, sparse matrix}, shape = [n_classes_true, n_classes_pred] | ||
A contingency matrix given by the :func:`contingency_matrix` function. | ||
If value is ``None``, it will be computed, otherwise the given value is | ||
used, with ``labels_true`` and ``labels_pred`` ignored. | ||
|
||
Returns | ||
------- | ||
ari : float | ||
|
@@ -188,33 +208,54 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): | |
adjusted_mutual_info_score: Adjusted Mutual Information | ||
|
||
""" | ||
labels_true, labels_pred = check_clusterings(labels_true, labels_pred) | ||
n_samples = labels_true.shape[0] | ||
classes = np.unique(labels_true) | ||
clusters = np.unique(labels_pred) | ||
if contingency is None: | ||
labels_true, labels_pred = check_clusterings(labels_true, labels_pred) | ||
n_samples = labels_true.shape[0] | ||
n_classes = np.unique(labels_true).shape[0] | ||
n_clusters = np.unique(labels_pred).shape[0] | ||
elif isinstance(contingency, _data_matrix): | ||
n_samples = contingency.nnz | ||
n_classes, n_clusters = contingency.shape | ||
else: | ||
raise ValueError("'contingency' must be a sparse matrix or None") | ||
|
||
# Special limit cases: no clustering since the data is not split; | ||
# or trivial clustering where each document is assigned a unique cluster. | ||
# These are perfect matches hence return 1.0. | ||
if (classes.shape[0] == clusters.shape[0] == 1 or | ||
classes.shape[0] == clusters.shape[0] == 0 or | ||
classes.shape[0] == clusters.shape[0] == len(labels_true)): | ||
if (n_classes == n_clusters == 1 or | ||
n_classes == n_clusters == 0 or | ||
n_classes == n_clusters == n_samples): | ||
return 1.0 | ||
|
||
contingency = contingency_matrix(labels_true, labels_pred, | ||
max_n_classes=max_n_classes) | ||
# Compute contingency matrix if we weren't given it | ||
if contingency is None: | ||
contingency = contingency_matrix(labels_true, labels_pred, | ||
max_n_classes=max_n_classes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This still builds a dense contingency matrix by default. Maybe we should have an internal heuristic: if n_classes is small we use a dense array (because it's faster to compute) and if it's larger than than some threshold we switch to sparse. Or we could decide to always build sparse contingency matrices internally to keep the code simpler and expect that a difference in speed will never exceed a couple of seconds for small to medium inputs while very large input will only be addressable via sparse contingency matrices anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm benchmarking now. will post a WIP to supersede this one. |
||
|
||
# Compute the ARI using the contingency data | ||
sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1)) | ||
sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0)) | ||
if isinstance(contingency, np.ndarray): | ||
# For an array | ||
sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1)) | ||
sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0)) | ||
sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten()) | ||
elif isinstance(contingency, _data_matrix): | ||
# For a sparse matrix | ||
sum_comb_c = sum( | ||
comb2(n_c) for n_c in np.array(contingency.sum(axis=1))) | ||
sum_comb_k = sum( | ||
comb2(n_k) for n_k in np.array(contingency.sum(axis=0)).T) | ||
sum_comb = sum(comb2(n_ij) for n_ij in find(contingency)[2]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it's a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If its a coo_matrix, data contains only 1s
I can do:
? |
||
else: | ||
raise ValueError( | ||
"Unsupported type for 'contingency': " + str(type(contingency))) | ||
|
||
sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten()) | ||
prod_comb = (sum_comb_c * sum_comb_k) / float(comb(n_samples, 2)) | ||
mean_comb = (sum_comb_k + sum_comb_c) / 2. | ||
return ((sum_comb - prod_comb) / (mean_comb - prod_comb)) | ||
return float((sum_comb - prod_comb) / (mean_comb - prod_comb)) | ||
|
||
|
||
def homogeneity_completeness_v_measure(labels_true, labels_pred, | ||
max_n_classes=5000): | ||
max_n_classes=5000, sparse=False): | ||
"""Compute the homogeneity and completeness and V-Measure scores at once. | ||
|
||
Those metrics are based on normalized conditional entropy measures of | ||
|
@@ -253,6 +294,10 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, | |
metric. Setting it too high can lead to MemoryError or OS | ||
freeze | ||
|
||
sparse: boolean, optional. | ||
If True, intermediate calculation of the contingency matrix | ||
will calculate a sparse continency matrix. | ||
|
||
Returns | ||
------- | ||
homogeneity: float | ||
|
@@ -278,8 +323,12 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, | |
entropy_C = entropy(labels_true) | ||
entropy_K = entropy(labels_pred) | ||
|
||
MI = mutual_info_score(labels_true, labels_pred, | ||
max_n_classes=max_n_classes) | ||
if sparse: | ||
contingency = contingency_matrix(labels_true, labels_pred, sparse=True) | ||
MI = mutual_info_score(None, None, contingency=contingency) | ||
else: | ||
MI = mutual_info_score(labels_true, labels_pred, | ||
max_n_classes=max_n_classes) | ||
|
||
homogeneity = MI / (entropy_C) if entropy_C else 1.0 | ||
completeness = MI / (entropy_K) if entropy_K else 1.0 | ||
|
@@ -293,7 +342,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, | |
return homogeneity, completeness, v_measure_score | ||
|
||
|
||
def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): | ||
def homogeneity_score(labels_true, labels_pred, max_n_classes=5000, | ||
sparse=False): | ||
"""Homogeneity metric of a cluster labeling given a ground truth. | ||
|
||
A clustering result satisfies homogeneity if all of its clusters | ||
|
@@ -317,6 +367,10 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): | |
labels_pred : array, shape = [n_samples] | ||
cluster labels to evaluate | ||
|
||
sparse: boolean, optional. | ||
If True, intermediate calculation of the contingency matrix | ||
will calculate a sparse continency matrix. | ||
|
||
max_n_classes: int, optional (default=5000) | ||
Maximal number of classes handled by the adjusted_rand_score | ||
metric. Setting it too high can lead to MemoryError or OS | ||
|
@@ -369,11 +423,13 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): | |
0.0... | ||
|
||
""" | ||
return homogeneity_completeness_v_measure(labels_true, labels_pred, | ||
max_n_classes)[0] | ||
return \ | ||
homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, | ||
max_n_classes=max_n_classes)[0] | ||
|
||
|
||
def completeness_score(labels_true, labels_pred, max_n_classes=5000): | ||
def completeness_score(labels_true, labels_pred, max_n_classes=5000, | ||
sparse=False): | ||
"""Completeness metric of a cluster labeling given a ground truth. | ||
|
||
A clustering result satisfies completeness if all the data points | ||
|
@@ -397,6 +453,10 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000): | |
labels_pred : array, shape = [n_samples] | ||
cluster labels to evaluate | ||
|
||
sparse: boolean, optional. | ||
If True, intermediate calculation of the contingency matrix | ||
will calculate a sparse continency matrix. | ||
|
||
max_n_classes: int, optional (default=5000) | ||
Maximal number of classes handled by the adjusted_rand_score | ||
metric. Setting it too high can lead to MemoryError or OS | ||
|
@@ -445,11 +505,12 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000): | |
0.0 | ||
|
||
""" | ||
return homogeneity_completeness_v_measure(labels_true, labels_pred, | ||
max_n_classes)[1] | ||
return \ | ||
homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, | ||
max_n_classes=max_n_classes)[1] | ||
|
||
|
||
def v_measure_score(labels_true, labels_pred, max_n_classes=5000): | ||
def v_measure_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): | ||
"""V-measure cluster labeling given a ground truth. | ||
|
||
This score is identical to :func:`normalized_mutual_info_score`. | ||
|
@@ -477,6 +538,10 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000): | |
labels_pred : array, shape = [n_samples] | ||
cluster labels to evaluate | ||
|
||
sparse: boolean, optional. | ||
If True, intermediate calculation of the contingency matrix | ||
will calculate a sparse continency matrix. | ||
|
||
max_n_classes: int, optional (default=5000) | ||
Maximal number of classes handled by the adjusted_rand_score | ||
metric. Setting it too high can lead to MemoryError or OS | ||
|
@@ -547,7 +612,8 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000): | |
|
||
""" | ||
return homogeneity_completeness_v_measure(labels_true, labels_pred, | ||
max_n_classes)[2] | ||
max_n_classes=max_n_classes, | ||
sparse=sparse)[2] | ||
|
||
|
||
def mutual_info_score(labels_true, labels_pred, contingency=None, | ||
|
@@ -586,7 +652,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, | |
labels_pred : array, shape = [n_samples] | ||
A clustering of the data into disjoint subsets. | ||
|
||
contingency: None or array, shape = [n_classes_true, n_classes_pred] | ||
contingency: {None, array, sparse matrix}, | ||
shape = [n_classes_true, n_classes_pred] | ||
A contingency matrix given by the :func:`contingency_matrix` function. | ||
If value is ``None``, it will be computed, otherwise the given value is | ||
used, with ``labels_true`` and ``labels_pred`` ignored. | ||
|
@@ -610,22 +677,41 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, | |
labels_true, labels_pred = check_clusterings(labels_true, labels_pred) | ||
contingency = contingency_matrix(labels_true, labels_pred, | ||
max_n_classes=max_n_classes) | ||
contingency = np.array(contingency, dtype='float') | ||
contingency_sum = np.sum(contingency) | ||
pi = np.sum(contingency, axis=1) | ||
pj = np.sum(contingency, axis=0) | ||
outer = np.outer(pi, pj) | ||
nnz = contingency != 0.0 | ||
# normalized contingency | ||
contingency_nm = contingency[nnz] | ||
log_contingency_nm = np.log(contingency_nm) | ||
contingency_nm /= contingency_sum | ||
# log(a / b) should be calculated as log(a) - log(b) for | ||
# possible loss of precision | ||
log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum()) | ||
mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) + | ||
contingency_nm * log_outer) | ||
return mi.sum() | ||
if isinstance(contingency, np.ndarray): | ||
# For an array | ||
contingency = np.array(contingency, dtype='float') | ||
contingency_sum = np.sum(contingency) | ||
pi = np.sum(contingency, axis=1) | ||
pj = np.sum(contingency, axis=0) | ||
outer = np.outer(pi, pj) | ||
nnz = contingency != 0.0 | ||
# normalized contingency | ||
contingency_nm = contingency[nnz] | ||
log_contingency_nm = np.log(contingency_nm) | ||
contingency_nm /= contingency_sum | ||
# log(a / b) should be calculated as log(a) - log(b) for | ||
# possible loss of precision | ||
log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum()) | ||
mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) | ||
+ contingency_nm * log_outer) | ||
return mi.sum() | ||
elif isinstance(contingency, _data_matrix): | ||
# For a sparse matrix | ||
contingency_sum = contingency.sum() | ||
pi = np.array(contingency.sum(axis=1)) | ||
pj = np.array(contingency.sum(axis=0)).T | ||
nnzx, nnzy, nnz_val = find(contingency) | ||
log_contingency_nm = np.log(nnz_val) | ||
contingency_nm = nnz_val * 1.0 / contingency_sum | ||
# Don't need to calculate the full outer product. Just for the non-zero values | ||
outer = np.array([pi[x] * pj[y] for x, y in zip(nnzx, nnzy)]).T | ||
log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum()) | ||
mi = contingency_nm * (log_contingency_nm - log(contingency_sum)) + \ | ||
contingency_nm * log_outer | ||
return mi.sum() | ||
else: | ||
raise ValueError( | ||
"Unsupported type for 'contingency': " + str(type(contingency))) | ||
|
||
|
||
def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): | ||
|
@@ -714,7 +800,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): | |
# Special limit cases: no clustering since the data is not split. | ||
# This is a perfect match hence return 1.0. | ||
if (classes.shape[0] == clusters.shape[0] == 1 or | ||
classes.shape[0] == clusters.shape[0] == 0): | ||
classes.shape[0] == clusters.shape[0] == 0): | ||
return 1.0 | ||
contingency = contingency_matrix(labels_true, labels_pred, | ||
max_n_classes=max_n_classes) | ||
|
@@ -801,7 +887,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): | |
# Special limit cases: no clustering since the data is not split. | ||
# This is a perfect match hence return 1.0. | ||
if (classes.shape[0] == clusters.shape[0] == 1 or | ||
classes.shape[0] == clusters.shape[0] == 0): | ||
classes.shape[0] == clusters.shape[0] == 0): | ||
return 1.0 | ||
contingency = contingency_matrix(labels_true, labels_pred, | ||
max_n_classes=max_n_classes) | ||
|
@@ -883,7 +969,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000): | |
.. [2] `Wikipedia entry for the Fowlkes-Mallows Index | ||
<https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_ | ||
""" | ||
labels_true, labels_pred = check_clusterings(labels_true, labels_pred,) | ||
labels_true, labels_pred = check_clusterings(labels_true, labels_pred, ) | ||
n_samples, = labels_true.shape | ||
|
||
c = contingency_matrix(labels_true, labels_pred, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use
check_array
to get the data into an appropriate format.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand.
sklearn.utils.check_array
? I didn't change the logic of this at all