8000 Use sparse contingency matrix for supervised cluster metrics by stuppie · Pull Request #7203 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

Use sparse contingency matrix for supervised cluster metrics #7203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 141 additions & 55 deletions sklearn/metrics/cluster/supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
# Diego Molla <dmolla-aliod@gmail.com>
# Arnaud Fouchet <foucheta@gmail.com>
# Thierry Guillemot <thierry.guillemot.work@gmail.com>
# Gregory Stupp <stuppie@gmail.com>
# License: BSD 3 clause

from math import log

from scipy.misc import comb
from scipy.sparse import coo_matrix
import numpy as np
from scipy.misc import comb
from scipy.sparse import coo_matrix, find
from scipy.sparse.data import _data_matrix

from .expected_mutual_info_fast import expected_mutual_information
from ...utils.fixes import bincount
Expand Down Expand Up @@ -46,7 +48,8 @@ def check_clusterings(labels_true, labels_pred):
return labels_true, labels_pred


def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000,
sparse=False):
"""Build a contingency matrix describing the relationship between labels.

Parameters
Expand All @@ -57,33 +60,42 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
labels_pred : array, shape = [n_samples]
Cluster labels to evaluate

eps: None or float
eps: None or float, optional.
If a float, that value is added to all values in the contingency
matrix. This helps to stop NaN propagation.
If ``None``, nothing is adjusted.

max_n_classes : int, optional (default=5000)
Maximal number of classeses handled for contingency_matrix.
This help to avoid Memory error with regression target
for mutual_information.
for mutual_information. If ``sparse is True``,
`max_n_classes` is ignored.

sparse: boolean, optional.
If True, return a sparse continency matrix. If ``eps is not None``,
and ``sparse is True``, will throw ValueError.

Returns
-------
contingency: array, shape=[n_classes_true, n_classes_pred]
contingency: {array-like, sparse}, shape=[n_classes_true, n_classes_pred]
Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in
true class :math:`i` and in predicted class :math:`j`. If
``eps is None``, the dtype of this array will be integer. If ``eps`` is
given, the dtype will be float.
given, the dtype will be float. Will be sparse if ``sparse is True``
"""

if eps is not None and sparse:
raise ValueError("Cannot set 'eps' and return a sparse matrix")

classes, class_idx = np.unique(labels_true, return_inverse=True)
clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
n_classes = classes.shape[0]
n_clusters = clusters.shape[0]
if n_classes > max_n_classes:
if not sparse and (n_classes > max_n_classes):
raise ValueError("Too many classes for a clustering metric. If you "
"want to increase the limit, pass parameter "
"max_n_classes to the scoring function")
if n_clusters > max_n_classes:
if not sparse and (n_clusters > max_n_classes):
raise ValueError("Too many clusters for a clustering metric. If you "
"want to increase the limit, pass parameter "
"max_n_classes to the scoring function")
Expand All @@ -93,7 +105,9 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):
contingency = coo_matrix((np.ones(class_idx.shape[0]),
(class_idx, cluster_idx)),
shape=(n_classes, n_clusters),
dtype=np.int).toarray()
dtype=np.int)
if not sparse:
contingency = contingency.toarray()
if eps is not None:
# don't use += as contingency is integer
contingency = contingency + eps
Expand All @@ -102,7 +116,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000):

# clustering measures

def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000,
contingency=None):
"""Rand index adjusted for chance.

The Rand Index computes a similarity measure between two clusterings
Expand Down Expand Up @@ -139,6 +154,11 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
metric. Setting it too high can lead to MemoryError or OS
freeze

contingency: {None, sparse matrix}, shape = [n_classes_true, n_classes_pred]
A contingency matrix given by the :func:`contingency_matrix` function.
If value is ``None``, it will be computed, otherwise the given value is
used, with ``labels_true`` and ``labels_pred`` ignored.

Returns
-------
ari : float
Expand Down Expand Up @@ -188,33 +208,54 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000):
adjusted_mutual_info_score: Adjusted Mutual Information

"""
labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
n_samples = labels_true.shape[0]
classes = np.unique(labels_true)
clusters = np.unique(labels_pred)
if contingency is None:
labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
n_samples = labels_true.shape[0]
n_classes = np.unique(labels_true).shape[0]
n_clusters = np.unique(labels_pred).shape[0]
elif isinstance(contingency, _data_matrix):
n_samples = contingency.nnz
n_classes, n_clusters = contingency.shape
else:
raise ValueError("'contingency' must be a sparse matrix or None")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use check_array to get the data into an appropriate format.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand. sklearn.utils.check_array ? I didn't change the logic of this at all


# Special limit cases: no clustering since the data is not split;
# or trivial clustering where each document is assigned a unique cluster.
# These are perfect matches hence return 1.0.
if (classes.shape[0] == clusters.shape[0] == 1 or
classes.shape[0] == clusters.shape[0] == 0 or
classes.shape[0] == clusters.shape[0] == len(labels_true)):
if (n_classes == n_clusters == 1 or
n_classes == n_clusters == 0 or
n_classes == n_clusters == n_samples):
return 1.0

contingency = contingency_matrix(labels_true, labels_pred,
max_n_classes=max_n_classes)
# Compute contingency matrix if we weren't given it
if contingency is None:
contingency = contingency_matrix(labels_true, labels_pred,
max_n_classes=max_n_classes)
Copy link
Member
@ogrisel ogrisel Sep 14, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still builds a dense contingency matrix by default. Maybe we should have an internal heuristic: if n_classes is small we use a dense array (because it's faster to compute) and if it's larger than than some threshold we switch to sparse.

Or we could decide to always build sparse contingency matrices internally to keep the code simpler and expect that a difference in speed will never exceed a couple of seconds for small to medium inputs while very large input will only be addressable via sparse contingency matrices anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm benchmarking now. will post a WIP to supersede this one.


# Compute the ARI using the contingency data
sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1))
sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0))
if isinstance(contingency, np.ndarray):
# For an array
sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1))
sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0))
sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten())
elif isinstance(contingency, _data_matrix):
# For a sparse matrix
sum_comb_c = sum(
comb2(n_c) for n_c in np.array(contingency.sum(axis=1)))
sum_comb_k = sum(
comb2(n_k) for n_k in np.array(contingency.sum(axis=0)).T)
sum_comb = sum(comb2(n_ij) for n_ij in find(contingency)[2])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's a _data_matrix surely contingency.data.flatten() will do without the more expensive find operation?

Copy link
Contributor Author
@stuppie stuppie Aug 18, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If its a coo_matrix, data contains only 1s

In [121]: C.toarray()
Out[121]: 
array([[5, 1, 0],
       [1, 4, 1],
       [0, 2, 3]])

In [122]: C.data
Out[122]: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

I can do:

In [123]: C.tocsr().data
Out[123]: array([5, 1, 1, 4, 1, 2, 3], dtype=int64)

?

else:
raise ValueError(
"Unsupported type for 'contingency': " + str(type(contingency)))

sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten())
prod_comb = (sum_comb_c * sum_comb_k) / float(comb(n_samples, 2))
mean_comb = (sum_comb_k + sum_comb_c) / 2.
return ((sum_comb - prod_comb) / (mean_comb - prod_comb))
return float((sum_comb - prod_comb) / (mean_comb - prod_comb))


def homogeneity_completeness_v_measure(labels_true, labels_pred,
max_n_classes=5000):
max_n_classes=5000, sparse=False):
"""Compute the homogeneity and completeness and V-Measure scores at once.

Those metrics are based on normalized conditional entropy measures of
Expand Down Expand Up @@ -253,6 +294,10 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred,
metric. Setting it too high can lead to MemoryError or OS
freeze

sparse: boolean, optional.
If True, intermediate calculation of the contingency matrix
will calculate a sparse continency matrix.

Returns
-------
homogeneity: float
Expand All @@ -278,8 +323,12 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred,
entropy_C = entropy(labels_true)
entropy_K = entropy(labels_pred)

MI = mutual_info_score(labels_true, labels_pred,
max_n_classes=max_n_classes)
if sparse:
contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
MI = mutual_info_score(None, None, contingency=contingency)
else:
MI = mutual_info_score(labels_true, labels_pred,
max_n_classes=max_n_classes)

homogeneity = MI / (entropy_C) if entropy_C else 1.0
completeness = MI / (entropy_K) if entropy_K else 1.0
Expand All @@ -293,7 +342,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred,
return homogeneity, completeness, v_measure_score


def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
def homogeneity_score(labels_true, labels_pred, max_n_classes=5000,
sparse=False):
"""Homogeneity metric of a cluster labeling given a ground truth.

A clustering result satisfies homogeneity if all of its clusters
Expand All @@ -317,6 +367,10 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
labels_pred : array, shape = [n_samples]
cluster labels to evaluate

sparse: boolean, optional.
If True, intermediate calculation of the contingency matrix
will calculate a sparse continency matrix.

max_n_classes: int, optional (default=5000)
Maximal number of classes handled by the adjusted_rand_score
metric. Setting it too high can lead to MemoryError or OS
Expand Down Expand Up @@ -369,11 +423,13 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000):
0.0...

"""
return homogeneity_completeness_v_measure(labels_true, labels_pred,
max_n_classes)[0]
return \
homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse,
max_n_classes=max_n_classes)[0]


def completeness_score(labels_true, labels_pred, max_n_classes=5000):
def completeness_score(labels_true, labels_pred, max_n_classes=5000,
sparse=False):
"""Completeness metric of a cluster labeling given a ground truth.

A clustering result satisfies completeness if all the data points
Expand All @@ -397,6 +453,10 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000):
labels_pred : array, shape = [n_samples]
cluster labels to evaluate

sparse: boolean, optional.
If True, intermediate calculation of the contingency matrix
will calculate a sparse continency matrix.

max_n_classes: int, optional (default=5000)
Maximal number of classes handled by the adjusted_rand_score
metric. Setting it too high can lead to MemoryError or OS
Expand Down Expand Up @@ -445,11 +505,12 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000):
0.0

"""
return homogeneity_completeness_v_measure(labels_true, labels_pred,
max_n_classes)[1]
return \
homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse,
max_n_classes=max_n_classes)[1]


def v_measure_score(labels_true, labels_pred, max_n_classes=5000):
def v_measure_score(labels_true, labels_pred, max_n_classes=5000, sparse=False):
"""V-measure cluster labeling given a ground truth.

This score is identical to :func:`normalized_mutual_info_score`.
Expand Down Expand Up @@ -477,6 +538,10 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000):
labels_pred : array, shape = [n_samples]
cluster labels to evaluate

sparse: boolean, optional.
If True, intermediate calculation of the contingency matrix
will calculate a sparse continency matrix.

max_n_classes: int, optional (default=5000)
Maximal number of classes handled by the adjusted_rand_score
metric. Setting it too high can lead to MemoryError or OS
Expand Down Expand Up @@ -547,7 +612,8 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000):

"""
return homogeneity_completeness_v_measure(labels_true, labels_pred,
max_n_classes)[2]
max_n_classes=max_n_classes,
sparse=sparse)[2]


def mutual_info_score(labels_true, labels_pred, contingency=None,
Expand Down Expand Up @@ -586,7 +652,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None,
labels_pred : array, shape = [n_samples]
A clustering of the data into disjoint subsets.

contingency: None or array, shape = [n_classes_true, n_classes_pred]
contingency: {None, array, sparse matrix},
shape = [n_classes_true, n_classes_pred]
A contingency matrix given by the :func:`contingency_matrix` function.
If value is ``None``, it will be computed, otherwise the given value is
used, with ``labels_true`` and ``labels_pred`` ignored.
Expand All @@ -610,22 +677,41 @@ def mutual_info_score(labels_true, labels_pred, contingency=None,
labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
contingency = contingency_matrix(labels_true, labels_pred,
max_n_classes=max_n_classes)
contingency = np.array(contingency, dtype='float')
contingency_sum = np.sum(contingency)
pi = np.sum(contingency, axis=1)
pj = np.sum(contingency, axis=0)
outer = np.outer(pi, pj)
nnz = contingency != 0.0
# normalized contingency
contingency_nm = contingency[nnz]
log_contingency_nm = np.log(contingency_nm)
contingency_nm /= contingency_sum
# log(a / b) should be calculated as log(a) - log(b) for
# possible loss of precision
log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum())
mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
contingency_nm * log_outer)
return mi.sum()
if isinstance(contingency, np.ndarray):
# For an array
contingency = np.array(contingency, dtype='float')
contingency_sum = np.sum(contingency)
pi = np.sum(contingency, axis=1)
pj = np.sum(contingency, axis=0)
outer = np.outer(pi, pj)
nnz = contingency != 0.0
# normalized contingency
contingency_nm = contingency[nnz]
log_contingency_nm = np.log(contingency_nm)
contingency_nm /= contingency_sum
# log(a / b) should be calculated as log(a) - log(b) for
# possible loss of precision
log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum())
mi = (contingency_nm * (log_contingency_nm - log(contingency_sum))
+ contingency_nm * log_outer)
return mi.sum()
elif isinstance(contingency, _data_matrix):
# For a sparse matrix
contingency_sum = contingency.sum()
pi = np.array(contingency.sum(axis=1))
pj = np.array(contingency.sum(axis=0)).T
nnzx, nnzy, nnz_val = find(contingency)
log_contingency_nm = np.log(nnz_val)
contingency_nm = nnz_val * 1.0 / contingency_sum
# Don't need to calculate the full outer product. Just for the non-zero values
outer = np.array([pi[x] * pj[y] for x, y in zip(nnzx, nnzy)]).T
log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
mi = contingency_nm * (log_contingency_nm - log(contingency_sum)) + \
contingency_nm * log_outer
return mi.sum()
else:
raise ValueError(
"Unsupported type for 'contingency': " + str(type(contingency)))


def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
Expand Down Expand Up @@ -714,7 +800,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
# Special limit cases: no clustering since the data is not split.
# This is a perfect match hence return 1.0.
if (classes.shape[0] == clusters.shape[0] == 1 or
classes.shape[0] == clusters.shape[0] == 0):
classes.shape[0] == clusters.shape[0] == 0):
return 1.0
contingency = contingency_matrix(labels_true, labels_pred,
max_n_classes=max_n_classes)
Expand Down Expand Up @@ -801,7 +887,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000):
# Special limit cases: no clustering since the data is not split.
# This is a perfect match hence return 1.0.
if (classes.shape[0] == clusters.shape[0] == 1 or
classes.shape[0] == clusters.shape[0] == 0):
classes.shape[0] == clusters.shape[0] == 0):
return 1.0
contingency = contingency_matrix(labels_true, labels_pred,
max_n_classes=max_n_classes)
Expand Down Expand Up @@ -883,7 +969,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000):
.. [2] `Wikipedia entry for the Fowlkes-Mallows Index
<https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_
"""
labels_true, labels_pred = check_clusterings(labels_true, labels_pred,)
labels_true, labels_pred = check_clusterings(labels_true, labels_pred, )
n_samples, = labels_true.shape

c = contingency_matrix(labels_true, labels_pred,
Expand Down
Loading
0