8000 Add benchmarks · scikit-learn/scikit-learn@baa7761 · GitHub
[go: up one dir, main page]

Skip to content

Commit baa7761

Browse files
committed
Add benchmarks
1 parent b3c5d88 commit baa7761

File tree

3 files changed

+106
-60
lines changed

3 files changed

+106
-60
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from collections import defaultdict
2+
from itertools import product
3+
from time import time
4+
5+
import numpy as np
6+
import pandas as pd
7+
import matplotlib.pyplot as plt
8+
9+
from sklearn.metrics import adjusted_mutual_info_score
10+
11+
repeat = 10
12+
n_samples = [1_000, 10_000, 100_000, 1_000_000]
13+
n_labels = [10, 100, 1_000]
14+
15+
rng = np.random.default_rng(0)
16+
17+
result = defaultdict(list)
18+
for ns, nl in product(n_samples, n_labels):
19+
local_result = []
20+
for i in range(repeat):
21+
print(f"Repetition {i+1} for n_samples={ns} and n_labels={nl}")
22+
x = rng.integers(low=0, high=nl, size=ns)
23+
y = rng.integers(low=0, high=nl, size=ns)
24+
25+
start = time()
26+
adjusted_mutual_info_score(x, y)
27+
end = time()
28+
local_result.append(end - start)
29+
30+
result["n_samples"].append(ns)
31+
result["n_labels"].append(nl)
32+
result["mean_time"].append(np.mean(local_result))
33+
34+
result = pd.DataFrame(result)
35+
plt.figure("Adjusted Mutual Info Score Benchmarks against number of Labels")
36+
for n_sample in n_samples:
37+
samples = result[result["n_samples"] == n_sample]
38+
plt.plot(
39+
samples["n_labels"], samples["mean_time"], label=f"{str(n_sample)} samples"
40+
)
41+
plt.xlabel("n_labels")
42+
plt.ylabel("Time (s)")
43+
plt.legend()
44+
45+
plt.figure("Adjusted Mutual Info Score Benchmarks against number of Samples")
46+
for n_label in n_labels:
47+
labels = result[result["n_labels"] == n_label]
48+
plt.plot(labels["n_samples"], labels["mean_time"], label=f"{str(n_label)} labels")
49+
plt.xlabel("n_samples")
50+
plt.ylabel("Time (s)")
51+
plt.legend()
52+
53+
plt.show()
Lines changed: 52 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,63 @@
1-
# Authors: Robert Layton <robertlayton@gmail.com>
2-
# Corey Lynch <coreylynch9@gmail.com>
3-
# License: BSD 3 clause
1+
from libc.math cimport exp, lgamma, log
2+
import cython
3+
cimport numpy as cnp
44

5-
from libc.math cimport exp, lgamma
6-
from scipy.special import gammaln
75
import numpy as np
8-
cimport numpy as cnp
6+
97

108
cnp.import_array()
11-
ctypedef cnp.float64_t DOUBLE
129

10+
cpdef expected_mutual_information(contingency, cnp.int64_t n_samples):
11+
"""Compute the expected mutual information.
12+
13+
Parameters
14+
----------
15+
contingency : sparse-matrix of shape (n_labels, n_labels), dtype=np.int64
16+
Contengency matrix storing the label counts. This is as sparse
17+
matrix at the CSR format.
18+
19+
n_samples : int
20+
The number of samples which is the sum of the counts in the contengency
21+
matrix.
22+
23+
Returns
24+
-------
25+
emi : float
26+
The expected mutual information between the two vectors of labels.
27+
"""
28+
cdef:
29+
unsigned long long n_rows, n_cols
30+
cnp.ndarray[cnp.int64_t] a, b
31+
cnp.int64_t a_i, b_j
32+
cnp.intp_t i, j, nij, nij_nz
33+
cnp.intp_t start, end
34+
cnp.float64_t emi = 0.0
35+
double term1, term2, term3
36+
double log_n_samples = log(n_samples)
1337

14-
def expected_mutual_information(contingency, int n_samples):
15-
"""Calculate the expected mutual information for two labelings."""
16-
cdef int R, C
17-
cdef DOUBLE N, gln_N, emi, term2, term3, gln
18-
cdef cnp.ndarray[DOUBLE] gln_a, gln_b, gln_Na, gln_Nb, gln_nij, log_Nnij
19-
cdef cnp.ndarray[DOUBLE] nijs, term1
20-
cdef cnp.ndarray[DOUBLE] log_a, log_b
21-
cdef cnp.ndarray[cnp.int32_t] a, b
22-
#cdef np.ndarray[int, ndim=2] start, end
23-
R, C = contingency.shape
24-
N = <DOUBLE>n_samples
25-
a = np.ravel(contingency.sum(axis=1).astype(np.int32, copy=False))
26-
b = np.ravel(contingency.sum(axis=0).astype(np.int32, copy=False))
38+
n_rows, n_cols = contingency.shape
39+
a = np.ravel(contingency.sum(axis=1))
40+
b = np.ravel(contingency.sum(axis=0))
2741

28-
# any labelling with zero entropy implies EMI = 0
2942
if a.size == 1 or b.size == 1:
3043
return 0.0
3144

32-
# There are three major terms to the EMI equation, which are multiplied to
33-
# and then summed over varying nij values.
34-
# While nijs[0] will never be used, having it simplifies the indexing.
35-
nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float')
36-
nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue.
37-
# term1 is nij / N
38-
term1 = nijs / N
39-
# term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
40-
log_a = np.log(a)
41-
log_b = np.log(b)
42-
# term2 uses log(N * nij) = log(N) + log(nij)
43-
log_Nnij = np.log(N) + np.log(nijs)
44-
# term3 is large, and involved many factorials. Calculate these in log
45-
# space to stop overflows.
46-
gln_a = gammaln(a + 1)
47-
gln_b = gammaln(b + 1)
48-
gln_Na = gammaln(N - a + 1)
49-
gln_Nb = gammaln(N - b + 1)
50-
gln_N = gammaln(N + 1)
51-
gln_nij = gammaln(nijs + 1)
52-
# start and end values for nij terms for each summation.
53-
start = np.array(np.meshgrid(a,b),dtype='int').T
54-
start = np.array([start[i].sum(axis=1)-N for i in range(len(start))])
55-
start = np.maximum(start, 1)
56-
end = np.minimum(np.resize(a, (C, R)).T, np.resize(b, (R, C))) + 1
57-
# emi itself is a summation over the various values.
58-
emi = 0.0
59-
cdef Py_ssize_t i, j, nij
60-
for i in range(R):
61-
for j in range(C):
62-
for nij in range(start[i,j], end[i,j]):
63-
term2 = log_Nnij[nij] - log_a[i] - log_b[j]
64-
# Numerators are positive, denominators are negative.
65-
gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
66-
- gln_N - gln_nij[nij] - lgamma(a[i] - nij + 1)
67-
- lgamma(b[j] - nij + 1)
68-
- lgamma(N - a[i] - b[j] + nij + 1))
69-
term3 = exp(gln)
70-
emi += (term1[nij] * term2 * term3)
45+
for i in range(n_rows):
46+
for j in range(n_cols):
47+
a_i, b_j = a[i], b[j]
48+
start = max(1, a_i - n_samples + b_j)
49+
end = min(a_i, b_j) + 1
50+
for nij in range(start, end):
51+
nij_nz = 1 if nij == 0 else nij
52+
term1 = nij_nz / <double>n_samples
53+
term2 = log_n_samples + log(nij_nz) - log(a_i) - log(b_j)
54+
term3 = exp(
55+
lgamma(a_i + 1) + lgamma(b_j + 1)
56+
+ lgamma(n_samples - a_i + 1) + lgamma(n_samples - b_j + 1)
57+
- lgamma(n_samples + 1) - lgamma(nij_nz + 1)
58+
- lgamma(a_i - nij + 1)
59+
- lgamma(b_j - nij + 1)
60+
- lgamma(n_samples - a_i - b_j + nij + 1)
61+
)
62+
emi += (term1 * term2 * term3)
7163
return emi

sklearn/metrics/cluster/_supervised.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,7 @@ def adjusted_mutual_info_score(
937937

938938
contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
939939
contingency = contingency.astype(np.float64, copy=False)
940+
940941
# Calculate the MI for the two clusterings
941942
mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)
942943
# Calculate the expected value for the mutual information

0 commit comments

Comments
 (0)
0