|
1 |
| -# Authors: Robert Layton <robertlayton@gmail.com> |
2 |
| -# Corey Lynch <coreylynch9@gmail.com> |
3 |
| -# License: BSD 3 clause |
| 1 | +from libc.math cimport exp, lgamma, log |
| 2 | +import cython |
| 3 | +cimport numpy as cnp |
4 | 4 |
|
5 |
| -from libc.math cimport exp, lgamma |
6 |
| -from scipy.special import gammaln |
7 | 5 | import numpy as np
|
8 |
| -cimport numpy as cnp |
| 6 | + |
9 | 7 |
|
10 | 8 | cnp.import_array()
|
11 |
| -ctypedef cnp.float64_t DOUBLE |
12 | 9 |
|
| 10 | +cpdef expected_mutual_information(contingency, cnp.int64_t n_samples): |
| 11 | + """Compute the expected mutual information. |
| 12 | +
|
| 13 | + Parameters |
| 14 | + ---------- |
| 15 | + contingency : sparse-matrix of shape (n_labels, n_labels), dtype=np.int64 |
| 16 | + Contengency matrix storing the label counts. This is as sparse |
| 17 | + matrix at the CSR format. |
| 18 | +
|
| 19 | + n_samples : int |
| 20 | + The number of samples which is the sum of the counts in the contengency |
| 21 | + matrix. |
| 22 | +
|
| 23 | + Returns |
| 24 | + ------- |
| 25 | + emi : float |
| 26 | + The expected mutual information between the two vectors of labels. |
| 27 | + """ |
| 28 | + cdef: |
| 29 | + unsigned long long n_rows, n_cols |
| 30 | + cnp.ndarray[cnp.int64_t] a, b |
| 31 | + cnp.int64_t a_i, b_j |
| 32 | + cnp.intp_t i, j, nij, nij_nz |
| 33 | + cnp.intp_t start, end |
| 34 | + cnp.float64_t emi = 0.0 |
| 35 | + double term1, term2, term3 |
| 36 | + double log_n_samples = log(n_samples) |
13 | 37 |
|
14 |
| -def expected_mutual_information(contingency, int n_samples): |
15 |
| - """Calculate the expected mutual information for two labelings.""" |
16 |
| - cdef int R, C |
17 |
| - cdef DOUBLE N, gln_N, emi, term2, term3, gln |
18 |
| - cdef cnp.ndarray[DOUBLE] gln_a, gln_b, gln_Na, gln_Nb, gln_nij, log_Nnij |
19 |
| - cdef cnp.ndarray[DOUBLE] nijs, term1 |
20 |
| - cdef cnp.ndarray[DOUBLE] log_a, log_b |
21 |
| - cdef cnp.ndarray[cnp.int32_t] a, b |
22 |
| - #cdef np.ndarray[int, ndim=2] start, end |
23 |
| - R, C = contingency.shape |
24 |
| - N = <DOUBLE>n_samples |
25 |
| - a = np.ravel(contingency.sum(axis=1).astype(np.int32, copy=False)) |
26 |
| - b = np.ravel(contingency.sum(axis=0).astype(np.int32, copy=False)) |
| 38 | + n_rows, n_cols = contingency.shape |
| 39 | + a = np.ravel(contingency.sum(axis=1)) |
| 40 | + b = np.ravel(contingency.sum(axis=0)) |
27 | 41 |
|
28 |
| - # any labelling with zero entropy implies EMI = 0 |
29 | 42 | if a.size == 1 or b.size == 1:
|
30 | 43 | return 0.0
|
31 | 44 |
|
32 |
| - # There are three major terms to the EMI equation, which are multiplied to |
33 |
| - # and then summed over varying nij values. |
34 |
| - # While nijs[0] will never be used, having it simplifies the indexing. |
35 |
| - nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float') |
36 |
| - nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue. |
37 |
| - # term1 is nij / N |
38 |
| - term1 = nijs / N |
39 |
| - # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b) |
40 |
| - log_a = np.log(a) |
41 |
| - log_b = np.log(b) |
42 |
| - # term2 uses log(N * nij) = log(N) + log(nij) |
43 |
| - log_Nnij = np.log(N) + np.log(nijs) |
44 |
| - # term3 is large, and involved many factorials. Calculate these in log |
45 |
| - # space to stop overflows. |
46 |
| - gln_a = gammaln(a + 1) |
47 |
| - gln_b = gammaln(b + 1) |
48 |
| - gln_Na = gammaln(N - a + 1) |
49 |
| - gln_Nb = gammaln(N - b + 1) |
50 |
| - gln_N = gammaln(N + 1) |
51 |
| - gln_nij = gammaln(nijs + 1) |
52 |
| - # start and end values for nij terms for each summation. |
53 |
| - start = np.array(np.meshgrid(a,b),dtype='int').T |
54 |
| - start = np.array([start[i].sum(axis=1)-N for i in range(len(start))]) |
55 |
| - start = np.maximum(start, 1) |
56 |
| - end = np.minimum(np.resize(a, (C, R)).T, np.resize(b, (R, C))) + 1 |
57 |
| - # emi itself is a summation over the various values. |
58 |
| - emi = 0.0 |
59 |
| - cdef Py_ssize_t i, j, nij |
60 |
| - for i in range(R): |
61 |
| - for j in range(C): |
62 |
| - for nij in range(start[i,j], end[i,j]): |
63 |
| - term2 = log_Nnij[nij] - log_a[i] - log_b[j] |
64 |
| - # Numerators are positive, denominators are negative. |
65 |
| - gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j] |
66 |
| - - gln_N - gln_nij[nij] - lgamma(a[i] - nij + 1) |
67 |
| - - lgamma(b[j] - nij + 1) |
68 |
| - - lgamma(N - a[i] - b[j] + nij + 1)) |
69 |
| - term3 = exp(gln) |
70 |
| - emi += (term1[nij] * term2 * term3) |
| 45 | + for i in range(n_rows): |
| 46 | + for j in range(n_cols): |
| 47 | + a_i, b_j = a[i], b[j] |
| 48 | + start = max(1, a_i - n_samples + b_j) |
| 49 | + end = min(a_i, b_j) + 1 |
| 50 | + for nij in range(start, end): |
| 51 | + nij_nz = 1 if nij == 0 else nij |
| 52 | + term1 = nij_nz / <double>n_samples |
| 53 | + term2 = log_n_samples + log(nij_nz) - log(a_i) - log(b_j) |
| 54 | + term3 = exp( |
| 55 | + lgamma(a_i + 1) + lgamma(b_j + 1) |
| 56 | + + lgamma(n_samples - a_i + 1) + lgamma(n_samples - b_j + 1) |
| 57 | + - lgamma(n_samples + 1) - lgamma(nij_nz + 1) |
| 58 | + - lgamma(a_i - nij + 1) |
| 59 | + - lgamma(b_j - nij + 1) |
| 60 | + - lgamma(n_samples - a_i - b_j + nij + 1) |
| 61 | + ) |
| 62 | + emi += (term1 * term2 * term3) |
71 | 63 | return emi
|
0 commit comments