8000 Use contingency_matrix in prediction_strength_score · scikit-learn/scikit-learn@e298446 · GitHub
[go: up one dir, main page]

Skip to content

Commit e298446

Browse files
committed
Use contingency_matrix in prediction_strength_score
Avoids using a for loop over all pairs
1 parent 35c7ad9 commit e298446

File tree

1 file changed

+9
-24
lines changed

1 file changed

+9
-24
lines changed

sklearn/metrics/cluster/unsupervised.py

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,16 @@
55
# Thierry Guillemot <thierry.guillemot.work@gmail.com>
66
# License: BSD 3 clause
77

8-
from itertools import chain
9-
from itertools import permutations
10-
118
import numpy as np
129

13-
from ...externals.six.moves import xrange
1410
from ...utils import check_array
1511
from ...utils import check_consistent_length
1612
from ...utils import check_random_state
1713
from ...utils import check_X_y
1814
from ...utils.fixes import bincount
1915
from ..pairwise import pairwise_distances
2016
from ...preprocessing import LabelEncoder
17+
from .supervised import contingency_matrix
2118

2219

2320
def check_number_of_labels(n_labels, n_samples):
@@ -304,27 +301,15 @@ def prediction_strength_score(labels_train, labels_test):
304301
labels_test = check_array(labels_test, dtype=np.int32, ensure_2d=False,
305302
warn_on_dtype=True)
306303

307-
clusters = set(chain(labels_train, labels_test))
308-
n_clusters = len(clusters)
304+
n_clusters = max(np.unique(labels_train).shape[0],
305+
np.unique(labels_test).shape[0])
309306
if n_clusters == 1:
310307
return 1.0 # by definition
311308

312-
strength = 1.0
313-
for k in clusters:
314-
# samples assigned to k-th cluster based on test data
315-
samples_test_k = np.flatnonzero(labels_test == k)
316-
cluster_test_size = samples_test_k.shape[0]
317-
318-
if cluster_test_size < 2:
319-
continue
320-
321-
matches = 0
322-
for i, j in permutations(xrange(cluster_test_size), 2):
323-
ki, kj = samples_test_k[j], samples_test_k[i]
324-
if labels_train[ki] == labels_train[kj]:
325-
matches += 1
326-
327-
strength = min(strength, matches / (cluster_test_size *
328-
(cluster_test_size - 1.)))
309+
C = contingency_matrix(labels_train, labels_test)
310+
pairs_matching = (C * (C - 1) / 2).sum(axis=0)
311+
M = C.sum(axis=0)
312+
pairs_total = (M * (M - 1) / 2)
313+
nz = pairs_total.nonzero()[0]
329314

330-
return strength
315+
return (pairs_matching[nz] / pairs_total[nz]).min()

0 commit comments

Comments
 (0)
0