8000 [MRG+1] Fix regression in silhouette_score for clusters of size 1 (#7… · scikit-learn/scikit-learn@6bc1257 · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit 6bc1257

Browse files
jnothmanamueller
authored andcommitted
[MRG+1] Fix regression in silhouette_score for clusters of size 1 (#7438)
1 parent 2f43bf2 commit 6bc1257

File tree

3 files changed

+46
-24
lines changed

3 files changed

+46
-24
lines changed

doc/whats_new.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,14 @@ Bug fixes
467467
(`#7350 <https://github.com/scikit-learn/scikit-learn/pull/7350/>`_)
468468
By `Russell Smith`_.
469469

470+
- Fix bug in :func:`metrics.silhouette_score` in which clusters of
471+
size 1 were incorrectly scored. They should get a score of 0.
472+
By `Joel Nothman`_.
473+
474+
- Fix bug in :func:`metrics.silhouette_samples` so that it now works with
475+
arbitrary labels, not just those ranging from 0 to n_clusters - 1.
476+
By `Joel Nothman`_.
477+
470478

471479
API changes summary
472480
-------------------

sklearn/metrics/cluster/tests/test_unsupervised.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
from sklearn import datasets
66
from sklearn.utils.testing import assert_false
77
from sklearn.utils.testing import assert_almost_equal
8+
from sklearn.utils.testing import assert_array_equal
89
from sklearn.utils.testing import assert_equal
910
from sklearn.utils.testing import assert_raises_regexp
1011
from sklearn.utils.testing import assert_raise_message
1112
from sklearn.utils.testing import assert_greater
1213
from sklearn.metrics.cluster import silhouette_score
13-
from sklearn.metrics.cluster import calinski_harabaz_score
14+
from sklearn.metrics.cluster import silhouette_samples
1415
from sklearn.metrics import pairwise_distances
16+
from sklearn.metrics.cluster import calinski_harabaz_score
1517

1618

1719
def test_silhouette():
@@ -56,16 +58,27 @@ def test_silhouette():
5658
assert_almost_equal(score_euclidean, score_dense_with_sampling)
5759

5860

59-
def test_no_nan():
60-
# Assert Silhouette Coefficient != nan when there is 1 sample in a class.
61-
# This tests for the condition that caused issue 960.
62-
# Note that there is only one sample in cluster 0. This used to cause the
63-
# silhouette_score to return nan (see bug #960).
64-
labels = np.array([1, 0, 1, 1, 1])
65-
# The distance matrix doesn't actually matter.
66-
D = np.random.RandomState(0).rand(len(labels), len(labels))
67-
silhouette = silhouette_score(D, labels, metric='precomputed')
61+
def test_cluster_size_1():
62+
# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
63+
# (cluster 0). We also test the case where there are identical samples
64+
# as the only members of a cluster (cluster 2). To our knowledge, this case
65+
# is not discussed in reference material, and we choose for it a sample
66+
# score of 1.
67+
X = [[0.], [1.], [1.], [2.], [3.], [3.]]
68+
labels = np.array([0, 1, 1, 1, 2, 2])
69+
70+
# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
71+
# Cluster 1: intra-cluster = [.5, .5, 1]
72+
# inter-cluster = [1, 1, 1]
73+
# silhouette = [.5, .5, 0]
74+
# Cluster 2: intra-cluster = [0, 0]
75+
# inter-cluster = [arbitrary, arbitrary]
76+
# silhouette = [1., 1.]
77+
78+
silhouette = silhouette_score(X, labels)
6879
assert_false(np.isnan(silhouette))
80+
ss = silhouette_samples(X, labels)
81+
assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
6982

7083

7184
def test_correct_labelsize():
@@ -93,7 +106,9 @@ def test_non_encoded_labels():
93106
X = dataset.data
94107
labels = dataset.target
95108
assert_equal(
96-
silhouette_score(X, labels + 10), silhouette_score(X, labels))
109+
silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels))
110+
assert_array_equal(
111+
silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
97112

98113

99114
def test_non_numpy_labels():

sklearn/metrics/cluster/unsupervised.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from ...utils import check_random_state
1111
from ...utils import check_X_y
12+
from ...utils.fixes import bincount
1213
from ..pairwise import pairwise_distances
1314
from ...preprocessing import LabelEncoder
1415

@@ -88,15 +89,8 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None,
8889
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
8990
9091
"""
91-
X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
92-
le = LabelEncoder()
93-
labels = le.fit_transform(labels)
94-
n_labels = len(le.classes_)
95-
n_samples = X.shape[0]
96-
97-
check_number_of_labels(n_labels, n_samples)
98-
9992
if sample_size is not None:
93+
X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
10094
random_state = check_random_state(random_state)
10195
indices = random_state.permutation(X.shape[0])[:sample_size]
10296
if metric == "precomputed":
@@ -166,36 +160,39 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
166160
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
167161
168162
"""
163+
X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
169164
le = LabelEncoder()
170165
labels = le.fit_transform(labels)
166+
check_number_of_labels(len(le.classes_), X.shape[0])
171167

172168
distances = pairwise_distances(X, metric=metric, **kwds)
173169
unique_labels = le.classes_
170+
n_samples_per_label = bincount(labels, minlength=len(unique_labels))
174171

175172
# For sample i, store the mean distance of the cluster to which
176173
# it belongs in intra_clust_dists[i]
177-
intra_clust_dists = np.ones(distances.shape[0], dtype=distances.dtype)
174+
intra_clust_dists = np.zeros(distances.shape[0], dtype=distances.dtype)
178175

179176
# For sample i, store the mean distance of the second closest
180177
# cluster in inter_clust_dists[i]
181-
inter_clust_dists = np.inf * intra_clust_dists
178+
inter_clust_dists = np.inf + intra_clust_dists
182179

183-
for curr_label in unique_labels:
180+
for curr_label in range(len(unique_labels)):
184181

185182
# Find inter_clust_dist for all samples belonging to the same
186183
# label.
187184
mask = labels == curr_label
188185
current_distances = distances[mask]
189186

190187
# Leave out current sample.
191-
n_samples_curr_lab = np.sum(mask) - 1
188+
n_samples_curr_lab = n_samples_per_label[curr_label] - 1
192189
if n_samples_curr_lab != 0:
193190
intra_clust_dists[mask] = np.sum(
194191
current_distances[:, mask], axis=1) / n_samples_curr_lab
195192

196193
# Now iterate over all other labels, finding the mean
197194
# cluster distance that is closest to every sample.
198-
for other_label in unique_labels:
195+
for other_label in range(len(unique_labels)):
199196
if other_label != curr_label:
200197
other_mask = labels == other_label
201198
other_distances = np.mean(
@@ -205,6 +202,8 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
205202

206203
sil_samples = inter_clust_dists - intra_clust_dists
207204
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
205+
# score 0 for clusters of size 1, according to the paper
206+
sil_samples[n_samples_per_label.take(labels) == 1] = 0
208207
return sil_samples
209208

210209

0 commit comments

Comments
 (0)
0