9
9
10
10
from ...utils import check_random_state
11
11
from ...utils import check_X_y
12
+ from ...utils .fixes import bincount
12
13
from ..pairwise import pairwise_distances
13
14
from ...preprocessing import LabelEncoder
14
15
@@ -88,15 +89,8 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None,
88
89
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
89
90
90
91
"""
91
- X , labels = check_X_y (X , labels , accept_sparse = ['csc' , 'csr' ])
92
- le = LabelEncoder ()
93
- labels = le .fit_transform (labels )
94
- n_labels = len (le .classes_ )
95
- n_samples = X .shape [0 ]
96
-
97
- check_number_of_labels (n_labels , n_samples )
98
-
99
92
if sample_size is not None :
93
+ X , labels = check_X_y (X , labels , accept_sparse = ['csc' , 'csr' ])
100
94
random_state = check_random_state (random_state )
101
95
indices = random_state .permutation (X .shape [0 ])[:sample_size ]
102
96
if metric == "precomputed" :
@@ -166,36 +160,39 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
166
160
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
167
161
168
162
"""
163
+ X , labels = check_X_y (X , labels , accept_sparse = ['csc' , 'csr' ])
169
164
le = LabelEncoder ()
170
165
labels = le .fit_transform (labels )
166
+ check_number_of_labels (len (le .classes_ ), X .shape [0 ])
171
167
172
168
distances = pairwise_distances (X , metric = metric , ** kwds )
173
169
unique_labels = le .classes_
170
+ n_samples_per_label = bincount (labels , minlength = len (unique_labels ))
174
171
175
172
# For sample i, store the mean distance of the cluster to which
176
173
# it belongs in intra_clust_dists[i]
177
- intra_clust_dists = np .ones (distances .shape [0 ], dtype = distances .dtype )
174
+ intra_clust_dists = np .zeros (distances .shape [0 ], dtype = distances .dtype )
178
175
179
176
# For sample i, store the mean distance of the second closest
180
177
# cluster in inter_clust_dists[i]
181
- inter_clust_dists = np .inf * intra_clust_dists
178
+ inter_clust_dists = np .inf + intra_clust_dists
182
179
183
- for curr_label in unique_labels :
180
+ for curr_label in range ( len ( unique_labels )) :
184
181
185
182
# Find inter_clust_dist for all samples belonging to the same
186
183
# label.
187
184
mask = labels == curr_label
188
185
current_distances = distances [mask ]
189
186
190
187
# Leave out current sample.
191
- n_samples_curr_lab = np . sum ( mask ) - 1
188
+ n_samples_curr_lab = n_samples_per_label [ curr_label ] - 1
192
189
if n_samples_curr_lab != 0 :
193
190
intra_clust_dists [mask ] = np .sum (
194
191
current_distances [:, mask ], axis = 1 ) / n_samples_curr_lab
195
192
196
193
# Now iterate over all other labels, finding the mean
197
194
# cluster distance that is closest to every sample.
198
- for other_label in unique_labels :
195
+ for other_label in range ( len ( unique_labels )) :
199
196
if other_label != curr_label :
200
197
other_mask = labels == other_label
201
198
other_distances = np .mean (
@@ -205,6 +202,8 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
205
202
206
203
sil_samples = inter_clust_dists - intra_clust_dists
207
204
sil_samples /= np .maximum (intra_clust_dists , inter_clust_dists )
205
+ # score 0 for clusters of size 1, according to the paper
206
+ sil_samples [n_samples_per_label .take (labels ) == 1 ] = 0
208
207
return sil_samples
209
208
210
209
0 commit comments