7
7
import numpy as np
8
8
9
9
from ...utils import check_random_state
10
+ from ...utils import check_X_y
10
11
from ..pairwise import pairwise_distances
12
+ from ...preprocessing import LabelEncoder
11
13
12
14
13
15
def silhouette_score (X , labels , metric = 'euclidean' , sample_size = None ,
@@ -79,8 +81,12 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None,
79
81
<http://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
80
82
81
83
"""
84
+ X , labels = check_X_y (X , labels )
85
+ le = LabelEncoder ()
86
+ labels = le .fit_transform (labels )
82
87
n_labels = len (np .unique (labels ))
83
88
n_samples = X .shape [0 ]
89
+
84
90
if not 1 < n_labels < n_samples :
85
91
raise ValueError ("Number of labels is %d. Valid values are 2 "
86
92
"to n_samples - 1 (inclusive)" % n_labels )
@@ -155,67 +161,45 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
155
161
<http://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
156
162
157
163
"""
158
- distances = pairwise_distances (X , metric = metric , ** kwds )
159
- n = labels .shape [0 ]
160
- A = np .array ([_intra_cluster_distance (distances [i ], labels , i )
161
- for i in range (n )])
162
- B = np .array ([_nearest_cluster_distance (distances [i ], labels , i )
163
- for i in range (n )])
164
- sil_samples = (B - A ) / np .maximum (A , B )
165
- # nan values are for clusters of size 1, and should be 0
166
- return np .nan_to_num (sil_samples )
167
-
168
-
169
- def _intra_cluster_distance (distances_row , labels , i ):
170
- """Calculate the mean intra-cluster distance for sample i.
171
-
172
- Parameters
173
- ----------
174
- distances_row : array, shape = [n_samples]
175
- Pairwise distance matrix between sample i and each sample.
176
-
177
- labels : array, shape = [n_samples]
178
- label values for each sample
179
-
180
- i : int
181
- Sample index being calculated. It is excluded from calculation and
182
- used to determine the current label
164
+ le = LabelEncoder ()
165
+ labels = le .fit_transform (labels )
183
166
184
- Returns
185
- -------
186
- a : float
187
- Mean intra-cluster distance for sample i
188
- """
189
- mask = labels == labels [i ]
190
- mask [i ] = False
191
- if not np .any (mask ):
192
- # cluster of size 1
193
- return 0
194
- a = np .mean (distances_row [mask ])
195
- return a
196
-
197
-
198
- def _nearest_cluster_distance (distances_row , labels , i ):
199
- """Calculate the mean nearest-cluster distance for sample i.
200
-
201
- Parameters
202
- ----------
203
- distances_row : array, shape = [n_samples]
204
- Pairwise distance matrix between sample i and each sample.
205
-
206
- labels : array, shape = [n_samples]
207
- label values for each sample
208
-
209
- i : int
210
- Sample index being calculated. It is used to determine the current
211
- label.
167
+ distances = pairwise_distances (X , metric = metric , ** kwds )
168
+ unique_labels = le .classes_
169
+
170
+ # For sample i, store the mean distance of the cluster to which
171
+ # it belongs in intra_clust_dists[i]
172
+ intra_clust_dists = np .ones (distances .shape [0 ], dtype = distances .dtype )
173
+
174
+ # For sample i, store the mean distance of the second closest
175
+ # cluster in inter_clust_dists[i]
176
+ inter_clust_dists = np .inf * intra_clust_dists
177
+
178
+ for curr_label in unique_labels :
179
+
180
+ # Find inter_clust_dist for all samples belonging to the same
181
+ # label.
182
+ mask = labels == curr_label
183
+ current_distances = distances [mask ]
184
+
185
+ # Leave out current sample.
186
+ n_samples_curr_lab = np .sum (mask ) - 1
187
+ if n_samples_curr_lab != 0 :
188
+ intra_clust_dists [mask ] = np .sum (
189
+ current_distances [:, mask ], axis = 1 ) / n_samples_curr_lab
190
+
191
+ # Now iterate over all other labels, finding the mean
192
+ # cluster distance that is closest to every sample.
193
+ for other_label in unique_labels :
194
+ if other_label != curr_label :
195
+ other_mask = labels == other_label
196
+ other_distances = np .mean (
197
+ current_distances [:, other_mask ], axis = 1 )
198
+ inter_clust_dists [mask ] = np .minimum (
199
+ inter_clust_dists [mask ], other_distances )
200
+
201
+ sil_samples = inter_clust_dists - intra_clust_dists
202
+ sil_samples /= np .maximum (intra_clust_dists , inter_clust_dists )
212
203
213
- Returns
214
- -------
215
- b : float
216
- Mean nearest-cluster distance for sample i
217
- """
218
- label = labels [i ]
219
- b = np .min ([np .mean (distances_row [labels == cur_label ])
220
- for cur_label in set (labels ) if not cur_label == label ])
221
- return b
204
+ # nan values are for clusters of size 1, and should be 0
205
+ return np .nan_to_num (sil_samples )
0 commit comments