9
9
import functools
10
10
11
11
import numpy as np
12
+ from scipy .sparse import issparse
12
13
13
14
from ...utils import check_random_state
14
15
from ...utils import check_X_y
@@ -122,31 +123,53 @@ def _silhouette_reduce(D_chunk, start, labels, label_freqs):
122
123
123
124
Parameters
124
125
----------
125
- D_chunk : array-like of shape (n_chunk_samples, n_samples)
126
- Precomputed distances for a chunk.
126
+ D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples)
127
+ Precomputed distances for a chunk. If a sparse matrix is provided,
128
+ only CSR format is accepted.
127
129
start : int
128
130
First index in the chunk.
129
131
labels : array-like of shape (n_samples,)
130
132
Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.
131
133
label_freqs : array-like
132
134
Distribution of cluster labels in ``labels``.
133
135
"""
136
+ n_chunk_samples = D_chunk .shape [0 ]
134
137
# accumulate distances from each sample to each cluster
135
- clust_dists = np .zeros ((len (D_chunk ), len (label_freqs )), dtype = D_chunk .dtype )
136
- for i in range (len (D_chunk )):
137
- clust_dists [i ] += np .bincount (
138
- labels , weights = D_chunk [i ], minlength = len (label_freqs )
139
- )
138
+ cluster_distances = np .zeros (
139
+ (n_chunk_samples , len (label_freqs )), dtype = D_chunk .dtype
140
+ )
140
141
141
- # intra_index selects intra-cluster distances within clust_dists
142
- intra_index = (np .arange (len (D_chunk )), labels [start : start + len (D_chunk )])
143
- # intra_clust_dists are averaged over cluster size outside this function
144
- intra_clust_dists = clust_dists [intra_index ]
142
+ if issparse (D_chunk ):
143
+ if D_chunk .format != "csr" :
144
+ raise TypeError (
145
+ "Expected CSR matrix. Please pass sparse matrix in CSR format."
146
+ )
147
+ for i in range (n_chunk_samples ):
148
+ indptr = D_chunk .indptr
149
+ indices = D_chunk .indices [indptr [i ] : indptr [i + 1 ]]
150
+ sample_weights = D_chunk .data [indptr [i ] : indptr [i + 1 ]]
151
+ sample_labels = np .take (labels , indices )
152
+ cluster_distances [i ] += np .bincount (
153
+ sample_labels , weights = sample_weights , minlength = len (label_freqs )
154
+ )
155
+ else :
156
+ for i in range (n_chunk_samples ):
157
+ sample_weights = D_chunk [i ]
158
+ sample_labels = labels
159
+ cluster_distances [i ] += np .bincount (
160
+ sample_labels , weights = sample_weights , minlength = len (label_freqs )
161
+ )
162
+
163
+ # intra_index selects intra-cluster distances within cluster_distances
164
+ end = start + n_chunk_samples
165
+ intra_index = (np .arange (n_chunk_samples ), labels [start :end ])
166
+ # intra_cluster_distances are averaged over cluster size outside this function
167
+ intra_cluster_distances = cluster_distances [intra_index ]
145
168
# of the remaining distances we normalise and extract the minimum
146
- clust_dists [intra_index ] = np .inf
147
- clust_dists /= label_freqs
148
- inter_clust_dists = clust_dists .min (axis = 1 )
149
- return intra_clust_dists , inter_clust_dists
169
+ cluster_distances [intra_index ] = np .inf
170
+ cluster_distances /= label_freqs
171
+ inter_cluster_distances = cluster_distances .min (axis = 1 )
172
+ return intra_cluster_distances , inter_cluster_distances
150
173
151
174
152
175
def silhouette_samples (X , labels , * , metric = "euclidean" , ** kwds ):
@@ -174,9 +197,11 @@ def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
174
197
175
198
Parameters
176
199
----------
177
- X : array-like of shape (n_samples_a, n_samples_a) if metric == \
200
+ X : { array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
178
201
"precomputed" or (n_samples_a, n_features) otherwise
179
- An array of pairwise distances between samples, or a feature array.
202
+ An array of pairwise distances between samples, or a feature array. If
203
+ a sparse matrix is provided, CSR format should be favoured avoiding
204
+ an additional copy.
180
205
181
206
labels : array-like of shape (n_samples,)
182
207
Label values for each sample.
@@ -209,7 +234,7 @@ def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
209
234
.. [2] `Wikipedia entry on the Silhouette Coefficient
210
235
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
211
236
"""
212
- X , labels = check_X_y (X , labels , accept_sparse = ["csc" , " csr" ])
237
+ X , labels = check_X_y (X , labels , accept_sparse = ["csr" ])
213
238
214
239
# Check for non-zero diagonal entries in precomputed distance matrix
215
240
if metric == "precomputed" :
@@ -219,10 +244,10 @@ def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
219
244
)
220
245
if X .dtype .kind == "f" :
221
246
atol = np .finfo (X .dtype ).eps * 100
222
- if np .any (np .abs (np .diagonal (X )) > atol ):
223
- raise ValueError ( error_msg )
224
- elif np .any (np .diagonal (X ) != 0 ): # integral dtype
225
- raise ValueError ( error_msg )
247
+ if np .any (np .abs (X .diagonal ()) > atol ):
248
+ raise error_msg
249
+ elif np .any (X .diagonal () != 0 ): # integral dtype
250
+ raise error_msg
226
251
227
252
le = LabelEncoder ()
228
253
labels = le .fit_transform (labels )
0 commit comments