1
1
# cython: profile=True
2
- # Profiling is enabled by default as the overhead does not seem to be measurable
3
- # on this specific use case.
2
+ # Profiling is enabled by default as the overhead does not seem to be
3
+ # measurable on this specific use case.
4
4
5
5
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
6
6
# Olivier Grisel <olivier.grisel@ensta.org>
@@ -34,6 +34,7 @@ np.import_array()
34
34
@ cython.wraparound (False )
35
35
@ cython.cdivision (True )
36
36
cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim= 2 ] X,
37
+ np.ndarray[floating, ndim= 1 ] sample_weight,
37
38
np.ndarray[floating, ndim= 1 ] x_squared_norms,
38
39
np.ndarray[floating, ndim= 2 ] centers,
39
40
np.ndarray[INT, ndim= 1 ] labels,
@@ -89,6 +90,7 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X,
89
90
dist *= - 2
90
91
dist += center_squared_norms[center_idx]
91
92
dist += x_squared_norms[sample_idx]
93
+ dist *= sample_weight[sample_idx]
92
94
if min_dist == - 1 or dist < min_dist:
93
95
min_dist = dist
94
96
labels[sample_idx] = center_idx
@@ -103,7 +105,8 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X,
103
105
@ cython.boundscheck (False )
104
106
@ cython.wraparound (False )
105
107
@ cython.cdivision (True )
106
- cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim= 1 ] x_squared_norms,
108
+ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[floating, ndim= 1 ] sample_weight,
109
+ np.ndarray[DOUBLE, ndim= 1 ] x_squared_norms,
107
110
np.ndarray[floating, ndim= 2 ] centers,
108
111
np.ndarray[INT, ndim= 1 ] labels,
109
112
np.ndarray[floating, ndim= 1 ] distances):
@@ -141,7 +144,8 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
141
144
142
145
for center_idx in range (n_clusters):
143
146
center_squared_norms[center_idx] = dot(
144
- n_features, & centers[center_idx, 0 ], 1 , & centers[center_idx, 0 ], 1 )
147
+ n_features, & centers[center_idx, 0 ], 1 ,
148
+ & centers[center_idx, 0 ], 1 )
145
149
146
150
for sample_idx in range (n_samples):
147
151
min_dist = - 1
@@ -154,6 +158,7 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
154
158
dist *= - 2
155
159
dist += center_squared_norms[center_idx]
156
160
dist += x_squared_norms[sample_idx]
161
+ dist *= sample_weight[sample_idx]
157
162
if min_dist == - 1 or dist < min_dist:
158
163
min_dist = dist
159
164
labels[sample_idx] = center_idx
@@ -167,9 +172,10 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
167
172
@ cython.boundscheck (False )
168
173
@ cython.wraparound (False )
169
174
@ cython.cdivision (True )
170
- def _mini_batch_update_csr (X , np.ndarray[DOUBLE , ndim = 1 ] x_squared_norms,
175
+ def _mini_batch_update_csr (X , np.ndarray[floating , ndim = 1 ] sample_weight,
176
+ np.ndarray[DOUBLE , ndim = 1 ] x_squared_norms,
171
177
np.ndarray[floating , ndim = 2 ] centers,
172
- np.ndarray[INT , ndim = 1 ] counts ,
178
+ np.ndarray[floating , ndim = 1 ] weight_sums ,
173
179
np.ndarray[INT , ndim = 1 ] nearest_center,
174
180
np.ndarray[floating , ndim = 1 ] old_center,
175
181
int compute_squared_diff ):
@@ -192,7 +198,7 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
192
198
-------
193
199
inertia : float
194
200
The inertia of the batch prior to centers update, i.e. the sum
195
- of squared distances to the closest center for each sample. This
201
+ of squared distances to the closest center for each sample. This
196
202
is the objective function being minimized by the k-means algorithm.
197
203
198
204
squared_diff : float
@@ -213,29 +219,29 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
213
219
214
220
unsigned int sample_idx, center_idx, feature_idx
215
221
unsigned int k
216
- int old_count, new_count
222
+ DOUBLE old_weight_sum, new_weight_sum
217
223
DOUBLE center_diff
218
224
DOUBLE squared_diff = 0.0
219
225
220
226
# move centers to the mean of both old and newly assigned samples
221
227
for center_idx in range (n_clusters):
222
- old_count = counts [center_idx]
223
- new_count = old_count
228
+ old_weight_sum = weight_sums [center_idx]
229
+ new_weight_sum = old_weight_sum
224
230
225
231
# count the number of samples assigned to this center
226
232
for sample_idx in range (n_samples):
227
233
if nearest_center[sample_idx] == center_idx:
228
- new_count += 1
234
+ new_weight_sum += sample_weight[sample_idx]
229
235
230
- if new_count == old_count :
236
+ if new_weight_sum == old_weight_sum :
231
237
# no new sample: leave this center as it stands
232
238
continue
233
239
234
240
# rescale the old center to reflect it previous accumulated weight
235
241
# with regards to the new data that will be incrementally contributed
236
242
if compute_squared_diff:
237
243
old_center[:] = centers[center_idx]
238
- centers[center_idx] *= old_count
244
+ centers[center_idx] *= old_weight_sum
239
245
240
246
# iterate of over samples assigned to this cluster to move the center
241
247
# location by inplace summation
@@ -250,12 +256,12 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
250
256
centers[center_idx, X_indices[k]] += X_data[k]
251
257
252
258
# inplace rescale center with updated count
253
- if new_count > old_count :
259
+ if new_weight_sum > old_weight_sum :
254
260
# update the count statistics for this center
255
- counts [center_idx] = new_count
261
+ weight_sums [center_idx] = new_weight_sum
256
262
257
263
# re-scale the updated center with the total new counts
258
- centers[center_idx] /= new_count
264
+ centers[center_idx] /= new_weight_sum
259
265
260
266
# update the incremental computation of the squared total
261
267
# centers position change
@@ -271,6 +277,7 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
271
277
@ cython.wraparound (False )
272
278
@ cython.cdivision (True )
273
279
def _centers_dense (np.ndarray[floating , ndim = 2 ] X,
280
+ np.ndarray[floating , ndim = 1 ] sample_weight,
274
281
np.ndarray[INT , ndim = 1 ] labels, int n_clusters ,
275
282
np.ndarray[floating , ndim = 1 ] distances):
276
283
""" M step of the K-means EM algorithm
@@ -281,6 +288,9 @@ def _centers_dense(np.ndarray[floating, ndim=2] X,
281
288
----------
282
289
X : array-like, shape (n_samples, n_features)
283
290
291
+ sample_weight : array-like, shape (n_samples,)
292
+ The weights for each observation in X.
293
+
284
294
labels : array of integers, shape (n_samples)
285
295
Current label assignment
286
296
@@ -301,13 +311,16 @@ def _centers_dense(np.ndarray[floating, ndim=2] X,
301
311
n_features = X.shape[1 ]
302
312
cdef int i, j, c
303
313
cdef np.ndarray[floating, ndim= 2 ] centers
304
- if floating is float :
305
- centers = np.zeros((n_clusters, n_features), dtype = np.float32)
306
- else :
307
- centers = np.zeros((n_clusters, n_features), dtype = np.float64)
314
+ cdef np.ndarray[floating, ndim= 1 ] weight_in_cluster
315
+
316
+ dtype = np.float32 if floating is float else np.float64
317
+ centers = np.zeros((n_clusters, n_features), dtype = dtype)
318
+ weight_in_cluster = np.zeros((n_clusters,), dtype = dtype)
308
319
309
- n_samples_in_cluster = np.bincount(labels, minlength = n_clusters)
310
- empty_clusters = np.where(n_samples_in_cluster == 0 )[0 ]
320
+ for i in range (n_samples):
321
+ c = labels[i]
322
+ weight_in_cluster[c] += sample_weight[i]
323
+ empty_clusters = np.where(weight_in_cluster == 0 )[0 ]
311
324
# maybe also relocate small clusters?
312
325
313
326
if len (empty_clusters):
@@ -316,23 +329,25 @@ def _centers_dense(np.ndarray[floating, ndim=2] X,
316
329
317
330
for i, cluster_id in enumerate (empty_clusters):
318
331
# XXX two relocated clusters could be close to each other
319
- new_center = X[far_from_centers[i]]
332
+ far_index = far_from_centers[i]
333
+ new_center = X[far_index]
320
334
centers[cluster_id] = new_center
321
- n_samples_in_cluster [cluster_id] = 1
335
+ weight_in_cluster [cluster_id] = sample_weight[far_index]
322
336
323
337
for i in range (n_samples):
324
338
for j in range (n_features):
325
- centers[labels[i], j] += X[i, j]
339
+ centers[labels[i], j] += X[i, j] * sample_weight[i]
326
340
327
- centers /= n_samples_in_cluster [:, np.newaxis]
341
+ centers /= weight_in_cluster [:, np.newaxis]
328
342
329
343
return centers
330
344
331
345
332
346
@ cython.boundscheck (False )
333
347
@ cython.wraparound (False )
334
348
@ cython.cdivision (True )
335
- def _centers_sparse (X , np.ndarray[INT , ndim = 1 ] labels, n_clusters ,
349
+ def _centers_sparse (X , np.ndarray[floating , ndim = 1 ] sample_weight,
350
+ np.ndarray[INT , ndim = 1 ] labels, n_clusters ,
336
351
np.ndarray[floating , ndim = 1 ] distances):
337
352
""" M step of the K-means EM algorithm
338
353
@@ -342,6 +357,9 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
342
357
----------
343
358
X : scipy.sparse.csr_matrix, shape (n_samples, n_features)
344
359
360
+ sample_weight : array-like, shape (n_samples,)
361
+ The weights for each observation in X.
362
+
345
363
labels : array of integers, shape (n_samples)
346
364
Current label assignment
347
365
@@ -356,7 +374,9 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
356
374
centers : array, shape (n_clusters, n_features)
357
375
The resulting centers
358
376
"""
359
- cdef int n_features = X.shape[1 ]
377
+ cdef int n_samples, n_features
378
+ n_samples = X.shape[0 ]
379
+ n_features = X.shape[1 ]
360
380
cdef int curr_label
361
381
362
382
cdef np.ndarray[floating, ndim= 1 ] data = X.data
@@ -365,17 +385,17 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
365
385
366
386
cdef np.ndarray[floating, ndim= 2 , mode= " c" ] centers
367
387
cdef np.ndarray[np.npy_intp, ndim= 1 ] far_from_centers
368
- cdef np.ndarray[np.npy_intp, ndim= 1 , mode= " c" ] n_samples_in_cluster = \
369
- np.bincount(labels, minlength = n_clusters)
388
+ cdef np.ndarray[floating, ndim= 1 ] weight_in_cluster
389
+ dtype = np.float32 if floating is float else np.float64
390
+ centers = np.zeros((n_clusters, n_features), dtype = dtype)
391
+ weight_in_cluster = np.zeros((n_clusters,), dtype = dtype)
392
+ for i in range (n_samples):
393
+ c = labels[i]
394
+ weight_in_cluster[c] += sample_weight[i]
370
395
cdef np.ndarray[np.npy_intp, ndim= 1 , mode= " c" ] empty_clusters = \
371
- np.where(n_samples_in_cluster == 0 )[0 ]
396
+ np.where(weight_in_cluster == 0 )[0 ]
372
397
cdef int n_empty_clusters = empty_clusters.shape[0 ]
373
398
374
- if floating is float :
375
- centers = np.zeros((n_clusters, n_features), dtype = np.float32)
376
- else :
377
- centers = np.zeros((n_clusters, n_features), dtype = np.float64)
378
-
379
399
# maybe also relocate small clusters?
380
400
381
401
if n_empty_clusters > 0 :
@@ -386,14 +406,14 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
386
406
assign_rows_csr(X, far_from_centers, empty_clusters, centers)
387
407
388
408
for i in range (n_empty_clusters):
389
- n_samples_in_cluster [empty_clusters[i]] = 1
409
+ weight_in_cluster [empty_clusters[i]] = 1
390
410
391
411
for i in range (labels.shape[0 ]):
392
412
curr_label = labels[i]
393
413
for ind in range (ind
10000
ptr[i], indptr[i + 1 ]):
394
414
j = indices[ind]
395
- centers[curr_label, j] += data[ind]
415
+ centers[curr_label, j] += data[ind] * sample_weight[i]
396
416
397
- centers /= n_samples_in_cluster [:, np.newaxis]
417
+ centers /= weight_in_cluster [:, np.newaxis]
398
418
399
419
return centers
0 commit comments