@@ -13,6 +13,7 @@ import numpy as np
13
13
import scipy.sparse as sp
14
14
cimport numpy as np
15
15
cimport cython
16
+ from cython cimport floating
16
17
17
18
from ..utils.extmath import norm
18
19
from sklearn.utils.sparsefuncs_fast cimport add_row_csr
@@ -23,18 +24,19 @@ ctypedef np.int32_t INT
23
24
24
25
cdef extern from " cblas.h" :
25
26
double ddot " cblas_ddot" (int N, double * X, int incX, double * Y, int incY)
27
+ float sdot " cblas_sdot" (int N, float * X, int incX, float * Y, int incY)
26
28
27
29
np.import_array()
28
30
29
31
30
32
@ cython.boundscheck (False )
31
33
@ cython.wraparound (False )
32
34
@ cython.cdivision (True )
33
- cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE , ndim= 2 ] X,
34
- np.ndarray[DOUBLE , ndim= 1 ] x_squared_norms,
35
- np.ndarray[DOUBLE , ndim= 2 ] centers,
35
+ cpdef DOUBLE _assign_labels_array(np.ndarray[floating , ndim= 2 ] X,
36
+ np.ndarray[floating , ndim= 1 ] x_squared_norms,
37
+ np.ndarray[floating , ndim= 2 ] centers,
36
38
np.ndarray[INT, ndim= 1 ] labels,
37
- np.ndarray[DOUBLE , ndim= 1 ] distances):
39
+ np.ndarray[floating , ndim= 1 ] distances):
38
40
""" Compute label assignment and inertia for a dense array
39
41
40
42
Return the inertia (sum of squared distances to the centers).
@@ -43,33 +45,58 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
43
45
unsigned int n_clusters = centers.shape[0 ]
44
46
unsigned int n_features = centers.shape[1 ]
45
47
unsigned int n_samples = X.shape[0 ]
46
- unsigned int x_stride = X.strides[ 1 ] / sizeof(DOUBLE)
47
- unsigned int center_stride = centers.strides[ 1 ] / sizeof(DOUBLE)
48
+ unsigned int x_stride
49
+ unsigned int center_stride
48
50
unsigned int sample_idx, center_idx, feature_idx
49
51
unsigned int store_distances = 0
50
52
unsigned int k
53
+ np.ndarray[floating, ndim= 1 ] center_squared_norms
54
+ # the following variables are always double cause make them floating
55
+ # does not save any memory, but makes the code much bigger
51
56
DOUBLE inertia = 0.0
52
57
DOUBLE min_dist
53
58
DOUBLE dist
54
- np.ndarray[DOUBLE, ndim= 1 ] center_squared_norms = np.zeros(
55
- n_clusters, dtype = np.float64)
59
+
60
+ if floating is float :
61
+ center_squared_norms = np.zeros(n_clusters, dtype = np.float32)
62
+ x_stride = X.strides[1 ] / sizeof(float )
63
+ center_stride = centers.strides[1 ] / sizeof(float )
64
+ elif floating is double :
65
+ center_squared_norms = np.zeros(n_clusters, dtype = np.float64)
66
+ x_stride = X.strides[1 ] / sizeof(DOUBLE)
67
+ center_stride = centers.strides[1 ] / sizeof(DOUBLE)
68
+ else :
69
+ raise ValueError (" Unknown floating type." )
56
70
57
71
if n_samples == distances.shape[0 ]:
58
72
store_distances = 1
59
73
60
74
for center_idx in range (n_clusters):
61
- center_squared_norms[center_idx] = ddot(
62
- n_features, & centers[center_idx, 0 ], center_stride,
63
- & centers[center_idx, 0 ], center_stride)
75
+ if floating is float :
76
+ center_squared_norms[center_idx] = sdot(
77
+ n_features, & centers[center_idx, 0 ], center_stride,
78
+ & centers[center_idx, 0 ], center_stride)
79
+ elif floating is double :
80
+ center_squared_norms[center_idx] = ddot(
81
+ n_features, & centers[center_idx, 0 ], center_stride,
82
+ & centers[center_idx, 0 ], center_stride)
83
+ else :
84
+ raise ValueError (" Unknown floating type." )
64
85
65
86
for sample_idx in range (n_samples):
66
87
min_dist = - 1
67
88
for center_idx in range (n_clusters):
68
89
dist = 0.0
69
90
# hardcoded: minimize euclidean distance to cluster center:
70
91
# ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
71
- dist += ddot(n_features, & X[sample_idx, 0 ], x_stride,
72
- & centers[center_idx, 0 ], center_stride)
92
+ if floating is float :
93
+ dist += sdot(n_features, & X[sample_idx, 0 ], x_stride,
94
+ & centers[center_idx, 0 ], center_stride)
95
+ elif floating is double :
96
+ dist += ddot(n_features, & X[sample_idx, 0 ], x_stride,
97
+ & centers[center_idx, 0 ], center_stride)
98
+ else :
99
+ raise ValueError (" Unknown floating type." )
73
100
dist *= - 2
74
101
dist += center_squared_norms[center_idx]
75
102
dist += x_squared_norms[sample_idx]
@@ -87,16 +114,16 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
87
114
@ cython.boundscheck (False )
88
115
@ cython.wraparound (False )
89
116
@ cython.cdivision (True )
90
- cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE , ndim= 1 ] x_squared_norms,
91
- np.ndarray[DOUBLE , ndim= 2 ] centers,
117
+ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[floating , ndim= 1 ] x_squared_norms,
118
+ np.ndarray[floating , ndim= 2 ] centers,
92
119
np.ndarray[INT, ndim= 1 ] labels,
93
- np.ndarray[DOUBLE , ndim= 1 ] distances):
120
+ np.ndarray[floating , ndim= 1 ] distances):
94
121
""" Compute label assignment and inertia for a CSR input
95
122
96
123
Return the inertia (sum of squared distances to the centers).
97
124
"""
98
125
cdef:
99
- np.ndarray[DOUBLE , ndim= 1 ] X_data = X.data
126
+ np.ndarray[floating , ndim= 1 ] X_data = X.data
100
127
np.ndarray[INT, ndim= 1 ] X_indices = X.indices
101
128
np.ndarray[INT, ndim= 1 ] X_indptr = X.indptr
102
129
unsigned int n_clusters = centers.shape[0 ]
@@ -105,18 +132,32 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
105
132
unsigned int store_distances = 0
106
133
unsigned int sample_idx, center_idx, feature_idx
107
134
unsigned int k
135
+ np.ndarray[floating, ndim= 1 ] center_squared_norms
136
+ # the following variables are always double cause make them floating
137
+ # does not save any memory, but makes the code much bigger
108
138
DOUBLE inertia = 0.0
109
139
DOUBLE min_dist
110
140
DOUBLE dist
111
- np.ndarray[DOUBLE, ndim= 1 ] center_squared_norms = np.zeros(
112
- n_clusters, dtype = np.float64)
141
+
142
+ if floating is float :
143
+ center_squared_norms = np.zeros(n_clusters, dtype = np.float32)
144
+ elif floating is double :
145
+ center_squared_norms = np.zeros(n_clusters, dtype = np.float64)
146
+ else :
147
+ raise ValueError (" Unknown floating type." )
113
148
114
149
if n_samples == distances.shape[0 ]:
115
150
store_distances = 1
116
151
117
152
for center_idx in range (n_clusters):
118
- center_squared_norms[center_idx] = ddot(
119
- n_features, & centers[center_idx, 0 ], 1 , & centers[center_idx, 0 ], 1 )
153
+ if floating is float :
154
+ center_squared_norms[center_idx] = sdot(
155
+ n_features, & centers[center_idx, 0 ], 1 , & centers[center_idx, 0 ], 1 )
156
+ elif floating is double :
157
+ center_squared_norms[center_idx] = ddot(
158
+ n_features, & centers[center_idx, 0 ], 1 , & centers[center_idx, 0 ], 1 )
159
+ else :
160
+ raise ValueError (" Unknown floating type." )
120
161
121
162
for sample_idx in range (n_samples):
122
163
min_dist = - 1
@@ -142,18 +183,18 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
142
183
@ cython.boundscheck (False )
143
184
@ cython.wraparound (False )
144
185
@ cython.cdivision (True )
145
- def _mini_batch_update_csr (X , np.ndarray[DOUBLE , ndim = 1 ] x_squared_norms,
146
- np.ndarray[DOUBLE , ndim = 2 ] centers,
186
+ def _mini_batch_update_csr (X , np.ndarray[floating , ndim = 1 ] x_squared_norms,
187
+ np.ndarray[floating , ndim = 2 ] centers,
147
188
np.ndarray[INT , ndim = 1 ] counts,
148
189
np.ndarray[INT , ndim = 1 ] nearest_center,
149
- np.ndarray[DOUBLE , ndim = 1 ] old_center,
190
+ np.ndarray[floating , ndim = 1 ] old_center,
150
191
int compute_squared_diff ):
151
192
""" Incremental update of the centers for sparse MiniBatchKMeans.
152
193
153
194
Parameters
154
195
----------
155
196
156
- X: CSR matrix, dtype float64
197
+ X: CSR matrix, dtype float
157
198
The complete (pre allocated) training set as a CSR matrix.
158
199
159
200
centers: array, shape (n_clusters, n_features)
@@ -179,7 +220,7 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
179
220
of the algorithm.
180
221
"""
181
222
cdef:
182
- np.ndarray[DOUBLE , ndim= 1 ] X_data = X.data
223
+ np.ndarray[floating , ndim= 1 ] X_data = X.data
183
224
np.ndarray[int , ndim= 1 ] X_indices = X.indices
184
225
np.ndarray[int , ndim= 1 ] X_indptr = X.indptr
185
226
unsigned int n_samples = X.shape[0 ]
@@ -245,9 +286,9 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
245
286
@ cython.boundscheck (False )
246
287
@ cython.wraparound (False )
247
288
@ cython.cdivision (True )
248
- def _centers_dense (np.ndarray[DOUBLE , ndim = 2 ] X,
289
+ def _centers_dense (np.ndarray[floating , ndim = 2 ] X,
249
290
np.ndarray[INT , ndim = 1 ] labels, int n_clusters ,
250
- np.ndarray[DOUBLE , ndim = 1 ] distances):
291
+ np.ndarray[floating , ndim = 1 ] distances):
251
292
""" M step of the K-means EM algorithm
252
293
253
294
Computation of cluster centers / means.
@@ -275,7 +316,14 @@ def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,
275
316
n_samples = X.shape[0 ]
276
317
n_features = X.shape[1 ]
277
318
cdef int i, j, c
278
- cdef np.ndarray[DOUBLE, ndim= 2 ] centers = np.zeros((n_clusters, n_features))
319
+ cdef np.ndarray[floating, ndim= 2 ] centers
320
+ if floating is float :
321
+ centers = np.zeros((n_clusters, n_features), dtype = np.float32)
322
+ elif floating is double :
323
+ centers = np.zeros((n_clusters, n_features), dtype = np.float64)
324
+ else :
325
+ raise ValueError (" Unknown floating type." )
326
+
279
327
n_samples_in_cluster = bincount(labels, minlength = n_clusters)
280
328
empty_clusters = np.where(n_samples_in_cluster == 0 )[0 ]
281
329
# maybe also relocate small clusters?
@@ -300,7 +348,7 @@ def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,
300
348
301
349
302
350
def _centers_sparse (X , np.ndarray[INT , ndim = 1 ] labels, n_clusters ,
303
- np.ndarray[DOUBLE , ndim = 1 ] distances):
351
+ np.ndarray[floating , ndim = 1 ] distances):
304
352
""" M step of the K-means EM algorithm
305
353
306
354
Computation of cluster centers / means.
@@ -327,18 +375,24 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
327
375
328
376
cdef np.npy_intp cluster_id
329
377
330
- cdef np.ndarray[DOUBLE , ndim= 1 ] data = X.data
378
+ cdef np.ndarray[floating , ndim= 1 ] data = X.data
331
379
cdef np.ndarray[int , ndim= 1 ] indices = X.indices
332
380
cdef np.ndarray[int , ndim= 1 ] indptr = X.indptr
333
381
334
- cdef np.ndarray[DOUBLE, ndim= 2 , mode= " c" ] centers = \
335
- np.zeros((n_clusters, n_features))
382
+ cdef np.ndarray[floating, ndim= 2 , mode= " c" ] centers
336
383
cdef np.ndarray[np.npy_intp, ndim= 1 ] far_from_centers
337
384
cdef np.ndarray[np.npy_intp, ndim= 1 , mode= " c" ] n_samples_in_cluster = \
338
385
bincount(labels, minlength = n_clusters)
339
386
cdef np.ndarray[np.npy_intp, ndim= 1 , mode= " c" ] empty_clusters = \
340
387
np.where(n_samples_in_cluster == 0 )[0 ]
341
388
389
+ if floating is float :
390
+ centers = np.zeros((n_clusters, n_features), dtype = np.float32)
391
+ elif floating is double :
392
+ centers = np.zeros((n_clusters, n_features), dtype = np.float64)
393
+ else :
394
+ raise ValueError (" Unknown floating type." )
395
+
342
396
# maybe also relocate small clusters?
343
397
344
398
if empty_clusters.shape[0 ] > 0 :
0 commit comments