8000 Adds support and tests for KMeans/MiniBatchKMeans to work with float3… · scikit-learn/scikit-learn@5e0e925 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5e0e925

Browse files
author
Sebastian Saeger
committed
Adds support and tests for KMeans/MiniBatchKMeans to work with float32 to save memory
1 parent c9d66db commit 5e0e925

File tree

4 files changed

+371
-51
lines changed

4 files changed

+371
-51
lines changed

sklearn/cluster/_k_means.pyx

Lines changed: 87 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import numpy as np
1313
import scipy.sparse as sp
1414
cimport numpy as np
1515
cimport cython
16+
from cython cimport floating
1617

1718
from ..utils.extmath import norm
1819
from sklearn.utils.sparsefuncs_fast cimport add_row_csr
@@ -23,18 +24,19 @@ ctypedef np.int32_t INT
2324

2425
cdef extern from "cblas.h":
2526
double ddot "cblas_ddot"(int N, double *X, int incX, double *Y, int incY)
27+
float sdot "cblas_sdot"(int N, float *X, int incX, float *Y, int incY)
2628

2729
np.import_array()
2830

2931

3032
@cython.boundscheck(False)
3133
@cython.wraparound(False)
3234
@cython.cdivision(True)
33-
cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
34-
np.ndarray[DOUBLE, ndim=1] x_squared_norms,
35-
np.ndarray[DOUBLE, ndim=2] centers,
35+
cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X,
36+
np.ndarray[floating, ndim=1] x_squared_norms,
37+
np.ndarray[floating, ndim=2] centers,
3638
np.ndarray[INT, ndim=1] labels,
37-
np.ndarray[DOUBLE, ndim=1] distances):
39+
np.ndarray[floating, ndim=1] distances):
3840
"""Compute label assignment and inertia for a dense array
3941
4042
Return the inertia (sum of squared distances to the centers).
@@ -43,33 +45,58 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
4345
unsigned int n_clusters = centers.shape[0]
4446
unsigned int n_features = centers.shape[1]
4547
unsigned int n_samples = X.shape[0]
46-
unsigned int x_stride = X.strides[1] / sizeof(DOUBLE)
47-
unsigned int center_stride = centers.strides[1] / sizeof(DOUBLE)
48+
unsigned int x_stride
49+
unsigned int center_stride
4850
unsigned int sample_idx, center_idx, feature_idx
4951
unsigned int store_distances = 0
5052
unsigned int k
53+
np.ndarray[floating, ndim=1] center_squared_norms
54+
# the following variables are always double cause make them floating
55+
# does not save any memory, but makes the code much bigger
5156
DOUBLE inertia = 0.0
5257
DOUBLE min_dist
5358
DOUBLE dist
54-
np.ndarray[DOUBLE, ndim=1] center_squared_norms = np.zeros(
55-
n_clusters, dtype=np.float64)
59+
60+
if floating is float:
61+
center_squared_norms = np.zeros(n_clusters, dtype=np.float32)
62+
x_stride = X.strides[1] / sizeof(float)
63+
center_stride = centers.strides[1] / sizeof(float)
64+
elif floating is double:
65+
center_squared_norms = np.zeros(n_clusters, dtype=np.float64)
66+
x_stride = X.strides[1] / sizeof(DOUBLE)
67+
center_stride = centers.strides[1] / sizeof(DOUBLE)
68+
else:
69+
raise ValueError("Unknown floating type.")
5670

5771
if n_samples == distances.shape[0]:
5872
store_distances = 1
5973

6074
for center_idx in range(n_clusters):
61-
center_squared_norms[center_idx] = ddot(
62-
n_features, &centers[center_idx, 0], center_stride,
63-
&centers[center_idx, 0], center_stride)
75+
if floating is float:
76+
center_squared_norms[center_idx] = sdot(
77+
n_features, &centers[center_idx, 0], center_stride,
78+
&centers[center_idx, 0], center_stride)
79+
elif floating is double:
80+
center_squared_norms[center_idx] = ddot(
81+
n_features, &centers[center_idx, 0], center_stride,
82+
&centers[center_idx, 0], center_stride)
83+
else:
84+
raise ValueError("Unknown floating type.")
6485

6586
for sample_idx in range(n_samples):
6687
min_dist = -1
6788
for center_idx in range(n_clusters):
6889
dist = 0.0
6990
# hardcoded: minimize euclidean distance to cluster center:
7091
# ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
71-
dist += ddot(n_features, &X[sample_idx, 0], x_stride,
72-
&centers[center_idx, 0], center_stride)
92+
if floating is float:
93+
dist += sdot(n_features, &X[sample_idx, 0], x_stride,
94+
&centers[center_idx, 0], center_stride)
95+
elif floating is double:
96+
dist += ddot(n_features, &X[sample_idx, 0], x_stride,
97+
&centers[center_idx, 0], center_stride)
98+
else:
99+
raise ValueError("Unknown floating type.")
73100
dist *= -2
74101
dist += center_squared_norms[center_idx]
75102
dist += x_squared_norms[sample_idx]
@@ -87,16 +114,16 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
87114
@cython.boundscheck(False)
88115
@cython.wraparound(False)
89116
@cython.cdivision(True)
90-
cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
91-
np.ndarray[DOUBLE, ndim=2] centers,
117+
cpdef DOUBLE _assign_labels_csr(X, np.ndarray[floating, ndim=1] x_squared_norms,
118+
np.ndarray[floating, ndim=2] centers,
92119
np.ndarray[INT, ndim=1] labels,
93-
np.ndarray[DOUBLE, ndim=1] distances):
120+
np.ndarray[floating, ndim=1] distances):
94121
"""Compute label assignment and inertia for a CSR input
95122
96123
Return the inertia (sum of squared distances to the centers).
97124
"""
98125
cdef:
99-
np.ndarray[DOUBLE, ndim=1] X_data = X.data
126+
np.ndarray[floating, ndim=1] X_data = X.data
100127
np.ndarray[INT, ndim=1] X_indices = X.indices
101128
np.ndarray[INT, ndim=1] X_indptr = X.indptr
102129
unsigned int n_clusters = centers.shape[0]
@@ -105,18 +132,32 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
105132
unsigned int store_distances = 0
106133
unsigned int sample_idx, center_idx, feature_idx
107134
unsigned int k
135+
np.ndarray[floating, ndim=1] center_squared_norms
136+
# the following variables are always double cause make them floating
137+
# does not save any memory, but makes the code much bigger
108138
DOUBLE inertia = 0.0
109139
DOUBLE min_dist
110140
DOUBLE dist
111-
np.ndarray[DOUBLE, ndim=1] center_squared_norms = np.zeros(
112-
n_clusters, dtype=np.float64)
141+
142+
if floating is float:
143+
center_squared_norms = np.zeros(n_clusters, dtype=np.float32)
144+
elif floating is double:
145+
center_squared_norms = np.zeros(n_clusters, dtype=np.float64)
146+
else:
147+
raise ValueError("Unknown floating type.")
113148

114149
if n_samples == distances.shape[0]:
115150
store_distances = 1
116151

117152
for center_idx in range(n_clusters):
118-
center_squared_norms[center_idx] = ddot(
119-
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)
153+
if floating is float:
154+
center_squared_norms[center_idx] = sdot(
155+
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)
156+
elif floating is double:
157+
center_squared_norms[center_idx] = ddot(
158+
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)
159+
else:
160+
raise ValueError("Unknown floating type.")
120161

121162
for sample_idx in range(n_samples):
122163
min_dist = -1
@@ -142,18 +183,18 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
142183
@cython.boundscheck(False)
143184
@cython.wraparound(False)
144185
@cython.cdivision(True)
145-
def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
146-
np.ndarray[DOUBLE, ndim=2] centers,
186+
def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] x_squared_norms,
187+
np.ndarray[floating, ndim=2] centers,
147188
np.ndarray[INT, ndim=1] counts,
148189
np.ndarray[INT, ndim=1] nearest_center,
149-
np.ndarray[DOUBLE, ndim=1] old_center,
190+
np.ndarray[floating, ndim=1] old_center,
150191
int compute_squared_diff):
151192
"""Incremental update of the centers for sparse MiniBatchKMeans.
152193
153194
Parameters
154195
----------
155196
156-
X: CSR matrix, dtype float64
197+
X: CSR matrix, dtype float
157198
The complete (pre allocated) training set as a CSR matrix.
158199
159200
centers: array, shape (n_clusters, n_features)
@@ -179,7 +220,7 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
179220
of the algorithm.
180221
"""
181222
cdef:
182-
np.ndarray[DOUBLE, ndim=1] X_data = X.data
223+
np.ndarray[floating, ndim=1] X_data = X.data
183224
np.ndarray[int, ndim=1] X_indices = X.indices
184225
np.ndarray[int, ndim=1] X_indptr = X.indptr
185226
unsigned int n_samples = X.shape[0]
@@ -245,9 +286,9 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
245286
@cython.boundscheck(False)
246287
@cython.wraparound(False)
247288
@cython.cdivision(True)
248-
def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,
289+
def _centers_dense(np.ndarray[floating, ndim=2] X,
249290
np.ndarray[INT, ndim=1] labels, int n_clusters,
250-
np.ndarray[DOUBLE, ndim=1] distances):
291+
np.ndarray[floating, ndim=1] distances):
251292
"""M step of the K-means EM algorithm
252293
253294
Computation of cluster centers / means.
@@ -275,7 +316,14 @@ def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,
275316
n_samples = X.shape[0]
276317
n_features = X.shape[1]
277318
cdef int i, j, c
278-
cdef np.ndarray[DOUBLE, ndim=2] centers = np.zeros((n_clusters, n_features))
319+
cdef np.ndarray[floating, ndim=2] centers
320+
if floating is float:
321+
centers = np.zeros((n_clusters, n_features), dtype=np.float32)
322+
elif floating is double:
323+
centers = np.zeros((n_clusters, n_features), dtype=np.float64)
324+
else:
325+
raise ValueError("Unknown floating type.")
326+
279327
n_samples_in_cluster = bincount(labels, minlength=n_clusters)
280328
empty_clusters = np.where(n_samples_in_cluster == 0)[0]
281329
# maybe also relocate small clusters?
@@ -300,7 +348,7 @@ def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,
300348

301349

302350
def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
303-
np.ndarray[DOUBLE, ndim=1] distances):
351+
np.ndarray[floating, ndim=1] distances):
304352
"""M step of the K-means EM algorithm
305353
306354
Computation of cluster centers / means.
@@ -327,18 +375,24 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
327375

328376
cdef np.npy_intp cluster_id
329377

330-
cdef np.ndarray[DOUBLE, ndim=1] data = X.data
378+
cdef np.ndarray[floating, ndim=1] data = X.data
331379
cdef np.ndarray[int, ndim=1] indices = X.indices
332380
cdef np.ndarray[int, ndim=1] indptr = X.indptr
333381

334-
cdef np.ndarray[DOUBLE, ndim=2, mode="c"] centers = \
335-
np.zeros((n_clusters, n_features))
382+
cdef np.ndarray[floating, ndim=2, mode="c"] centers
336383
cdef np.ndarray[np.npy_intp, ndim=1] far_from_centers
337384
cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] n_samples_in_cluster = \
338385
bincount(labels, minlength=n_clusters)
339386
cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \
340387
np.where(n_samples_in_cluster == 0)[0]
341388

389+
if floating is float:
390+
centers = np.zeros((n_clusters, n_features), dtype=np.float32)
391+
elif floating is double:
392+
centers = np.zeros((n_clusters, n_features), dtype=np.float64)
393+
else:
394+
raise ValueError("Unknown floating type.")
395+
342396
# maybe also relocate small clusters?
343397

344398
if empty_clusters.shape[0] > 0:

sklearn/cluster/k_means_.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
7676
"""
7777
n_samples, n_features = X.shape
7878

79-
centers = np.empty((n_clusters, n_features))
79+
centers = np.empty((n_clusters, n_features), dtype=X.dtype)
8080

8181
assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
8282

@@ -435,7 +435,7 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
435435

436436
# Allocate memory to store the distances for each sample to its
437437
# closer center for reallocation in case of ties
438-
distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)
438+
distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
439439

440440
# iterations
441441
for i in range(max_iter):
@@ -542,13 +542,13 @@ def _labels_inertia(X, x_squared_norms, centers,
542542
Precomputed squared euclidean norm of each data point, to speed up
543543
computations.
544544
545-
centers: float64 array, shape (k, n_features)
545+
centers: float array, shape (k, n_features)
546546
The cluster centers.
547547
548548
precompute_distances : boolean, default: True
549549
Precompute distances (faster but takes more memory).
550550
551-
distances: float64 array, shape (n_samples,)
551+
distances: float array, shape (n_samples,)
552552
Pre-allocated array to be filled in with each sample's distance
553553
to the closest center.
554554
@@ -565,7 +565,7 @@ def _labels_inertia(X, x_squared_norms, centers,
565565
# easily
566566
labels = -np.ones(n_samples, np.int32)
567567
if distances is None:
568-
distances = np.zeros(shape=(0,), dtype=np.float64)
568+
distances = np.zeros(shape=(0,), dtype=X.dtype)
569569
# distances will be changed in-place
570570
if sp.issparse(X):
571571
inertia = _k_means._assign_labels_csr(
@@ -642,7 +642,9 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
642642
seeds = random_state.permutation(n_samples)[:k]
643643
centers = X[seeds]
644644
elif hasattr(init, '__array__'):
645-
centers = init
645+
# ensure that the centers have the same dtype as X
646+
# this is a requirement of fused types of cython
647+
centers = np.array(init, dtype=X.dtype)
646648
elif callable(init):
647649
centers = init(X, k, random_state=random_state)
648650
else:
@@ -783,7 +785,11 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
783785

784786
def _check_fit_data(self, X):
785787
"""Verify that the number of samples given is larger than k"""
786-
X = check_array(X, accept_sparse='csr', dtype=np.float64)
788+
# to handle sparse data which only works as float64 at the moment
789+
if sp.issparse(X):
790+
X = check_array(X, accept_sparse='csr', dtype=np.float64)
791+
else:
792+
X = check_array(X, dtype=None)
787793
if X.shape[0] < self.n_clusters:
788794
raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
789795
X.shape[0], self.n_clusters))
@@ -933,7 +939,7 @@ def _mini_batch_step(X, x_squared_norms, centers, counts,
933939
The vector in which we keep track of the numbers of elements in a
934940
cluster. This array is MODIFIED IN PLACE
935941
936-
distances : array, dtype float64, shape (n_samples), optional
942+
distances : array, dtype float, shape (n_samples), optional
937943
If not None, should be a pre-allocated array that will be used to store
938944
the distances of each sample to its closest center.
939945
May not be None when random_reassign is True.
@@ -1034,7 +1040,9 @@ def _mini_batch_step(X, x_squared_norms, centers, counts,
10341040
counts[center_idx] += count
10351041

10361042
# inplace rescale to compute mean of all points (old and new)
1037-
centers[center_idx] /= counts[center_idx]
1043+
# Note: numpy >= 1.10 does not support '/=' for the following
1044+
# expression for a mixture of int and float (see numpy issue #6464)
1045+
centers[center_idx] = centers[center_idx]/counts[center_idx]
10381046

10391047
# update the squared diff if necessary
10401048
if compute_squared_diff:
@@ -1232,15 +1240,20 @@ def fit(self, X, y=None):
12321240
Coordinates of the data points to cluster
12331241
"""
12341242
random_state = check_random_state(self.random_state)
1235-
X = check_array(X, accept_sparse="csr", order='C', dtype=np.float64)
1243+
# to handle sparse data which only works as float64 at the moment
1244+
if sp.issparse(X):
1245+
X = check_array(X, accept_sparse="csr", order='C',
1246+
dtype=np.float64)
1247+
else:
1248+
X = check_array(X, accept_sparse="csr", order='C')
12361249
n_samples, n_features = X.shape
12371250
if n_samples < self.n_clusters:
12381251
raise ValueError("Number of samples smaller than number "
12391252
"of clusters.")
12401253

12411254
n_init = self.n_init
12421255
if hasattr(self.init, '__array__'):
1243-
self.init = np.ascontiguousarray(self.init, dtype=np.float64)
1256+
self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
12441257
if n_init != 1:
12451258
warnings.warn(
12461259
'Explicit initial center position passed: '
@@ -1264,7 +1277,7 @@ def fit(self, X, y=None):
12641277
# disabled
12651278
old_center_buffer = np.zeros(0, np.double)
12661279

1267-
distances = np.zeros(self.batch_size, dtype=np.float64)
1280+
distances = np.zeros(self.batch_size, dtype=X.dtype)
12681281
n_batches = int(np.ceil(float(n_samples) / self.batch_size))
12691282
n_iter = int(self.max_iter * n_batches)
12701283

@@ -1397,7 +1410,7 @@ def partial_fit(self, X, y=None):
13971410
X = check_array(X, accept_sparse="csr")
13981411
n_samples, n_features = X.shape
13991412
if hasattr(self.init, '__array__'):
1400-
self.init = np.ascontiguousarray(self.init, dtype=np.float64)
1413+
self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
14011414

14021415
if n_samples == 0:
14031416
return self
@@ -1423,7 +1436,7 @@ def partial_fit(self, X, y=None):
14231436
# reassignment too often, to allow for building up counts
14241437
random_reassign = self.random_state_.randint(
14251438
10 * (1 + self.counts_.min())) == 0
1426-
distances = np.zeros(X.shape[0], dtype=np.float64)
1439+
distances = np.zeros(X.shape[0], dtype=X.dtype)
14271440

14281441
_mini_batch_step(X, x_squared_norms, self.cluster_centers_,
14291442
self.counts_, np.zeros(0, np.double), 0,

0 commit comments

Comments
 (0)
0