@@ -106,8 +106,8 @@ def k_init(X, k, n_samples_max=500, rng=None):
106
106
################################################################################
107
107
# K-means estimation by EM (expectation maximisation)
108
108
109
- def k_means (X , k , init = 'k-means++' , n_init = 10 , max_iter = 300 , verbose = 0 ,
110
- delta = 1e-4 , rng = None ):
109
+ def k_means (X , k ,init = 'k-means++' , n_init = 10 , max_iter = 300 , verbose = 0 ,
110
+ delta = 1e-4 , rng = None , copy_x = True ):
111
111
""" K-means clustering algorithm.
112
112
113
113
Parameters
@@ -150,7 +150,13 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
150
150
Terbosity mode
151
151
152
152
rng: numpy.RandomState, optional
153
- The generator used to initialize the centers
153
+ The generator used to initialize the centers. Defaults to numpy.random.
154
+
155
+ copy_x: boolean, optional
156
+ When pre-computing distances it is more numerically accurate to center the data first.
157
+ If copy_x is True, then the original data is not modified. If False, the original data
158
+ is modified, and put back before the function returns, but small numerical differences
159
+ may be introduced by subtracting and then adding the data mean.
154
160
155
161
Returns
156
162
-------
@@ -180,7 +186,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
180
186
n_init = 1
181
187
'subtract of mean of x for more accurate distance computations'
182
188
Xmean = X .mean (axis = 0 )
183
- X = X - Xmean # TODO: offer an argument to allow doing this inplace
189
+ if copy_x :
190
+ X = X .copy ()
191
+ X -= Xmean
184
192
for it in range (n_init ):
185
193
# init
186
194
if init == 'k-means++' :
@@ -219,6 +227,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
219
227
best_centers = centers
220
228
best_labels = labels
221
229
best_inertia = inertia
230
+ if not copy_x :
231
+ X += Xmean
222
232
return best_centers + Xmean , best_labels , best_inertia
223
233
224
234
@@ -372,19 +382,22 @@ class KMeans(BaseEstimator):
372
382
373
383
374
384
def __init__ (self , k = 8 , init = 'random' , n_init = 10 , max_iter = 300 ,
375
- verbose = 0 ):
385
+ verbose = 0 , rng = None , copy_x = True ):
376
386
self .k = k
377
387
self .init = init
378
388
self .max_iter = max_iter
379
389
self .n_init = n_init
380
390
self .verbose = verbose
391
+ self .rng = rng
392
+ self .copy_x = copy_x
381
393
382
394
def fit (self , X , ** params ):
383
395
""" Compute k-means"""
384
396
X = np .asanyarray (X )
385
397
self ._set_params (** params )
386
398
self .cluster_centers_ , self .labels_ , self .inertia_ = k_means (X ,
387
399
k = self .k , init = self .init , n_init = self .n_init ,
388
- max_iter = self .max_iter , verbose = self .verbose )
400
+ max_iter = self .max_iter , verbose = self .verbose ,
401
+ rng = self .rng , copy_x = self .copy_x )
389
402
return self
390
403
0 commit comments