13
13
from ..base import BaseEstimator
14
14
from ..metrics .pairwise import euclidian_distances
15
15
16
- ################################################################################
16
+
17
+ ###############################################################################
17
18
# Initialisation heuristic
18
19
19
- # kinit originaly from pybrain:
20
- # http://github.com/pybrain/pybrain/raw/master/pybrain/auxiliary/kmeans.py
21
20
def k_init (X , k , n_samples_max = 500 , rng = None ):
22
21
"""Init k seeds according to kmeans++
23
22
@@ -44,6 +43,9 @@ def k_init(X, k, n_samples_max=500, rng=None):
44
43
45
44
Implementation from Yong Sun's website
46
45
http://blogs.sun.com/yongsun/entry/k_means_and_k_means
46
+
47
+ kinit originaly from pybrain:
48
+ http://github.com/pybrain/pybrain/raw/master/pybrain/auxiliary/kmeans.py
47
49
"""
48
50
n_samples = X .shape [0 ]
49
51
if rng is None :
@@ -55,8 +57,8 @@ def k_init(X, k, n_samples_max=500, rng=None):
55
57
56
58
distances = euclidian_distances (X , X , squared = True )
57
59
58
- ' choose the 1st seed randomly, and store D(x)^2 in D[]'
59
- first_idx = rng .randint (n_samples )
60
+ # choose the 1st seed randomly, and store D(x)^2 in D[]
61
+ first_idx = rng .randint (n_samples )
60
62
centers = [X [first_idx ]]
61
63
D = distances [first_idx ]
62
64
@@ -76,10 +78,10 @@ def k_init(X, k, n_samples_max=500, rng=None):
76
78
return np .array (centers )
77
79
78
80
79
- ################################################################################
81
+ ###############################################################################
80
82
# K-means estimation by EM (expectation maximisation)
81
83
82
- def k_means (X , k , init = 'k-means++' , n_init = 10 , max_iter = 300 , verbose = 0 ,
84
+ def k_means (X , k , init = 'k-means++' , n_init = 10 , max_iter = 300 , verbose = 0 ,
83
85
delta = 1e-4 , rng = None , copy_x = True ):
84
86
""" K-means clustering algorithm.
85
87
@@ -99,9 +101,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
99
101
Maximum number of iterations of the k-means algorithm to run.
100
102
101
103
n_init: int, optional, default: 10
102
- Number of time the k-means algorithm will be run with different centroid
103
- seeds. The final results will be the best output of n_init consecutive
104
- runs in terms of inertia.
104
+ Number of time the k-means algorithm will be run with different
105
+ centroid seeds. The final results will be the best output of
106
+ n_init consecutive runs in terms of inertia.
105
107
106
108
init: {'k-means++', 'random', or ndarray, or a callable}, optional
107
109
Method for initialization, default to 'k-means++':
@@ -180,7 +182,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
180
182
"'%s' (type '%s') was passed." )
181
183
182
184
if verbose :
183
- print 'Initialization complete'
185
+ print 'Initialization complete'
184
186
# iterations
185
187
for i in range (max_iter ):
186
188
centers_old = centers .copy ()
@@ -194,19 +196,19 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
194
196
break
195
197
196
198
if inertia < best_inertia :
199
+ best_labels = labels .copy ()
197
200
best_centers = centers .copy ()
198
- best_labels = labels .copy ()
199
201
best_inertia = inertia
200
202
else :
203
+ best_labels = labels
201
204
best_centers = centers
202
- best_labels = labels
203
205
best_inertia = inertia
204
- if not copy_x :
206
+ if not copy_x :
205
207
X += Xmean
206
- return best_centers + Xmean , best_labels , best_inertia
208
+ return best_centers + Xmean , best_labels , best_inertia
207
209
208
210
209
- def _m_step (x , z , k ):
211
+ def _m_step (x , z , k ):
210
212
""" M step of the K-means EM algorithm
211
213
212
214
Computation of cluster centers/means
@@ -228,10 +230,10 @@ def _m_step(x, z ,k):
228
230
dim = x .shape [1 ]
229
231
centers = np .repeat (np .reshape (x .mean (0 ), (1 , dim )), k , 0 )
230
232
for q in range (k ):
231
- if np .sum (z == q ) == 0 :
233
+ if np .sum (z == q ) == 0 :
232
234
pass
233
235
else :
234
- centers [q ] = np .mean (x [z == q ], axis = 0 )
236
+ centers [q ] = np .mean (x [z == q ], axis = 0 )
235
237
return centers
236
238
237
239
@@ -261,23 +263,21 @@ def _e_step(x, centers, precompute_distances=True, x_squared_norms=None):
261
263
k = centers .shape [0 ]
262
264
263
265
if precompute_distances :
264
- distances = euclidian_distances (centers , x , x_squared_norms , squared = True )
266
+ distances = euclidian_distances (centers , x , x_squared_norms ,
267
+ squared = True )
265
268
z = - np .ones (n_samples ).astype (np .int )
266
269
mindist = np .infty * np .ones (n_samples )
267
270
for q in range (k ):
268
271
if precompute_distances :
269
272
dist = distances [q ]
270
273
else :
271
274
dist = np .sum ((x - centers [q ]) ** 2 , axis = 1 )
272
- z [dist < mindist ] = q
275
+ z [dist < mindist ] = q
273
276
mindist = np .minimum (dist , mindist )
274
277
inertia = mindist .sum ()
275
278
return z , inertia
276
279
277
280
278
-
279
- ################################################################################
280
-
281
281
class KMeans (BaseEstimator ):
282
282
""" K-Means clustering
283
283
@@ -295,12 +295,13 @@ class KMeans(BaseEstimator):
295
295
interpreted as initial cluster to use instead.
296
296
297
297
max_iter : int
298
- Maximum number of iterations of the k-means algorithm for a single run.
298
+ Maximum number of iterations of the k-means algorithm for a
299
+ single run.
299
300
300
301
n_init: int, optional, default: 10
301
- Number of time the k-means algorithm will be run with different centroid
302
- seeds. The final results will be the best output of n_init consecutive
303
- runs in terms of inertia.
302
+ Number of time the k-means algorithm will be run with different
303
+ centroid seeds. The final results will be the best output of
304
+ n_init consecutive runs in terms of inertia.
304
305
305
306
init : {'k-means++', 'random', 'points', 'matrix'}
306
307
Method for initialization, defaults to 'k-means++':
@@ -354,7 +355,6 @@ class KMeans(BaseEstimator):
354
355
it can be useful to restart it several times.
355
356
"""
356
357
357
-
358
358
def __init__ (self , k = 8 , init = 'random' , n_init = 10 , max_iter = 300 ,
359
359
verbose = 0 , rng = None , copy_x = True ):
360
360
self .k = k
@@ -371,7 +371,7 @@ def fit(self, X, **params):
371
371
self ._set_params (** params )
372
372
self .cluster_centers_ , self .labels_ , self .inertia_ = k_means (X ,
373
373
k = self .k , init = self .init , n_init = self .n_init ,
374
- max_iter = self .max_iter , verbose = self .verbose ,
374
+ max_iter = self .max_iter , verbose = self .verbose ,
375
375
rng = self .rng , copy_x = self .copy_x )
376
376
return self
377
377
0 commit comments