@@ -288,131 +288,17 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
288
288
Returned only if `return_n_iter` is set to True.
289
289
290
290
"""
291
- if n_init <= 0 :
292
- raise ValueError ("Invalid number of initializations."
293
- " n_init=%d must be bigger than zero." % n_init )
294
- random_state = check_random_state (random_state )
295
-
296
- if max_iter <= 0 :
297
- raise ValueError ('Number of iterations should be a positive number,'
298
- ' got %d instead' % max_iter )
299
-
300
- # avoid forcing order when copy_x=False
301
- order = "C" if copy_x else None
302
- X = check_array (X , accept_sparse = 'csr' , dtype = [np .float64 , np .float32 ],
303
- order = order , copy = copy_x )
304
- # verify that the number of samples given is larger than k
305
- if _num_samples (X ) < n_clusters :
306
- raise ValueError ("n_samples=%d should be >= n_clusters=%d" % (
307
- _num_samples (X ), n_clusters ))
308
-
309
- tol = _tolerance (X , tol )
310
-
311
- # If the distances are precomputed every job will create a matrix of shape
312
- # (n_clusters, n_samples). To stop KMeans from eating up memory we only
313
- # activate this if the created matrix is guaranteed to be under 100MB. 12
314
- # million entries consume a little under 100MB if they are of type double.
315
- if precompute_distances == 'auto' :
316
- n_samples = X .shape [0 ]
317
- precompute_distances = (n_clusters * n_samples ) < 12e6
318
- elif isinstance (precompute_distances , bool ):
319
- pass
320
- else :
321
- raise ValueError ("precompute_distances should be 'auto' or True/False"
322
- ", but a value of %r was passed" %
323
- precompute_distances )
324
-
325
- # Validate init array
326
- if hasattr (init , '__array__' ):
327
- init = check_array (init , dtype = X .dtype .type , copy = True )
328
- _validate_center_shape (X , n_clusters , init )
329
-
330
- if n_init != 1 :
331
- warnings .warn (
332
- 'Explicit initial center position passed: '
333
- 'performing only one init in k-means instead of n_init=%d'
334
- % n_init , RuntimeWarning , stacklevel = 2 )
335
- n_init = 1
336
-
337
- # subtract of mean of x for more accurate distance computations
338
- if not sp .issparse (X ):
339
- X_mean = X .mean (axis = 0 )
340
- # The copy was already done above
341
- X -= X_mean
342
-
343
- if hasattr (init , '__array__' ):
344
- init -= X_mean
345
-
346
- # precompute squared norms of data points
347
- x_squared_norms = row_norms (X , squared = True )
348
-
349
- best_labels , best_inertia , best_centers = None , None , None
350
- if n_clusters == 1 :
351
- # elkan doesn't make sense for a single cluster, full will produce
352
- # the right result.
353
- algorithm = "full"
354
- if algorithm == "auto" :
355
- algorithm = "full" if sp .issparse (X ) else 'elkan'
356
- if algorithm == "full" :
357
- kmeans_single = _kmeans_single_lloyd
358
- elif algorithm == "elkan" :
359
- kmeans_single = _kmeans_single_elkan
360
- else :
361
- raise ValueError ("Algorithm must be 'auto', 'full' or 'elkan', got"
362
- " %s" % str (algorithm ))
363
-
364
- seeds = random_state .randint (np .iinfo (np .int32 ).max , size = n_init )
365
- if effective_n_jobs (n_jobs ) == 1 :
366
- # For a single thread, less memory is needed if we just store one set
367
- # of the best results (as opposed to one set per run per thread).
368
- for seed in seeds :
369
- # run a k-means once
370
- labels , inertia , centers , n_iter_ = kmeans_single (
371
- X , sample_weight , n_clusters , max_iter = max_iter , init = init ,
372
- verbose = verbose , precompute_distances = precompute_distances ,
373
- tol = tol , x_squared_norms = x_squared_norms ,
374
- random_state = seed )
375
- # determine if these results are the best so far
376
- if best_inertia is None or inertia < best_inertia :
377
- best_labels = labels .copy ()
378
- best_centers = centers .copy ()
379
- best_inertia = inertia
380
- best_n_iter = n_iter_
381
- else :
382
- # parallelisation of k-means runs
383
- results = Parallel (n_jobs = n_jobs , verbose = 0 )(
384
- delayed (kmeans_single )(X , sample_weight , n_clusters ,
385
- max_iter = max_iter , init = init ,
386
- verbose = verbose , tol = tol ,
387
- precompute_distances = precompute_distances ,
388
- x_squared_norms = x_squared_norms ,
389
- # Change seed to ensure variety
390
- random_state = seed )
391
- for seed in seeds )
392
- # Get results with the lowest inertia
393
- labels , inertia , centers , n_iters = zip (* results )
394
- best = np .argmin (inertia )
395
- best_labels = labels [best ]
396
- best_inertia = inertia [best ]
397
- best_centers = centers [best ]
398
- best_n_iter = n_iters [best ]
399
-
400
- if not sp .issparse (X ):
401
- if not copy_x :
402
- X += X_mean
403
- best_centers += X_mean
404
-
405
- distinct_clusters = len (set (best_labels ))
406
- if distinct_clusters < n_clusters :
407
- warnings .warn ("Number of distinct clusters ({}) found smaller than "
408
- "n_clusters ({}). Possibly due to duplicate points "
409
- "in X." .format (distinct_clusters , n_clusters ),
410
- ConvergenceWarning , stacklevel = 2 )
411
291
292
+ est = KMeans (
293
+ n_clusters = n_clusters , init = init , n_init = n_init , max_iter = max_iter ,
294
+ verbose = verbose , precompute_distances = precompute_distances , tol = tol ,
295
+ random_state = random_state , copy_x = copy_x , n_jobs = n_jobs ,
296
+ algorithm = algorithm
297
+ ).fit (X , sample_weight = sample_weight )
412
298
if return_n_iter :
413
- return best_centers , best_labels , best_inertia , best_n_iter
299
+ return est . cluster_centers_ , est . labels_ , est . inertia_ , est . n_iter_
414
300
else :
415
- return best_centers , best_labels , best_inertia
301
+ return est . cluster_centers_ , est . labels_ , est . inertia_
416
302
417
303
418
304
def _kmeans_single_elkan (X , sample_weight , n_clusters , max_iter = 300 ,
@@ -953,15 +839,144 @@ def fit(self, X, y=None, sample_weight=None):
953
839
"""
954
840
random_state = check_random_state (self .random_state )
955
841
956
- self .cluster_centers_ , self .labels_ , self .inertia_ , self .n_iter_ = \
957
- k_means (
958
- X , n_clusters = self .n_clusters , sample_weight = sample_weight ,
959
- init = self .init , n_init = self .n_init ,
960
- max_iter = self .max_iter , verbose = self .verbose ,
961
- precompute_distances = self .precompute_distances ,
962
- tol = self .tol , random_state = random_state , copy_x = self .copy_x ,
963
- n_jobs = self .n_jobs , algorithm = self .algorithm ,
964
- return_n_iter = True )
842
+ n_init = self .n_init
843
+ if n_init <= 0 :
844
+ raise ValueError ("Invalid number of initializations."
845
+ " n_init=%d must be bigger than zero." % n_init )
846
+
847
+ if self .max_iter <= 0 :
848
+ raise ValueError (
849
+ 'Number of iterations should be a positive number,'
850
+ ' got %d instead' % self .max_iter
851
+ )
852
+
853
+ # avoid forcing order when copy_x=False
854
+ order = "C" if self .copy_x else None
855
+ X = check_array (X , accept_sparse = 'csr' , dtype = [np .float64 , np .float32 ],
856
+ order = order , copy = self .copy_x )
857
+ # verify that the number of samples given is larger than k
858
+ if _num_samples (X ) < self .n_clusters :
859
+ raise ValueError ("n_samples=%d should be >= n_clusters=%d" % (
860
+ _num_samples (X ), self .n_clusters ))
861
+
862
+ tol = _tolerance (X , self .tol )
863
+
864
+ # If the distances are precomputed every job will create a matrix of
865
+ # shape (n_clusters, n_samples). To stop KMeans from eating up memory
866
+ # we only activate this if the created matrix is guaranteed to be
867
+ # under 100MB. 12 million entries consume a little under 100MB if they
868
+ # are of type double.
869
+ precompute_distances = self .precompute_distances
870
+ if precompute_distances == 'auto' :
871
+ n_samples = X .shape [0 ]
872
+ precompute_distances = (self .n_clusters * n_samples ) < 12e6
873
+ elif isinstance (precompute_distances , bool ):
874
+ pass
875
+ else :
876
+ raise ValueError (
877
+ "precompute_distances should be 'auto' or True/False"
878
+ ", but a value of %r was passed" %
879
+ precompute_distances
880
+ )
881
+
882
+ # Validate init array
883
+ init = self .init
884
+ if hasattr (init , '__array__' ):
885
+ init = check_array (init , dtype = X .dtype .type , copy = True )
886
+ _validate_center_shape (X , self .n_clusters , init )
887
+
888
+ if n_init != 1 :
889
+ warnings .warn (
890
+ 'Explicit initial center position passed: '
891
+ 'performing only one init in k-means instead of n_init=%d'
892
+ % n_init , RuntimeWarning , stacklevel = 2 )
893
+ n_init = 1
894
+
895
+ # subtract of mean of x for more accurate distance computations
896
+ if not sp .issparse (X ):
897
+ X_mean = X .mean (axis = 0 )
898
+ # The copy was already done above
899
+ X -= X_mean
900
+
901
+ if hasattr (init , '__array__' ):
902
+ init -= X_mean
903
+
904
+ # precompute squared norms of data points
905
+ x_squared_norms = row_norms (X , squared = True )
906
+
907
+ best_labels , best_inertia , best_centers = None , None , None
908
+ algorithm = self .algorithm
909
+ if self .n_clusters == 1 :
910
+ # elkan doesn't make sense for a single cluster, full will produce
911
+ # the right result.
912
+ algorithm = "full"
913
+ if algorithm == "auto" :
914
+ algorithm = "full" if sp .issparse (X ) else 'elkan'
915
+ if algorithm == "full" :
916
+ kmeans_single = _kmeans_single_lloyd
917
+ elif algorithm == "elkan" :
918
+ kmeans_single = _kmeans_single_elkan
919
+ else :
920
+ raise ValueError ("Algorithm must be 'auto', 'full' or 'elkan', got"
921
+ " %s" % str (algorithm ))
922
+
923
+ seeds = random_state .randint (np .iinfo (np .int32 ).max , size = n_init )
924
+ if effective_n_jobs (self .n_jobs ) == 1 :
925
+ # For a single thread, less memory is needed if we just store one
926
+ # set of the best results (as opposed to one set per run per
927
+ # thread).
928
+ for seed in seeds :
929
+ # run a k-means once
930
+ labels , inertia , centers , n_iter_ = kmeans_single (
931
+ X , sample_weight , self .n_clusters ,
932
+ max_iter = self .max_iter , init = init , verbose = self .verbose ,
933
+ precompute_distances = precompute_distances , tol = tol ,
934
+ x_squared_norms = x_squared_norms , random_state = seed )
935
+ # determine if these results are the best so far
936
+ if best_inertia is None or inertia < best_inertia :
937
+ best_labels = labels .copy ()
938
+ best_centers = centers .copy ()
939
+ best_inertia = inertia
940
+ best_n_iter = n_iter_
941
+ else :
942
+ # parallelisation of k-means runs
943
+ results = Parallel (n_jobs = self .n_jobs , verbose = 0 )(
944
+ delayed (kmeans_single )(
945
+ X , sample_weight , self .n_clusters ,
946
+ max_iter = self .max_iter , init = init ,
947
+ verbose = self .verbose , tol = tol ,
948
+ precompute_distances = precompute_distances ,
949
+ x_squared_norms = x_squared_norms ,
950
+ # Change seed to ensure variety
951
+ random_state = seed
952
+ )
953
+ for seed in seeds )
954
+ # Get results with the lowest inertia
955
+ labels , inertia , centers , n_iters = zip (* results )
956
+ best = np .argmin (inertia )
957
+ best_labels = labels [best ]
958
+ best_inertia = inertia [best ]
959
+ best_centers = centers [best ]
960
+ best_n_iter = n_iters [best ]
961
+
962
+ if not sp .issparse (X ):
963
+ if not self .copy_x :
964
+ X += X_mean
965
+ best_centers += X_mean
966
+
967
+ distinct_clusters = len (set (best_labels ))
968
+ if distinct_clusters < self .n_clusters :
969
+ warnings .warn (
970
+ "Number of distinct clusters ({}) found smaller than "
971
+ "n_clusters ({}). Possibly due to duplicate points "
972
+ "in X." .format (distinct_clusters , self .n_clusters ),
973
+ ConvergenceWarning , stacklevel = 2
974
+ )
975
+
976
+ self .cluster_centers_ = best_centers
977
+ self .labels_ = best_labels
978
+ self .inertia_ = best_inertia
979
+ self .n_iter_ = best_n_iter
965
980
return self
966
981
967
982
def fit_predict (self , X , y = None , sample_weight = None ):
0 commit comments