63
63
{
64
64
"X" : ["array-like" , "sparse matrix" ],
65
65
"n_clusters" : [Interval (Integral , 1 , None , closed = "left" )],
66
+ "sample_weight" : ["array-like" , None ],
66
67
"x_squared_norms" : ["array-like" , None ],
67
68
"random_state" : ["random_state" ],
68
69
"n_local_trials" : [Interval (Integral , 1 , None , closed = "left" ), None ],
69
70
}
70
71
)
71
72
def kmeans_plusplus (
72
- X , n_clusters , * , x_squared_norms = None , random_state = None , n_local_trials = None
73
+ X ,
74
+ n_clusters ,
75
+ * ,
76
+ sample_weight = None ,
77
+ x_squared_norms = None ,
78
+ random_state = None ,
79
+ n_local_trials = None ,
73
80
):
74
81
"""Init n_clusters seeds according to k-means++.
75
82
@@ -83,6 +90,13 @@ def kmeans_plusplus(
83
90
n_clusters : int
84
91
The number of centroids to initialize.
85
92
93
+ sample_weight : array-like of shape (n_samples,), default=None
94
+ The weights for each observation in `X`. If `None`, all observations
95
+ are assigned equal weight. `sample_weight` is ignored if `init`
96
+ is a callable or a user provided array.
97
+
98
+ .. versionadded:: 1.3
99
+
86
100
x_squared_norms : array-like of shape (n_samples,), default=None
87
101
Squared Euclidean norm of each data point.
88
102
@@ -125,13 +139,14 @@ def kmeans_plusplus(
125
139
... [10, 2], [10, 4], [10, 0]])
126
140
>>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
127
141
>>> centers
128
- array([[10, 4 ],
142
+ array([[10, 2 ],
129
143
[ 1, 0]])
130
144
>>> indices
131
- array([4 , 2])
145
+ array([3 , 2])
132
146
"""
133
147
# Check data
134
148
check_array (X , accept_sparse = "csr" , dtype = [np .float64 , np .float32 ])
149
+ sample_weight = _check_sample_weight (sample_weight , X , dtype = X .dtype )
135
150
136
151
if X .shape [0 ] < n_clusters :
137
152
raise ValueError (
@@ -154,13 +169,15 @@ def kmeans_plusplus(
154
169
155
170
# Call private k-means++
156
171
centers , indices = _kmeans_plusplus (
157
- X , n_clusters , x_squared_norms , random_state , n_local_trials
172
+ X , n_clusters , x_squared_norms , sample_weight , random_state , n_local_trials
158
173
)
159
174
160
175
return centers , indices
161
176
162
177
163
- def _kmeans_plusplus (X , n_clusters , x_squared_norms , random_state , n_local_trials = None ):
178
+ def _kmeans_plusplus (
179
+ X , n_clusters , x_squared_norms , sample_weight , random_state , n_local_trials = None
180
+ ):
164
181
"""Computational component for initialization of n_clusters by
165
182
k-means++. Prior validation of data is assumed.
166
183
@@ -172,6 +189,9 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trial
172
189
n_clusters : int
173
190
The number of seeds to choose.
174
191
192
+ sample_weight : ndarray of shape (n_samples,)
193
+ The weights for each observation in `X`.
194
+
175
195
x_squared_norms : ndarray of shape (n_samples,)
176
196
Squared Euclidean norm of each data point.
177
197
@@ -206,7 +226,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trial
206
226
n_local_trials = 2 + int (np .log (n_clusters ))
207
227
208
228
# Pick first center randomly and track index of point
209
- center_id = random_state .randint (n_samples )
229
+ center_id = random_state .choice (n_samples , p = sample_weight / sample_weight . sum () )
210
230
indices = np .full (n_clusters , - 1 , dtype = int )
211
231
if sp .issparse (X ):
212
232
centers [0 ] = X [center_id ].toarray ()
@@ -218,14 +238,16 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trial
218
238
closest_dist_sq = _euclidean_distances (
219
239
centers [0 , np .newaxis ], X , Y_norm_squared = x_squared_norms , squared = True
220
240
)
221
- current_pot = closest_dist_sq . sum ()
241
+ current_pot = closest_dist_sq @ sample_weight
222
242
223
243
# Pick the remaining n_clusters-1 points
224
244
for c in range (1 , n_clusters ):
225
245
# Choose center candidates by sampling with probability proportional
226
246
# to the squared distance to the closest existing center
227
247
rand_vals = random_state .uniform (size = n_local_trials ) * current_pot
228
- candidate_ids = np .searchsorted (stable_cumsum (closest_dist_sq ), rand_vals )
248
+ candidate_ids = np .searchsorted (
249
+ stable_cumsum (sample_weight * closest_dist_sq ), rand_vals
250
+ )
229
251
# XXX: numerical imprecision can result in a candidate_id out of range
230
252
np .clip (candidate_ids , None , closest_dist_sq .size - 1 , out = candidate_ids )
231
253
@@ -236,7 +258,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trial
236
258
237
259
# update closest distances squared and potential for each candidate
238
260
np .minimum (closest_dist_sq , distance_to_candidates , out = distance_to_candidates )
239
- candidates_pot = distance_to_candidates . sum ( axis = 1 )
261
+ candidates_pot = distance_to_candidates @ sample_weight . reshape ( - 1 , 1 )
240
262
241
263
# Decide which candidate is the best
242
264
best_candidate = np .argmin (candidates_pot )
@@ -323,7 +345,8 @@ def k_means(
323
345
324
346
sample_weight : array-like of shape (n_samples,), default=None
325
347
The weights for each observation in `X`. If `None`, all observations
326
- are assigned equal weight.
348
+ are assigned equal weight. `sample_weight` is not used during
349
+ initialization if `init` is a callable or a user provided array.
327
350
328
351
init : {'k-means++', 'random'}, callable or array-like of shape \
329
352
(n_clusters, n_features), default='k-means++'
@@ -939,7 +962,14 @@ def _check_test_data(self, X):
939
962
return X
940
963
941
964
def _init_centroids (
942
- self , X , x_squared_norms , init , random_state , init_size = None , n_centroids = None
965
+ self ,
966
+ X ,
967
+ x_squared_norms ,
968
+ init ,
969
+ random_state ,
970
+ init_size = None ,
971
+ n_centroids = None ,
972
+ sample_weight = None ,
943
973
):
944
974
"""Compute the initial centroids.
945
975
@@ -969,6 +999,11 @@ def _init_centroids(
969
999
If left to 'None' the number of centroids will be equal to
970
1000
number of clusters to form (self.n_clusters)
971
1001
1002
+ sample_weight : ndarray of shape (n_samples,), default=None
1003
+ The weights for each observation in X. If None, all observations
1004
+ are assigned equal weight. `sample_weight` is not used during
1005
+ initialization if `init` is a callable or a user provided array.
1006
+
972
1007
Returns
973
1008
-------
974
1009
centers : ndarray of shape (n_clusters, n_features)
@@ -981,16 +1016,23 @@ def _init_centroids(
981
1016
X = X [init_indices ]
982
1017
x_squared_norms = x_squared_norms [init_indices ]
983
1018
n_samples = X .shape [0 ]
1019
+ sample_weight = sample_weight [init_indices ]
984
1020
985
1021
if isinstance (init , str ) and init == "k-means++" :
986
1022
centers , _ = _kmeans_plusplus (
987
1023
X ,
988
1024
n_clusters ,
989
1025
random_state = random_state ,
990
1026
x_squared_norms = x_squared_norms ,
1027
+ sample_weight = sample_weight ,
991
1028
)
992
1029
elif isinstance (init , str ) and init == "random" :
993
- seeds = random_state .permutation (n_samples )[:n_clusters ]
1030
+ seeds = random_state .choice (
1031
+ n_samples ,
1032
+ size = n_clusters ,
1033
+ replace = False ,
1034
+ p = sample_weight / sample_weight .sum (),
1035
+ )
994
1036
centers = X [seeds ]
995
1037
elif _is_arraylike_not_scalar (self .init ):
996
1038
centers = init
@@ -1412,7 +1454,8 @@ def fit(self, X, y=None, sample_weight=None):
1412
1454
1413
1455
sample_weight : array-like of shape (n_samples,), default=None
1414
1456
The weights for each observation in X. If None, all observations
1415
- are assigned equal weight.
1457
+ are assigned equal weight. `sample_weight` is not used during
1458
+ initialization if `init` is a callable or a user provided array.
1416
1459
1417
1460
.. versionadded:: 0.20
1418
1461
@@ -1468,7 +1511,11 @@ def fit(self, X, y=None, sample_weight=None):
1468
1511
for i in range (self ._n_init ):
1469
1512
# Initialize centers
1470
1513
centers_init = self ._init_centroids (
1471
- X , x_squared_norms = x_squared_norms , init = init , random_state = random_state
1514
+ X ,
1515
+ x_squared_norms = x_squared_norms ,
1516
+ init = init ,
1517
+ random_state = random_state ,
1518
+ sample_weight = sample_weight ,
1472
1519
)
1473
1520
if self .verbose :
1474
1521
print ("Initialization complete" )
@@ -1545,7 +1592,7 @@ def _mini_batch_step(
1545
1592
Squared euclidean norm of each data point.
1546
1593
1547
1594
sample_weight : ndarray of shape (n_samples,)
1548
- The weights for each observation in X .
1595
+ The weights for each observation in `X` .
1549
1596
1550
1597
centers : ndarray of shape (n_clusters, n_features)
1551
1598
The cluster centers before the current iteration
@@ -1818,19 +1865,19 @@ class MiniBatchKMeans(_BaseKMeans):
1818
1865
>>> kmeans = kmeans.partial_fit(X[0:6,:])
1819
1866
>>> kmeans = kmeans.partial_fit(X[6:12,:])
1820
1867
>>> kmeans.cluster_centers_
1821
- array([[2. , 1. ],
1822
- [3.5, 4.5 ]])
1868
+ array([[3.375, 3. ],
1869
+ [0.75 , 0.5 ]])
1823
1870
>>> kmeans.predict([[0, 0], [4, 4]])
1824
- array([0, 1 ], dtype=int32)
1871
+ array([1, 0 ], dtype=int32)
1825
1872
>>> # fit on the whole data
1826
1873
>>> kmeans = MiniBatchKMeans(n_clusters=2,
1827
1874
... random_state=0,
1828
1875
... batch_size=6,
1829
1876
... max_iter=10,
1830
1877
... n_init="auto").fit(X)
1831
1878
>>> kmeans.cluster_centers_
1832
- array([[3.97727273 , 2.43181818 ],
1833
- [1.125 , 1.6 ]])
1879
+ array([[3.55102041 , 2.48979592 ],
1880
+ [1.06896552 , 1. ]])
1834
1881
>>> kmeans.predict([[0, 0], [4, 4]])
1835
1882
array([1, 0], dtype=int32)
1836
1883
"""
@@ -2015,7 +2062,8 @@ def fit(self, X, y=None, sample_weight=None):
2015
2062
2016
2063
sample_weight : array-like of shape (n_samples,), default=None
2017
2064
The weights for each observation in X. If None, all observations
2018
- are assigned equal weight.
2065
+ are assigned equal weight. `sample_weight` is not used during
2066
+ initialization if `init` is a callable or a user provided array.
2019
2067
2020
2068
.. versionadded:: 0.20
2021
2069
@@ -2070,6 +2118,7 @@ def fit(self, X, y=None, sample_weight=None):
2070
2118
init = init ,
2071
2119
random_state = random_state ,
2072
2120
init_size = self ._init_size ,
2121
+ sample_weight = sample_weight ,
2073
2122
)
2074
2123
2075
2124
# Compute inertia on a validation set.
@@ -2170,7 +2219,8 @@ def partial_fit(self, X, y=None, sample_weight=None):
2170
2219
2171
2220
sample_weight : array-like of shape (n_samples,), default=None
2172
2221
The weights for each observation in X. If None, all observations
2173
- are assigned equal weight.
2222
+ are assigned equal weight. `sample_weight` is not used during
2223
+ initialization if `init` is a callable or a user provided array.
2174
2224
2175
2225
Returns
2176
2226
-------
@@ -2220,6 +2270,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
2220
2270
init = init ,
2221
2271
random_state = self ._random_state ,
2222
2272
init_size = self ._init_size ,
2273
+ sample_weight = sample_weight ,
2223
2274
)
2224
2275
2225
2276
# Initialize counts
0 commit comments