8000 Merge branch 'master' into warningdoc · scikit-learn/scikit-learn@1084747 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1084747

Browse files
committed
Merge branch 'master' into warningdoc
2 parents 6a6015e + d2476fb commit 1084747

File tree

6 files changed

+410
-313
lines changed

6 files changed

+410
-313
lines changed

sklearn/cluster/k_means_.py

Lines changed: 146 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -288,131 +288,17 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
288288
Returned only if `return_n_iter` is set to True.
289289
290290
"""
291-
if n_init <= 0:
292-
raise ValueError("Invalid number of initializations."
293-
" n_init=%d must be bigger than zero." % n_init)
294-
random_state = check_random_state(random_state)
295-
296-
if max_iter <= 0:
297-
raise ValueError('Number of iterations should be a positive number,'
298-
' got %d instead' % max_iter)
299-
300-
# avoid forcing order when copy_x=False
301-
order = "C" if copy_x else None
302-
X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
303-
order=order, copy=copy_x)
304-
# verify that the number of samples given is larger than k
305-
if _num_samples(X) < n_clusters:
306-
raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
307-
_num_samples(X), n_clusters))
308-
309-
tol = _tolerance(X, tol)
310-
311-
# If the distances are precomputed every job will create a matrix of shape
312-
# (n_clusters, n_samples). To stop KMeans from eating up memory we only
313-
# activate this if the created matrix is guaranteed to be under 100MB. 12
314-
# million entries consume a little under 100MB if they are of type double.
315-
if precompute_distances == 'auto':
316-
n_samples = X.shape[0]
317-
precompute_distances = (n_clusters * n_samples) < 12e6
318-
elif isinstance(precompute_distances, bool):
319-
pass
320-
else:
321-
raise ValueError("precompute_distances should be 'auto' or True/False"
322-
", but a value of %r was passed" %
323-
precompute_distances)
324-
325-
# Validate init array
326-
if hasattr(init, '__array__'):
327-
init = check_array(init, dtype=X.dtype.type, copy=True)
328-
_validate_center_shape(X, n_clusters, init)
329-
330-
if n_init != 1:
331-
warnings.warn(
332-
'Explicit initial center position passed: '
333-
'performing only one init in k-means instead of n_init=%d'
334-
% n_init, RuntimeWarning, stacklevel=2)
335-
n_init = 1
336-
337-
# subtract of mean of x for more accurate distance computations
338-
if not sp.issparse(X):
339-
X_mean = X.mean(axis=0)
340-
# The copy was already done above
341-
X -= X_mean
342-
343-
if hasattr(init, '__array__'):
344-
init -= X_mean
345-
346-
# precompute squared norms of data points
347-
x_squared_norms = row_norms(X, squared=True)
348-
349-
best_labels, best_inertia, best_centers = None, None, None
350-
if n_clusters == 1:
351-
# elkan doesn't make sense for a single cluster, full will produce
352-
# the right result.
353-
algorithm = "full"
354-
if algorithm == "auto":
355-
algorithm = "full" if sp.issparse(X) else 'elkan'
356-
if algorithm == "full":
357-
kmeans_single = _kmeans_single_lloyd
358-
elif algorithm == "elkan":
359-
kmeans_single = _kmeans_single_elkan
360-
else:
361-
raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
362-
" %s" % str(algorithm))
363-
364-
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
365-
if effective_n_jobs(n_jobs) == 1:
366-
# For a single thread, less memory is needed if we just store one set
367-
# of the best results (as opposed to one set per run per thread).
368-
for seed in seeds:
369-
# run a k-means once
370-
labels, inertia, centers, n_iter_ = kmeans_single(
371-
X, sample_weight, n_clusters, max_iter=max_iter, init=init,
372-
verbose=verbose, precompute_distances=precompute_distances,
373-
tol=tol, x_squared_norms=x_squared_norms,
374-
random_state=seed)
375-
# determine if these results are the best so far
376-
if best_inertia is None or inertia < best_inertia:
377-
best_labels = labels.copy()
378-
best_centers = centers.copy()
379-
best_inertia = inertia
380-
best_n_iter = n_iter_
381-
else:
382-
# parallelisation of k-means runs
383-
results = Parallel(n_jobs=n_jobs, verbose=0)(
384-
delayed(kmeans_single)(X, sample_weight, n_clusters,
385-
max_iter=max_iter, init=init,
386-
verbose=verbose, tol=tol,
387-
precompute_distances=precompute_distances,
388-
x_squared_norms=x_squared_norms,
389-
# Change seed to ensure variety
390-
random_state=seed)
391-
for seed in seeds)
392-
# Get results with the lowest inertia
393-
labels, inertia, centers, n_iters = zip(*results)
394-
best = np.argmin(inertia)
395-
best_labels = labels[best]
396-
best_inertia = inertia[best]
397-
best_centers = centers[best]
398-
best_n_iter = n_iters[best]
399-
400-
if not sp.issparse(X):
401-
if not copy_x:
402-
X += X_mean
403-
best_centers += X_mean
404-
405-
distinct_clusters = len(set(best_labels))
406-
if distinct_clusters < n_clusters:
407-
warnings.warn("Number of distinct clusters ({}) found smaller than "
408-
"n_clusters ({}). Possibly due to duplicate points "
409-
"in X.".format(distinct_clusters, n_clusters),
410-
ConvergenceWarning, stacklevel=2)
411291

292+
est = KMeans(
293+
n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter,
294+
verbose=verbose, precompute_distances=precompute_distances, tol=tol,
295+
random_state=random_state, copy_x=copy_x, n_jobs=n_jobs,
296+
algorithm=algorithm
297+
).fit(X, sample_weight=sample_weight)
412298
if return_n_iter:
413-
return best_centers, best_labels, best_inertia, best_n_iter
299+
return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_
414300
else:
415-
return best_centers, best_labels, best_inertia
301+
return est.cluster_centers_, est.labels_, est.inertia_
416302

417303

418304
def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
@@ -953,15 +839,144 @@ def fit(self, X, y=None, sample_weight=None):
953839
"""
954840
random_state = check_random_state(self.random_state)
955841

956-
self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
957-
k_means(
958-
X, n_clusters=self.n_clusters, sample_weight=sample_weight,
959-
init=self.init, n_init=self.n_init,
960-
max_iter=self.max_iter, verbose=self.verbose,
961-
precompute_distances=self.precompute_distances,
962-
tol=self.tol, random_state=random_state, copy_x=self.copy_x,
963-
n_jobs=self.n_jobs, algorithm=self.algorithm,
964-
return_n_iter=True)
842+
n_init = self.n_init
843+
if n_init <= 0:
844+
raise ValueError("Invalid number of initializations."
845+
" n_init=%d must be bigger than zero." % n_init)
846+
847+
if self.max_iter <= 0:
848+
raise ValueError(
849+
'Number of iterations should be a positive number,'
850+
' got %d instead' % self.max_iter
851+
)
852+
853+
# avoid forcing order when copy_x=False
854+
order = "C" if self.copy_x else None
855+
X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
856+
order=order, copy=self.copy_x)
857+
# verify that the number of samples given is larger than k
858+
if _num_samples(X) < self.n_clusters:
859+
raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
860+
_num_samples(X), self.n_clusters))
861+
862+
tol = _tolerance(X, self.tol)
863+
864+
# If the distances are precomputed every job will create a matrix of
865+
# shape (n_clusters, n_samples). To stop KMeans from eating up memory
866+
# we only activate this if the created matrix is guaranteed to be
867+
# under 100MB. 12 million entries consume a little under 100MB if they
868+
# are of type double.
869+
precompute_distances = self.precompute_distances
870+
if precompute_distances == 'auto':
871+
n_samples = X.shape[0]
872+
precompute_distances = (self.n_clusters * n_samples) < 12e6
873+
elif isinstance(precompute_distances, bool):
874+
pass
875+
else:
876+
raise ValueError(
877+
"precompute_distances should be 'auto' or True/False"
878+
", but a value of %r was passed" %
879+
precompute_distances
880+
)
881+
882+
# Validate init array
883+
init = self.init
884+
if hasattr(init, '__array__'):
885+
init = check_array(init, dtype=X.dtype.type, copy=True)
886+
_validate_center_shape(X, self.n_clusters, init)
887+
888+
if n_init != 1:
889+
warnings.warn(
890+
'Explicit initial center position passed: '
891+
'performing only one init in k-means instead of n_init=%d'
892+
% n_init, RuntimeWarning, stacklevel=2)
893+
n_init = 1
894+
895+
# subtract of mean of x for more accurate distance computations
896+
if not sp.issparse(X):
897+
X_mean = X.mean(axis=0)
898+
# The copy was already done above
899+
X -= X_mean
900+
901+
if hasattr(init, '__array__'):
902+
init -= X_mean
903+
904+
# precompute squared norms of data points
905+
x_squared_norms = row_norms(X, squared=True)
906+
907+
best_labels, best_inertia, best_centers = None, None, None
908+
algorithm = self.algorithm
909+
if self.n_clusters == 1:
910+
# elkan doesn't make sense for a single cluster, full will produce
911+
# the right result.
912+
algorithm = "full"
913+
if algorithm == "auto":
914+
algorithm = "full" if sp.issparse(X) else 'elkan'
915+
if algorithm == "full":
916+
kmeans_single = _kmeans_single_lloyd
917+
elif algorithm == "elkan":
918+
kmeans_single = _kmeans_single_elkan
919+
else:
920+
raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
921+
" %s" % str(algorithm))
922+
923+
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
924+
if effective_n_jobs(self.n_jobs) == 1:
925+
# For a single thread, less memory is needed if we just store one
926+
# set of the best results (as opposed to one set per run per
927+
# thread).
928+
for seed in seeds:
929+
# run a k-means once
930+
labels, inertia, centers, n_iter_ = kmeans_single(
931+
X, sample_weight, self.n_clusters,
932+
max_iter=self.max_iter, init=init, verbose=self.verbose,
933+
precompute_distances=precompute_distances, tol=tol,
934+
x_squared_norms=x_squared_norms, random_state=seed)
935+
# determine if these results are the best so far
936+
if best_inertia is None or inertia < best_inertia:
937+
best_labels = labels.copy()
938+
best_centers = centers.copy()
939+
best_inertia = inertia
940+
best_n_iter = n_iter_
941+
else:
942+
# parallelisation of k-means runs
943+
results = Parallel(n_jobs=self.n_jobs, verbose=0)(
944+
delayed(kmeans_single)(
945+
X, sample_weight, self.n_clusters,
946+
max_iter=self.max_iter, init=init,
947+
verbose=self.verbose, tol=tol,
948+
precompute_distances=precompute_distances,
949+
x_squared_norms=x_squared_norms,
950+
# Change seed to ensure variety
951+
random_state=seed
952+
)
953+
for seed in seeds)
954+
# Get results with the lowest inertia
955+
labels, inertia, centers, n_iters = zip(*results)
956+
best = np.argmin(inertia)
957+
best_labels = labels[best]
958+
best_inertia = inertia[best]
959+
best_centers = centers[best]
960+
best_n_iter = n_iters[best]
961+
962+
if not sp.issparse(X):
963+
if not self.copy_x:
964+
X += X_mean
965+
best_centers += X_mean
966+
967+
distinct_clusters = len(set(best_labels))
968+
if distinct_clusters < self.n_clusters:
969+
warnings.warn(
970+
"Number of distinct clusters ({}) found smaller than "
971+
"n_clusters ({}). Possibly due to duplicate points "
972+
"in X.".format(distinct_clusters, self.n_clusters),
973+
ConvergenceWarning, stacklevel=2
974+
)
975+
976+
self.cluster_centers_ = best_centers
977+
self.labels_ = best_labels
978+
self.inertia_ = best_inertia
979+
self.n_iter_ = best_n_iter
965980
return self
966981

967982
def fit_predict(self, X, y=None, sample_weight=None):

sklearn/discriminant_analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
186186
coef_ : array, shape (n_features,) or (n_classes, n_features)
187187
Weight vector(s).
188188
189-
intercept_ : array, shape (n_features,)
189+
intercept_ : array, shape (n_classes,)
190190
Intercept term.
191191
192192
covariance_ : array-like, shape (n_features, n_features)

0 commit comments

Comments
 (0)
0