diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 99e2a0dcd7ce1..b3d7ae82e783f 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -3,6 +3,7 @@ # Authors: Gael Varoquaux # Thomas Rueckstiess +# James Bergstra # License: BSD import warnings @@ -10,13 +11,14 @@ import numpy as np from ..base import BaseEstimator +from ..metrics.pairwise import euclidian_distances ################################################################################ # Initialisation heuristic # kinit originaly from pybrain: # http://github.com/pybrain/pybrain/raw/master/pybrain/auxiliary/kmeans.py -def k_init(X, k, n_samples_max=500): +def k_init(X, k, n_samples_max=500, rng=None): """Init k seeds according to kmeans++ Parameters @@ -44,27 +46,32 @@ def k_init(X, k, n_samples_max=500): http://blogs.sun.com/yongsun/entry/k_means_and_k_means """ n_samples = X.shape[0] + if rng is None: + rng = np.random + if n_samples >= n_samples_max: - X = X[np.random.randint(n_samples, size=n_samples_max)] + X = X[rng.randint(n_samples, size=n_samples_max)] n_samples = n_samples_max + distances = euclidian_distances(X, X, squared=True) + 'choose the 1st seed randomly, and store D(x)^2 in D[]' - centers = [X[np.random.randint(n_samples)]] - D = ((X - centers[0]) ** 2).sum(axis=-1) + first_idx =rng.randint(n_samples) + centers = [X[first_idx]] + D = distances[first_idx] for _ in range(k - 1): bestDsum = bestIdx = -1 for i in range(n_samples): 'Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)' - Dsum = np.minimum(D, ((X - X[i]) ** 2).sum(axis=-1) - ).sum() + Dsum = np.minimum(D, distances[i]).sum() if bestDsum < 0 or Dsum < bestDsum: bestDsum, bestIdx = Dsum, i centers.append(X[bestIdx]) - D = np.minimum(D, ((X - X[bestIdx]) ** 2).sum(axis=-1)) + D = np.minimum(D, distances[bestIdx]) return np.array(centers) @@ -72,8 +79,8 @@ def k_init(X, k, n_samples_max=500): ################################################################################ # K-means estimation by EM (expectation maximisation) -def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, - delta=1e-4): +def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, + delta=1e-4, rng=None, copy_x=True): """ K-means clustering algorithm. Parameters @@ -96,7 +103,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. - init: {'k-means++', 'random', or an ndarray}, optional + init: {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean @@ -115,6 +122,16 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, verbose: boolean, optional Terbosity mode + rng: numpy.RandomState, optional + The generator used to initialize the centers. Defaults to numpy.random. + + copy_x: boolean, optional + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + Returns ------- centroid: ndarray @@ -129,6 +146,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, The final value of the inertia criterion """ + if rng is None: + rng = np.random n_samples = X.shape[0] vdata = np.mean(np.var(X, 0)) @@ -139,20 +158,29 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, warnings.warn('Explicit initial center position passed: ' 'performing only one init in the k-means') n_init = 1 + 'subtract of mean of x for more accurate distance computations' + Xmean = X.mean(axis=0) + if copy_x: + X = X.copy() + X -= Xmean for it in range(n_init): # init if init == 'k-means++': - centers = k_init(X, k) + centers = k_init(X, k, rng=rng) elif init == 'random': - seeds = np.argsort(np.random.rand(n_samples))[:k] + seeds = np.argsort(rng.rand(n_samples))[:k] centers = X[seeds] elif hasattr(init, '__array__'): centers = np.asanyarray(init).copy() + elif callable(init): + centers = init(X, k, rng=rng) else: raise ValueError("the init parameter for the k-means should " - "be 'k-mean++' or 'random' or an ndarray, " + "be 'k-means++' or 'random' or an ndarray, " "'%s' (type '%s') was passed.") + if verbose: + print 'Initialization complete' # iterations for i in range(max_iter): centers_old = centers.copy() @@ -173,7 +201,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, best_centers = centers best_labels = labels best_inertia = inertia - return best_centers, best_labels, best_inertia + if not copy_x: + X += Xmean + return best_centers+Xmean, best_labels, best_inertia def _m_step(x, z ,k): @@ -205,7 +235,7 @@ def _m_step(x, z ,k): return centers -def _e_step(x, centers): +def _e_step(x, centers, precompute_distances=True, x_squared_norms=None): """E step of the K-means EM algorithm Computation of the input-to-cluster assignment @@ -226,12 +256,19 @@ def _e_step(x, centers): inertia: float The value of the inertia criterion with the assignment """ + n_samples = x.shape[0] + k = centers.shape[0] + + if precompute_distances: + distances = euclidian_distances(centers, x, x_squared_norms, squared=True) z = -np.ones(n_samples).astype(np.int) mindist = np.infty * np.ones(n_samples) - k = centers.shape[0] for q in range(k): - dist = np.sum((x - centers[q]) ** 2, 1) + if precompute_distances: + dist = distances[q] + else: + dist = np.sum((x - centers[q]) ** 2, axis=1) z[dist