8000 improvements to k-means (and one PCA thing) by jaberg · Pull Request #42 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

improvements to k-means (and one PCA thing) #42

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
19 commits merged into from
Jan 14, 2011
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
1cb7799
k_means_ - added optional rng parameter to work routines
jaberg Jan 10, 2011
5b9605e
Centering data for k-means before fitting
jaberg Jan 10, 2011
c182b37
k-means - added verbose-level print after initialization
jaberg Jan 10, 2011
d0084d7
added faster distance-computation algorithm to k-means _e_step
jaberg Jan 10, 2011
5fcbbbf
PCA train() stores eigenvalues associated with components
jaberg Jan 12, 2011
a5af706
adding James Bergstra as author of k_means_ file
jaberg Jan 12, 2011
1563c31
k-means adding all_paris_l2_distance_squared function
jaberg Jan 12, 2011
73c6024
k-means - modified k_init to use pre-computed distances for faster, c…
jaberg Jan 12, 2011
e66a033
k-means - added support for a callable "init" argument instead of cop…
jaberg Jan 12, 2011
38bd96e
k-means - fixed misleading typo in error message
jaberg Jan 12, 2011
9242e46
k-means - added optional parameters "precompute_distances" and "x_squ…
jaberg Jan 12, 2011
21122d7
k-means - added "verbose" parameter to KMeans class
jaberg Jan 12, 2011
c2e17ac
k-means - added copy_x parameter to worker routine and BaseEstimator,…
jaberg Jan 12, 2011
e7929e6
added optional args to euclidean_distances and removed k_means_.all_p…
jaberg Jan 12, 2011
eefd4f1
fixed typo in my previous patch to PCA
jaberg Jan 13, 2011
8783af1
added PCA.inverse_transform and unit test
jaberg Jan 13, 2011
41e4bc8
added components_coefs_ (eigenvalues) member to RandomizedPCA to matc…
jaberg Jan 13, 2011
4beecdc
test_pca - modified to use assert_almost_equal
jaberg Jan 13, 2011
fb7632f
euclidian_distances - repair special case for when X is Y
jaberg Jan 13, 2011
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 61 additions & 19 deletions scikits/learn/cluster/k_means_.py
10000
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,22 @@

# Authors: Gael Varoquaux <gael.xaroquaux@normalesup.org>
# Thomas Rueckstiess <ruecksti@in.tum.de>
# James Bergstra <james.bergstra@umontreal.ca>
# License: BSD

import warnings

import numpy as np

from ..base import BaseEstimator
from ..metrics.pairwise import euclidian_distances

################################################################################
# Initialisation heuristic

# kinit originaly from pybrain:
# http://github.com/pybrain/pybrain/raw/master/pybrain/auxiliary/kmeans.py
def k_init(X, k, n_samples_max=500):
def k_init(X, k, n_samples_max=500, rng=None):
"""Init k seeds according to kmeans++

Parameters
Expand Down Expand Up @@ -44,36 +46,41 @@ def k_init(X, k, n_samples_max=500):
http://blogs.sun.com/yongsun/entry/k_means_and_k_means
"""
n_samples = X.shape[0]
if rng is None:
rng = np.random

if n_samples >= n_samples_max:
X = X[np.random.randint(n_samples, size=n_samples_max)]
X = X[rng.randint(n_samples, size=n_samples_max)]
n_samples = n_samples_max

distances = euclidian_distances(X, X, squared=True)

'choose the 1st seed randomly, and store D(x)^2 in D[]'
centers = [X[np.random.randint(n_samples)]]
D = ((X - centers[0]) ** 2).sum(axis=-1)
first_idx =rng.randint(n_samples)
centers = [X[first_idx]]
D = distances[first_idx]

for _ in range(k - 1):
bestDsum = bestIdx = -1

for i in range(n_samples):
'Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)'
Dsum = np.minimum(D, ((X - X[i]) ** 2).sum(axis=-1)
).sum()
Dsum = np.minimum(D, distances[i]).sum()

if bestDsum < 0 or Dsum < bestDsum:
bestDsum, bestIdx = Dsum, i

centers.append(X[bestIdx])
D = np.minimum(D, ((X - X[bestIdx]) ** 2).sum(axis=-1))
D = np.minimum(D, distances[bestIdx])

return np.array(centers)


################################################################################
# K-means estimation by EM (expectation maximisation)

def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
delta=1e-4):
def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
delta=1e-4, rng=None, copy_x=True):
""" K-means clustering algorithm.

Parameters
Expand All @@ -96,7 +103,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
seeds. The final results will be the best output of n_init consecutive
runs in terms of inertia.

init: {'k-means++', 'random', or an ndarray}, optional
init: {'k-means++', 'random', or ndarray, or a callable}, optional
Method for initialization, default to 'k-means++':

'k-means++' : selects initial cluster centers for k-mean
Expand All @@ -115,6 +122,16 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
verbose: boolean, optional
Terbosity mode

rng: numpy.RandomState, optional
The generator used to initialize the centers. Defaults to numpy.random.

copy_x: boolean, optional
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True, then the original data is not
modified. If False, the original data is modified, and put back before
the function returns, but small numerical differences may be introduced
by subtracting and then adding the data mean.

Returns
-------
centroid: ndarray
Expand All @@ -129,6 +146,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
The final value of the inertia criterion

"""
if rng is None:
rng = np.random
n_samples = X.shape[0]

vdata = np.mean(np.var(X, 0))
Expand All @@ -139,20 +158,29 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
warnings.warn('Explicit initial center position passed: '
'performing only one init in the k-means')
n_init = 1
'subtract of mean of x for more accurate distance computations'
Xmean = X.mean(axis=0)
if copy_x:
X = X.copy()
X -= Xmean
for it in range(n_init):
# init
if init == 'k-means++':
centers = k_init(X, k)
centers = k_init(X, k, rng=rng)
elif init == 'random':
seeds = np.argsort(np.random.rand(n_samples))[:k]
seeds = np.argsort(rng.rand(n_samples))[:k]
centers = X[seeds]
elif hasattr(init, '__array__'):
centers = np.asanyarray(init).copy()
elif callable(init):
centers = init(X, k, rng=rng)
else:
raise ValueError("the init parameter for the k-means should "
"be 'k-mean++' or 'random' or an ndarray, "
"be 'k-means++' or 'random' or 10000 an ndarray, "
"'%s' (type '%s') was passed.")

if verbose:
print 'Initialization complete'
# iterations
for i in range(max_iter):
centers_old = centers.copy()
Expand All @@ -173,7 +201,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
best_centers = centers
best_labels = labels
best_inertia = inertia
return best_centers, best_labels, best_inertia
if not copy_x:
X += Xmean
return best_centers+Xmean, best_labels, best_inertia


def _m_step(x, z ,k):
Expand Down Expand Up @@ -205,7 +235,7 @@ def _m_step(x, z ,k):
return centers


def _e_step(x, centers):
def _e_step(x, centers, precompute_distances=True, x_squared_norms=None):
"""E step of the K-means EM algorithm

Computation of the input-to-cluster assignment
Expand All @@ -226,12 +256,19 @@ def _e_step(x, centers):
inertia: float
The value of the inertia criterion with the assignment
"""

n_samples = x.shape[0]
k = centers.shape[0]

if precompute_distances:
distances = euclidian_distances(centers, x, x_squared_norms, squared=True)
z = -np.ones(n_samples).astype(np.int)
mindist = np.infty * np.ones(n_samples)
k = centers.shape[0]
for q in range(k):
dist = np.sum((x - centers[q]) ** 2, 1)
if precompute_distances:
dist = distances[q]
else:
dist = np.sum((x - centers[q]) ** 2, axis=1)
z[dist<mindist] = q
mindist = np.minimum(dist, mindist)
inertia = mindist.sum()
Expand Down Expand Up @@ -318,18 +355,23 @@ class KMeans(BaseEstimator):
"""


def __init__(self, k=8, init='random', n_init=10, max_iter=300):
def __init__(self, k=8, init='random', n_init=10, max_iter=300,
verbose=0, rng=None, copy_x=True):
self.k = k
self.init = init
self.max_iter = max_iter
self.n_init = n_init
self.verbose = verbose
self.rng = rng
self.copy_x = copy_x

def fit(self, X, **params):
""" Compute k-means"""
X = np.asanyarray(X)
self._set_params(**params)
self.cluster_centers_, self.labels_, self.inertia_ = k_means(X,
k=self.k, init=self.init, n_init=self.n_init,
max_iter=self.max_iter)
max_iter=self.max_iter, verbose=self.verbose,
rng=self.rng, copy_x=self.copy_x)
return self

41 changes: 32 additions & 9 deletions scikits/learn/metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
import numpy as np


def euclidian_distances(X, Y):
def euclidian_distances(X, Y,
Y_norm_squared=None,
squared=False):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.
Expand All @@ -20,6 +22,12 @@ def euclidian_distances(X, Y):

Y: array of shape (n_samples_2, n_features)

Y_norm_squared: array [n_samples_2], optional
pre-computed (Y**2).sum(axis=1)

squared: boolean, optional
This routine will return squared Euclidean distances instead.

Returns
-------
distances: array of shape (n_samples_1, n_samples_2)
Expand All @@ -37,22 +45,37 @@ def euclidian_distances(X, Y):
array([[ 1. ],
[ 1.41421356]])
"""
# shortcut in the common case euclidean_distances(X, X)
compute_Y = X is not Y

X = np.asanyarray(X)
Y = np.asanyar 9E7A ray(Y)
# should not need X_norm_squared because if you could precompute that as
# well as Y, then you should just pre-compute the output and not even
# call this function.
if X is Y:
X = Y = np.asanyarray(X)
else:
X = np.asanyarray(X)
Y = np.asanyarray(Y)

if X.shape[1] != Y.shape[1]:
raise ValueError("Incompatible dimension for X and Y matrices")

XX = np.sum(X * X, axis=1)[:, np.newaxis]
if compute_Y:
if X is Y: # shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is None:
YY = np.sum(Y * Y, axis=1)[np.newaxis, :]
else:
YY = XX.T
YY = np.asanyarray(Y_norm_squared)
if YY.shape != (Y.shape[0],):
raise ValueError("Incompatible dimension for Y and Y_norm_squared")
YY = YY[np.newaxis,:]

# TODO:
# a faster cython implementation would do the dot product first,
# and then add XX, add YY, and do the clipping of negative values in
# a single pass over the output matrix.
distances = XX + YY # Using broadcasting
distances -= 2 * np.dot(X, Y.T)
distances = np.maximum(distances, 0)
return np.sqrt(distances)
if squared:
return distances
else:
return np.sqrt(distances)
13 changes: 13 additions & 0 deletions scikits/learn/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ class PCA(BaseEstimator):
components_: array, [n_features, n_components]
Components with maximum variance.

components_coefs: array, [n_components]
Eigenvalues associated with principal components_.

explained_variance_ratio_: array, [n_components]
Percentage of variance explained by each of the selected components.
k is not set then all components are stored and the sum of
Expand Down Expand Up @@ -182,15 +185,18 @@ def fit(self, X, **params):
if self.whiten:
n = X.shape[0]
self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
self.components_coefs_ = S / np.sqrt(n)
else:
self.components_ = V.T
self.components_coefs_ = np.ones_like(S)

if self.n_components == 'mle':
self.n_components = _infer_dimension_(self.explained_variance_,
n_samples, X.shape[1])

if self.n_components is not None:
self.components_ = self.components_[:, :self.n_components]
self.components_coefs_ = self.components_coefs_[:self.n_components]
self.explained_variance_ = \
self.explained_variance_[:self.n_components]
self.explained_variance_ratio_ = \
Expand All @@ -204,6 +210,11 @@ def transform(self, X):
Xr = np.dot(Xr, self.components_)
return Xr

def inverse_transform(self, Y):
"""Return an input X whose transform would be Y"""
r = np.dot(Y * self.components_coefs_, self.components_.T)
r += self.mean_
return r

class ProbabilisticPCA(PCA):
"""Additional layer on top of PCA that add a probabilistic evaluation
Expand Down Expand Up @@ -365,8 +376,10 @@ def fit(self, X, **params):
if self.whiten:
n = X.shape[0]
self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
self.components_coefs_ = S / np.sqrt(n)
else:
self.components_ = V.T
self.components_coefs_ = np.ones_like(S)

return self

Expand Down
14 changes: 14 additions & 0 deletions scikits/learn/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,20 @@ def test_pca_check_projection():

np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)

def test_pca_inverse():
"""Test that the projection of data can be inverted"""

n, p = 50, 3
X = randn(n,p) # spherical data
X[:,1] *= .00001 # make middle component relatively small
X += [5,4,3] # make a large mean

pca = PCA(n_components=2)
pca.fit(X)
Y = pca.transform(X)
Xlike = pca.i 4DBE nverse_transform(Y)
assert_almost_equal(X, Xlike, decimal=3)


def test_randomized_pca_check_projection():
"""Test that the projection by RandomizedPCA on dense data is correct"""
Expand Down
0