scikit-learn · Jan 14, 2011 · Jan 10, 2011 · Jan 10, 2011 · Jan 10, 2011 · Jan 10, 2011
diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
@@ -3,20 +3,22 @@
 
 # Authors: Gael Varoquaux <gael.xaroquaux@normalesup.org>
 #          Thomas Rueckstiess <ruecksti@in.tum.de>
+#          James Bergstra <james.bergstra@umontreal.ca>
 # License: BSD
 
 import warnings
 
 import numpy as np
 
 from ..base import BaseEstimator
+from ..metrics.pairwise import euclidian_distances
 
 ################################################################################
 # Initialisation heuristic
 
 # kinit originaly from pybrain:
 # http://github.com/pybrain/pybrain/raw/master/pybrain/auxiliary/kmeans.py
-def k_init(X, k, n_samples_max=500):
+def k_init(X, k, n_samples_max=500, rng=None):
     """Init k seeds according to kmeans++
 
     Parameters
@@ -44,36 +46,41 @@ def k_init(X, k, n_samples_max=500):
     http://blogs.sun.com/yongsun/entry/k_means_and_k_means
     """
     n_samples = X.shape[0]
+    if rng is None:
+        rng = np.random
+
     if n_samples >= n_samples_max:
-        X = X[np.random.randint(n_samples, size=n_samples_max)]
+        X = X[rng.randint(n_samples, size=n_samples_max)]
         n_samples = n_samples_max
 
+    distances = euclidian_distances(X, X, squared=True)
+
     'choose the 1st seed randomly, and store D(x)^2 in D[]'
-    centers = [X[np.random.randint(n_samples)]]
-    D = ((X - centers[0]) ** 2).sum(axis=-1)
+    first_idx =rng.randint(n_samples)
+    centers = [X[first_idx]]
+    D = distances[first_idx]
 
     for _ in range(k - 1):
         bestDsum = bestIdx = -1
 
         for i in range(n_samples):
             'Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)'
-            Dsum = np.minimum(D, ((X - X[i]) ** 2).sum(axis=-1)
-                              ).sum()
+            Dsum = np.minimum(D, distances[i]).sum()
 
             if bestDsum < 0 or Dsum < bestDsum:
                 bestDsum, bestIdx = Dsum, i
 
         centers.append(X[bestIdx])
-        D = np.minimum(D, ((X - X[bestIdx]) ** 2).sum(axis=-1))
+        D = np.minimum(D, distances[bestIdx])
 
     return np.array(centers)
 
 
 ################################################################################
 # K-means estimation by EM (expectation maximisation)
 
-def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
-                    delta=1e-4):
+def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, 
+                    delta=1e-4, rng=None, copy_x=True):
     """ K-means clustering algorithm.
 
     Parameters
@@ -96,7 +103,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         seeds. The final results will be the best output of n_init consecutive
         runs in terms of inertia.
 
-    init: {'k-means++', 'random', or an ndarray}, optional
+    init: {'k-means++', 'random', or ndarray, or a callable}, optional
         Method for initialization, default to 'k-means++':
 
         'k-means++' : selects initial cluster centers for k-mean
@@ -115,6 +122,16 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
     verbose: boolean, optional
         Terbosity mode
 
+    rng: numpy.RandomState, optional
+        The generator used to initialize the centers. Defaults to numpy.random.
+
+    copy_x: boolean, optional
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+
     Returns
     -------
     centroid: ndarray
@@ -129,6 +146,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         The final value of the inertia criterion
 
     """
+    if rng is None:
+        rng = np.random
     n_samples = X.shape[0]
 
     vdata = np.mean(np.var(X, 0))
@@ -139,20 +158,29 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
             warnings.warn('Explicit initial center position passed: '
                           'performing only one init in the k-means')
             n_init = 1
+    'subtract of mean of x for more accurate distance computations'
+    Xmean = X.mean(axis=0)
+    if copy_x:
+        X = X.copy()
+    X -= Xmean
     for it in range(n_init):
         # init
         if init == 'k-means++':
-            centers = k_init(X, k)
+            centers = k_init(X, k, rng=rng)
         elif init == 'random':
-            seeds = np.argsort(np.random.rand(n_samples))[:k]
+            seeds = np.argsort(rng.rand(n_samples))[:k]
             centers = X[seeds]
         elif hasattr(init, '__array__'):
             centers = np.asanyarray(init).copy()
+        elif callable(init):
+            centers = init(X, k, rng=rng)
         else:
             raise ValueError("the init parameter for the k-means should "
-                "be 'k-mean++' or 'random' or an ndarray, "
+                "be 'k-means++' or 'random' or
10000
 an ndarray, "
                 "'%s' (type '%s') was passed.")
 
+        if verbose:
+            print 'Initialization complete' 
         # iterations
         for i in range(max_iter):
             centers_old = centers.copy()
@@ -173,7 +201,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         best_centers = centers
         best_labels  = labels
         best_inertia = inertia
-    return best_centers, best_labels, best_inertia
+    if not copy_x: 
+        X += Xmean
+    return best_centers+Xmean, best_labels, best_inertia
 
 
 def _m_step(x, z ,k):
@@ -205,7 +235,7 @@ def _m_step(x, z ,k):
     return centers
 
 
-def _e_step(x, centers):
+def _e_step(x, centers, precompute_distances=True, x_squared_norms=None):
     """E step of the K-means EM algorithm
 
     Computation of the input-to-cluster assignment
@@ -226,12 +256,19 @@ def _e_step(x, centers):
     inertia: float
         The value of the inertia criterion with the assignment
     """
+
     n_samples = x.shape[0]
+    k = centers.shape[0]
+
+    if precompute_distances:
+        distances = euclidian_distances(centers, x, x_squared_norms, squared=True)
     z = -np.ones(n_samples).astype(np.int)
     mindist = np.infty * np.ones(n_samples)
-    k = centers.shape[0]
     for q in range(k):
-        dist = np.sum((x - centers[q]) ** 2, 1)
+        if precompute_distances:
+            dist = distances[q]
+        else:
+            dist = np.sum((x - centers[q]) ** 2, axis=1)
         z[dist<mindist] = q
         mindist = np.minimum(dist, mindist)
     inertia = mindist.sum()
@@ -318,18 +355,23 @@ class KMeans(BaseEstimator):
     """
 
 
-    def __init__(self, k=8, init='random', n_init=10, max_iter=300):
+    def __init__(self, k=8, init='random', n_init=10, max_iter=300,
+            verbose=0, rng=None, copy_x=True):
         self.k = k
         self.init = init
         self.max_iter = max_iter
         self.n_init = n_init
+        self.verbose = verbose
+        self.rng = rng
+        self.copy_x = copy_x
 
     def fit(self, X, **params):
         """ Compute k-means"""
         X = np.asanyarray(X)
         self._set_params(**params)
         self.cluster_centers_, self.labels_, self.inertia_ = k_means(X,
                     k=self.k, init=self.init, n_init=self.n_init,
-                    max_iter=self.max_iter)
+                    max_iter=self.max_iter, verbose=self.verbose, 
+                    rng=self.rng, copy_x=self.copy_x)
         return self
 
diff --git a/scikits/learn/metrics/pairwise.py b/scikits/learn/metrics/pairwise.py
@@ -9,7 +9,9 @@
 import numpy as np
 
 
-def euclidian_distances(X, Y):
+def euclidian_distances(X, Y, 
+        Y_norm_squared=None, 
+        squared=False):
     """
     Considering the rows of X (and Y=X) as vectors, compute the
     distance matrix between each pair of vectors.
@@ -20,6 +22,12 @@ def euclidian_distances(X, Y):
 
     Y: array of shape (n_samples_2, n_features)
 
+    Y_norm_squared: array [n_samples_2], optional
+        pre-computed (Y**2).sum(axis=1)
+
+    squared: boolean, optional
+        This routine will return squared Euclidean distances instead.
+
     Returns
     -------
     distances: array of shape (n_samples_1, n_samples_2)
@@ -37,22 +45,37 @@ def euclidian_distances(X, Y):
     array([[ 1.        ],
            [ 1.41421356]])
     """
-    # shortcut in the common case euclidean_distances(X, X)
-    compute_Y = X is not Y
-
-    X = np.asanyarray(X)
-    Y = np.asanyar
9E7A
ray(Y)
+    # should not need X_norm_squared because if you could precompute that as
+    # well as Y, then you should just pre-compute the output and not even
+    # call this function.
+    if X is Y:
+        X = Y = np.asanyarray(X)
+    else:
+        X = np.asanyarray(X)
+        Y = np.asanyarray(Y)
 
     if X.shape[1] != Y.shape[1]:
         raise ValueError("Incompatible dimension for X and Y matrices")
 
     XX = np.sum(X * X, axis=1)[:, np.newaxis]
-    if compute_Y:
+    if X is Y: # shortcut in the common case euclidean_distances(X, X)
+        YY = XX.T
+    elif Y_norm_squared is None:
         YY = np.sum(Y * Y, axis=1)[np.newaxis, :]
     else:
-        YY = XX.T
+        YY = np.asanyarray(Y_norm_squared)
+        if YY.shape != (Y.shape[0],):
+            raise ValueError("Incompatible dimension for Y and Y_norm_squared")
+        YY = YY[np.newaxis,:]
 
+    # TODO:
+    # a faster cython implementation would do the dot product first,
+    # and then add XX, add YY, and do the clipping of negative values in
+    # a single pass over the output matrix.
     distances = XX + YY # Using broadcasting
     distances -= 2 * np.dot(X, Y.T)
     distances = np.maximum(distances, 0)
-    return np.sqrt(distances)
+    if squared:
+        return distances
+    else:
+        return np.sqrt(distances)
diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py
@@ -132,6 +132,9 @@ class PCA(BaseEstimator):
     components_: array, [n_features, n_components]
         Components with maximum variance.
 
+    components_coefs: array, [n_components]
+        Eigenvalues associated with principal components_.

     explained_variance_ratio_: array, [n_components]
         Percentage of variance explained by each of the selected components.
         k is not set then all components are stored and the sum of
@@ -182,15 +185,18 @@ def fit(self, X, **params):
         if self.whiten:
             n = X.shape[0]
             self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
+            self.components_coefs_ = S / np.sqrt(n) 
         else:
             self.components_ = V.T
+            self.components_coefs_ = np.ones_like(S)
 
         if self.n_components == 'mle':
             self.n_components = _infer_dimension_(self.explained_variance_,
                                             n_samples, X.shape[1])
 
         if self.n_components is not None:
             self.components_ = self.components_[:, :self.n_components]
+            self.components_coefs_ = self.components_coefs_[:self.n_components]
             self.explained_variance_ = \
                     self.explained_variance_[:self.n_components]
             self.explained_variance_ratio_ = \
@@ -204,6 +210,11 @@ def transform(self, X):
         Xr = np.dot(Xr, self.components_)
         return Xr
 
+    def inverse_transform(self, Y):
+        """Return an input X whose transform would be Y"""
+        r = np.dot(Y * self.components_coefs_, self.components_.T)
+        r += self.mean_
+        return r
 
 class ProbabilisticPCA(PCA):
     """Additional layer on top of PCA that add a probabilistic evaluation
@@ -365,8 +376,10 @@ def fit(self, X, **params):
         if self.whiten:
             n = X.shape[0]
             self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
+            self.components_coefs_ = S / np.sqrt(n) 
         else:
             self.components_ = V.T
+            self.components_coefs_ = np.ones_like(S)
 
         return self
 

diff --git a/scikits/learn/tests/test_pca.py b/scikits/learn/tests/test_pca.py
@@ -79,6 +79,20 @@ def test_pca_check_projection():
 
     np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
 
+def test_pca_inverse():
+    """Test that the projection of data can be inverted"""
+
+    n, p = 50, 3
+    X = randn(n,p) # spherical data
+    X[:,1] *= .00001 # make middle component relatively small
+    X += [5,4,3] # make a large mean
+
+    pca = PCA(n_components=2)
+    pca.fit(X)
+    Y = pca.transform(X)
+    Xlike = pca.i
4DBE
nverse_transform(Y)
+    assert_almost_equal(X, Xlike, decimal=3)
+
 
 def test_randomized_pca_check_projection():
     """Test that the projection by RandomizedPCA on dense data is correct"""