From 1cb77993c3f3013b0ceaf87bb3dbead05c33e6fd Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Mon, 10 Jan 2011 13:10:07 -0500
Subject: [PATCH 01/19] k_means_ - added optional rng parameter to work
 routines

---
 scikits/learn/cluster/k_means_.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 99e2a0dcd7ce1..507b2969e5d41 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -16,7 +16,7 @@
 
 # kinit originaly from pybrain:
 # http://github.com/pybrain/pybrain/raw/master/pybrain/auxiliary/kmeans.py
-def k_init(X, k, n_samples_max=500):
+def k_init(X, k, n_samples_max=500, rng=None):
     """Init k seeds according to kmeans++
 
     Parameters
@@ -44,12 +44,14 @@ def k_init(X, k, n_samples_max=500):
     http://blogs.sun.com/yongsun/entry/k_means_and_k_means
     """
     n_samples = X.shape[0]
+    if rng is None:
+        rng = np.random
     if n_samples >= n_samples_max:
-        X = X[np.random.randint(n_samples, size=n_samples_max)]
+        X = X[rng.randint(n_samples, size=n_samples_max)]
         n_samples = n_samples_max
 
     'choose the 1st seed randomly, and store D(x)^2 in D[]'
-    centers = [X[np.random.randint(n_samples)]]
+    centers = [X[rng.randint(n_samples)]]
     D = ((X - centers[0]) ** 2).sum(axis=-1)
 
     for _ in range(k - 1):
@@ -73,7 +75,7 @@ def k_init(X, k, n_samples_max=500):
 # K-means estimation by EM (expectation maximisation)
 
 def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
-                    delta=1e-4):
+                    delta=1e-4, rng=None):
     """ K-means clustering algorithm.
 
     Parameters
@@ -115,6 +117,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
     verbose: boolean, optional
         Terbosity mode
 
+    rng: numpy.RandomState, optional
+        The generator used to initialize the centers
+
     Returns
     -------
     centroid: ndarray
@@ -129,6 +134,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         The final value of the inertia criterion
 
     """
+    if rng is None:
+        rng = np.random
     n_samples = X.shape[0]
 
     vdata = np.mean(np.var(X, 0))
@@ -142,9 +149,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
     for it in range(n_init):
         # init
         if init == 'k-means++':
-            centers = k_init(X, k)
+            centers = k_init(X, k, rng=rng)
         elif init == 'random':
-            seeds = np.argsort(np.random.rand(n_samples))[:k]
+            seeds = np.argsort(rng.rand(n_samples))[:k]
             centers = X[seeds]
         elif hasattr(init, '__array__'):
             centers = np.asanyarray(init).copy()

From 5b9605ee290595e2b7d5b95950b68adfa144a677 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Mon, 10 Jan 2011 13:17:13 -0500
Subject: [PATCH 02/19] Centering data for k-means before fitting

---
 scikits/learn/cluster/k_means_.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 507b2969e5d41..8072ad64bbd8c 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -146,6 +146,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
             warnings.warn('Explicit initial center position passed: '
                           'performing only one init in the k-means')
             n_init = 1
+    'subtract of mean of x for more accurate distance computations'
+    Xmean = X.mean(axis=0)
+    X = X-Xmean # TODO: offer an argument to allow doing this inplace
     for it in range(n_init):
         # init
         if init == 'k-means++':
@@ -180,7 +183,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         best_centers = centers
         best_labels  = labels
         best_inertia = inertia
-    return best_centers, best_labels, best_inertia
+    return best_centers+Xmean, best_labels, best_inertia
 
 
 def _m_step(x, z ,k):

From c182b3776172026783b18059d057ca61e2e35c45 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Mon, 10 Jan 2011 13:18:38 -0500
Subject: [PATCH 03/19] k-means - added verbose-level print after
 initialization

---
 scikits/learn/cluster/k_means_.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 8072ad64bbd8c..95bc566741d57 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -163,6 +163,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
                 "be 'k-mean++' or 'random' or an ndarray, "
                 "'%s' (type '%s') was passed.")
 
+        if verbose:
+            print 'Initialization complete' 
         # iterations
         for i in range(max_iter):
             centers_old = centers.copy()

From d0084d7f3c2ba52a08708adb847299370402f1ba Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Mon, 10 Jan 2011 13:19:49 -0500
Subject: [PATCH 04/19] added faster distance-computation algorithm to k-means
 _e_step

---
 scikits/learn/cluster/k_means_.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 95bc566741d57..38d2db5fc8dbd 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -238,12 +238,26 @@ def _e_step(x, centers):
     inertia: float
         The value of the inertia criterion with the assignment
     """
+
     n_samples = x.shape[0]
+    k = centers.shape[0]
+
+    there_is_memory_to_compute_distances_matrix = True
+
+    if there_is_memory_to_compute_distances_matrix:
+        distances = (
+                (x**2).sum(axis=1)
+                + (centers**2).sum(axis=1).reshape((k,1))
+                - 2*np.dot(centers, x.T))
+    # distances is a matrix of shape (k, n_samples) 
+
     z = -np.ones(n_samples).astype(np.int)
     mindist = np.infty * np.ones(n_samples)
-    k = centers.shape[0]
     for q in range(k):
-        dist = np.sum((x - centers[q]) ** 2, 1)
+        if there_is_memory_to_compute_distances_matrix:
+            dist = distances[q]
+        else:
+            dist = np.sum((x - centers[q]) ** 2, 1)
         z[dist<mindist] = q
         mindist = np.minimum(dist, mindist)
     inertia = mindist.sum()

From 5fcbbbf8b75a154175ceb6ac2280b0913b63a356 Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 21:39:37 -0500
Subject: [PATCH 05/19] PCA train() stores eigenvalues associated with
 components

---
 scikits/learn/pca.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py
index 38fc4fe1bbe18..916b9c5187ee4 100644
--- a/scikits/learn/pca.py
+++ b/scikits/learn/pca.py
@@ -132,6 +132,9 @@ class PCA(BaseEstimator):
     components_: array, [n_features, n_components]
         Components with maximum variance.
 
+    components_coefs: array, [n_components]
+        Eigenvalues associated with principal components_.
+
     explained_variance_ratio_: array, [n_components]
         Percentage of variance explained by each of the selected components.
         k is not set then all components are stored and the sum of
@@ -182,8 +185,10 @@ def fit(self, X, **params):
         if self.whiten:
             n = X.shape[0]
             self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
+            self.components_coefs_ = coefs = S / np.sqrt(n) 
         else:
             self.components_ = V.T
+            self.components_coefs = np.ones_like(S, dtype=V.dtype)
 
         if self.n_components == 'mle':
             self.n_components = _infer_dimension_(self.explained_variance_,
@@ -191,6 +196,7 @@ def fit(self, X, **params):
 
         if self.n_components is not None:
             self.components_ = self.components_[:, :self.n_components]
+            self.components_coefs_ = self.components_coefs_[:self.n_components]
             self.explained_variance_ = \
                     self.explained_variance_[:self.n_components]
             self.explained_variance_ratio_ = \

From a5af706bce8957011bf4e63da6c6e8be4a57c48e Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 21:49:56 -0500
Subject: [PATCH 06/19] adding James Bergstra as author of k_means_ file

---
 scikits/learn/cluster/k_means_.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 38d2db5fc8dbd..794fa9471753f 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -3,6 +3,7 @@
 
 # Authors: Gael Varoquaux <gael.xaroquaux@normalesup.org>
 #          Thomas Rueckstiess <ruecksti@in.tum.de>
+#          James Bergstra <james.bergstra@umontreal.ca>
 # License: BSD
 
 import warnings

From 1563c3185ce4527671d7b44879e0ec362a1287c1 Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 21:50:57 -0500
Subject: [PATCH 07/19] k-means adding all_paris_l2_distance_squared function

---
 scikits/learn/cluster/k_means_.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 794fa9471753f..9176b48e95a47 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -12,6 +12,34 @@
 
 from ..base import BaseEstimator
 
+def all_pairs_l2_distance_squared(A, B, B_norm_squared=None):
+    """
+    Returns the squared l2 norms of the differences between rows of A and B.
+
+    Parameters
+    ----------
+    A: array, [n_rows_A, n_cols]
+
+    B: array, [n_rows_B, n_cols]
+
+    B_norm_squared: array [n_rows_B], or None
+        pre-computed (B**2).sum(axis=1)
+
+    Returns
+    -------
+
+    array [n_rows_A, n_rows_B]
+        entry [i,j] is ((A[i] - B[i])**2).sum(axis=1)
+
+    """
+    if B_norm_squared is None:
+        B_norm_squared = (B**2).sum(axis=1)
+    if A is B:
+        A_norm_squared = B_norm_squared
+    else:
+        A_norm_squared = (A**2).sum(axis=1)
+    return (B_norm_squared + A_norm_squared.reshape((A.shape[0],1)) - 2*np.dot(A, B.T))
+
 ################################################################################
 # Initialisation heuristic
 

From 73c6024f223231928f9c6acf8a6e8cded81e4a9d Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 21:52:47 -0500
Subject: [PATCH 08/19] k-means - modified k_init to use pre-computed distances
 for faster, clearer code

---
 scikits/learn/cluster/k_means_.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 9176b48e95a47..fc90ae236900f 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -75,27 +75,30 @@ def k_init(X, k, n_samples_max=500, rng=None):
     n_samples = X.shape[0]
     if rng is None:
         rng = np.random
+
     if n_samples >= n_samples_max:
         X = X[rng.randint(n_samples, size=n_samples_max)]
         n_samples = n_samples_max
 
+    distances = all_pairs_l2_distance_squared(X, X)
+
     'choose the 1st seed randomly, and store D(x)^2 in D[]'
-    centers = [X[rng.randint(n_samples)]]
-    D = ((X - centers[0]) ** 2).sum(axis=-1)
+    first_idx =rng.randint(n_samples)
+    centers = [X[first_idx]]
+    D = distances[first_idx]
 
     for _ in range(k - 1):
         bestDsum = bestIdx = -1
 
         for i in range(n_samples):
             'Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)'
-            Dsum = np.minimum(D, ((X - X[i]) ** 2).sum(axis=-1)
-                              ).sum()
+            Dsum = np.minimum(D, distances[i]).sum()
 
             if bestDsum < 0 or Dsum < bestDsum:
                 bestDsum, bestIdx = Dsum, i
 
         centers.append(X[bestIdx])
-        D = np.minimum(D, ((X - X[bestIdx]) ** 2).sum(axis=-1))
+        D = np.minimum(D, distances[bestIdx])
 
     return np.array(centers)
 

From e66a033095aa7b4a3d51672abeb8c5a5f79e09fa Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 21:55:40 -0500
Subject: [PATCH 09/19] k-means - added support for a callable "init" argument
 instead of copying all the k_init parameters as optional arguments - invite
 user to use a lambda or something

---
 scikits/learn/cluster/k_means_.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index fc90ae236900f..8f67e36234da0 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -130,7 +130,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         seeds. The final results will be the best output of n_init consecutive
         runs in terms of inertia.
 
-    init: {'k-means++', 'random', or an ndarray}, optional
+    init: {'k-means++', 'random', or ndarray, or a callable}, optional
         Method for initialization, default to 'k-means++':
 
         'k-means++' : selects initial cluster centers for k-mean
@@ -190,6 +190,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
             centers = X[seeds]
         elif hasattr(init, '__array__'):
             centers = np.asanyarray(init).copy()
+        elif callable(init):
+            centers = init(X, k, rng=rng)
         else:
             raise ValueError("the init parameter for the k-means should "
                 "be 'k-mean++' or 'random' or an ndarray, "

From 38bd96e207f4a25efc28774d587d7e611de6204d Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 21:56:35 -0500
Subject: [PATCH 10/19] k-means - fixed misleading typo in error message

---
 scikits/learn/cluster/k_means_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 8f67e36234da0..6346c2bdf926f 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -194,7 +194,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
             centers = init(X, k, rng=rng)
         else:
             raise ValueError("the init parameter for the k-means should "
-                "be 'k-mean++' or 'random' or an ndarray, "
+                "be 'k-means++' or 'random' or an ndarray, "
                 "'%s' (type '%s') was passed.")
 
         if verbose:

From 9242e46cc45400f855c7350a2d5e8bf5f7e94a39 Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 21:59:51 -0500
Subject: [PATCH 11/19] k-means - added optional parameters
 "precompute_distances" and "x_squared_norms"

---
 scikits/learn/cluster/k_means_.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 6346c2bdf926f..7dbec0709b9a7 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -251,7 +251,7 @@ def _m_step(x, z ,k):
     return centers
 
 
-def _e_step(x, centers):
+def _e_step(x, centers, precompute_distances=True, x_squared_norms=None):
     """E step of the K-means EM algorithm
 
     Computation of the input-to-cluster assignment
@@ -276,22 +276,15 @@ def _e_step(x, centers):
     n_samples = x.shape[0]
     k = centers.shape[0]
 
-    there_is_memory_to_compute_distances_matrix = True
-
-    if there_is_memory_to_compute_distances_matrix:
-        distances = (
-                (x**2).sum(axis=1)
-                + (centers**2).sum(axis=1).reshape((k,1))
-                - 2*np.dot(centers, x.T))
-    # distances is a matrix of shape (k, n_samples) 
-
+    if precompute_distances:
+        distances = all_pairs_l2_distance_squared(centers, x, x_squared_norms)
     z = -np.ones(n_samples).astype(np.int)
     mindist = np.infty * np.ones(n_samples)
     for q in range(k):
-        if there_is_memory_to_compute_distances_matrix:
+        if precompute_distances:
             dist = distances[q]
         else:
-            dist = np.sum((x - centers[q]) ** 2, 1)
+            dist = np.sum((x - centers[q]) ** 2, axis=1)
         z[dist<mindist] = q
         mindist = np.minimum(dist, mindist)
     inertia = mindist.sum()

From 21122d713d0fef5f6a8589d40de7bd6127f99ffe Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 22:00:58 -0500
Subject: [PATCH 12/19] k-means - added "verbose" parameter to KMeans class

---
 scikits/learn/cluster/k_means_.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 7dbec0709b9a7..05117fb340618 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -371,11 +371,13 @@ class KMeans(BaseEstimator):
     """
 
 
-    def __init__(self, k=8, init='random', n_init=10, max_iter=300):
+    def __init__(self, k=8, init='random', n_init=10, max_iter=300,
+            verbose=0):
         self.k = k
         self.init = init
         self.max_iter = max_iter
         self.n_init = n_init
+        self.verbose = verbose
 
     def fit(self, X, **params):
         """ Compute k-means"""
@@ -383,6 +385,6 @@ def fit(self, X, **params):
         self._set_params(**params)
         self.cluster_centers_, self.labels_, self.inertia_ = k_means(X,
                     k=self.k, init=self.init, n_init=self.n_init,
-                    max_iter=self.max_iter)
+                    max_iter=self.max_iter, verbose=self.verbose)
         return self
 

From c2e17ac1f8635106c764172217048a515eeba4f7 Mon Sep 17 00:00:00 2001
From: "james.bergstra" <james.bergstra@gmail.com>
Date: Tue, 11 Jan 2011 22:30:49 -0500
Subject: [PATCH 13/19] k-means - added copy_x parameter to worker routine and
 BaseEstimator, allowing optional in-place operation

---
 scikits/learn/cluster/k_means_.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index 05117fb340618..d0ac1dcac13e3 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -106,8 +106,8 @@ def k_init(X, k, n_samples_max=500, rng=None):
 ################################################################################
 # K-means estimation by EM (expectation maximisation)
 
-def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
-                    delta=1e-4, rng=None):
+def k_means(X, k,init='k-means++', n_init=10, max_iter=300, verbose=0, 
+                    delta=1e-4, rng=None, copy_x=True):
     """ K-means clustering algorithm.
 
     Parameters
@@ -150,7 +150,13 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         Terbosity mode
 
     rng: numpy.RandomState, optional
-        The generator used to initialize the centers
+        The generator used to initialize the centers. Defaults to numpy.random.
+
+    copy_x: boolean, optional
+        When pre-computing distances it is more numerically accurate to center the data first.
+        If copy_x is True, then the original data is not modified.  If False, the original data
+        is modified, and put back before the function returns, but small numerical differences
+        may be introduced by subtracting and then adding the data mean.
 
     Returns
     -------
@@ -180,7 +186,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
             n_init = 1
     'subtract of mean of x for more accurate distance computations'
     Xmean = X.mean(axis=0)
-    X = X-Xmean # TODO: offer an argument to allow doing this inplace
+    if copy_x:
+        X = X.copy()
+    X -= Xmean
     for it in range(n_init):
         # init
         if init == 'k-means++':
@@ -219,6 +227,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         best_centers = centers
         best_labels  = labels
         best_inertia = inertia
+    if not copy_x: 
+        X += Xmean
     return best_centers+Xmean, best_labels, best_inertia
 
 
@@ -372,12 +382,14 @@ class KMeans(BaseEstimator):
 
 
     def __init__(self, k=8, init='random', n_init=10, max_iter=300,
-            verbose=0):
+            verbose=0, rng=None, copy_x=True):
         self.k = k
         self.init = init
         self.max_iter = max_iter
         self.n_init = n_init
         self.verbose = verbose
+        self.rng = rng
+        self.copy_x = copy_x
 
     def fit(self, X, **params):
         """ Compute k-means"""
@@ -385,6 +397,7 @@ def fit(self, X, **params):
         self._set_params(**params)
         self.cluster_centers_, self.labels_, self.inertia_ = k_means(X,
                     k=self.k, init=self.init, n_init=self.n_init,
-                    max_iter=self.max_iter, verbose=self.verbose)
+                    max_iter=self.max_iter, verbose=self.verbose, 
+                    rng=self.rng, copy_x=self.copy_x)
         return self
 

From e7929e61e8844b476943d4aacb50200f523d43b3 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 12 Jan 2011 16:31:13 -0500
Subject: [PATCH 14/19] added optional args to euclidean_distances and removed
 k_means_.all_pairs_l2_distances_squared

---
 scikits/learn/cluster/k_means_.py | 44 +++++++------------------------
 scikits/learn/metrics/pairwise.py | 33 ++++++++++++++++++-----
 2 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index d0ac1dcac13e3..b3d7ae82e783f 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -11,34 +11,7 @@
 import numpy as np
 
 from ..base import BaseEstimator
-
-def all_pairs_l2_distance_squared(A, B, B_norm_squared=None):
-    """
-    Returns the squared l2 norms of the differences between rows of A and B.
-
-    Parameters
-    ----------
-    A: array, [n_rows_A, n_cols]
-
-    B: array, [n_rows_B, n_cols]
-
-    B_norm_squared: array [n_rows_B], or None
-        pre-computed (B**2).sum(axis=1)
-
-    Returns
-    -------
-
-    array [n_rows_A, n_rows_B]
-        entry [i,j] is ((A[i] - B[i])**2).sum(axis=1)
-
-    """
-    if B_norm_squared is None:
-        B_norm_squared = (B**2).sum(axis=1)
-    if A is B:
-        A_norm_squared = B_norm_squared
-    else:
-        A_norm_squared = (A**2).sum(axis=1)
-    return (B_norm_squared + A_norm_squared.reshape((A.shape[0],1)) - 2*np.dot(A, B.T))
+from ..metrics.pairwise import euclidian_distances
 
 ################################################################################
 # Initialisation heuristic
@@ -80,7 +53,7 @@ def k_init(X, k, n_samples_max=500, rng=None):
         X = X[rng.randint(n_samples, size=n_samples_max)]
         n_samples = n_samples_max
 
-    distances = all_pairs_l2_distance_squared(X, X)
+    distances = euclidian_distances(X, X, squared=True)
 
     'choose the 1st seed randomly, and store D(x)^2 in D[]'
     first_idx =rng.randint(n_samples)
@@ -106,7 +79,7 @@ def k_init(X, k, n_samples_max=500, rng=None):
 ################################################################################
 # K-means estimation by EM (expectation maximisation)
 
-def k_means(X, k,init='k-means++', n_init=10, max_iter=300, verbose=0, 
+def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, 
                     delta=1e-4, rng=None, copy_x=True):
     """ K-means clustering algorithm.
 
@@ -153,10 +126,11 @@ def k_means(X, k,init='k-means++', n_init=10, max_iter=300, verbose=0,
         The generator used to initialize the centers. Defaults to numpy.random.
 
     copy_x: boolean, optional
-        When pre-computing distances it is more numerically accurate to center the data first.
-        If copy_x is True, then the original data is not modified.  If False, the original data
-        is modified, and put back before the function returns, but small numerical differences
-        may be introduced by subtracting and then adding the data mean.
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
 
     Returns
     -------
@@ -287,7 +261,7 @@ def _e_step(x, centers, precompute_distances=True, x_squared_norms=None):
     k = centers.shape[0]
 
     if precompute_distances:
-        distances = all_pairs_l2_distance_squared(centers, x, x_squared_norms)
+        distances = euclidian_distances(centers, x, x_squared_norms, squared=True)
     z = -np.ones(n_samples).astype(np.int)
     mindist = np.infty * np.ones(n_samples)
     for q in range(k):
diff --git a/scikits/learn/metrics/pairwise.py b/scikits/learn/metrics/pairwise.py
index cc9c486ac67ce..ad74cd6b8b88b 100644
--- a/scikits/learn/metrics/pairwise.py
+++ b/scikits/learn/metrics/pairwise.py
@@ -9,7 +9,9 @@
 import numpy as np
 
 
-def euclidian_distances(X, Y):
+def euclidian_distances(X, Y, 
+        Y_norm_squared=None, 
+        squared=False):
     """
     Considering the rows of X (and Y=X) as vectors, compute the
     distance matrix between each pair of vectors.
@@ -20,6 +22,12 @@ def euclidian_distances(X, Y):
 
     Y: array of shape (n_samples_2, n_features)
 
+    Y_norm_squared: array [n_samples_2], optional
+        pre-computed (Y**2).sum(axis=1)
+
+    squared: boolean, optional
+        This routine will return squared Euclidean distances instead.
+
     Returns
     -------
     distances: array of shape (n_samples_1, n_samples_2)
@@ -37,8 +45,9 @@ def euclidian_distances(X, Y):
     array([[ 1.        ],
            [ 1.41421356]])
     """
-    # shortcut in the common case euclidean_distances(X, X)
-    compute_Y = X is not Y
+    # should not need X_norm_squared because if you could precompute that as
+    # well as Y, then you should just pre-compute the output and not even
+    # call this function.
 
     X = np.asanyarray(X)
     Y = np.asanyarray(Y)
@@ -47,12 +56,24 @@ def euclidian_distances(X, Y):
         raise ValueError("Incompatible dimension for X and Y matrices")
 
     XX = np.sum(X * X, axis=1)[:, np.newaxis]
-    if compute_Y:
+    if X is Y: # shortcut in the common case euclidean_distances(X, X)
+        YY = XX.T
+    elif Y_norm_squared is None:
         YY = np.sum(Y * Y, axis=1)[np.newaxis, :]
     else:
-        YY = XX.T
+        YY = np.asanyarray(Y_norm_squared)
+        if YY.shape != (Y.shape[0],):
+            raise ValueError("Incompatible dimension for Y and Y_norm_squared")
+        YY = YY[np.newaxis,:]
 
+    # TODO:
+    # a faster cython implementation would do the dot product first,
+    # and then add XX, add YY, and do the clipping of negative values in
+    # a single pass over the output matrix.
     distances = XX + YY # Using broadcasting
     distances -= 2 * np.dot(X, Y.T)
     distances = np.maximum(distances, 0)
-    return np.sqrt(distances)
+    if squared:
+        return distances
+    else:
+        return np.sqrt(distances)

From eefd4f1126d81031adba33b0570558fb6a78fba9 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 13 Jan 2011 10:49:23 -0500
Subject: [PATCH 15/19] fixed typo in my previous patch to PCA

---
 scikits/learn/pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py
index 916b9c5187ee4..f5df2548db643 100644
--- a/scikits/learn/pca.py
+++ b/scikits/learn/pca.py
@@ -188,7 +188,7 @@ def fit(self, X, **params):
             self.components_coefs_ = coefs = S / np.sqrt(n) 
         else:
             self.components_ = V.T
-            self.components_coefs = np.ones_like(S, dtype=V.dtype)
+            self.components_coefs_ = np.ones_like(S)
 
         if self.n_components == 'mle':
             self.n_components = _infer_dimension_(self.explained_variance_,

From 8783af1d4293384f6572606c9488282b8e5920f9 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 13 Jan 2011 10:51:08 -0500
Subject: [PATCH 16/19] added PCA.inverse_transform and unit test

---
 scikits/learn/pca.py            |  5 +++++
 scikits/learn/tests/test_pca.py | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py
index f5df2548db643..8d572893e7b42 100644
--- a/scikits/learn/pca.py
+++ b/scikits/learn/pca.py
@@ -210,6 +210,11 @@ def transform(self, X):
         Xr = np.dot(Xr, self.components_)
         return Xr
 
+    def inverse_transform(self, Y):
+        """Return an input X whose transform would be Y"""
+        r = np.dot(Y * self.components_coefs_, self.components_.T)
+        r += self.mean_
+        return r
 
 class ProbabilisticPCA(PCA):
     """Additional layer on top of PCA that add a probabilistic evaluation
diff --git a/scikits/learn/tests/test_pca.py b/scikits/learn/tests/test_pca.py
index 713e66746e165..e4176fd6ea053 100644
--- a/scikits/learn/tests/test_pca.py
+++ b/scikits/learn/tests/test_pca.py
@@ -79,6 +79,20 @@ def test_pca_check_projection():
 
     np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
 
+def test_pca_inverse():
+    """Test that the projection of data can be inverted"""
+
+    n, p = 50, 3
+    X = randn(n,p) # spherical data
+    X[:,1] *= .0001 # make middle component relatively small
+    X += [5,4,3] # make a large mean
+
+    pca = PCA(n_components=2)
+    pca.fit(X)
+    Y = pca.transform(X)
+    Xlike = pca.inverse_transform(Y)
+    assert np.max(abs(X-Xlike)) < .001
+
 
 def test_randomized_pca_check_projection():
     """Test that the projection by RandomizedPCA on dense data is correct"""

From 41e4bc8aa1c76ebfef6aad32d8adc9fbf47b384d Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 13 Jan 2011 10:57:51 -0500
Subject: [PATCH 17/19] added components_coefs_ (eigenvalues) member to
 RandomizedPCA to match PCA

---
 scikits/learn/pca.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py
index 8d572893e7b42..ae8e38ae1ed02 100644
--- a/scikits/learn/pca.py
+++ b/scikits/learn/pca.py
@@ -185,7 +185,7 @@ def fit(self, X, **params):
         if self.whiten:
             n = X.shape[0]
             self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
-            self.components_coefs_ = coefs = S / np.sqrt(n) 
+            self.components_coefs_ = S / np.sqrt(n) 
         else:
             self.components_ = V.T
             self.components_coefs_ = np.ones_like(S)
@@ -376,8 +376,10 @@ def fit(self, X, **params):
         if self.whiten:
             n = X.shape[0]
             self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
+            self.components_coefs_ = S / np.sqrt(n) 
         else:
             self.components_ = V.T
+            self.components_coefs_ = np.ones_like(S)
 
         return self
 

From 4beecdc340ea863019d5a8873482dfcbe0b010e1 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 13 Jan 2011 18:22:40 -0500
Subject: [PATCH 18/19] test_pca - modified to use assert_almost_equal

---
 scikits/learn/tests/test_pca.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scikits/learn/tests/test_pca.py b/scikits/learn/tests/test_pca.py
index e4176fd6ea053..b1d68a375c45a 100644
--- a/scikits/learn/tests/test_pca.py
+++ b/scikits/learn/tests/test_pca.py
@@ -84,14 +84,14 @@ def test_pca_inverse():
 
     n, p = 50, 3
     X = randn(n,p) # spherical data
-    X[:,1] *= .0001 # make middle component relatively small
+    X[:,1] *= .00001 # make middle component relatively small
     X += [5,4,3] # make a large mean
 
     pca = PCA(n_components=2)
     pca.fit(X)
     Y = pca.transform(X)
     Xlike = pca.inverse_transform(Y)
-    assert np.max(abs(X-Xlike)) < .001
+    assert_almost_equal(X, Xlike, decimal=3)
 
 
 def test_randomized_pca_check_projection():

From fb7632fbe91039b2308d8463b41fc72484753d62 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 13 Jan 2011 18:26:26 -0500
Subject: [PATCH 19/19] euclidian_distances - repair special case for when X is
 Y

---
 scikits/learn/metrics/pairwise.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scikits/learn/metrics/pairwise.py b/scikits/learn/metrics/pairwise.py
index ad74cd6b8b88b..d278921cc8210 100644
--- a/scikits/learn/metrics/pairwise.py
+++ b/scikits/learn/metrics/pairwise.py
@@ -48,9 +48,11 @@ def euclidian_distances(X, Y,
     # should not need X_norm_squared because if you could precompute that as
     # well as Y, then you should just pre-compute the output and not even
     # call this function.
-
-    X = np.asanyarray(X)
-    Y = np.asanyarray(Y)
+    if X is Y:
+        X = Y = np.asanyarray(X)
+    else:
+        X = np.asanyarray(X)
+        Y = np.asanyarray(Y)
 
     if X.shape[1] != Y.shape[1]:
         raise ValueError("Incompatible dimension for X and Y matrices")