From 1cb77993c3f3013b0ceaf87bb3dbead05c33e6fd Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Mon, 10 Jan 2011 13:10:07 -0500 Subject: [PATCH 01/19] k_means_ - added optional rng parameter to work routines --- scikits/learn/cluster/k_means_.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 99e2a0dcd7ce1..507b2969e5d41 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -16,7 +16,7 @@ # kinit originaly from pybrain: # http://github.com/pybrain/pybrain/raw/master/pybrain/auxiliary/kmeans.py -def k_init(X, k, n_samples_max=500): +def k_init(X, k, n_samples_max=500, rng=None): """Init k seeds according to kmeans++ Parameters @@ -44,12 +44,14 @@ def k_init(X, k, n_samples_max=500): http://blogs.sun.com/yongsun/entry/k_means_and_k_means """ n_samples = X.shape[0] + if rng is None: + rng = np.random if n_samples >= n_samples_max: - X = X[np.random.randint(n_samples, size=n_samples_max)] + X = X[rng.randint(n_samples, size=n_samples_max)] n_samples = n_samples_max 'choose the 1st seed randomly, and store D(x)^2 in D[]' - centers = [X[np.random.randint(n_samples)]] + centers = [X[rng.randint(n_samples)]] D = ((X - centers[0]) ** 2).sum(axis=-1) for _ in range(k - 1): @@ -73,7 +75,7 @@ def k_init(X, k, n_samples_max=500): # K-means estimation by EM (expectation maximisation) def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, - delta=1e-4): + delta=1e-4, rng=None): """ K-means clustering algorithm. Parameters @@ -115,6 +117,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, verbose: boolean, optional Terbosity mode + rng: numpy.RandomState, optional + The generator used to initialize the centers + Returns ------- centroid: ndarray @@ -129,6 +134,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, The final value of the inertia criterion """ + if rng is None: + rng = np.random n_samples = X.shape[0] vdata = np.mean(np.var(X, 0)) @@ -142,9 +149,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, for it in range(n_init): # init if init == 'k-means++': - centers = k_init(X, k) + centers = k_init(X, k, rng=rng) elif init == 'random': - seeds = np.argsort(np.random.rand(n_samples))[:k] + seeds = np.argsort(rng.rand(n_samples))[:k] centers = X[seeds] elif hasattr(init, '__array__'): centers = np.asanyarray(init).copy() From 5b9605ee290595e2b7d5b95950b68adfa144a677 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Mon, 10 Jan 2011 13:17:13 -0500 Subject: [PATCH 02/19] Centering data for k-means before fitting --- scikits/learn/cluster/k_means_.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 507b2969e5d41..8072ad64bbd8c 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -146,6 +146,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, warnings.warn('Explicit initial center position passed: ' 'performing only one init in the k-means') n_init = 1 + 'subtract of mean of x for more accurate distance computations' + Xmean = X.mean(axis=0) + X = X-Xmean # TODO: offer an argument to allow doing this inplace for it in range(n_init): # init if init == 'k-means++': @@ -180,7 +183,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, best_centers = centers best_labels = labels best_inertia = inertia - return best_centers, best_labels, best_inertia + return best_centers+Xmean, best_labels, best_inertia def _m_step(x, z ,k): From c182b3776172026783b18059d057ca61e2e35c45 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Mon, 10 Jan 2011 13:18:38 -0500 Subject: [PATCH 03/19] k-means - added verbose-level print after initialization --- scikits/learn/cluster/k_means_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 8072ad64bbd8c..95bc566741d57 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -163,6 +163,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, "be 'k-mean++' or 'random' or an ndarray, " "'%s' (type '%s') was passed.") + if verbose: + print 'Initialization complete' # iterations for i in range(max_iter): centers_old = centers.copy() From d0084d7f3c2ba52a08708adb847299370402f1ba Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Mon, 10 Jan 2011 13:19:49 -0500 Subject: [PATCH 04/19] added faster distance-computation algorithm to k-means _e_step --- scikits/learn/cluster/k_means_.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 95bc566741d57..38d2db5fc8dbd 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -238,12 +238,26 @@ def _e_step(x, centers): inertia: float The value of the inertia criterion with the assignment """ + n_samples = x.shape[0] + k = centers.shape[0] + + there_is_memory_to_compute_distances_matrix = True + + if there_is_memory_to_compute_distances_matrix: + distances = ( + (x**2).sum(axis=1) + + (centers**2).sum(axis=1).reshape((k,1)) + - 2*np.dot(centers, x.T)) + # distances is a matrix of shape (k, n_samples) + z = -np.ones(n_samples).astype(np.int) mindist = np.infty * np.ones(n_samples) - k = centers.shape[0] for q in range(k): - dist = np.sum((x - centers[q]) ** 2, 1) + if there_is_memory_to_compute_distances_matrix: + dist = distances[q] + else: + dist = np.sum((x - centers[q]) ** 2, 1) z[dist Date: Tue, 11 Jan 2011 21:39:37 -0500 Subject: [PATCH 05/19] PCA train() stores eigenvalues associated with components --- scikits/learn/pca.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py index 38fc4fe1bbe18..916b9c5187ee4 100644 --- a/scikits/learn/pca.py +++ b/scikits/learn/pca.py @@ -132,6 +132,9 @@ class PCA(BaseEstimator): components_: array, [n_features, n_components] Components with maximum variance. + components_coefs: array, [n_components] + Eigenvalues associated with principal components_. + explained_variance_ratio_: array, [n_components] Percentage of variance explained by each of the selected components. k is not set then all components are stored and the sum of @@ -182,8 +185,10 @@ def fit(self, X, **params): if self.whiten: n = X.shape[0] self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n) + self.components_coefs_ = coefs = S / np.sqrt(n) else: self.components_ = V.T + self.components_coefs = np.ones_like(S, dtype=V.dtype) if self.n_components == 'mle': self.n_components = _infer_dimension_(self.explained_variance_, @@ -191,6 +196,7 @@ def fit(self, X, **params): if self.n_components is not None: self.components_ = self.components_[:, :self.n_components] + self.components_coefs_ = self.components_coefs_[:self.n_components] self.explained_variance_ = \ self.explained_variance_[:self.n_components] self.explained_variance_ratio_ = \ From a5af706bce8957011bf4e63da6c6e8be4a57c48e Mon Sep 17 00:00:00 2001 From: "james.bergstra" Date: Tue, 11 Jan 2011 21:49:56 -0500 Subject: [PATCH 06/19] adding James Bergstra as author of k_means_ file --- scikits/learn/cluster/k_means_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 38d2db5fc8dbd..794fa9471753f 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -3,6 +3,7 @@ # Authors: Gael Varoquaux # Thomas Rueckstiess +# James Bergstra # License: BSD import warnings From 1563c3185ce4527671d7b44879e0ec362a1287c1 Mon Sep 17 00:00:00 2001 From: "james.bergstra" Date: Tue, 11 Jan 2011 21:50:57 -0500 Subject: [PATCH 07/19] k-means adding all_paris_l2_distance_squared function --- scikits/learn/cluster/k_means_.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 794fa9471753f..9176b48e95a47 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -12,6 +12,34 @@ from ..base import BaseEstimator +def all_pairs_l2_distance_squared(A, B, B_norm_squared=None): + """ + Returns the squared l2 norms of the differences between rows of A and B. + + Parameters + ---------- + A: array, [n_rows_A, n_cols] + + B: array, [n_rows_B, n_cols] + + B_norm_squared: array [n_rows_B], or None + pre-computed (B**2).sum(axis=1) + + Returns + ------- + + array [n_rows_A, n_rows_B] + entry [i,j] is ((A[i] - B[i])**2).sum(axis=1) + + """ + if B_norm_squared is None: + B_norm_squared = (B**2).sum(axis=1) + if A is B: + A_norm_squared = B_norm_squared + else: + A_norm_squared = (A**2).sum(axis=1) + return (B_norm_squared + A_norm_squared.reshape((A.shape[0],1)) - 2*np.dot(A, B.T)) + ################################################################################ # Initialisation heuristic From 73c6024f223231928f9c6acf8a6e8cded81e4a9d Mon Sep 17 00:00:00 2001 From: "james.bergstra" Date: Tue, 11 Jan 2011 21:52:47 -0500 Subject: [PATCH 08/19] k-means - modified k_init to use pre-computed distances for faster, clearer code --- scikits/learn/cluster/k_means_.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 9176b48e95a47..fc90ae236900f 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -75,27 +75,30 @@ def k_init(X, k, n_samples_max=500, rng=None): n_samples = X.shape[0] if rng is None: rng = np.random + if n_samples >= n_samples_max: X = X[rng.randint(n_samples, size=n_samples_max)] n_samples = n_samples_max + distances = all_pairs_l2_distance_squared(X, X) + 'choose the 1st seed randomly, and store D(x)^2 in D[]' - centers = [X[rng.randint(n_samples)]] - D = ((X - centers[0]) ** 2).sum(axis=-1) + first_idx =rng.randint(n_samples) + centers = [X[first_idx]] + D = distances[first_idx] for _ in range(k - 1): bestDsum = bestIdx = -1 for i in range(n_samples): 'Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)' - Dsum = np.minimum(D, ((X - X[i]) ** 2).sum(axis=-1) - ).sum() + Dsum = np.minimum(D, distances[i]).sum() if bestDsum < 0 or Dsum < bestDsum: bestDsum, bestIdx = Dsum, i centers.append(X[bestIdx]) - D = np.minimum(D, ((X - X[bestIdx]) ** 2).sum(axis=-1)) + D = np.minimum(D, distances[bestIdx]) return np.array(centers) From e66a033095aa7b4a3d51672abeb8c5a5f79e09fa Mon Sep 17 00:00:00 2001 From: "james.bergstra" Date: Tue, 11 Jan 2011 21:55:40 -0500 Subject: [PATCH 09/19] k-means - added support for a callable "init" argument instead of copying all the k_init parameters as optional arguments - invite user to use a lambda or something --- scikits/learn/cluster/k_means_.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index fc90ae236900f..8f67e36234da0 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -130,7 +130,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. - init: {'k-means++', 'random', or an ndarray}, optional + init: {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean @@ -190,6 +190,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, centers = X[seeds] elif hasattr(init, '__array__'): centers = np.asanyarray(init).copy() + elif callable(init): + centers = init(X, k, rng=rng) else: raise ValueError("the init parameter for the k-means should " "be 'k-mean++' or 'random' or an ndarray, " From 38bd96e207f4a25efc28774d587d7e611de6204d Mon Sep 17 00:00:00 2001 From: "james.bergstra" Date: Tue, 11 Jan 2011 21:56:35 -0500 Subject: [PATCH 10/19] k-means - fixed misleading typo in error message --- scikits/learn/cluster/k_means_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 8f67e36234da0..6346c2bdf926f 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -194,7 +194,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, centers = init(X, k, rng=rng) else: raise ValueError("the init parameter for the k-means should " - "be 'k-mean++' or 'random' or an ndarray, " + "be 'k-means++' or 'random' or an ndarray, " "'%s' (type '%s') was passed.") if verbose: From 9242e46cc45400f855c7350a2d5e8bf5f7e94a39 Mon Sep 17 00:00:00 2001 From: "james.bergstra" Date: Tue, 11 Jan 2011 21:59:51 -0500 Subject: [PATCH 11/19] k-means - added optional parameters "precompute_distances" and "x_squared_norms" --- scikits/learn/cluster/k_means_.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 6346c2bdf926f..7dbec0709b9a7 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -251,7 +251,7 @@ def _m_step(x, z ,k): return centers -def _e_step(x, centers): +def _e_step(x, centers, precompute_distances=True, x_squared_norms=None): """E step of the K-means EM algorithm Computation of the input-to-cluster assignment @@ -276,22 +276,15 @@ def _e_step(x, centers): n_samples = x.shape[0] k = centers.shape[0] - there_is_memory_to_compute_distances_matrix = True - - if there_is_memory_to_compute_distances_matrix: - distances = ( - (x**2).sum(axis=1) - + (centers**2).sum(axis=1).reshape((k,1)) - - 2*np.dot(centers, x.T)) - # distances is a matrix of shape (k, n_samples) - + if precompute_distances: + distances = all_pairs_l2_distance_squared(centers, x, x_squared_norms) z = -np.ones(n_samples).astype(np.int) mindist = np.infty * np.ones(n_samples) for q in range(k): - if there_is_memory_to_compute_distances_matrix: + if precompute_distances: dist = distances[q] else: - dist = np.sum((x - centers[q]) ** 2, 1) + dist = np.sum((x - centers[q]) ** 2, axis=1) z[dist Date: Tue, 11 Jan 2011 22:00:58 -0500 Subject: [PATCH 12/19] k-means - added "verbose" parameter to KMeans class --- scikits/learn/cluster/k_means_.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 7dbec0709b9a7..05117fb340618 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -371,11 +371,13 @@ class KMeans(BaseEstimator): """ - def __init__(self, k=8, init='random', n_init=10, max_iter=300): + def __init__(self, k=8, init='random', n_init=10, max_iter=300, + verbose=0): self.k = k self.init = init self.max_iter = max_iter self.n_init = n_init + self.verbose = verbose def fit(self, X, **params): """ Compute k-means""" @@ -383,6 +385,6 @@ def fit(self, X, **params): self._set_params(**params) self.cluster_centers_, self.labels_, self.inertia_ = k_means(X, k=self.k, init=self.init, n_init=self.n_init, - max_iter=self.max_iter) + max_iter=self.max_iter, verbose=self.verbose) return self From c2e17ac1f8635106c764172217048a515eeba4f7 Mon Sep 17 00:00:00 2001 From: "james.bergstra" Date: Tue, 11 Jan 2011 22:30:49 -0500 Subject: [PATCH 13/19] k-means - added copy_x parameter to worker routine and BaseEstimator, allowing optional in-place operation --- scikits/learn/cluster/k_means_.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index 05117fb340618..d0ac1dcac13e3 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -106,8 +106,8 @@ def k_init(X, k, n_samples_max=500, rng=None): ################################################################################ # K-means estimation by EM (expectation maximisation) -def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, - delta=1e-4, rng=None): +def k_means(X, k,init='k-means++', n_init=10, max_iter=300, verbose=0, + delta=1e-4, rng=None, copy_x=True): """ K-means clustering algorithm. Parameters @@ -150,7 +150,13 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, Terbosity mode rng: numpy.RandomState, optional - The generator used to initialize the centers + The generator used to initialize the centers. Defaults to numpy.random. + + copy_x: boolean, optional + When pre-computing distances it is more numerically accurate to center the data first. + If copy_x is True, then the original data is not modified. If False, the original data + is modified, and put back before the function returns, but small numerical differences + may be introduced by subtracting and then adding the data mean. Returns ------- @@ -180,7 +186,9 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, n_init = 1 'subtract of mean of x for more accurate distance computations' Xmean = X.mean(axis=0) - X = X-Xmean # TODO: offer an argument to allow doing this inplace + if copy_x: + X = X.copy() + X -= Xmean for it in range(n_init): # init if init == 'k-means++': @@ -219,6 +227,8 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, best_centers = centers best_labels = labels best_inertia = inertia + if not copy_x: + X += Xmean return best_centers+Xmean, best_labels, best_inertia @@ -372,12 +382,14 @@ class KMeans(BaseEstimator): def __init__(self, k=8, init='random', n_init=10, max_iter=300, - verbose=0): + verbose=0, rng=None, copy_x=True): self.k = k self.init = init self.max_iter = max_iter self.n_init = n_init self.verbose = verbose + self.rng = rng + self.copy_x = copy_x def fit(self, X, **params): """ Compute k-means""" @@ -385,6 +397,7 @@ def fit(self, X, **params): self._set_params(**params) self.cluster_centers_, self.labels_, self.inertia_ = k_means(X, k=self.k, init=self.init, n_init=self.n_init, - max_iter=self.max_iter, verbose=self.verbose) + max_iter=self.max_iter, verbose=self.verbose, + rng=self.rng, copy_x=self.copy_x) return self From e7929e61e8844b476943d4aacb50200f523d43b3 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 12 Jan 2011 16:31:13 -0500 Subject: [PATCH 14/19] added optional args to euclidean_distances and removed k_means_.all_pairs_l2_distances_squared --- scikits/learn/cluster/k_means_.py | 44 +++++++------------------------ scikits/learn/metrics/pairwise.py | 33 ++++++++++++++++++----- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py index d0ac1dcac13e3..b3d7ae82e783f 100644 --- a/scikits/learn/cluster/k_means_.py +++ b/scikits/learn/cluster/k_means_.py @@ -11,34 +11,7 @@ import numpy as np from ..base import BaseEstimator - -def all_pairs_l2_distance_squared(A, B, B_norm_squared=None): - """ - Returns the squared l2 norms of the differences between rows of A and B. - - Parameters - ---------- - A: array, [n_rows_A, n_cols] - - B: array, [n_rows_B, n_cols] - - B_norm_squared: array [n_rows_B], or None - pre-computed (B**2).sum(axis=1) - - Returns - ------- - - array [n_rows_A, n_rows_B] - entry [i,j] is ((A[i] - B[i])**2).sum(axis=1) - - """ - if B_norm_squared is None: - B_norm_squared = (B**2).sum(axis=1) - if A is B: - A_norm_squared = B_norm_squared - else: - A_norm_squared = (A**2).sum(axis=1) - return (B_norm_squared + A_norm_squared.reshape((A.shape[0],1)) - 2*np.dot(A, B.T)) +from ..metrics.pairwise import euclidian_distances ################################################################################ # Initialisation heuristic @@ -80,7 +53,7 @@ def k_init(X, k, n_samples_max=500, rng=None): X = X[rng.randint(n_samples, size=n_samples_max)] n_samples = n_samples_max - distances = all_pairs_l2_distance_squared(X, X) + distances = euclidian_distances(X, X, squared=True) 'choose the 1st seed randomly, and store D(x)^2 in D[]' first_idx =rng.randint(n_samples) @@ -106,7 +79,7 @@ def k_init(X, k, n_samples_max=500, rng=None): ################################################################################ # K-means estimation by EM (expectation maximisation) -def k_means(X, k,init='k-means++', n_init=10, max_iter=300, verbose=0, +def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0, delta=1e-4, rng=None, copy_x=True): """ K-means clustering algorithm. @@ -153,10 +126,11 @@ def k_means(X, k,init='k-means++', n_init=10, max_iter=300, verbose=0, The generator used to initialize the centers. Defaults to numpy.random. copy_x: boolean, optional - When pre-computing distances it is more numerically accurate to center the data first. - If copy_x is True, then the original data is not modified. If False, the original data - is modified, and put back before the function returns, but small numerical differences - may be introduced by subtracting and then adding the data mean. + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. Returns ------- @@ -287,7 +261,7 @@ def _e_step(x, centers, precompute_distances=True, x_squared_norms=None): k = centers.shape[0] if precompute_distances: - distances = all_pairs_l2_distance_squared(centers, x, x_squared_norms) + distances = euclidian_distances(centers, x, x_squared_norms, squared=True) z = -np.ones(n_samples).astype(np.int) mindist = np.infty * np.ones(n_samples) for q in range(k): diff --git a/scikits/learn/metrics/pairwise.py b/scikits/learn/metrics/pairwise.py index cc9c486ac67ce..ad74cd6b8b88b 100644 --- a/scikits/learn/metrics/pairwise.py +++ b/scikits/learn/metrics/pairwise.py @@ -9,7 +9,9 @@ import numpy as np -def euclidian_distances(X, Y): +def euclidian_distances(X, Y, + Y_norm_squared=None, + squared=False): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. @@ -20,6 +22,12 @@ def euclidian_distances(X, Y): Y: array of shape (n_samples_2, n_features) + Y_norm_squared: array [n_samples_2], optional + pre-computed (Y**2).sum(axis=1) + + squared: boolean, optional + This routine will return squared Euclidean distances instead. + Returns ------- distances: array of shape (n_samples_1, n_samples_2) @@ -37,8 +45,9 @@ def euclidian_distances(X, Y): array([[ 1. ], [ 1.41421356]]) """ - # shortcut in the common case euclidean_distances(X, X) - compute_Y = X is not Y + # should not need X_norm_squared because if you could precompute that as + # well as Y, then you should just pre-compute the output and not even + # call this function. X = np.asanyarray(X) Y = np.asanyarray(Y) @@ -47,12 +56,24 @@ def euclidian_distances(X, Y): raise ValueError("Incompatible dimension for X and Y matrices") XX = np.sum(X * X, axis=1)[:, np.newaxis] - if compute_Y: + if X is Y: # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is None: YY = np.sum(Y * Y, axis=1)[np.newaxis, :] else: - YY = XX.T + YY = np.asanyarray(Y_norm_squared) + if YY.shape != (Y.shape[0],): + raise ValueError("Incompatible dimension for Y and Y_norm_squared") + YY = YY[np.newaxis,:] + # TODO: + # a faster cython implementation would do the dot product first, + # and then add XX, add YY, and do the clipping of negative values in + # a single pass over the output matrix. distances = XX + YY # Using broadcasting distances -= 2 * np.dot(X, Y.T) distances = np.maximum(distances, 0) - return np.sqrt(distances) + if squared: + return distances + else: + return np.sqrt(distances) From eefd4f1126d81031adba33b0570558fb6a78fba9 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 13 Jan 2011 10:49:23 -0500 Subject: [PATCH 15/19] fixed typo in my previous patch to PCA --- scikits/learn/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py index 916b9c5187ee4..f5df2548db643 100644 --- a/scikits/learn/pca.py +++ b/scikits/learn/pca.py @@ -188,7 +188,7 @@ def fit(self, X, **params): self.components_coefs_ = coefs = S / np.sqrt(n) else: self.components_ = V.T - self.components_coefs = np.ones_like(S, dtype=V.dtype) + self.components_coefs_ = np.ones_like(S) if self.n_components == 'mle': self.n_components = _infer_dimension_(self.explained_variance_, From 8783af1d4293384f6572606c9488282b8e5920f9 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 13 Jan 2011 10:51:08 -0500 Subject: [PATCH 16/19] added PCA.inverse_transform and unit test --- scikits/learn/pca.py | 5 +++++ scikits/learn/tests/test_pca.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py index f5df2548db643..8d572893e7b42 100644 --- a/scikits/learn/pca.py +++ b/scikits/learn/pca.py @@ -210,6 +210,11 @@ def transform(self, X): Xr = np.dot(Xr, self.components_) return Xr + def inverse_transform(self, Y): + """Return an input X whose transform would be Y""" + r = np.dot(Y * self.components_coefs_, self.components_.T) + r += self.mean_ + return r class ProbabilisticPCA(PCA): """Additional layer on top of PCA that add a probabilistic evaluation diff --git a/scikits/learn/tests/test_pca.py b/scikits/learn/tests/test_pca.py index 713e66746e165..e4176fd6ea053 100644 --- a/scikits/learn/tests/test_pca.py +++ b/scikits/learn/tests/test_pca.py @@ -79,6 +79,20 @@ def test_pca_check_projection(): np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1) +def test_pca_inverse(): + """Test that the projection of data can be inverted""" + + n, p = 50, 3 + X = randn(n,p) # spherical data + X[:,1] *= .0001 # make middle component relatively small + X += [5,4,3] # make a large mean + + pca = PCA(n_components=2) + pca.fit(X) + Y = pca.transform(X) + Xlike = pca.inverse_transform(Y) + assert np.max(abs(X-Xlike)) < .001 + def test_randomized_pca_check_projection(): """Test that the projection by RandomizedPCA on dense data is correct""" From 41e4bc8aa1c76ebfef6aad32d8adc9fbf47b384d Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 13 Jan 2011 10:57:51 -0500 Subject: [PATCH 17/19] added components_coefs_ (eigenvalues) member to RandomizedPCA to match PCA --- scikits/learn/pca.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py index 8d572893e7b42..ae8e38ae1ed02 100644 --- a/scikits/learn/pca.py +++ b/scikits/learn/pca.py @@ -185,7 +185,7 @@ def fit(self, X, **params): if self.whiten: n = X.shape[0] self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n) - self.components_coefs_ = coefs = S / np.sqrt(n) + self.components_coefs_ = S / np.sqrt(n) else: self.components_ = V.T self.components_coefs_ = np.ones_like(S) @@ -376,8 +376,10 @@ def fit(self, X, **params): if self.whiten: n = X.shape[0] self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n) + self.components_coefs_ = S / np.sqrt(n) else: self.components_ = V.T + self.components_coefs_ = np.ones_like(S) return self From 4beecdc340ea863019d5a8873482dfcbe0b010e1 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 13 Jan 2011 18:22:40 -0500 Subject: [PATCH 18/19] test_pca - modified to use assert_almost_equal --- scikits/learn/tests/test_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scikits/learn/tests/test_pca.py b/scikits/learn/tests/test_pca.py index e4176fd6ea053..b1d68a375c45a 100644 --- a/scikits/learn/tests/test_pca.py +++ b/scikits/learn/tests/test_pca.py @@ -84,14 +84,14 @@ def test_pca_inverse(): n, p = 50, 3 X = randn(n,p) # spherical data - X[:,1] *= .0001 # make middle component relatively small + X[:,1] *= .00001 # make middle component relatively small X += [5,4,3] # make a large mean pca = PCA(n_components=2) pca.fit(X) Y = pca.transform(X) Xlike = pca.inverse_transform(Y) - assert np.max(abs(X-Xlike)) < .001 + assert_almost_equal(X, Xlike, decimal=3) def test_randomized_pca_check_projection(): From fb7632fbe91039b2308d8463b41fc72484753d62 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 13 Jan 2011 18:26:26 -0500 Subject: [PATCH 19/19] euclidian_distances - repair special case for when X is Y --- scikits/learn/metrics/pairwise.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scikits/learn/metrics/pairwise.py b/scikits/learn/metrics/pairwise.py index ad74cd6b8b88b..d278921cc8210 100644 --- a/scikits/learn/metrics/pairwise.py +++ b/scikits/learn/metrics/pairwise.py @@ -48,9 +48,11 @@ def euclidian_distances(X, Y, # should not need X_norm_squared because if you could precompute that as # well as Y, then you should just pre-compute the output and not even # call this function. - - X = np.asanyarray(X) - Y = np.asanyarray(Y) + if X is Y: + X = Y = np.asanyarray(X) + else: + X = np.asanyarray(X) + Y = np.asanyarray(Y) if X.shape[1] != Y.shape[1]: raise ValueError("Incompatible dimension for X and Y matrices")