From 0c26bd2c97c00cdf1b0cc1b7600263aaf55dc02c Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Mon, 31 Dec 2012 23:50:02 +0100 Subject: [PATCH 1/8] Added 2D PCA with tests and applications --- .../applications/face_recognition_pca_2d.py | 179 ++++++++++++ sklearn/decomposition/pca_2d.py | 268 ++++++++++++++++++ sklearn/decomposition/tests/test_pca_2d.py | 121 ++++++++ 3 files changed, 568 insertions(+) create mode 100644 examples/applications/face_recognition_pca_2d.py create mode 100644 sklearn/decomposition/pca_2d.py create mode 100644 sklearn/decomposition/tests/test_pca_2d.py diff --git a/examples/applications/face_recognition_pca_2d.py b/examples/applications/face_recognition_pca_2d.py new file mode 100644 index 0000000000000..9e834fad6f51d --- /dev/null +++ b/examples/applications/face_recognition_pca_2d.py @@ -0,0 +1,179 @@ +""" +=================================================== +Faces recognition example using 2DPCA and Random Forests +=================================================== + +This dataset contains a set of face images taken between April 1992 and +April 1994 at AT&T Laboratories Cambridge. It is the Olivetti faces : + + http://cs.nyu.edu/~roweis/data/olivettifaces.mat (4MB) + +.. Olivetti: http://www.cs.nyu.edu/~roweis/ + +Expected results for the 40 people in the dataset:: + + +precision recall f1-score support + + Face 0 1.00 0.40 0.57 5 + Face 1 1.00 1.00 1.00 5 + Face 2 0.80 0.80 0.80 5 + Face 3 1.00 1.00 1.00 5 + Face 4 1.00 0.80 0.89 5 + Face 5 0.83 1.00 0.91 5 + Face 6 0.71 1.00 0.83 5 + Face 7 0.80 0.80 0.80 5 + Face 8 1.00 1.00 1.00 5 + Face 9 1.00 0.80 0.89 5 + Face 10 1.00 1.00 1.00 5 + Face 11 0.67 0.80 0.73 5 + Face 12 1.00 0.80 0.89 5 + Face 13 1.00 1.00 1.00 5 + Face 14 0.83 1.00 0.91 5 + Face 15 1.00 1.00 1.00 5 + Face 16 1.00 1.00 1.00 5 + Face 17 0.83 1.00 0.91 5 + Face 18 1.00 1.00 1.00 5 + Face 19 1.00 0.80 0.89 5 + Face 20 0.71 1.00 0.83 5 + Face 21 1.00 0.80 0.89 5 + Face 22 1.00 1.00 1.00 5 + Face 23 0.83 1.00 0.91 5 + Face 24 0.83 1.00 0.91 5 + Face 25 0.80 0.80 0.80 5 + Face 26 1.00 1.00 1.00 5 + Face 27 1.00 1.00 1.00 5 + Face 28 1.00 1.00 1.00 5 + Face 29 1.00 1.00 1.00 5 + Face 30 1.00 1.00 1.00 5 + Face 31 1.00 0.60 0.75 5 + Face 32 1.00 1.00 1.00 5 + Face 33 1.00 1.00 1.00 5 + Face 34 1.00 0.80 0.89 5 + Face 35 1.00 0.80 0.89 5 + Face 36 0.83 1.00 0.91 5 + Face 37 1.00 1.00 1.00 5 + Face 38 0.83 1.00 0.91 5 + Face 39 0.80 0.80 0.80 5 + +avg / total 0.93 0.92 0.91 200 + + + +""" +print __doc__ + +from time import time +import numpy as np + + +from sklearn.grid_search import GridSearchCV +from sklearn.metrics import classification_report +from sklearn.metrics import confusion_matrix +from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import fetch_olivetti_faces +from sklearn.cross_validation import StratifiedShuffleSplit +from sklearn.decomposition import PCA2D + + +############################################################################## +# Download the data, if not already on disk and load it as numpy arrays + +olivetti_faces = fetch_olivetti_faces() + +# introspect the images arrays to find the shapes +n_samples, h, w = olivetti_faces.images.shape + +# we use the 2 data directly +X = olivetti_faces.images + + +# the label to predict is the id of the person +y = olivetti_faces.target + +target_names = np.array(["Face %d" % x for x in range(0, + np.size(olivetti_faces.target), 1)]) +# target_names = np.array(map(str, +# (np.arange(np.size(olivetti_faces.target)).reshape( +# olivetti_faces.target.shape)))) +n_classes = target_names.shape[0] + +print "Total dataset size:" +print "n_samples: %d" % n_samples +print "features: %d X %d" % (h, w) +print "n_classes: %d" % n_classes + + +############################################################################## +# Split into a training set and a test set using a stratified shuffle split + +# split into a training and testing set +sss = StratifiedShuffleSplit(y, 1, test_size=0.5, random_state=0) +for train_index, test_index in sss: + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + +############################################################################## +# Compute a 2DPCA (eigenfaces) on the face dataset (treated as unlabeled +# dataset): unsupervised feature extraction / dimensionality reduction + +n_row_components = 0.9 +n_column_components = 0.9 + +print("Reducing images by keeping %d %% of the variance on the row" + % (100 * n_row_components)) +print "and %d %% of the variance on the column from %d faces" % ( + 100 * n_column_components, X_train.shape[0]) +t0 = time() +pca = PCA2D(n_row_components=n_row_components, n_column_components= + n_column_components, row_whiten=True, + column_whiten=True, epsilon=0).fit(X_train) +print "done in %0.3fs" % (time() - t0) + + +print "Projecting the input data on the eigenfaces orthonormal basis" +t0 = time() +X_train_pca = pca.transform(X_train) +X_test_pca = pca.transform(X_test) +print "done in %0.3fs" % (time() - t0) + +# Converting the images to 1D for classification +X_train_pca = X_train_pca.reshape(X_train_pca.shape[0], X_train_pca.shape[1] * + X_train_pca.shape[2]) + +X_test_pca = X_test_pca.reshape(X_test_pca.shape[0], X_test_pca.shape[1] * + X_test_pca.shape[2]) + +X_test = X_test.reshape(X_test.shape[0], h * w) +X_train = X_train.reshape(X_train.shape[0], h * w) + + +############################################################################## +# Train a Random Forests classification model + +print "Fitting the classifier to the training set" +t0 = time() +param_grid = { + 'n_estimators': [10, 50, 90, 300, 500, 700, 1000, 1500, 2000], +} +clf = GridSearchCV(RandomForestClassifier(), param_grid) +clf = clf.fit(X_train_pca, y_train) +print "done in %0.3fs" % (time() - t0) +print "Best estimator found by grid search:" +print clf.best_estimator_ + + +############################################################################## +# Quantitative evaluation of the model quality on the test set + +print "Predicting the people names on the testing set" +t0 = time() +y_pred = clf.predict(X_test_pca) +print "done in %0.3fs" % (time() - t0) + +print classification_report(y_test, y_pred, target_names=target_names) +print confusion_matrix(y_test, y_pred, labels=range(n_classes)) + +print "Size of the components retained %d X %d" % (pca.n_row_components_, + pca.n_column_components_) diff --git a/sklearn/decomposition/pca_2d.py b/sklearn/decomposition/pca_2d.py new file mode 100644 index 0000000000000..d86ac831fcee2 --- /dev/null +++ b/sklearn/decomposition/pca_2d.py @@ -0,0 +1,268 @@ +""" 2 Dimensional Principal Component Analysis +""" + +# Author : Aristide TOSSOU +# +# License: BSD Style + +import numpy as np +from scipy import linalg +from ..base import BaseEstimator, TransformerMixin +from ..utils import as_float_array + + +class PCA2D(BaseEstimator, TransformerMixin): + """ Two directionnal two dimensional principal component analysis + This technique is based on 2D matrices as opposed to standard PCA which + is based on 1D vectors. It considers simultaneously the row and column + directions.2D PCA is computed by examining the covariance of the data. + It can easily get higher accuracy than 1D PCA. + + This implements the method described in `Daoqiang Zhang, Zhi-Hua Zhou, : + Two-directional two-dimensional PCA for efficient face representation and + recognition, Neurocomputing, Volume 69, Issues 1-3, December 2005, + Pages 224-231` + + This implementation uses the scipy.linalg implementation of singular + value decomposition. It works on dense matrices. The time complexity is : + O(n_row**3 + n_column**3 + n_row**2 * n_samples + n_column**2 * n_samples) + where n_samples is the number of examples, n_row and n_column are the + dimension of the original 2D matrices. In practice it means that it + can be used if n_row, n_column, n_samples are all less than 300. + More formally, it can be used as far as one of the term in the + complexity is not too large (less than 10**10). + + In most case, it is then significantly faster than standard 1D PCA + which has a complexity of O( n_row**3 * n_column**3). + + + Parameters + ---------- + + n_row_components : int, None, or string + Number of components in the row direction to keep + if n_row_components is not set or is 0, all components are kept:: + n_row_components = n_row + if ``0 < n_row_components < 1``, select the number of components + such that a proportion (defined by n_row_components) + of the variance is retained + + n_column_components : int, None, or string + Number of components in the column direction to keep + if n_column_components is not set or is 0, all components are kept:: + n_column_components = n_column + if ``0 < n_column_components < 1``, select the number of components + such that a proportion (defined by n_column_components) + of the variance is retained + + row_whiten : bool, optional + When True (False by default) the `row_components` vectors are divided + by the square root of singular values to ensure uncorrelated outputs + with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making there data respect some hard-wired assumptions. + + + column_whiten : bool, optional + When True (False by default) the `column_components` vectors are + divided by the square root of singular values to ensure + uncorrelated outputs with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making there data respect some hard-wired assumptions. + + epsilon : float, optional + (default value 1e-5). This is a regularization parameter when + whitening is enable.Whitening involves division by the square + root of singular values. When the singular values are close to + zero it could lead to numerical instability. This parameter is + added to the singular value to prevent it. + + copy : bool + If False (True by default), data passed to fit are overwritten + + Attributes + ---------- + + `row_components_` : array, [n_row, n_row_components] + components with maximum variance on the row direction + + `column_components_` : array, [n_column, n_column_components] + components with maximum variance on the column direction + + See also + -------- + PCA + ProbabilisticPCA + RandomizedPCA + KernelPCA + SparsePCA + + + + """ + def __init__(self, n_row_components=None, n_column_components=None, + row_whiten=False, column_whiten=False, epsilon=1e-5, + copy=True): + self.n_row_components = n_row_components + self.n_column_components = n_column_components + self.row_whiten = row_whiten + self.column_whiten = column_whiten + self.epsilon = epsilon + self.copy = copy + + def fit(self, X, y=None): + """ Fit the model from data in X + Parameters + ---------- + + X: array-like, shape (n_samples, n_row, n_column) + Training matrix where n_samples is the number of samples + and n_row x n_column is the dimension of the features + + Returns + ------- + self : object + Returns the instance itself. + """ + + # Converting the data to 3 dimensions + X = np.atleast_3d(X) + + n_samples, n_row, n_column = X.shape + + # Making sure the type of the data is float + X = as_float_array(X, copy=self.copy) + + # Center data + self.mean_ = np.mean(X, axis=0) + X -= self.mean_ + + # As we don't want to change the default n_row_components let's copy it + + self.n_row_components_ = np.copy(self.n_row_components) + self.n_column_components_ = np.copy(self.n_column_components) + + # Computing Alternate 2DPCA + if (self.n_row_components_ is not None and + self.n_row_components_ > 0): + U, S, V = linalg.svd(np.tensordot(X, X, axes=([0, 2], [0, 2])) + / n_samples, full_matrices=False) + + if self.n_row_components_ < 1: + self.n_row_components_ = np.sum((S / np.sum(S)).cumsum() < + self.n_row_components_) + 1 + + if self.row_whiten: + # self.row_reg = np.diag(1/np.sqrt(S)+self.epsilon) + + # To save time we can multiply the whitening matrix + # by U beforehand + self.row_components_ = (U[:, :self.n_row_components_]. + dot(np.diag(1. / np.sqrt(S[:self. + n_row_components_] + self. + epsilon)))) + else: + self.row_components_ = U[:, :self.n_row_components_] + else: + + self.row_components_ = np.eye(n_row, n_row) + + # Computing 2DPCA + if (self.n_column_components_ is not None and + self.n_column_components_ > 0): + U, S, V = linalg.svd(np.tensordot(X, X, axes=([0, 1], [0, 1])) + / n_samples, full_matrices=False) + + if self.n_column_components_ < 1: + self.n_column_components_ = np.sum((S / np.sum(S)).cumsum() < + self.n_column_components_ + ) + 1 + + if self.column_whiten: + # self.column_reg = np.diag(1/np.sqrt(S)+self.epsilon) + + self.column_components_ = (U[:, :self.n_column_components_]. + dot(np.diag(1. / np.sqrt(S[:self. + n_column_components_] + + self.epsilon)))) + else: + self.column_components_ = U[:, :self.n_column_components_] + else: + + self.column_components_ = np.eye(n_column, n_column) + return self + + def transform(self, X): + """Apply the dimensionality reduction on X. + Parameters + ---------- + + X: array-like, shape (n_samples, n_row, n_column) + Training matrix where n_samples is the number of samples + and n_row x n_column is the dimension of the features. + Or the shape can also be (n_row, n_column) + + Returns + ------- + + X_new : array-like, + shape (n_samples, n_row_components, n_column_components) + or shape is (n_row_components, n_column_components) + according to X. + + + """ + + # X -= self.mean_ + if np.size(X.shape) == 2: + return (self.row_components_.T.dot(X - self.mean_). + dot(self.column_components_)) + + elif np.size(X.shape) == 3: + return ((X - self.mean_).dot(self.column_components_). + transpose((0, 2, 1)).dot(self.row_components_). + transpose((0, 2, 1))) + + def inverse_transform(self, X): + """ Transform data back to its original space, i.e., + return an input X_original whose transform would be X + + Parameters + ---------- + + X: array-like, + shape (n_samples, n_row_components, n_column_components) or + shape is (n_row_components, n_column_components) + New data where n_samples is the number of samples + and n_row_components, n_column_components are the number of + components on the row and column direction respectively. + + Returns + ------- + + X_original : array-like, shape (n_samples, n_row, n_column) + + or shape is (n_row, n_column) according to X + + Notes + ----- + + If whitening is enabled, inverse_transform does not compute the + exact inverse operation as transform. + """ + + if np.size(X.shape) == 2: + return (self.row_components_.dot(X). + dot(self.column_components_.T) + self.mean_) + + elif np.size(X.shape) == 3: + + return (X.dot(self.column_components_.T).transpose((0, 2, 1)). + dot(self.row_components_.T).transpose((0, 2, 1)) + + self.mean_) diff --git a/sklearn/decomposition/tests/test_pca_2d.py b/sklearn/decomposition/tests/test_pca_2d.py new file mode 100644 index 0000000000000..7bb3e650e7a6b --- /dev/null +++ b/sklearn/decomposition/tests/test_pca_2d.py @@ -0,0 +1,121 @@ +import numpy as np + + +from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_array_equal + +from sklearn import datasets +from sklearn.decomposition import PCA2D + + +digits = datasets.load_digits() + + +def test_correct_shapes(): + + rng = np.random.RandomState(0) + + X = rng.randn(50, 30, 40) + + X_copy = np.copy(X) + + pca2d = PCA2D(n_row_components=10, n_column_components=12) + + # Testing shapes after transformation + + pca2d.fit(X) + + assert_array_equal(pca2d.row_components_.shape, (30, 10)) + + assert_array_equal(pca2d.column_components_.shape, (40, 12)) + + Xr = pca2d.transform(X) + + assert_array_equal(Xr.shape, (50, 10, 12)) + + Xr_2d = pca2d.transform(X[0, :, :]) + + assert_array_equal(Xr_2d.shape, (10, 12)) + + # Testing shapes after inverse transformation + + # Testing first that the data X has not been changed by the transformation + + assert_array_almost_equal(X, X_copy, 1e-10) + + X_inv = pca2d.inverse_transform(Xr) + + assert_array_equal(X_inv.shape, X.shape) + + X_inv_2d = pca2d.inverse_transform(Xr_2d) + + assert_array_equal(X_inv_2d.shape, X[0, :, :].shape) + + +def test_logic(): + + # Testing that after (only row and column)transformation there is no + # correlation between the new features + X = digits.images + + # after row transformation + + pca2d = PCA2D(n_row_components=4, n_column_components=0, epsilon=0) + pca2d.fit(X) + Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the row dimension + X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] + X_cov = X_cov - np.diag(np.diag(X_cov)) + assert_array_almost_equal(X_cov, np.zeros((4, 4))) + + # after column transformation + pca2d = PCA2D(n_row_components=0, n_column_components=4, epsilon=0) + pca2d.fit(X) + Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the column dimension + X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] + X_cov = X_cov - np.diag(np.diag(X_cov)) + assert_array_almost_equal(X_cov, np.zeros((4, 4))) + + +def test_logic_whitening(): + + # Testing that after transformation and whitening (row and column) + # we have unit variance for the new features + + X = digits.images + # after row transformation + pca2d = PCA2D(n_row_components=4, n_column_components=0, + epsilon=0, row_whiten=True) + pca2d.fit(X) + Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the row dimension + X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] + assert_array_almost_equal(X_cov, np.eye(4, 4)) + + # after column transformation + pca2d = PCA2D(n_row_components=0, n_column_components=4, + epsilon=0, column_whiten=True) + pca2d.fit(X) + Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the column dimension + X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] + assert_array_almost_equal(X_cov, np.eye(4, 4)) + + +def test_pca2d_efficiency(): + + X = np.arange(2000 * 20 * 20).reshape(2000, 20, 20) + # Testing performance when 90% of the variance is kept (row and column + pca2d = PCA2D(n_row_components=0.9, n_column_components=0.9) + pca2d.fit(X) + Xr = pca2d.transform(X) + + # Due to the high correlation between the features, only one is + # needed to get 90% of the variance + assert_array_equal(Xr.shape, (2000, 1, 1)) + + X_ori = pca2d.inverse_transform(Xr) + + # X_ori should be approximately equal to X + assert_array_almost_equal(X_ori, X) From 02cc22082e916f74504192dea963f0e8f947e03a Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Tue, 1 Jan 2013 00:13:29 +0100 Subject: [PATCH 2/8] minor corrections in 2DPCA module --- examples/applications/face_recognition_pca_2d.py | 2 +- sklearn/decomposition/__init__.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/applications/face_recognition_pca_2d.py b/examples/applications/face_recognition_pca_2d.py index 9e834fad6f51d..066645801563b 100644 --- a/examples/applications/face_recognition_pca_2d.py +++ b/examples/applications/face_recognition_pca_2d.py @@ -84,7 +84,7 @@ # introspect the images arrays to find the shapes n_samples, h, w = olivetti_faces.images.shape -# we use the 2 data directly +# we use the 2d data directly X = olivetti_faces.images diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index 21204ca7bd12f..12c3d88703d23 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -13,6 +13,7 @@ DictionaryLearning, MiniBatchDictionaryLearning,\ SparseCoder from .factor_analysis import FactorAnalysis +from .pca_2d import PCA2D __all__ = ['DictionaryLearning', 'FastICA', @@ -30,4 +31,5 @@ 'dict_learning_online', 'fastica', 'sparse_encode', - 'FactorAnalysis'] + 'FactorAnalysis', + 'PCA2D'] From 8e065614591156b53516c83b5c99f679b378da55 Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Tue, 1 Jan 2013 21:14:03 +0100 Subject: [PATCH 3/8] corrections in 2DPCA module --- sklearn/decomposition/pca_2d.py | 8 +++++++- sklearn/decomposition/tests/test_pca_2d.py | 12 ++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/pca_2d.py b/sklearn/decomposition/pca_2d.py index d86ac831fcee2..00b5cbedecf96 100644 --- a/sklearn/decomposition/pca_2d.py +++ b/sklearn/decomposition/pca_2d.py @@ -132,12 +132,16 @@ def fit(self, X, y=None): """ # Converting the data to 3 dimensions + assert_all_finite(X) X = np.atleast_3d(X) n_samples, n_row, n_column = X.shape # Making sure the type of the data is float - X = as_float_array(X, copy=self.copy) + #X = as_float_array(X, copy=self.copy) + X.astype(float) + if self.copy: + X = np.copy(X) # Center data self.mean_ = np.mean(X, axis=0) @@ -220,6 +224,7 @@ def transform(self, X): """ # X -= self.mean_ + assert_all_finite(X) if np.size(X.shape) == 2: return (self.row_components_.T.dot(X - self.mean_). dot(self.column_components_)) @@ -257,6 +262,7 @@ def inverse_transform(self, X): exact inverse operation as transform. """ + assert_all_finite(X) if np.size(X.shape) == 2: return (self.row_components_.dot(X). dot(self.column_components_.T) + self.mean_) diff --git a/sklearn/decomposition/tests/test_pca_2d.py b/sklearn/decomposition/tests/test_pca_2d.py index 7bb3e650e7a6b..87b3d52a26056 100644 --- a/sklearn/decomposition/tests/test_pca_2d.py +++ b/sklearn/decomposition/tests/test_pca_2d.py @@ -41,7 +41,7 @@ def test_correct_shapes(): # Testing first that the data X has not been changed by the transformation - assert_array_almost_equal(X, X_copy, 1e-10) + assert_array_almost_equal(X, X_copy, 2) X_inv = pca2d.inverse_transform(Xr) @@ -66,7 +66,7 @@ def test_logic(): # Computing the covariance matrix of Xr on the row dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] X_cov = X_cov - np.diag(np.diag(X_cov)) - assert_array_almost_equal(X_cov, np.zeros((4, 4))) + assert_array_almost_equal(X_cov, np.zeros((4, 4)), 2) # after column transformation pca2d = PCA2D(n_row_components=0, n_column_components=4, epsilon=0) @@ -75,7 +75,7 @@ def test_logic(): # Computing the covariance matrix of Xr on the column dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] X_cov = X_cov - np.diag(np.diag(X_cov)) - assert_array_almost_equal(X_cov, np.zeros((4, 4))) + assert_array_almost_equal(X_cov, np.zeros((4, 4)), 2) def test_logic_whitening(): @@ -91,7 +91,7 @@ def test_logic_whitening(): Xr = pca2d.transform(X) # Computing the covariance matrix of Xr on the row dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] - assert_array_almost_equal(X_cov, np.eye(4, 4)) + assert_array_almost_equal(X_cov, np.eye(4, 4), 2) # after column transformation pca2d = PCA2D(n_row_components=0, n_column_components=4, @@ -100,7 +100,7 @@ def test_logic_whitening(): Xr = pca2d.transform(X) # Computing the covariance matrix of Xr on the column dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] - assert_array_almost_equal(X_cov, np.eye(4, 4)) + assert_array_almost_equal(X_cov, np.eye(4, 4), 2) def test_pca2d_efficiency(): @@ -118,4 +118,4 @@ def test_pca2d_efficiency(): X_ori = pca2d.inverse_transform(Xr) # X_ori should be approximately equal to X - assert_array_almost_equal(X_ori, X) + assert_array_almost_equal(X_ori, X, 2) From 697fd4e07aca2b8ef129ad56a139dff933f969e9 Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Tue, 1 Jan 2013 21:29:36 +0100 Subject: [PATCH 4/8] corrections in 2DPCA module --- sklearn/decomposition/pca_2d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca_2d.py b/sklearn/decomposition/pca_2d.py index 00b5cbedecf96..1f6f204af15ad 100644 --- a/sklearn/decomposition/pca_2d.py +++ b/sklearn/decomposition/pca_2d.py @@ -8,7 +8,7 @@ import numpy as np from scipy import linalg from ..base import BaseEstimator, TransformerMixin -from ..utils import as_float_array +from ..utils import as_float_array, assert_all_finite class PCA2D(BaseEstimator, TransformerMixin): From 3c5efffa3d6c870f6e1090013625ea2e29d18c0b Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Wed, 2 Jan 2013 09:08:04 +0100 Subject: [PATCH 5/8] corrections in 2DPCA module --- sklearn/decomposition/pca_2d.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca_2d.py b/sklearn/decomposition/pca_2d.py index 1f6f204af15ad..28aa03e1c6824 100644 --- a/sklearn/decomposition/pca_2d.py +++ b/sklearn/decomposition/pca_2d.py @@ -7,6 +7,7 @@ import numpy as np from scipy import linalg +from scipy.sparse import issparse from ..base import BaseEstimator, TransformerMixin from ..utils import as_float_array, assert_all_finite @@ -131,6 +132,8 @@ def fit(self, X, y=None): Returns the instance itself. """ + if issparse(X): + raise ValueError("sparse matrices are not currently supported") # Converting the data to 3 dimensions assert_all_finite(X) X = np.atleast_3d(X) @@ -141,7 +144,7 @@ def fit(self, X, y=None): #X = as_float_array(X, copy=self.copy) X.astype(float) if self.copy: - X = np.copy(X) + X = X.copy() # Center data self.mean_ = np.mean(X, axis=0) @@ -224,6 +227,8 @@ def transform(self, X): """ # X -= self.mean_ + if issparse(X): + raise ValueError("sparse matrices are not currently supported") assert_all_finite(X) if np.size(X.shape) == 2: return (self.row_components_.T.dot(X - self.mean_). @@ -262,6 +267,8 @@ def inverse_transform(self, X): exact inverse operation as transform. """ + if issparse(X): + raise ValueError("sparse matrices are not currently supported") assert_all_finite(X) if np.size(X.shape) == 2: return (self.row_components_.dot(X). From c9217e03e1142f67309f4acde6665b6bb44ed8ba Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Wed, 2 Jan 2013 09:55:25 +0100 Subject: [PATCH 6/8] corrections in 2DPCA module --- sklearn/decomposition/pca_2d.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/pca_2d.py b/sklearn/decomposition/pca_2d.py index 28aa03e1c6824..7f673e6a5342d 100644 --- a/sklearn/decomposition/pca_2d.py +++ b/sklearn/decomposition/pca_2d.py @@ -132,19 +132,26 @@ def fit(self, X, y=None): Returns the instance itself. """ + # Checking if X is not sparse if issparse(X): raise ValueError("sparse matrices are not currently supported") - # Converting the data to 3 dimensions + + + # Converting the data to 3 dimensions array + X = np.asarray(X, np.float64) assert_all_finite(X) X = np.atleast_3d(X) + + # Copy the data if necessary + if self.copy: + X = X.copy() + n_samples, n_row, n_column = X.shape # Making sure the type of the data is float #X = as_float_array(X, copy=self.copy) - X.astype(float) - if self.copy: - X = X.copy() + #X = X.astype(np.float64) # Center data self.mean_ = np.mean(X, axis=0) @@ -229,7 +236,10 @@ def transform(self, X): # X -= self.mean_ if issparse(X): raise ValueError("sparse matrices are not currently supported") + + X = np.asarray(X, np.float64) assert_all_finite(X) + if np.size(X.shape) == 2: return (self.row_components_.T.dot(X - self.mean_). dot(self.column_components_)) @@ -269,7 +279,10 @@ def inverse_transform(self, X): if issparse(X): raise ValueError("sparse matrices are not currently supported") + + X = np.asarray(X, np.float64) assert_all_finite(X) + if np.size(X.shape) == 2: return (self.row_components_.dot(X). dot(self.column_components_.T) + self.mean_) From 493d783466f723df889e032afce4639522921339 Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Wed, 2 Jan 2013 15:58:50 +0100 Subject: [PATCH 7/8] corrections to 2D PCA module --- sklearn/decomposition/pca_2d.py | 53 +++++++++++----------- sklearn/decomposition/tests/test_pca_2d.py | 20 ++++---- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/sklearn/decomposition/pca_2d.py b/sklearn/decomposition/pca_2d.py index 7f673e6a5342d..0a5dd6945ed60 100644 --- a/sklearn/decomposition/pca_2d.py +++ b/sklearn/decomposition/pca_2d.py @@ -134,24 +134,22 @@ def fit(self, X, y=None): # Checking if X is not sparse if issparse(X): - raise ValueError("sparse matrices are not currently supported") - - + raise TypeError("sparse matrices are not currently supported") + # Converting the data to 3 dimensions array X = np.asarray(X, np.float64) assert_all_finite(X) X = np.atleast_3d(X) - + # Copy the data if necessary if self.copy: X = X.copy() - n_samples, n_row, n_column = X.shape # Making sure the type of the data is float - #X = as_float_array(X, copy=self.copy) - #X = X.astype(np.float64) + # X = as_float_array(X, copy=self.copy) + # X = X.astype(np.float64) # Center data self.mean_ = np.mean(X, axis=0) @@ -220,14 +218,12 @@ def transform(self, X): X: array-like, shape (n_samples, n_row, n_column) Training matrix where n_samples is the number of samples and n_row x n_column is the dimension of the features. - Or the shape can also be (n_row, n_column) Returns ------- X_new : array-like, shape (n_samples, n_row_components, n_column_components) - or shape is (n_row_components, n_column_components) according to X. @@ -235,16 +231,19 @@ def transform(self, X): # X -= self.mean_ if issparse(X): - raise ValueError("sparse matrices are not currently supported") - + raise TypeError("sparse matrices are not currently supported") + X = np.asarray(X, np.float64) assert_all_finite(X) - - if np.size(X.shape) == 2: - return (self.row_components_.T.dot(X - self.mean_). - dot(self.column_components_)) + X = np.atleast_3d(X) + + # Disabling this features + # if X.ndim == 2: + # return (self.row_components_.T.dot(X - self.mean_). + # dot(self.column_components_)) - elif np.size(X.shape) == 3: + # elif X.ndim == 3: + if X.ndim == 3: return ((X - self.mean_).dot(self.column_components_). transpose((0, 2, 1)).dot(self.row_components_). transpose((0, 2, 1))) @@ -257,8 +256,7 @@ def inverse_transform(self, X): ---------- X: array-like, - shape (n_samples, n_row_components, n_column_components) or - shape is (n_row_components, n_column_components) + shape (n_samples, n_row_components, n_column_components) New data where n_samples is the number of samples and n_row_components, n_column_components are the number of components on the row and column direction respectively. @@ -268,7 +266,7 @@ def inverse_transform(self, X): X_original : array-like, shape (n_samples, n_row, n_column) - or shape is (n_row, n_column) according to X + Notes ----- @@ -278,16 +276,19 @@ def inverse_transform(self, X): """ if issparse(X): - raise ValueError("sparse matrices are not currently supported") - + raise TypeError("sparse matrices are not currently supported") + X = np.asarray(X, np.float64) assert_all_finite(X) - - if np.size(X.shape) == 2: - return (self.row_components_.dot(X). - dot(self.column_components_.T) + self.mean_) + X = np.atleast_3d(X) + + # Disabling this features + # if X.ndim == 2: + # return (self.row_components_.dot(X). + # dot(self.column_components_.T) + self.mean_) - elif np.size(X.shape) == 3: + # elif X.ndim == 3: + if X.ndim == 3: return (X.dot(self.column_components_.T).transpose((0, 2, 1)). dot(self.row_components_.T).transpose((0, 2, 1)) + diff --git a/sklearn/decomposition/tests/test_pca_2d.py b/sklearn/decomposition/tests/test_pca_2d.py index 87b3d52a26056..587fb1452fc85 100644 --- a/sklearn/decomposition/tests/test_pca_2d.py +++ b/sklearn/decomposition/tests/test_pca_2d.py @@ -33,23 +33,23 @@ def test_correct_shapes(): assert_array_equal(Xr.shape, (50, 10, 12)) - Xr_2d = pca2d.transform(X[0, :, :]) + #Xr_2d = pca2d.transform(X[0, :, :]) - assert_array_equal(Xr_2d.shape, (10, 12)) + #assert_array_equal(Xr_2d.shape, (10, 12)) # Testing shapes after inverse transformation # Testing first that the data X has not been changed by the transformation - assert_array_almost_equal(X, X_copy, 2) + assert_array_almost_equal(X, X_copy) X_inv = pca2d.inverse_transform(Xr) assert_array_equal(X_inv.shape, X.shape) - X_inv_2d = pca2d.inverse_transform(Xr_2d) + #X_inv_2d = pca2d.inverse_transform(Xr_2d) - assert_array_equal(X_inv_2d.shape, X[0, :, :].shape) + #assert_array_equal(X_inv_2d.shape, X[0, :, :].shape) def test_logic(): @@ -66,7 +66,7 @@ def test_logic(): # Computing the covariance matrix of Xr on the row dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] X_cov = X_cov - np.diag(np.diag(X_cov)) - assert_array_almost_equal(X_cov, np.zeros((4, 4)), 2) + assert_array_almost_equal(X_cov, np.zeros((4, 4))) # after column transformation pca2d = PCA2D(n_row_components=0, n_column_components=4, epsilon=0) @@ -75,7 +75,7 @@ def test_logic(): # Computing the covariance matrix of Xr on the column dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] X_cov = X_cov - np.diag(np.diag(X_cov)) - assert_array_almost_equal(X_cov, np.zeros((4, 4)), 2) + assert_array_almost_equal(X_cov, np.zeros((4, 4))) def test_logic_whitening(): @@ -91,7 +91,7 @@ def test_logic_whitening(): Xr = pca2d.transform(X) # Computing the covariance matrix of Xr on the row dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] - assert_array_almost_equal(X_cov, np.eye(4, 4), 2) + assert_array_almost_equal(X_cov, np.eye(4, 4)) # after column transformation pca2d = PCA2D(n_row_components=0, n_column_components=4, @@ -100,7 +100,7 @@ def test_logic_whitening(): Xr = pca2d.transform(X) # Computing the covariance matrix of Xr on the column dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] - assert_array_almost_equal(X_cov, np.eye(4, 4), 2) + assert_array_almost_equal(X_cov, np.eye(4, 4)) def test_pca2d_efficiency(): @@ -118,4 +118,4 @@ def test_pca2d_efficiency(): X_ori = pca2d.inverse_transform(Xr) # X_ori should be approximately equal to X - assert_array_almost_equal(X_ori, X, 2) + assert_array_almost_equal(X_ori, X) From 66862ad308b35c9d53c5a017c3beea38abcddeb8 Mon Sep 17 00:00:00 2001 From: TOSSOU Aristide Date: Thu, 3 Jan 2013 10:25:51 +0100 Subject: [PATCH 8/8] style correction in PCA2D --- .../applications/face_recognition_pca_2d.py | 7 -- sklearn/decomposition/pca_2d.py | 80 +++++-------------- sklearn/decomposition/tests/test_pca_2d.py | 37 ++------- 3 files changed, 27 insertions(+), 97 deletions(-) diff --git a/examples/applications/face_recognition_pca_2d.py b/examples/applications/face_recognition_pca_2d.py index 066645801563b..4ab9106947e6c 100644 --- a/examples/applications/face_recognition_pca_2d.py +++ b/examples/applications/face_recognition_pca_2d.py @@ -12,7 +12,6 @@ Expected results for the 40 people in the dataset:: - precision recall f1-score support Face 0 1.00 0.40 0.57 5 @@ -66,7 +65,6 @@ from time import time import numpy as np - from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix @@ -87,15 +85,11 @@ # we use the 2d data directly X = olivetti_faces.images - # the label to predict is the id of the person y = olivetti_faces.target target_names = np.array(["Face %d" % x for x in range(0, np.size(olivetti_faces.target), 1)]) -# target_names = np.array(map(str, -# (np.arange(np.size(olivetti_faces.target)).reshape( -# olivetti_faces.target.shape)))) n_classes = target_names.shape[0] print "Total dataset size:" @@ -174,6 +168,5 @@ print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) - print "Size of the components retained %d X %d" % (pca.n_row_components_, pca.n_column_components_) diff --git a/sklearn/decomposition/pca_2d.py b/sklearn/decomposition/pca_2d.py index 0a5dd6945ed60..ccbf3fda7b276 100644 --- a/sklearn/decomposition/pca_2d.py +++ b/sklearn/decomposition/pca_2d.py @@ -2,11 +2,11 @@ """ # Author : Aristide TOSSOU -# # License: BSD Style import numpy as np from scipy import linalg + from scipy.sparse import issparse from ..base import BaseEstimator, TransformerMixin from ..utils import as_float_array, assert_all_finite @@ -17,12 +17,7 @@ class PCA2D(BaseEstimator, TransformerMixin): This technique is based on 2D matrices as opposed to standard PCA which is based on 1D vectors. It considers simultaneously the row and column directions.2D PCA is computed by examining the covariance of the data. - It can easily get higher accuracy than 1D PCA. - - This implements the method described in `Daoqiang Zhang, Zhi-Hua Zhou, : - Two-directional two-dimensional PCA for efficient face representation and - recognition, Neurocomputing, Volume 69, Issues 1-3, December 2005, - Pages 224-231` + It can easily have better performance on generalisation than 1D PCA. This implementation uses the scipy.linalg implementation of singular value decomposition. It works on dense matrices. The time complexity is : @@ -33,13 +28,11 @@ class PCA2D(BaseEstimator, TransformerMixin): More formally, it can be used as far as one of the term in the complexity is not too large (less than 10**10). - In most case, it is then significantly faster than standard 1D PCA + In most cases, it is then significantly faster than standard 1D PCA which has a complexity of O( n_row**3 * n_column**3). - Parameters ---------- - n_row_components : int, None, or string Number of components in the row direction to keep if n_row_components is not set or is 0, all components are kept:: @@ -66,7 +59,6 @@ class PCA2D(BaseEstimator, TransformerMixin): improve the predictive accuracy of the downstream estimators by making there data respect some hard-wired assumptions. - column_whiten : bool, optional When True (False by default) the `column_components` vectors are divided by the square root of singular values to ensure @@ -89,13 +81,27 @@ class PCA2D(BaseEstimator, TransformerMixin): Attributes ---------- - `row_components_` : array, [n_row, n_row_components] components with maximum variance on the row direction `column_components_` : array, [n_column, n_column_components] components with maximum variance on the column direction + Notes + ----- + **References:** + + 1. `Daoqiang Zhang, Zhi-Hua Zhou, : Two-directional two-dimensional + PCA for efficient face representation and recognition, Neurocomputing, + Volume 69, Issues 1-3, December 2005,Pages 224-231` + (http://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/neucom05a.pdf) + + 2. `Yang, Jian and Zhang, David and Frangi, Alejandro F. and Yang, + Jing Y. :Two-dimensional PCA: a new approach to appearance-based face + representation and recognition, IEEE Transactions on Pattern Analysis + and Machine Intelligence, Vol. 26, No. 1. (2004), pp. 131-137` + http://repository.lib.polyu.edu.hk/jspui/bitstream/10397/190/1/137.pdf + See also -------- PCA @@ -103,9 +109,6 @@ class PCA2D(BaseEstimator, TransformerMixin): RandomizedPCA KernelPCA SparsePCA - - - """ def __init__(self, n_row_components=None, n_column_components=None, row_whiten=False, column_whiten=False, epsilon=1e-5, @@ -119,9 +122,9 @@ def __init__(self, n_row_components=None, n_column_components=None, def fit(self, X, y=None): """ Fit the model from data in X + Parameters ---------- - X: array-like, shape (n_samples, n_row, n_column) Training matrix where n_samples is the number of samples and n_row x n_column is the dimension of the features @@ -131,12 +134,11 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - # Checking if X is not sparse if issparse(X): raise TypeError("sparse matrices are not currently supported") - # Converting the data to 3 dimensions array + # Converting the data to 3 dimensions float array X = np.asarray(X, np.float64) assert_all_finite(X) X = np.atleast_3d(X) @@ -144,19 +146,13 @@ def fit(self, X, y=None): # Copy the data if necessary if self.copy: X = X.copy() - n_samples, n_row, n_column = X.shape - # Making sure the type of the data is float - # X = as_float_array(X, copy=self.copy) - # X = X.astype(np.float64) - # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ # As we don't want to change the default n_row_components let's copy it - self.n_row_components_ = np.copy(self.n_row_components) self.n_column_components_ = np.copy(self.n_column_components) @@ -165,14 +161,10 @@ def fit(self, X, y=None): self.n_row_components_ > 0): U, S, V = linalg.svd(np.tensordot(X, X, axes=([0, 2], [0, 2])) / n_samples, full_matrices=False) - if self.n_row_components_ < 1: self.n_row_components_ = np.sum((S / np.sum(S)).cumsum() < self.n_row_components_) + 1 - if self.row_whiten: - # self.row_reg = np.diag(1/np.sqrt(S)+self.epsilon) - # To save time we can multiply the whitening matrix # by U beforehand self.row_components_ = (U[:, :self.n_row_components_]. @@ -182,7 +174,6 @@ def fit(self, X, y=None): else: self.row_components_ = U[:, :self.n_row_components_] else: - self.row_components_ = np.eye(n_row, n_row) # Computing 2DPCA @@ -190,15 +181,11 @@ def fit(self, X, y=None): self.n_column_components_ > 0): U, S, V = linalg.svd(np.tensordot(X, X, axes=([0, 1], [0, 1])) / n_samples, full_matrices=False) - if self.n_column_components_ < 1: self.n_column_components_ = np.sum((S / np.sum(S)).cumsum() < self.n_column_components_ ) + 1 - if self.column_whiten: - # self.column_reg = np.diag(1/np.sqrt(S)+self.epsilon) - self.column_components_ = (U[:, :self.n_column_components_]. dot(np.diag(1. / np.sqrt(S[:self. n_column_components_] + @@ -206,30 +193,24 @@ def fit(self, X, y=None): else: self.column_components_ = U[:, :self.n_column_components_] else: - self.column_components_ = np.eye(n_column, n_column) return self def transform(self, X): """Apply the dimensionality reduction on X. + Parameters ---------- - X: array-like, shape (n_samples, n_row, n_column) Training matrix where n_samples is the number of samples and n_row x n_column is the dimension of the features. Returns ------- - X_new : array-like, shape (n_samples, n_row_components, n_column_components) according to X. - - """ - - # X -= self.mean_ if issparse(X): raise TypeError("sparse matrices are not currently supported") @@ -237,12 +218,6 @@ def transform(self, X): assert_all_finite(X) X = np.atleast_3d(X) - # Disabling this features - # if X.ndim == 2: - # return (self.row_components_.T.dot(X - self.mean_). - # dot(self.column_components_)) - - # elif X.ndim == 3: if X.ndim == 3: return ((X - self.mean_).dot(self.column_components_). transpose((0, 2, 1)).dot(self.row_components_). @@ -254,7 +229,6 @@ def inverse_transform(self, X): Parameters ---------- - X: array-like, shape (n_samples, n_row_components, n_column_components) New data where n_samples is the number of samples @@ -263,18 +237,13 @@ def inverse_transform(self, X): Returns ------- - X_original : array-like, shape (n_samples, n_row, n_column) - - Notes ----- - If whitening is enabled, inverse_transform does not compute the exact inverse operation as transform. """ - if issparse(X): raise TypeError("sparse matrices are not currently supported") @@ -282,14 +251,7 @@ def inverse_transform(self, X): assert_all_finite(X) X = np.atleast_3d(X) - # Disabling this features - # if X.ndim == 2: - # return (self.row_components_.dot(X). - # dot(self.column_components_.T) + self.mean_) - - # elif X.ndim == 3: if X.ndim == 3: - return (X.dot(self.column_components_.T).transpose((0, 2, 1)). dot(self.row_components_.T).transpose((0, 2, 1)) + self.mean_) diff --git a/sklearn/decomposition/tests/test_pca_2d.py b/sklearn/decomposition/tests/test_pca_2d.py index 587fb1452fc85..bef7de3660d5e 100644 --- a/sklearn/decomposition/tests/test_pca_2d.py +++ b/sklearn/decomposition/tests/test_pca_2d.py @@ -1,68 +1,43 @@ import numpy as np - - from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal from sklearn import datasets from sklearn.decomposition import PCA2D - digits = datasets.load_digits() def test_correct_shapes(): - rng = np.random.RandomState(0) - X = rng.randn(50, 30, 40) - X_copy = np.copy(X) - pca2d = PCA2D(n_row_components=10, n_column_components=12) # Testing shapes after transformation - pca2d.fit(X) - assert_array_equal(pca2d.row_components_.shape, (30, 10)) - assert_array_equal(pca2d.column_components_.shape, (40, 12)) - Xr = pca2d.transform(X) - assert_array_equal(Xr.shape, (50, 10, 12)) - #Xr_2d = pca2d.transform(X[0, :, :]) - - #assert_array_equal(Xr_2d.shape, (10, 12)) - # Testing shapes after inverse transformation - # Testing first that the data X has not been changed by the transformation - assert_array_almost_equal(X, X_copy) - X_inv = pca2d.inverse_transform(Xr) - assert_array_equal(X_inv.shape, X.shape) - #X_inv_2d = pca2d.inverse_transform(Xr_2d) - - #assert_array_equal(X_inv_2d.shape, X[0, :, :].shape) - def test_logic(): - # Testing that after (only row and column)transformation there is no # correlation between the new features X = digits.images # after row transformation - pca2d = PCA2D(n_row_components=4, n_column_components=0, epsilon=0) pca2d.fit(X) Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the row dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] X_cov = X_cov - np.diag(np.diag(X_cov)) @@ -72,6 +47,7 @@ def test_logic(): pca2d = PCA2D(n_row_components=0, n_column_components=4, epsilon=0) pca2d.fit(X) Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the column dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] X_cov = X_cov - np.diag(np.diag(X_cov)) @@ -79,16 +55,16 @@ def test_logic(): def test_logic_whitening(): - # Testing that after transformation and whitening (row and column) # we have unit variance for the new features - X = digits.images + # after row transformation pca2d = PCA2D(n_row_components=4, n_column_components=0, epsilon=0, row_whiten=True) pca2d.fit(X) Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the row dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 2], [0, 2])) / X.shape[0] assert_array_almost_equal(X_cov, np.eye(4, 4)) @@ -98,15 +74,15 @@ def test_logic_whitening(): epsilon=0, column_whiten=True) pca2d.fit(X) Xr = pca2d.transform(X) + # Computing the covariance matrix of Xr on the column dimension X_cov = np.tensordot(Xr, Xr, axes=([0, 1], [0, 1])) / X.shape[0] assert_array_almost_equal(X_cov, np.eye(4, 4)) def test_pca2d_efficiency(): - X = np.arange(2000 * 20 * 20).reshape(2000, 20, 20) - # Testing performance when 90% of the variance is kept (row and column + # Testing performance when 90% of the variance is kept (row and column) pca2d = PCA2D(n_row_components=0.9, n_column_components=0.9) pca2d.fit(X) Xr = pca2d.transform(X) @@ -114,7 +90,6 @@ def test_pca2d_efficiency(): # Due to the high correlation between the features, only one is # needed to get 90% of the variance assert_array_equal(Xr.shape, (2000, 1, 1)) - X_ori = pca2d.inverse_transform(Xr) # X_ori should be approximately equal to X