8000 ENH RandomizedPCA collapsed into PCA · scikit-learn/scikit-learn@1195df9 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1195df9

Browse files
author
giorgiop
committed
ENH RandomizedPCA collapsed into PCA
1 parent c0bc2f8 commit 1195df9

File tree

15 files changed

+1090
-604
lines changed

15 files changed

+1090
-604
lines changed

doc/modules/pipeline.rst

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,15 @@ is an estimator object::
3838
>>> from sklearn.svm import SVC
3939
>>> from sklearn.decomposition import PCA
4040
>>> estimators = [('reduce_dim', PCA()), ('svm', SVC())]
41-
>>> clf = Pipeline(estimators)
41+
>>> clf = Pipeline(estimators)
4242
>>> clf # doctest: +NORMALIZE_WHITESPACE
43-
Pipeline(steps=[('reduce_dim', PCA(copy=True, n_components=None,
44-
whiten=False)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None,
45-
coef0=0.0, decision_function_shape=None, degree=3, gamma='auto',
46-
kernel='rbf', max_iter=-1, probability=False, random_state=None,
47-
shrinking=True, tol=0.001, verbose=False))])
43+
Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power=4,
44+
n_components=None, random_state=None, svd_solver='auto', tol=0.0,
45+
whiten=False)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None,
46+
coef0=0.0, decision_function_shape=None, degree=3, gamma='auto',
47+
kernel='rbf', max_iter=-1, probability=False, random_state=None,
48+
shrinking=True, tol=0.001, verbose=False))])
49+
4850

4951
The utility function :func:`make_pipeline` is a shorthand
5052
for constructing pipelines;
@@ -63,23 +65,27 @@ filling in the names automatically::
6365
The estimators of a pipeline are stored as a list in the ``steps`` attribute::
6466

6567
>>> clf.steps[0]
66-
('reduce_dim', PCA(copy=True, n_components=None, whiten=False))
68+
('reduce_dim', PCA(copy=True, iterated_power=4, n_components=None, random_state=None,
69+
svd_solver='auto', tol=0.0, whiten=False))
6770

6871
and as a ``dict`` in ``named_steps``::
6972

7073
>>> clf.named_steps['reduce_dim']
71-
PCA(copy=True, n_components=None, whiten=False)
74+
PCA(copy=True, iterated_power=4, n_components=None, random_state=None,
75+
svd_solver='auto', tol=0.0, whiten=False)
7276

7377
Parameters of the estimators in the pipeline can be accessed using the
7478
``<estimator>__<parameter>`` syntax::
7579

7680
>>> clf.set_params(svm__C=10) # doctest: +NORMALIZE_WHITESPACE
77-
Pipeline(steps=[('reduce_dim', PCA(copy=True, n_components=None,
81+
Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power=4,
82+
n_components=None, random_state=None, svd_solver='auto', tol=0.0,
7883
whiten=False)), ('svm', SVC(C=10, cache_size=200, class_weight=None,
7984
coef0=0.0, decision_function_shape=None, degree=3, gamma='auto',
8085
kernel='rbf', max_iter=-1, probability=False, random_state=None,
8186
shrinking=True, tol=0.001, verbose=False))])
8287

88+
8389
This is particularly important for doing grid searches::
8490

8591
>>> from sklearn.model_selection import GridSearchCV
@@ -150,19 +156,22 @@ and ``value`` is an estimator object::
150156
>>> from sklearn.decomposition import PCA
151157
>>> from sklearn.decomposition import KernelPCA
152158
>>> estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
153-
>>> combined = FeatureUnion(estimators)
159+
>>> combined = FeatureUnion(estimators)
154160
>>> combined # doctest: +NORMALIZE_WHITESPACE
155-
FeatureUnion(n_jobs=1, transformer_list=[('linear_pca', PCA(copy=True,
156-
n_components=None, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0,
157-
coef0=1, degree=3, eigen_solver='auto', fit_inverse_transform=False,
158-
gamma=None, kernel='linear', kernel_params=None, max_iter=None,
159-
n_components=None, n_jobs=1, random_state=None, remove_zero_eig=False, tol=0))],
161+
FeatureUnion(n_jobs=1, transformer_list=[('linear_pca', PCA(copy=True,
162+
iterated_power=4, n_components=None, random_state=None,
163+
svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca',
164+
KernelPCA(alpha=1.0, coef0=1, degree=3, eigen_solver='auto',
165+
fit_inverse_transform=False, gamma=None, kernel='linear',
166+
kernel_params=None, max_iter=None, n_components=None, n_jobs=1,
167+
random_state=None, remove_zero_eig=False, tol=0))],
160168
transformer_weights=None)
161169

170+
162171
Like pipelines, feature unions have a shorthand constructor called
163172
:func:`make_union` that does not require explicit naming of the components.
164173

165-
174+
166175
.. topic:: Examples:
167176

168177
* :ref:`example_feature_stacker.py`

doc/tutorial/statistical_inference/unsupervised_learning.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,8 @@ data by projecting on a principal subspace.
275275
>>> from sklearn import decomposition
276276
>>> pca = decomposition.PCA()
277277
>>> pca.fit(X)
278-
PCA(copy=True, n_components=None, whiten=False)
278+
PCA(copy=True, iterated_power=4, n_components=None, random_state=None,
279+
svd_solver='auto', tol=0.0, whiten=False)
279280
>>> print(pca.explained_variance_) # doctest: +SKIP
280281
[ 2.18565811e+00 1.19346747e+00 8.43026679e-32]
281282

doc/whats_new.rst

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@ New features
2222
:class:`feature_selection.SelectPercentile` as score functions.
2323
By `Andrea Bravi`_ and `Nikolay Mayorov`_.
2424

25+
- Class :class:`decomposition.RandomizedPCA` is now factored into :class:`decomposition.PCA`
26+
and it is available calling with parameter ``svd_solver='randomized'``.
27+
The default number of ``n_iter`` for ``'randomized'`` has changed to 4. The old
28+
behavior of PCA is recovered by ``svd_solver='full'``. An additional solver
29+
calls `arpack` and performs truncated (non-randomized) SVD. By default,
30+
the best solver is selected depending on the size of the input and the
31+
number of components requested.
32+
(`#5299 <https://github.com/scikit-learn/scikit-learn/pull/5299>`_) by `Giorgio Patrini`_.
33+
2534
- The Gaussian Process module has been reimplemented and now offers classification
2635
and regression estimators through :class:`gaussian_process.GaussianProcessClassifier`
2736
and :class:`gaussian_process.GaussianProcessRegressor`. Among other things, the new
@@ -114,17 +123,26 @@ Bug fixes
114123
- :class:`StratifiedKFold` now raises error if all n_labels for individual classes is less than n_folds.
115124
(`#6182 <https://github.com/scikit-learn/scikit-learn/pull/6182>`_) by `Devashish Deshpande`_.
116125

117-
- :class:`RandomizedPCA` default number of `iterated_power` is 2 instead of 3.
118-
This is a speed up with a minor precision decrease. (`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
126+
- :class:`RandomizedPCA` default number of `iterated_power` is 4 instead of 3.
127+
(`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
119128

120-
- :func:`randomized_svd` performs 2 power iterations by default, instead or 0.
121-
In practice this is often enough for obtaining a good approximation of the
122-
true eigenvalues/vectors in the presence of noise. (`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
129+
- :func:`utils.extmath.randomized_svd` performs 4 power iterations by default, instead or 0.
130+
In practice this is enough for obtaining a good approximation of the
131+
true eigenvalues/vectors in the presence of noise. When `n_components` is
132+
small (< .1 * min(X.shape)) `n_iter` is set to 7, unless the user specifies
133+
a higher number. This improves precision with few components.
134+
(`#5299 <https://github.com/scikit-learn/scikit-learn/pull/5299>`_) by `Giorgio Patrini`_.
123135

124-
- :func:`randomized_range_finder` is more numerically stable when many
136+
- :func:`utils.extmath.randomized_range_finder` is more numerically stable when many
125137
power iterations are requested, since it applies LU normalization by default.
126138
If `n_iter<2` numerical issues are unlikely, thus no normalization is applied.
127-
Other normalization options are available: 'none', 'LU' and 'QR'. (`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
139+
Other normalization options are available: 'none', 'LU' and 'QR'.
140+
(`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
141+
142+
- Whiten/non-whiten inconsistency between components of :class:`decomposition.PCA`
143+
and :class:`decomposition.RandomizedPCA` (now factored into PCA, see the
144+
New features) is fixed. `components_` are stored with no whitening.
145+
(`#5299 <https://github.com/scikit-learn/scikit-learn/pull/5299>`_) by `Giorgio Patrini`_.
128146

129147
- Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized
130148
Laplacian matrix was incorrectly set to 1. (`#4995 <https://github.com/scikit-learn/scikit-learn/pull/4995>`_) By `Peter Fischer`_.
@@ -213,7 +231,8 @@ Changelog
213231

214232
New features
215233
............
216-
- All the Scaler classes but :class:`RobustScaler` can be fitted online by
234+
235+
- All the Scaler classes but :class:`preprocessing.RobustScaler` can be fitted online by
217236
calling `partial_fit`. By `Giorgio Patrini`_.
218237

219238
- The new class :class:`ensemble.VotingClassifier` implements a
@@ -445,6 +464,7 @@ Enhancements
445464

446465
Bug fixes
447466
.........
467+
448468
- Fixed non-determinism in :class:`dummy.DummyClassifier` with sparse
449469
multi-label output. By `Andreas Müller`_.
450470

examples/decomposition/plot_pca_vs_fa_model_selection.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
by Thomas P. Minka is also compared.
2424
2525
"""
26-
print(__doc__)
2726

2827
# Authors: Alexandre Gramfort
2928
# Denis A. Engemann
@@ -38,6 +37,8 @@
3837
from sklearn.model_selection import cross_val_score
3938
from sklearn.model_selection import GridSearchCV
4039

40+
print(__doc__)
41+
4142
###############################################################################
4243
# Create the data
4344

@@ -61,7 +62,7 @@
6162

6263

6364
def compute_scores(X):
64-
pca = PCA()
65+
pca = PCA(svd_solver='full')
6566
fa = FactorAnalysis()
6667

6768
pca_scores, fa_scores = [], []
@@ -90,7 +91,7 @@ def lw_score(X):
9091
n_components_pca = n_components[np.argmax(pca_scores)]
9192
n_components_fa = n_components[np.argmax(fa_scores)]
9293

93-
pca = PCA(n_components='mle')
94+
pca = PCA(svd_solver='full', n_components='mle')
9495
pca.fit(X)
9596
n_components_pca_mle = pca.n_components_
9697

@@ -105,7 +106,8 @@ def lw_score(X):
105106
plt.axvline(n_components_pca, color='b',
106107
label='PCA CV: %d' % n_components_pca, linestyle='--')
107108
plt.axvline(n_components_fa, color='r',
108-
label='FactorAnalysis CV: %d' % n_components_fa, linestyle='--')
109+
label='FactorAnalysis CV: %d' % n_components_fa,
110+
linestyle='--')
109111
plt.axvline(n_components_pca_mle, color='k',
110112
label='PCA MLE: %d' % n_components_pca_mle, linestyle='--')
111113

0 commit comments

Comments
 (0)
0