8000 [MRG+2] fix warning and behavior in randomized_svd wrt power iterations by amueller · Pull Request #7311 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+2] fix warning and behavior in randomized_svd wrt power iterations #7311

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions sklearn/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ class PCA(_BasePCA):
.. versionadded:: 0.18.0
iterated_power : int >= 0, optional (default 4)
iterated_power : int >= 0, or 'auto', (default 'auto')
Number of iterations for the power method computed by
svd_solver == 'randomized'.
Expand Down Expand Up @@ -240,21 +240,21 @@ class PCA(_BasePCA):
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> pca = PCA(n_components=2)
>>> pca.fit(X)
PCA(copy=True, iterated_power=4, n_components=2, random_state=None,
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
>>> pca = PCA(n_components=2, svd_solver='full')
>>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
PCA(copy=True, iterated_power=4, n_components=2, random_state=None,
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='full', tol=0.0, whiten=False)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
>>> pca = PCA(n_components=1, svd_solver='arpack')
>>> pca.fit(X)
PCA(copy=True, iterated_power=4, n_components=1, random_state=None,
PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
svd_solver='arpack', tol=0.0, whiten=False)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244...]
Expand All @@ -268,7 +268,7 @@ class PCA(_BasePCA):
"""

def __init__(self, n_components=None, copy=True, whiten=False,
svd_solver='auto', tol=0.0, iterated_power=4,
svd_solver='auto', tol=0.0, iterated_power='auto',
random_state=None):
self.n_components = n_components
self.copy = copy
Expand Down Expand Up @@ -535,8 +535,8 @@ class RandomizedPCA(BaseEstimator, TransformerMixin):
fit(X).transform(X) will not yield the expected results,
use fit_transform(X) instead.
iterated_power : int, optional
Number of iterations for the power method. 2 by default.
iterated_power : int, default=2
Number of iterations for the power method.
.. versionchanged:: 0.18
Expand Down
2 changes: 1 addition & 1 deletion sklearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,7 @@ def make_union(*transformers):
>>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE
FeatureUnion(n_jobs=1,
transformer_list=[('pca',
PCA(copy=True, iterated_power=4,
PCA(copy=True, iterated_power='auto',
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)),
('truncatedsvd',
Expand Down
25 changes: 8 additions & 17 deletions sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def randomized_range_finder(A, size, n_iter,
return Q


def randomized_svd(M, n_components, n_oversamples=10, n_iter=None,
def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
power_iteration_normalizer='auto', transpose='auto',
flip_sign=True, random_state=0):
"""Computes a truncated randomized SVD
Expand All @@ -287,11 +287,11 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter=None,
number can improve speed but can negatively impact the quality of
approximation of singular vectors and singular values.

n_iter: int (default is 4)
n_iter: int or 'auto' (default is 'auto')
Number of power iterations. It can be used to deal with very noisy
problems. When `n_components` is small (< .1 * min(X.shape)) `n_iter`
is set to 7, unless the user specifies a higher number. This improves
precision with few components.
problems. When 'auto', it is set to 4, unless `n_components` is small
(< .1 * min(X.shape)) `n_iter` in which case is set to 7.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe i'm being slow, but i'm not sure what this sentence means.

8000
Copy link
Member
@ogrisel ogrisel Sep 1, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the following would be clearer:

When `n_iter='auto'`, it is effectively set to 4 to when `n_components` is larger than
`0.1 * min(X.shape)` and set to 7 otherwise.

This improves precision with few components.

.. versionchanged:: 0.18

Expand Down Expand Up @@ -349,26 +349,17 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter=None,
n_random = n_components + n_oversamples
n_samples, n_features = M.shape

if n_iter is None:
if n_iter is 'auto':
# Checks if the number of iterations is explicitely specified
n_iter = 4
n_iter_specified = False
else:
n_iter_specified = True
# Adjust n_iter. 7 was found a good compromise for PCA. See #5299
n_iter = 7 if n_components < .1 * min(M.shape) else 4

if transpose == 'auto':
transpose = n_samples < n_features
if transpose:
# this implementation is a bit faster with smaller shape[1]
M = M.T

# Adjust n_iter. 7 was found a good compromise for PCA. See #5299
if n_components < .1 * min(M.shape) and n_iter < 7:
if n_iter_specified:
warnings.warn("The number of power iterations is increased to "
"7 to achieve higher precision.")
n_iter = 7

Q = randomized_range_finder(M, n_random, n_iter,
power_iteration_normalizer, random_state)

Expand Down
0