From 54041979093d006d1b7b325bd4a24cdd90b95be5 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 31 Aug 2016 14:40:51 -0400 Subject: [PATCH 1/2] fix warning and behavior in randomized_svd wrt power iterations --- sklearn/utils/extmath.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index ab67c89e09525..8fe8f87860969 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -25,7 +25,7 @@ from ..externals.six.moves import xrange from .sparsefuncs_fast import csr_row_norms from .validation import check_array -from ..exceptions import NonBLASDotWarning +from ..exceptions import NonBLASDotWarning, ChangedBehaviorWarning def norm(x): @@ -351,10 +351,14 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter=None, if n_iter is None: # Checks if the number of iterations is explicitely specified - n_iter = 4 - n_iter_specified = False - else: - n_iter_specified = True + # Adjust n_iter. 7 was found a good compromise for PCA. See #5299 + if n_components < .1 * min(M.shape) and n_iter < 7: + n_iter = 7 + warnings.warn("The default number of power iterations is increased from 4" + "to 7 in version 0.18 to achieve higher precision.", + ChangedBehaviorWarning) + else: + n_iter = 4 if transpose == 'auto': transpose = n_samples < n_features @@ -362,13 +366,6 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter=None, # this implementation is a bit faster with smaller shape[1] M = M.T - # Adjust n_iter. 7 was found a good compromise for PCA. See #5299 - if n_components < .1 * min(M.shape) and n_iter < 7: - if n_iter_specified: - warnings.warn("The number of power iterations is increased to " - "7 to achieve higher precision.") - n_iter = 7 - Q = randomized_range_finder(M, n_random, n_iter, power_iteration_normalizer, random_state) From 970ace98fbfb9c49f1d3d80a47bc7b09e55c3482 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 31 Aug 2016 16:46:03 -0400 Subject: [PATCH 2/2] change default iterated_power to auto. --- sklearn/decomposition/pca.py | 14 +++++++------- sklearn/pipeline.py | 2 +- sklearn/utils/extmath.py | 22 ++++++++-------------- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index b4643e6a8c58f..881a4a593cfd2 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -166,7 +166,7 @@ class PCA(_BasePCA): .. versionadded:: 0.18.0 - iterated_power : int >= 0, optional (default 4) + iterated_power : int >= 0, or 'auto', (default 'auto') Number of iterations for the power method computed by svd_solver == 'randomized'. @@ -240,21 +240,21 @@ class PCA(_BasePCA): >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> pca = PCA(n_components=2) >>> pca.fit(X) - PCA(copy=True, iterated_power=4, n_components=2, random_state=None, + PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False) >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS [ 0.99244... 0.00755...] >>> pca = PCA(n_components=2, svd_solver='full') >>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - PCA(copy=True, iterated_power=4, n_components=2, random_state=None, + PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='full', tol=0.0, whiten=False) >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS [ 0.99244... 0.00755...] >>> pca = PCA(n_components=1, svd_solver='arpack') >>> pca.fit(X) - PCA(copy=True, iterated_power=4, n_components=1, random_state=None, + PCA(copy=True, iterated_power='auto', n_components=1, random_state=None, svd_solver='arpack', tol=0.0, whiten=False) >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS [ 0.99244...] @@ -268,7 +268,7 @@ class PCA(_BasePCA): """ def __init__(self, n_components=None, copy=True, whiten=False, - svd_solver='auto', tol=0.0, iterated_power=4, + svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): self.n_components = n_components self.copy = copy @@ -535,8 +535,8 @@ class RandomizedPCA(BaseEstimator, TransformerMixin): fit(X).transform(X) will not yield the expected results, use fit_transform(X) instead. - iterated_power : int, optional - Number of iterations for the power method. 2 by default. + iterated_power : int, default=2 + Number of iterations for the power method. .. versionchanged:: 0.18 diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 91e4fef0ec4d8..6c98ea70efad8 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -796,7 +796,7 @@ def make_union(*transformers): >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE FeatureUnion(n_jobs=1, transformer_list=[('pca', - PCA(copy=True, iterated_power=4, + PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('truncatedsvd', diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 8fe8f87860969..be349e1bc73bf 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -25,7 +25,7 @@ from ..externals.six.moves import xrange from .sparsefuncs_fast import csr_row_norms from .validation import check_array -from ..exceptions import NonBLASDotWarning, ChangedBehaviorWarning +from ..exceptions import NonBLASDotWarning def norm(x): @@ -267,7 +267,7 @@ def randomized_range_finder(A, size, n_iter, return Q -def randomized_svd(M, n_components, n_oversamples=10, n_iter=None, +def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto', power_iteration_normalizer='auto', transpose='auto', flip_sign=True, random_state=0): """Computes a truncated randomized SVD @@ -287,11 +287,11 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter=None, number can improve speed but can negatively impact the quality of approximation of singular vectors and singular values. - n_iter: int (default is 4) + n_iter: int or 'auto' (default is 'auto') Number of power iterations. It can be used to deal with very noisy - problems. When `n_components` is small (< .1 * min(X.shape)) `n_iter` - is set to 7, unless the user specifies a higher number. This improves - precision with few components. + problems. When 'auto', it is set to 4, unless `n_components` is small + (< .1 * min(X.shape)) `n_iter` in which case is set to 7. + This improves precision with few components. .. versionchanged:: 0.18 @@ -349,16 +349,10 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter=None, n_random = n_components + n_oversamples n_samples, n_features = M.shape - if n_iter is None: + if n_iter is 'auto': # Checks if the number of iterations is explicitely specified # Adjust n_iter. 7 was found a good compromise for PCA. See #5299 - if n_components < .1 * min(M.shape) and n_iter < 7: - n_iter = 7 - warnings.warn("The default number of power iterations is increased from 4" - "to 7 in version 0.18 to achieve higher precision.", - ChangedBehaviorWarning) - else: - n_iter = 4 + n_iter = 7 if n_components < .1 * min(M.shape) else 4 if transpose == 'auto': transpose = n_samples < n_features