scikit-learn · summer-bebop · Jul 14, 2018 · Jul 15, 2018 · Jul 15, 2018 · Jul 15, 2018
diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py
@@ -74,7 +74,7 @@
 from collections import defaultdict
 import os.path
 
-from sklearn.utils import gen_batches
+from sklearn.utils import gen_batches, _init_arpack_v0
 from sklearn.utils.validation import check_random_state
 from sklearn.utils.extmath import randomized_svd
 from sklearn.datasets.samples_generator import (make_low_rank_matrix,
@@ -257,7 +257,7 @@ def svd_timing(X, n_comps, n_iter, n_oversamples,
     return U, mu, V, call_time
 
 
-def norm_diff(A, norm=2, msg=True):
+def norm_diff(A, norm=2, msg=True, random_state=None):
     """
     Compute the norm diff with the original matrix, when randomized
     SVD is called with *params.
@@ -269,7 +269,11 @@ def norm_diff(A, norm=2, msg=True):
         print("... computing %s norm ..." % norm)
     if norm == 2:
         # s = sp.linalg.norm(A, ord=2)  # slow
-        value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False)
+        v0 = _init_arpack_v0(min(A.shape), random_state)
+        value = sp.sparse.linalg.svds(A,
+                                      k=1,
+                                      return_singular_vectors=False,
+                                      v0=v0)
     else:
         if sp.sparse.issparse(A):
             value = sp.sparse.linalg.norm(A, ord=norm)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -116,6 +116,14 @@ Support for Python 3.4 and below has been officially dropped.
   and :class:`tree.ExtraTreeRegressor`.
   :issue:`12300` by :user:`Adrin Jalali <adrinjalali>`.
 
+:mod:`sklearn.utils`
+..................
+- New :func:`utils._init_arpack_v0` which goal is to be used each time eigs,
+  eigsh or svds from scipy.sparse.linalg is called. It initializes the v0
+  parameter correctly with value sampled from the uniform distribution in
+  [-1, 1] to avoid convergence issues with another initialization.
+  :issue:`11524` by :user:`Ivan Panico <FollowKenny>`
+
 Multiple modules
 ................
 

diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py
@@ -15,7 +15,7 @@
 from . import KMeans, MiniBatchKMeans
 from ..base import BaseEstimator, BiclusterMixin
 from ..externals import six
-from ..utils import check_random_state
+from ..utils.arpack import _init_arpack_v0
 
 from ..utils.extmath import (make_nonnegative, randomized_svd,
                              safe_sparse_dot)
@@ -138,22 +138,21 @@ def _svd(self, array, n_components, n_discard):
                                       **kwargs)
 
         elif self.svd_method == 'arpack':
-            u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
+            v0 = _init_arpack_v0(min(array.shape), self.random_state)
+            u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs, v0=v0)
             if np.any(np.isnan(vt)):
                 # some eigenvalues of A * A.T are negative, causing
                 # sqrt() to be np.nan. This causes some vectors in vt
                 # to be np.nan.
                 A = safe_sparse_dot(array.T, array)
-                random_state = check_random_state(self.random_state)
-                # initialize with [-1,1] as in ARPACK
-                v0 = random_state.uniform(-1, 1, A.shape[0])
+                # We have to renitialize v0 differently because the shape
+                # here can be different from the previous init
+                v0 = _init_arpack_v0(A.shape[0], self.random_state)
                 _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
                 vt = v.T
             if np.any(np.isnan(u)):
                 A = safe_sparse_dot(array, array.T)
-                random_state = check_random_state(self.random_state)
-                # initialize with [-1,1] as in ARPACK
-                v0 = random_state.uniform(-1, 1, A.shape[0])
+                v0 = _init_arpack_v0(A.shape[0], self.random_state)
                 _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
 
         assert_all_finite(u)

diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
@@ -14,6 +14,7 @@
 
 from ..base import BaseEstimator, RegressorMixin, TransformerMixin
 from ..utils import check_array, check_consistent_length
+from ..utils.arpack import _init_arpack_v0
 from ..utils.extmath import svd_flip
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
 from ..exceptions import ConvergenceWarning
@@ -761,6 +762,15 @@ class PLSSVD(BaseEstimator, TransformerMixin):
     copy : boolean, default True
         Whether to copy X and Y, or perform in-place computations.
 
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
+        .. versionadded:: 0.21
+
     Attributes
     ----------
     x_weights_ : array, [p, n_components]
@@ -789,7 +799,7 @@ class PLSSVD(BaseEstimator, TransformerMixin):
     ...     [11.9, 12.3]])
     >>> plsca = PLSSVD(n_components=2)
     >>> plsca.fit(X, Y)
-    PLSSVD(copy=True, n_components=2, scale=True)
+    PLSSVD(copy=True, n_components=2, random_state=None, scale=True)
     >>> X_c, Y_c = plsca.transform(X, Y)
     >>> X_c.shape, Y_c.shape
     ((4, 2), (4, 2))
@@ -800,10 +810,12 @@ class PLSSVD(BaseEstimator, TransformerMixin):
     CCA
     """
 
-    def __init__(self, n_components=2, scale=True, copy=True):
+    def __init__(self, n_components=2, scale=True,
+                 copy=True, random_state=None):
         self.n_components = n_components
         self.scale = scale
         self.copy = copy
+        self.random_state = random_state
 
     def fit(self, X, Y):
         """Fit model to data.
@@ -844,7 +856,8 @@ def fit(self, X, Y):
         if self.n_components >= np.min(C.shape):
             U, s, V = svd(C, full_matrices=False)
         else:
-            U, s, V = svds(C, k=self.n_components)
+            v0 = _init_arpack_v0(min(C.shape), self.random_state)
+            U, s, V = svds(C, k=self.n_components, v0=v0)
         # Deterministic output
         U, V = svd_flip(U, V)
         V = V.T

diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
@@ -7,7 +7,7 @@
 from scipy import linalg
 from scipy.sparse.linalg import eigsh
 
-from ..utils import check_random_state
+from ..utils.arpack import _init_arpack_v0
 from ..utils.validation import check_is_fitted, check_array
10000
 from ..exceptions import NotFittedError
 from ..base import BaseEstimator, TransformerMixin
@@ -201,9 +201,7 @@ def _fit_transform(self, K):
             self.lambdas_, self.alphas_ = linalg.eigh(
                 K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
         elif eigen_solver == 'arpack':
-            random_state = check_random_state(self.random_state)
-            # initialize with [-1,1] as in ARPACK
-            v0 = random_state.uniform(-1, 1, K.shape[0])
+            v0 = _init_arpack_v0(K.shape[0], self.random_state)
             self.lambdas_, self.alphas_ = eigsh(K, n_components,
                                                 which="LA",
                                                 tol=self.tol,

diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
@@ -24,6 +24,7 @@
 from .base import _BasePCA
 from ..utils import check_random_state
 from ..utils import check_array
+from ..utils.arpack import _init_arpack_v0
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
@@ -508,8 +509,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
         X -= self.mean_
 
         if svd_solver == 'arpack':
-            # random init solution, as ARPACK does it internally
-            v0 = random_state.uniform(-1, 1, size=min(X.shape))
+            v0 = _init_arpack_v0(min(X.shape), self.random_state)
             U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
             # svds doesn't abide by scipy.linalg.svd/randomized_svd
             # conventions, so reverse its outputs.

diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
@@ -12,6 +12,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, check_random_state
+from ..utils.arpack import _init_arpack_v0
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
 from ..utils.sparsefuncs import mean_variance_axis
 
@@ -160,7 +161,8 @@ def fit_transform(self, X, y=None):
         random_state = check_random_state(self.random_state)
 
         if self.algorithm == "arpack":
-            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
+            v0 = _init_arpack_v0(min(X.shape), self.random_state)
+            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0)
             # svds doesn't abide by scipy.linalg.svd/randomized_svd
             # conventions, so reverse its outputs.
             Sigma = Sigma[::-1]

diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
@@ -11,6 +11,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_random_state, check_array
+from ..utils.arpack import _init_arpack_v0
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
@@ -157,9 +158,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
             eigen_solver = 'dense'
 
     if eigen_solver == 'arpack':
-        random_state = check_random_state(random_state)
-        # initialize with [-1,1] as in ARPACK
-        v0 = random_state.uniform(-1, 1, M.shape[0])
+        v0 = _init_arpack_v0(M.shape[0], random_state)
         try:
             eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
                                                 tol=tol, maxiter=max_iter,

diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
@@ -17,7 +17,9 @@
 
 from ..base import BaseEstimator
 from ..externals import six
-from ..utils import check_random_state, check_array, check_symmetric
+from ..utils import (check_random_state, check_array,
+                     check_symmetric)
+from ..utils.arpack import _init_arpack_v0
 from ..utils.extmath import _deterministic_vector_sign_flip
 from ..metrics.pairwise import rbf_kernel
 from ..neighbors import kneighbors_graph
@@ -267,7 +269,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
             # We are computing the opposite of the laplacian inplace so as
             # to spare a memory allocation of a possibly very large array
             laplacian *= -1
-            v0 = random_state.uniform(-1, 1, laplacian.shape[0])
+            v0 = _init_arpack_v0(laplacian.shape[0], random_state)
             lambdas, diffusion_map = eigsh(laplacian, k=n_components,
                                            sigma=1.0, which='LM',
                                            tol=eigen_tol, v0=v0)

diff --git a/sklearn/utils/arpack.py b/sklearn/utils/arpack.py
@@ -0,0 +1,32 @@
+from .validation import check_random_state
+
+
+def _init_arpack_v0(size, random_state):
+    """Initialize the starting vector for iteration in ARPACK functions
+
+    Initialize a ndarray with values sampled from the uniform distribution
+    on [-1, 1]. This initialization model has been chosen to be
+    consistent with the ARPACK one as another initialization can lead to
+    convergence issues.
+
+    Parameters
+    ----------
+    size : int
+        the size of the eigenvalue vector to be initialized
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
+    Returns
+    -------
+    v0 : array of shape (size,)
+        the initialized vector
+    """
+
+    random_state = check_random_state(random_state)
+    v0 = random_state.uniform(-1, 1, size)
+    return v0
diff --git a/sklearn/utils/tests/test_arpack.py b/sklearn/utils/tests/test_arpack.py
@@ -0,0 +1,15 @@
+import numpy as np
+
+from sklearn.utils.arpack import _init_arpack_v0
+
+
+def test_init_arpack_v0():
+    v0s = []
+    for i in range(100):
+        v0s.append(_init_arpack_v0(1000, i))
+        if i > 0:
+            assert not any(np.equal(v0s[i], v0s[i-1]))
+
+    v0 = np.concatenate(v0s)
+    assert np.allclose(np.mean(v0), 0, atol=1e-2)
+    assert np.allclose(np.std(v0), 1/np.sqrt(3), atol=1e-3)