10000 [MRG] Creation of a v0 ARPACK initialization function by summer-bebop · Pull Request #11524 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Creation of a v0 ARPACK initialization function #11524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions benchmarks/bench_plot_randomized_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
from collections import defaultdict
import os.path

from sklearn.utils import gen_batches
from sklearn.utils import gen_batches, _init_arpack_v0
from sklearn.utils.validation import check_random_state
from sklearn.utils.extmath import randomized_svd
from sklearn.datasets.samples_generator import (make_low_rank_matrix,
Expand Down Expand Up @@ -257,7 +257,7 @@ def svd_timing(X, n_comps, n_iter, n_oversamples,
return U, mu, V, call_time


def norm_diff(A, norm=2, msg=True):
def norm_diff(A, norm=2, msg=True, random_state=None):
"""
Compute the norm diff with the original matrix, when randomized
SVD is called with *params.
Expand All @@ -269,7 +269,11 @@ def norm_diff(A, norm=2, msg=True):
print("... computing %s norm ..." % norm)
if norm == 2:
# s = sp.linalg.norm(A, ord=2) # slow
value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False)
v0 = _init_arpack_v0(min(A.shape), random_state)
value = sp.sparse.linalg.svds(A,
k=1,
return_singular_vectors=False,
v0=v0)
else:
if sp.sparse.issparse(A):
value = sp.sparse.linalg.norm(A, ord=norm)
Expand Down
8 changes: 8 additions & 0 deletions doc/whats_new/v0.21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ Support for Python 3.4 and below has been officially dropped.
and :class:`tree.ExtraTreeRegressor`.
:issue:`12300` by :user:`Adrin Jalali <adrinjalali>`.

:mod:`sklearn.utils`
..................
- New :func:`utils._init_arpack_v0` which goal is to be used each time eigs,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This creates a private method. We only need to document user-facing changes in what's new.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All right then I'll remove that

eigsh or svds from scipy.sparse.linalg is called. It initializes the v0
parameter correctly with value sampled from the uniform distribution in
[-1, 1] to avoid convergence issues with another initialization.
:issue:`11524` by :user:`Ivan Panico <FollowKenny>`

Multiple modules
................

Expand Down
15 changes: 7 additions & 8 deletions sklearn/cluster/bicluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from . import KMeans, MiniBatchKMeans
from ..base import BaseEstimator, BiclusterMixin
from ..externals import six
from ..utils import check_random_state
from ..utils.arpack import _init_arpack_v0

from ..utils.extmath import (make_nonnegative, randomized_svd,
safe_sparse_dot)
Expand Down Expand Up @@ -138,22 +138,21 @@ def _svd(self, array, n_components, n_discard):
**kwargs)

elif self.svd_method == 'arpack':
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
v0 = _init_arpack_v0(min(array.shape), self.random_state)
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs, v0=v0)
if np.any(np.isnan(vt)):
# some eigenvalues of A * A.T are negative, causing
# sqrt() to be np.nan. This causes some vectors in vt
# to be np.nan.
A = safe_sparse_dot(array.T, array)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
# We have to renitialize v0 differently because the shape
# here can be different from the previous init
v0 = _init_arpack_v0(A.shape[0], self.random_state)
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
vt = v.T
if np.any(np.isnan(u)):
A = safe_sparse_dot(array, array.T)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
v0 = _init_arpack_v0(A.shape[0], self.random_state)
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)

assert_all_finite(u)
Expand Down
19 changes: 16 additions & 3 deletions sklearn/cross_decomposition/pls_.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from ..base import BaseEstimator, RegressorMixin, TransformerMixin
from ..utils import check_array, check_consistent_length
from ..utils.arpack import _init_arpack_v0
from ..utils.extmath import svd_flip
from ..utils.validation import check_is_fitted, FLOAT_DTYPES
from ..exceptions import ConvergenceWarning
Expand Down Expand Up @@ -761,6 +762,15 @@ class PLSSVD(BaseEstimator, TransformerMixin):
copy : boolean, default True
Whether to copy X and Y, or perform in-place computations.

random_state : int, RandomState instance or None, optional (default=None)
The seed of the pseudo random number generator to use when shuffling
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not used when shuffling the data. Please fix.

the data. If int, random_state is the seed used by the random number
generator; If RandomState instance, random_state is the random number
generator; If None, the random number generator is the RandomState
instance used by `np.random`.

.. versionadded:: 0.21

Attributes
----------
x_weights_ : array, [p, n_components]
Expand Down Expand Up @@ -789,7 +799,7 @@ class PLSSVD(BaseEstimator, TransformerMixin):
... [11.9, 12.3]])
>>> plsca = PLSSVD(n_components=2)
>>> plsca.fit(X, Y)
PLSSVD(copy=True, n_components=2, scale=True)
PLSSVD(copy=True, n_components=2, random_state=None, scale=True)
>>> X_c, Y_c = plsca.transform(X, Y)
>>> X_c.shape, Y_c.shape
((4, 2), (4, 2))
Expand All @@ -800,10 +810,12 @@ class PLSSVD(BaseEstimator, TransformerMixin):
CCA
"""

def __init__(self, n_components=2, scale=True, copy=True):
def __init__(self, n_components=2, scale=True,
copy=True, random_state=None):
self.n_components = n_components
self.scale = scale
self.copy = copy
self.random_state = random_state

def fit(self, X, Y):
"""Fit model to data.
Expand Down Expand Up @@ -844,7 +856,8 @@ def fit(self, X, Y):
if self.n_components >= np.min(C.shape):
U, s, V = svd(C, full_matrices=False)
else:
U, s, V = svds(C, k=self.n_components)
v0 = _init_arpack_v0(min(C.shape), self.random_state)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may change the result (as per my comment in truncated svd below)

U, s, V = svds(C, k=self.n_components, v0=v0)
# Deterministic output
U, V = svd_flip(U, V)
V = V.T
Expand Down
6 changes: 2 additions & 4 deletions sklearn/decomposition/kernel_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from scipy import linalg
from scipy.sparse.linalg import eigsh

from ..utils import check_random_state
from ..utils.arpack import _init_arpack_v0
from ..utils.validation import check_is_fitted, check_array 10000
from ..exceptions import NotFittedError
from ..base import BaseEstimator, TransformerMixin
Expand Down Expand Up @@ -201,9 +201,7 @@ def _fit_transform(self, K):
self.lambdas_, self.alphas_ = linalg.eigh(
K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
elif eigen_solver == 'arpack':
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, K.shape[0])
v0 = _init_arpack_v0(K.shape[0], self.random_state)
self.lambdas_, self.alphas_ = eigsh(K, n_components,
which="LA",
tol=self.tol,
Expand Down
4 changes: 2 additions & 2 deletions sklearn/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .base import _BasePCA
from ..utils import check_random_state
from ..utils import check_array
from ..utils.arpack import _init_arpack_v0
from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
from ..utils.extmath import stable_cumsum
from ..utils.validation import check_is_fitted
Expand Down Expand Up @@ -508,8 +509,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
X -= self.mean_

if svd_solver == 'arpack':
# random init solution, as ARPACK does it internally
v0 = random_state.uniform(-1, 1, size=min(X.shape))
v0 = _init_arpack_v0(min(X.shape), self.random_state)
U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
# svds doesn't abide by scipy.linalg.svd/randomized_svd
# conventions, so reverse its outputs.
Expand Down
4 changes: 3 additions & 1 deletion sklearn/decomposition/truncated_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array, check_random_state
from ..utils.arpack import _init_arpack_v0
from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
from ..utils.sparsefuncs import mean_variance_axis

Expand Down Expand Up @@ -160,7 +161,8 @@ def fit_transform(self, X, y=None):
random_state = check_random_state(self.random_state)

if self.algorithm == "arpack":
U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
v0 = _init_arpack_v0(min(X.shape), self.random_state)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to change the results of the function.

  1. Is it beneficial and worth breaking backwards compatibility?
  2. If so, it needs to be documented in the change log.

Copy link
Contributor Author
@summer-bebop summer-bebop Nov 15, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Without this initialization, the v0 is initialized with randn. In arpack it is initialized with uniform distrib on [-1, 1]. The goal is also to generalize the initialization of the calls to arpack lib and provide a random state.
  2. What should be documented ? For each impacted algorithm : "Initialization of v0 is now uniform [-1, 1]" something of this flavour ?

U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0)
# svds doesn't abide by scipy.linalg.svd/randomized_svd
# conventions, so reverse its outputs.
Sigma = Sigma[::-1]
Expand Down
5 changes: 2 additions & 3 deletions sklearn/manifold/locally_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_random_state, check_array
from ..utils.arpack import _init_arpack_v0
from ..utils.extmath import stable_cumsum
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
Expand Down Expand Up @@ -157,9 +158,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
eigen_solver = 'dense'

if eigen_solver == 'arpack':
random_state = check_random_state(random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, M.shape[0])
v0 = _init_arpack_v0(M.shape[0], random_state)
try:
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
tol=tol, maxiter=max_iter,
Expand Down
6 changes: 4 additions & 2 deletions sklearn/manifold/spectral_embedding_.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

from ..base import BaseEstimator
from ..externals import six
from ..utils import check_random_state, check_array, check_symmetric
from ..utils import (check_random_state, check_array,
check_symmetric)
from ..utils.arpack import _init_arpack_v0
from ..utils.extmath import _deterministic_vector_sign_flip
from ..metrics.pairwise import rbf_kernel
from ..neighbors import kneighbors_graph
Expand Down Expand Up @@ -267,7 +269,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
# We are computing the opposite of the laplacian inplace so as
# to spare a memory allocation of a possibly very large array
laplacian *= -1
v0 = random_state.uniform(-1, 1, laplacian.shape[0])
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
lambdas, diffusion_map = eigsh(laplacian, k=n_components,
sigma=1.0, which='LM',
tol=eigen_tol, v0=v0)
Expand Down
32 changes: 32 additions & 0 deletions sklearn/utils/arpack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from .validation import check_random_state


def _init_arpack_v0(size, random_state):
"""Initialize the starting vector for iteration in ARPACK functions

Initialize a ndarray with values sampled from the uniform distribution
on [-1, 1]. This initialization model has been chosen to be
consistent with the ARPACK one as another initialization can lead to
convergence issues.

Parameters
----------
size : int
the size of the eigenvalue vector to be initialized

random_state : int, RandomState instance or None, optional (default=None)
The seed of the pseudo random number generator to use when shuffling
the data. If int, random_state is the seed used by the random number
generator; If RandomState instance, random_state is the random number
generator; If None, the random number generator is the RandomState
instance used by `np.random`.

Returns
-------
v0 : array of shape (size,)
the initialized vector
"""

random_state = check_random_state(random_state)
v0 = random_state.uniform(-1, 1, size)
return v0
15 changes: 15 additions & 0 deletions sklearn/utils/tests/test_arpack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import numpy as np

from sklearn.utils.arpack import _init_arpack_v0


def test_init_arpack_v0():
v0s = []
for i in range(100):
v0s.append(_init_arpack_v0(1000, i))
if i > 0:
assert not any(np.equal(v0s[i], v0s[i-1]))

v0 = np.concatenate(v0s)
assert np.allclose(np.mean(v0), 0, atol=1e-2)
assert np.allclose(np.std(v0), 1/np.sqrt(3), atol=1e-3)
0