From b3ae54652a98f274edc2194df36f4c33d3e06a63 Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Sat, 14 Jul 2018 22:05:13 +0200 Subject: [PATCH 01/16] Arpack initialization modif --- sklearn/cluster/bicluster.py | 10 +++----- sklearn/decomposition/kernel_pca.py | 6 ++--- sklearn/manifold/locally_linear.py | 6 ++--- sklearn/manifold/spectral_embedding_.py | 5 ++-- sklearn/utils/__init__.py | 31 +++++++++++++++++++++++++ 5 files changed, 41 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 18260a0f3b1c2..34b028a856bc3 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -15,7 +15,7 @@ from . import KMeans, MiniBatchKMeans from ..base import BaseEstimator, BiclusterMixin from ..externals import six -from ..utils import check_random_state +from ..utils import init_arpack from ..utils.extmath import (make_nonnegative, randomized_svd, safe_sparse_dot) @@ -144,16 +144,12 @@ def _svd(self, array, n_components, n_discard): # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. A = safe_sparse_dot(array.T, array) - random_state = check_random_state(self.random_state) - # initialize with [-1,1] as in ARPACK - v0 = random_state.uniform(-1, 1, A.shape[0]) + v0 = init_arpack(A.shape[0], self.random_state) _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0) vt = v.T if np.any(np.isnan(u)): A = safe_sparse_dot(array, array.T) - random_state = check_random_state(self.random_state) - # initialize with [-1,1] as in ARPACK - v0 = random_state.uniform(-1, 1, A.shape[0]) + v0 = init_arpack(A.shape[0], self.random_state) _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0) assert_all_finite(u) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 133717e13f677..e6f743e7f2ac5 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -7,7 +7,7 @@ from scipy import linalg from scipy.sparse.linalg import eigsh -from ..utils import check_random_state +from ..utils import check_random_state, init_arpack_v0 from ..utils.validation import check_is_fitted, check_array from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin @@ -201,9 +201,7 @@ def _fit_transform(self, K): self.lambdas_, self.alphas_ = linalg.eigh( K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1)) elif eigen_solver == 'arpack': - random_state = check_random_state(self.random_state) - # initialize with [-1,1] as in ARPACK - v0 = random_state.uniform(-1, 1, K.shape[0]) + v0 = init_arpack_v0(K.shape[0], self.random_state) self.lambdas_, self.alphas_ = eigsh(K, n_components, which="LA", tol=self.tol, diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index c3afdac2cba08..a3e49e89d1ced 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -10,7 +10,7 @@ from scipy.sparse.linalg import eigsh from ..base import BaseEstimator, TransformerMixin -from ..utils import check_random_state, check_array +from ..utils import check_random_state, check_array, init_arpack_v0 from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES @@ -157,9 +157,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100, eigen_solver = 'dense' if eigen_solver == 'arpack': - random_state = check_random_state(random_state) - # initialize with [-1,1] as in ARPACK - v0 = random_state.uniform(-1, 1, M.shape[0]) + v0 = init_arpack_v0(M.shape[0], random_state) try: eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index d0c226b51ca5e..ca10e4af71de0 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -17,7 +17,8 @@ from ..base import BaseEstimator from ..externals import six -from ..utils import check_random_state, check_array, check_symmetric +from ..utils import (check_random_state, check_array, + check_symmetric, init_arpack_v0) from ..utils.extmath import _deterministic_vector_sign_flip from ..metrics.pairwise import rbf_kernel from ..neighbors import kneighbors_graph @@ -267,7 +268,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, # We are computing the opposite of the laplacian inplace so as # to spare a memory allocation of a possibly very large array laplacian *= -1 - v0 = random_state.uniform(-1, 1, laplacian.shape[0]) + v0 = init_arpack_v0(laplacian.shape[0], random_state) lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol, v0=v0) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4c8f3c54101c9..8298d09c70e31 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -608,3 +608,34 @@ def is_scalar_nan(x): # Redondant np.floating is needed because numbers can't match np.float32 # in python 2. return bool(isinstance(x, (numbers.Real, np.floating)) and np.isnan(x)) + + +def init_arpack_v0(size, random_state): + """Initialize the starting vector for iteration in ARPACK functions + + Initialize a ndarray with values sampled from the uniform distribution + on [-1, 1]. This initialization model has been chosen to be + consistent with the ARPACK one as another initialization can lead to + convergence issues. + + Parameters + ---------- + size : int + the size of the eigenvalue vector to be initialized + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + + Returns + ------- + v0 : array of shape (size,) + the initialized vector + """ + + random_state = check_random_state(random_state) + v0 = random_state.uniform(-1, 1, size) + return v0 From 4099f91f50ee2535ff9ec743c8d116b0e4e2b650 Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Sun, 15 Jul 2018 12:16:55 +0200 Subject: [PATCH 02/16] fixes --- sklearn/cluster/bicluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 34b028a856bc3..ba2571e0c9303 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -15,7 +15,7 @@ from . import KMeans, MiniBatchKMeans from ..base import BaseEstimator, BiclusterMixin from ..externals import six -from ..utils import init_arpack +from ..utils import init_arpack_v0 from ..utils.extmath import (make_nonnegative, randomized_svd, safe_sparse_dot) From 0b8eec3bcfd8d01e0ae5a71cc686d34b251abe6b Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Sun, 15 Jul 2018 21:08:30 +0200 Subject: [PATCH 03/16] making function private and correcting typo --- sklearn/cluster/bicluster.py | 6 +++--- sklearn/decomposition/kernel_pca.py | 4 ++-- sklearn/manifold/locally_linear.py | 4 ++-- sklearn/manifold/spectral_embedding_.py | 4 ++-- sklearn/utils/__init__.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index ba2571e0c9303..706b0e96320ee 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -15,7 +15,7 @@ from . import KMeans, MiniBatchKMeans from ..base import BaseEstimator, BiclusterMixin from ..externals import six -from ..utils import init_arpack_v0 +from ..utils import _init_arpack_v0 from ..utils.extmath import (make_nonnegative, randomized_svd, safe_sparse_dot) @@ -144,12 +144,12 @@ def _svd(self, array, n_components, n_discard): # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. A = safe_sparse_dot(array.T, array) - v0 = init_arpack(A.shape[0], self.random_state) + v0 = _init_arpack_v0(A.shape[0], self.random_state) _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0) vt = v.T if np.any(np.isnan(u)): A = safe_sparse_dot(array, array.T) - v0 = init_arpack(A.shape[0], self.random_state) + v0 = _init_arpack_v0(A.shape[0], self.random_state) _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0) assert_all_finite(u) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index e6f743e7f2ac5..219d82f0410f0 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -7,7 +7,7 @@ from scipy import linalg from scipy.sparse.linalg import eigsh -from ..utils import check_random_state, init_arpack_v0 +from ..utils import check_random_state, _init_arpack_v0 from ..utils.validation import check_is_fitted, check_array from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin @@ -201,7 +201,7 @@ def _fit_transform(self, K): self.lambdas_, self.alphas_ = linalg.eigh( K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1)) elif eigen_solver == 'arpack': - v0 = init_arpack_v0(K.shape[0], self.random_state) + v0 = _init_arpack_v0(K.shape[0], self.random_state) self.lambdas_, self.alphas_ = eigsh(K, n_components, which="LA", tol=self.tol, diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index a3e49e89d1ced..8d475bdb02213 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -10,7 +10,7 @@ from scipy.sparse.linalg import eigsh from ..base import BaseEstimator, TransformerMixin -from ..utils import check_random_state, check_array, init_arpack_v0 +from ..utils import check_random_state, check_array, _init_arpack_v0 from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES @@ -157,7 +157,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100, eigen_solver = 'dense' if eigen_solver == 'arpack': - v0 = init_arpack_v0(M.shape[0], random_state) + v0 = _init_arpack_v0(M.shape[0], random_state) try: eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index ca10e4af71de0..5cbf48d641c2a 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -18,7 +18,7 @@ from ..base import BaseEstimator from ..externals import six from ..utils import (check_random_state, check_array, - check_symmetric, init_arpack_v0) + check_symmetric, _init_arpack_v0) from ..utils.extmath import _deterministic_vector_sign_flip from ..metrics.pairwise import rbf_kernel from ..neighbors import kneighbors_graph @@ -268,7 +268,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, # We are computing the opposite of the laplacian inplace so as # to spare a memory allocation of a possibly very large array laplacian *= -1 - v0 = init_arpack_v0(laplacian.shape[0], random_state) + v0 = _init_arpack_v0(laplacian.shape[0], random_state) lambdas, diffusion_map = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol, v0=v0) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8298d09c70e31..eb39a46b1399d 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -610,7 +610,7 @@ def is_scalar_nan(x): return bool(isinstance(x, (numbers.Real, np.floating)) and np.isnan(x)) -def init_arpack_v0(size, random_state): +def _init_arpack_v0(size, random_state): """Initialize the starting vector for iteration in ARPACK functions Initialize a ndarray with values sampled from the uniform distribution From 68ecfaf5c679160e815b65a91b428afc4c044679 Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Sun, 15 Jul 2018 22:33:56 +0200 Subject: [PATCH 04/16] with the svds correct init --- benchmarks/bench_plot_randomized_svd.py | 8 +++++--- sklearn/cluster/bicluster.py | 11 ++++++++--- sklearn/cross_decomposition/pls_.py | 9 ++++++--- sklearn/decomposition/kernel_pca.py | 2 +- sklearn/decomposition/pca.py | 4 ++-- sklearn/decomposition/truncated_svd.py | 5 +++-- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index 7c14bcaa56b3c..54251ded44b9e 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -74,7 +74,7 @@ from collections import defaultdict import os.path -from sklearn.utils import gen_batches +from sklearn.utils import gen_batches, _init_arpack_v0 from sklearn.utils.validation import check_random_state from sklearn.utils.extmath import randomized_svd from sklearn.datasets.samples_generator import (make_low_rank_matrix, @@ -257,7 +257,7 @@ def svd_timing(X, n_comps, n_iter, n_oversamples, return U, mu, V, call_time -def norm_diff(A, norm=2, msg=True): +def norm_diff(A, norm=2, msg=True, random_state=None): """ Compute the norm diff with the original matrix, when randomized SVD is called with *params. @@ -269,7 +269,9 @@ def norm_diff(A, norm=2, msg=True): print("... computing %s norm ..." % norm) if norm == 2: # s = sp.linalg.norm(A, ord=2) # slow - value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False) + v0 = _init_arpack_v0(min(A.shape), random_state) + value = sp.sparse.linalg.svds(A, k=1, + return_singular_vectors=False, v0=v0) else: if sp.sparse.issparse(A): value = sp.sparse.linalg.norm(A, ord=norm) diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 706b0e96320ee..7541f6dbd96f0 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -138,18 +138,23 @@ def _svd(self, array, n_components, n_discard): **kwargs) elif self.svd_method == 'arpack': - u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) + v0 = _init_arpack_v0(min(array.shape), self.random_state) + u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs, v0=v0) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. A = safe_sparse_dot(array.T, array) - v0 = _init_arpack_v0(A.shape[0], self.random_state) + if A.shape[0] != min(array.shape): + # We have to renitialize v0 differently because the shape + # here can be different from the previous init + v0 = _init_arpack_v0(A.shape[0], self.random_state) _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0) vt = v.T if np.any(np.isnan(u)): A = safe_sparse_dot(array, array.T) - v0 = _init_arpack_v0(A.shape[0], self.random_state) + if A.shape[0] != min(array.shape): + v0 = _init_arpack_v0(A.shape[0], self.random_state) _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0) assert_all_finite(u) diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index df7cb22b895f7..129cd72d19625 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -13,7 +13,7 @@ from scipy.sparse.linalg import svds from ..base import BaseEstimator, RegressorMixin, TransformerMixin -from ..utils import check_array, check_consistent_length +from ..utils import check_array, check_consistent_length, _init_arpack_v0 from ..utils.extmath import svd_flip from ..utils.validation import check_is_fitted, FLOAT_DTYPES from ..exceptions import ConvergenceWarning @@ -800,10 +800,12 @@ class PLSSVD(BaseEstimator, TransformerMixin): CCA """ - def __init__(self, n_components=2, scale=True, copy=True): + def __init__(self, n_components=2, scale=True, + copy=True, random_state=None): self.n_components = n_components self.scale = scale self.copy = copy + self.random_state = random_state def fit(self, X, Y): """Fit model to data. @@ -844,7 +846,8 @@ def fit(self, X, Y): if self.n_components >= np.min(C.shape): U, s, V = svd(C, full_matrices=False) else: - U, s, V = svds(C, k=self.n_components) + v0 = _init_arpack_v0(min(C.shape), self.random_state) + U, s, V = svds(C, k=self.n_components, v0=v0) # Deterministic output U, V = svd_flip(U, V) V = V.T diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 219d82f0410f0..91168cb0cd578 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -7,7 +7,7 @@ from scipy import linalg from scipy.sparse.linalg import eigsh -from ..utils import check_random_state, _init_arpack_v0 +from ..utils import _init_arpack_v0 from ..utils.validation import check_is_fitted, check_array from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index db183af45af0c..9f38c70ddc7e8 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -24,6 +24,7 @@ from .base import _BasePCA from ..utils import check_random_state from ..utils import check_array +from ..utils import _init_arpack_v0 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted @@ -508,8 +509,7 @@ def _fit_truncated(self, X, n_components, svd_solver): X -= self.mean_ if svd_solver == 'arpack': - # random init solution, as ARPACK does it internally - v0 = random_state.uniform(-1, 1, size=min(X.shape)) + v0 = _init_arpack_v0(min(X.shape), self.random_state) U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index cbaa5e19008fd..a0900d035d202 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -11,7 +11,7 @@ from scipy.sparse.linalg import svds from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array, check_random_state +from ..utils import check_array, check_random_state, _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis @@ -160,7 +160,8 @@ def fit_transform(self, X, y=None): random_state = check_random_state(self.random_state) if self.algorithm == "arpack": - U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) + v0 = _init_arpack_v0(min(X.shape), self.random_state) + U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. Sigma = Sigma[::-1] From 67d5632b28fa2bb87858ec33c4147888a6008e75 Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Sun, 15 Jul 2018 22:40:48 +0200 Subject: [PATCH 05/16] with pep8 flake8 and pytest corrections --- benchmarks/bench_plot_randomized_svd.py | 6 ++++-- sklearn/decomposition/pca.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index 54251ded44b9e..27c9bdb24520a 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -270,8 +270,10 @@ def norm_diff(A, norm=2, msg=True, random_state=None): if norm == 2: # s = sp.linalg.norm(A, ord=2) # slow v0 = _init_arpack_v0(min(A.shape), random_state) - value = sp.sparse.linalg.svds(A, k=1, - return_singular_vectors=False, v0=v0) + value = sp.sparse.linalg.svds(A, + k=1, + return_singular_vectors=False, + v0=v0) else: if sp.sparse.issparse(A): value = sp.sparse.linalg.norm(A, ord=norm) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 9f38c70ddc7e8..0cab26c80e025 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -24,7 +24,7 @@ from .base import _BasePCA from ..utils import check_random_state from ..utils import check_array -from ..utils import _init_arpack_v0 +from ..utils import _init_arpack_v0 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted From b81afe5fa1fee360bfe8c5492152ffdb8cb0be23 Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Mon, 16 Jul 2018 11:06:48 +0200 Subject: [PATCH 06/16] unit test on init_arpack --- sklearn/cross_decomposition/pls_.py | 7 +++++++ sklearn/utils/tests/test_utils.py | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index 129cd72d19625..3f9cb60a084ad 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -761,6 +761,13 @@ class PLSSVD(BaseEstimator, TransformerMixin): copy : boolean, default True Whether to copy X and Y, or perform in-place computations. + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + Attributes ---------- x_weights_ : array, [p, n_components] diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index ab6f8f0ff1115..6fadae396c04f 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -21,6 +21,7 @@ from sklearn.utils import is_scalar_nan from sklearn.utils.mocking import MockDataFrame from sklearn import config_context +from pytest import approx def test_make_rng(): @@ -275,3 +276,14 @@ def check_warning(*args, **kw): ([np.nan], False)]) def test_is_scalar_nan(value, result): assert is_scalar_nan(value) is result + +def test_init_arpack_v0(): + v0s = [] + for i in range(100): + v0s.append(_init_arpack_v0(1000, i)) + if i > 0: + assert not any(np.equal(v0s[i], v0s[i-1])) + + v0 = np.concatenate(v0s) + assert np.mean(v0) == approx(0, abs=1e-2) + assert np.std(v0) == approx(1/np.sqrt(3), abs=1e-3) From 666c22bba76f1f7d57300eb8a5854062f42effad Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Mon, 16 Jul 2018 11:15:04 +0200 Subject: [PATCH 07/16] pep8 correction --- sklearn/utils/tests/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 6fadae396c04f..3b1259d43688e 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -277,6 +277,7 @@ def check_warning(*args, **kw): def test_is_scalar_nan(value, result): assert is_scalar_nan(value) is result + def test_init_arpack_v0(): v0s = [] for i in range(100): From d3149848f04cf00d47dec40445de086765bd5ab6 Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Mon, 16 Jul 2018 16:37:15 +0200 Subject: [PATCH 08/16] change approx for allclose to pass travis --- sklearn/utils/tests/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 3b1259d43688e..548b84206b6ad 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -286,5 +286,5 @@ def test_init_arpack_v0(): assert not any(np.equal(v0s[i], v0s[i-1])) v0 = np.concatenate(v0s) - assert np.mean(v0) == approx(0, abs=1e-2) - assert np.std(v0) == approx(1/np.sqrt(3), abs=1e-3) + assert np.allclose(np.mean(v0), 0, atol=1e-2) + assert np.allclose(np.std(v0), 1/np.sqrt(3), atol=1e-3) From 128aecc130a9ac0c33deecfa995483f9fbbc9eea Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Mon, 16 Jul 2018 16:38:01 +0200 Subject: [PATCH 09/16] with flake8 --- sklearn/utils/tests/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 548b84206b6ad..1b1bb45d65b15 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -21,7 +21,6 @@ from sklearn.utils import is_scalar_nan from sklearn.utils.mocking import MockDataFrame from sklearn import config_context -from pytest import approx def test_make_rng(): From b450b03b06da42e3e6b8dfe92bfe35831a3d3dd6 Mon Sep 17 00:00:00 2001 From: Ivan PANICO Date: Thu, 19 Jul 2018 02:40:41 +0200 Subject: [PATCH 10/16] double init sadly --- sklearn/cluster/bicluster.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 7541f6dbd96f0..2903f6699752b 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -145,16 +145,14 @@ def _svd(self, array, n_components, n_discard): # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. A = safe_sparse_dot(array.T, array) - if A.shape[0] != min(array.shape): - # We have to renitialize v0 differently because the shape - # here can be different from the previous init - v0 = _init_arpack_v0(A.shape[0], self.random_state) + # We have to renitialize v0 differently because the shape + # here can be different from the previous init + v0 = _init_arpack_v0(A.shape[0], self.random_state) _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0) vt = v.T if np.any(np.isnan(u)): A = safe_sparse_dot(array, array.T) - if A.shape[0] != min(array.shape): - v0 = _init_arpack_v0(A.shape[0], self.random_state) + v0 = _init_arpack_v0(A.shape[0], self.random_state) _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0) assert_all_finite(u) From a16e52f16ff5ec14c638b1330b81dba21f47d1e5 Mon Sep 17 00:00:00 2001 From: Ivan Panico Date: Wed, 8 Aug 2018 17:17:26 +0200 Subject: [PATCH 11/16] changing _init_arpack_v0 location --- sklearn/cluster/bicluster.py | 2 +- sklearn/cross_decomposition/pls_.py | 3 ++- sklearn/decomposition/kernel_pca.py | 2 +- sklearn/decomposition/pca.py | 2 +- sklearn/decomposition/truncated_svd.py | 3 ++- sklearn/manifold/locally_linear.py | 3 ++- sklearn/manifold/spectral_embedding_.py | 3 ++- sklearn/utils/__init__.py | 31 ------------------------ sklearn/utils/arpack.py | 32 +++++++++++++++++++++++++ sklearn/utils/tests/test_arpack.py | 15 ++++++++++++ sklearn/utils/tests/test_utils.py | 12 ---------- 11 files changed, 58 insertions(+), 50 deletions(-) create mode 100644 sklearn/utils/arpack.py create mode 100644 sklearn/utils/tests/test_arpack.py diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 2903f6699752b..1d3cc544792cb 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -15,7 +15,7 @@ from . import KMeans, MiniBatchKMeans from ..base import BaseEstimator, BiclusterMixin from ..externals import six -from ..utils import _init_arpack_v0 +from ..utils.arpack import _init_arpack_v0 from ..utils.extmath import (make_nonnegative, randomized_svd, safe_sparse_dot) diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index 3f9cb60a084ad..d605c0daf8fb1 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -13,7 +13,8 @@ from scipy.sparse.linalg import svds from ..base import BaseEstimator, RegressorMixin, TransformerMixin -from ..utils import check_array, check_consistent_length, _init_arpack_v0 +from ..utils import check_array, check_consistent_length +from ..utils.arpack import _init_arpack_v0 from ..utils.extmath import svd_flip from ..utils.validation import check_is_fitted, FLOAT_DTYPES from ..exceptions import ConvergenceWarning diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 91168cb0cd578..819e1a5212e75 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -7,7 +7,7 @@ from scipy import linalg from scipy.sparse.linalg import eigsh -from ..utils import _init_arpack_v0 +from ..utils.arpack import _init_arpack_v0 from ..utils.validation import check_is_fitted, check_array from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 0cab26c80e025..33eebf43aaa3c 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -24,7 +24,7 @@ from .base import _BasePCA from ..utils import check_random_state from ..utils import check_array -from ..utils import _init_arpack_v0 +from ..utils.arpack import _init_arpack_v0 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index a0900d035d202..51b6778093886 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -11,7 +11,8 @@ from scipy.sparse.linalg import svds from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array, check_random_state, _init_arpack_v0 +from ..utils import check_array, check_random_state +from ..utils.arpack import _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index 8d475bdb02213..bf12b90827818 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -10,7 +10,8 @@ from scipy.sparse.linalg import eigsh from ..base import BaseEstimator, TransformerMixin -from ..utils import check_random_state, check_array, _init_arpack_v0 +from ..utils import check_random_state, check_array +from ..utils.arpack import _init_arpack_v0 from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 5cbf48d641c2a..aa90e51385273 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -18,7 +18,8 @@ from ..base import BaseEstimator from ..externals import six from ..utils import (check_random_state, check_array, - check_symmetric, _init_arpack_v0) + check_symmetric) +from ..utils.arpack import _init_arpack_v0 from ..utils.extmath import _deterministic_vector_sign_flip from ..metrics.pairwise import rbf_kernel from ..neighbors import kneighbors_graph diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index eb39a46b1399d..4c8f3c54101c9 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -608,34 +608,3 @@ def is_scalar_nan(x): # Redondant np.floating is needed because numbers can't match np.float32 # in python 2. return bool(isinstance(x, (numbers.Real, np.floating)) and np.isnan(x)) - - -def _init_arpack_v0(size, random_state): - """Initialize the starting vector for iteration in ARPACK functions - - Initialize a ndarray with values sampled from the uniform distribution - on [-1, 1]. This initialization model has been chosen to be - consistent with the ARPACK one as another initialization can lead to - convergence issues. - - Parameters - ---------- - size : int - the size of the eigenvalue vector to be initialized - - random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. - - Returns - ------- - v0 : array of shape (size,) - the initialized vector - """ - - random_state = check_random_state(random_state) - v0 = random_state.uniform(-1, 1, size) - return v0 diff --git a/sklearn/utils/arpack.py b/sklearn/utils/arpack.py new file mode 100644 index 0000000000000..0ae3ad54817f4 --- /dev/null +++ b/sklearn/utils/arpack.py @@ -0,0 +1,32 @@ +from .validation import check_random_state + + +def _init_arpack_v0(size, random_state): + """Initialize the starting vector for iteration in ARPACK functions + + Initialize a ndarray with values sampled from the uniform distribution + on [-1, 1]. This initialization model has been chosen to be + consistent with the ARPACK one as another initialization can lead to + convergence issues. + + Parameters + ---------- + size : int + the size of the eigenvalue vector to be initialized + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + + Returns + ------- + v0 : array of shape (size,) + the initialized vector + """ + + random_state = check_random_state(random_state) + v0 = random_state.uniform(-1, 1, size) + return v0 diff --git a/sklearn/utils/tests/test_arpack.py b/sklearn/utils/tests/test_arpack.py new file mode 100644 index 0000000000000..5e723975f8787 --- /dev/null +++ b/sklearn/utils/tests/test_arpack.py @@ -0,0 +1,15 @@ +import numpy as np + +from sklearn.utils.arpack import _init_arpack_v0 + + +def test_init_arpack_v0(): + v0s = [] + for i in range(100): + v0s.append(_init_arpack_v0(1000, i)) + if i > 0: + assert not any(np.equal(v0s[i], v0s[i-1])) + + v0 = np.concatenate(v0s) + assert np.allclose(np.mean(v0), 0, atol=1e-2) + assert np.allclose(np.std(v0), 1/np.sqrt(3), atol=1e-3) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 1b1bb45d65b15..ab6f8f0ff1115 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -275,15 +275,3 @@ def check_warning(*args, **kw): ([np.nan], False)]) def test_is_scalar_nan(value, result): assert is_scalar_nan(value) is result - - -def test_init_arpack_v0(): - v0s = [] - for i in range(100): - v0s.append(_init_arpack_v0(1000, i)) - if i > 0: - assert not any(np.equal(v0s[i], v0s[i-1])) - - v0 = np.concatenate(v0s) - assert np.allclose(np.mean(v0), 0, atol=1e-2) - assert np.allclose(np.std(v0), 1/np.sqrt(3), atol=1e-3) From b0281d3bef79283647edcce862be1e1f11516bc5 Mon Sep 17 00:00:00 2001 From: Ivan Panico Date: Mon, 5 Nov 2018 22:26:36 +0100 Subject: [PATCH 12/16] Fix PLSCA docstring and add versionadded mention in PLSSVD --- sklearn/cross_decomposition/pls_.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index d605c0daf8fb1..18c81fac42163 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -715,7 +715,7 @@ class PLSCanonical(_PLS): >>> plsca.fit(X, Y) ... # doctest: +NORMALIZE_WHITESPACE PLSCanonical(algorithm='nipals', copy=True, max_iter=500, n_components=2, - scale=True, tol=1e-06) + random_state=None, scale=True, tol=1e-06) >>> X_c, Y_c = plsca.transform(X, Y) References @@ -769,6 +769,8 @@ class PLSSVD(BaseEstimator, TransformerMixin): generator; If None, the random number generator is the RandomState instance used by `np.random`. + .. versionadded:: 0.21 + Attributes ---------- x_weights_ : array, [p, n_components] From c6ea0bdbcdf70438cdad1c52b548a925f63f7fc6 Mon Sep 17 00:00:00 2001 From: Ivan Panico Date: Mon, 5 Nov 2018 23:12:31 +0100 Subject: [PATCH 13/16] Add a whats_new. --- doc/whats_new/v0.21.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 54d8c024eb5c5..c063630dfe41d 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -116,6 +116,13 @@ Support for Python 3.4 and below has been officially dropped. and :class:`tree.ExtraTreeRegressor`. :issue:`12300` by :user:`Adrin Jalali `. +:mod:`sklearn.utils` +.................. +- New :func:`utils._init_arpack_v0` which goal is to be used each time eigs, + eigsh or svds from scipy.sparse.linalg is called. It initializes the v0 + parameter correctly with value sampled from the uniform distribution in + [-1, 1] to avoid convergence issues with another initialization. + Multiple modules ................ From 054e9b7666879427ff16a0bfcb8332e2a19239c0 Mon Sep 17 00:00:00 2001 From: Ivan Panico Date: Tue, 6 Nov 2018 00:16:48 +0100 Subject: [PATCH 14/16] fixes the right docstring... And convert some weird tabs into spaces... --- sklearn/cross_decomposition/pls_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index 18c81fac42163..95db25d427dbe 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -715,7 +715,7 @@ class PLSCanonical(_PLS): >>> plsca.fit(X, Y) ... # doctest: +NORMALIZE_WHITESPACE PLSCanonical(algorithm='nipals', copy=True, max_iter=500, n_components=2, - random_state=None, scale=True, tol=1e-06) + scale=True, tol=1e-06) >>> X_c, Y_c = plsca.transform(X, Y) References @@ -769,7 +769,7 @@ class PLSSVD(BaseEstimator, TransformerMixin): generator; If None, the random number generator is the RandomState instance used by `np.random`. - .. versionadded:: 0.21 + .. versionadded:: 0.21 Attributes ---------- @@ -799,7 +799,7 @@ class PLSSVD(BaseEstimator, TransformerMixin): ... [11.9, 12.3]]) >>> plsca = PLSSVD(n_components=2) >>> plsca.fit(X, Y) - PLSSVD(copy=True, n_components=2, scale=True) + PLSSVD(copy=True, n_components=2, random_state=None, scale=True) >>> X_c, Y_c = plsca.transform(X, Y) >>> X_c.shape, Y_c.shape ((4, 2), (4, 2)) From 3f8bfc26fd6222cdf42616d79f79f044afbd8e4f Mon Sep 17 00:00:00 2001 From: Ivan Panico Date: Tue, 6 Nov 2018 00:23:58 +0100 Subject: [PATCH 15/16] Adds :issue: and :user: entry --- doc/whats_new/v0.21.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index c063630dfe41d..0c5ed243e6a1e 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -122,6 +122,7 @@ Support for Python 3.4 and below has been officially dropped. eigsh or svds from scipy.sparse.linalg is called. It initializes the v0 parameter correctly with value sampled from the uniform distribution in [-1, 1] to avoid convergence issues with another initialization. + :issue:`5545` by :user:`Giorgio Patrini ` Multiple modules ................ From bbc38f9f86abe9eaae22f9ea06762af7340c8ec4 Mon Sep 17 00:00:00 2001 From: Ivan Panico Date: Tue, 6 Nov 2018 00:35:23 +0100 Subject: [PATCH 16/16] I'm out of words... --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 0c5ed243e6a1e..342ee57f6e402 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -122,7 +122,7 @@ Support for Python 3.4 and below has been officially dropped. eigsh or svds from scipy.sparse.linalg is called. It initializes the v0 parameter correctly with value sampled from the uniform distribution in [-1, 1] to avoid convergence issues with another initialization. - :issue:`5545` by :user:`Giorgio Patrini ` + :issue:`11524` by :user:`Ivan Panico ` Multiple modules ................