8000 Initialize ARPACK eigsh · scikit-learn/scikit-learn@eee85bb · GitHub
[go: up one dir, main page]

Skip to content

Commit eee85bb

Browse files
committed
Initialize ARPACK eigsh
`v0 = random_state.rand(M.shape[0])` leads to an initial residual vector in ARPACK which is all positive. However, this is not the absolute or squared residual, but a true difference. Thus, it is better to initialize with `v0=random_state.uniform(-1, 1, M.shape[0])` to have an equally distributed sign. This is the way that ARPACK initializes the residuals. The effect of the previous initialization is that eigsh frequently does not converge to the correct eigenvalues, e.g. negative eigenvalues for s.p.d. matrix, which leads to an incorrect null-space. - initialized all occurences of sklearn.utils.arpack.eigsh the same way it would be initialzed by ARPACK - regression test to test behavior of new initialization
1 parent f6b85d9 commit eee85bb

File tree

6 files changed

+55
-9
lines changed

6 files changed

+55
-9
lines changed

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ Bug fixes
3030
- Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized
3131
Laplacian matrix was incorrectly set to 1. By `Peter Fischer`_.
3232

33+
- Fixed incorrect initialization of :func:`utils.arpack.eigsh` on all
34+
occurrences. Affects :class:`cluster.SpectralBiclustering`,
35+
:class:`decomposition.KernelPCA`, :class:`manifold.LocallyLinearEmbedding`,
36+
and :class:`manifold.SpectralEmbedding`. By `Peter Fischer`_.
3337

3438
API changes summary
3539
-------------------

sklearn/cluster/bicluster.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from . import KMeans, MiniBatchKMeans
1515
from ..base import BaseEstimator, BiclusterMixin
1616
from ..externals import six
17+
from ..utils import check_random_state
1718
from ..utils.arpack import eigsh, svds
1819

1920
from ..utils.extmath import (make_nonnegative, norm, randomized_svd,
@@ -140,12 +141,18 @@ def _svd(self, array, n_components, n_discard):
140141
# some eigenvalues of A * A.T are negative, causing
141142
# sqrt() to be np.nan. This causes some vectors in vt
142143
# to be np.nan.
143-
_, v = eigsh(safe_sparse_dot(array.T, array),
144-
ncv=self.n_svd_vecs)
144+
A = safe_sparse_dot(array.T, array)
145+
random_state = check_random_state(self.random_state)
146+
# initialize with [-1,1] as in ARPACK
147+
v0 = random_state.uniform(-1, 1, A.shape[0])
148+
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
145149
vt = v.T
146150
if np.any(np.isnan(u)):
147-
_, u = eigsh(safe_sparse_dot(array, array.T),
148-
ncv=self.n_svd_vecs)
151+
A = safe_sparse_dot(array, array.T)
152+
random_state = check_random_state(self.random_state)
153+
# initialize with [-1,1] as in ARPACK
154+
v0 = random_state.uniform(-1, 1, A.shape[0])
155+
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
149156

150157
assert_all_finite(u)
151158
assert_all_finite(vt)

sklearn/decomposition/kernel_pca.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77
from scipy import linalg
88

9+
from ..utils import check_random_state
910
from ..utils.arpack import eigsh
1011
from ..utils.validation import check_is_fitted, NotFittedError
1112
from ..base import BaseEstimator, TransformerMixin
@@ -102,7 +103,8 @@ class KernelPCA(BaseEstimator, TransformerMixin):
102103
def __init__(self, n_components=None, kernel="linear",
103104
gamma=None, degree=3, coef0=1, kernel_params=None,
104105
alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
105-
tol=0, max_iter=None, remove_zero_eig=False):
106+
tol=0, max_iter=None, remove_zero_eig=False,
107+
random_state=None):
106108
if fit_inverse_transform and kernel == 'precomputed':
107109
raise ValueError(
108110
"Cannot fit_inverse_transform with a precomputed kernel.")
@@ -119,6 +121,7 @@ def __init__(self, n_components=None, kernel="linear",
119121
self.tol = tol
120122
self.max_iter = max_iter
121123
self._centerer = KernelCenterer()
124+
self.random_state = random_state
122125

123126
@property
124127
def _pairwise(self):
@@ -157,10 +160,14 @@ def _fit_transform(self, K):
157160
self.lambdas_, self.alphas_ = linalg.eigh(
158161
K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
159162
elif eigen_solver == 'arpack':
163+
random_state = check_random_state(self.random_state)
164+
# initialize with [-1,1] as in ARPACK
165+
v0 = random_state.uniform(-1, 1, K.shape[0])
160166
self.lambdas_, self.alphas_ = eigsh(K, n_components,
161167
which="LA",
162168
tol=self.tol,
163-
maxiter=self.max_iter)
169+
maxiter=self.max_iter,
170+
v0=v0)
164171

165172
# sort eigenvectors in descending order
166173
indices = self.lambdas_.argsort()[::-1]

sklearn/manifold/locally_linear.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
148148

149149
if eigen_solver == 'arpack':
150150
random_state = check_random_state(random_state)
151-
v0 = random_state.rand(M.shape[0])
151+
# initialize with [-1,1] as in ARPACK
152+
v0 = random_state.uniform(-1, 1, M.shape[0])
152153
try:
153154
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
154155
tol=tol, maxiter=max_iter,

sklearn/manifold/spectral_embedding_.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,10 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
259259
# We are computing the opposite of the laplacian inplace so as
260260
# to spare a memory allocation of a possibly very large array
261261
laplacian *= -1
262+
v0 = random_state.uniform(-1, 1, laplacian.shape[0])
262263
lambdas, diffusion_map = eigsh(laplacian, k=n_components,
263264
sigma=1.0, which='LM',
264-
tol=eigen_tol)
265+
tol=eigen_tol, v0=v0)
265266
embedding = diffusion_map.T[n_components::-1] * dd
266267
except RuntimeError:
267268
# When submatrices are exactly singular, an LU decomposition

sklearn/utils/tests/test_utils.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
import numpy as np
44
import scipy.sparse as sp
55
from scipy.linalg import pinv2
6+
from scipy.linalg import eigh
67
from itertools import chain
78

89
from sklearn.utils.testing import (assert_equal, assert_raises, assert_true,
910
assert_almost_equal, assert_array_equal,
10-
SkipTest, assert_raises_regex)
11+
SkipTest, assert_raises_regex,
12+
assert_greater_equal)
1113

1214
from sklearn.utils import check_random_state
1315
from sklearn.utils import deprecated
@@ -18,7 +20,9 @@
1820
from sklearn.utils import shuffle
1921
from sklearn.utils import gen_even_slices
2022
from sklearn.utils.extmath import pinvh
23+
from sklearn.utils.arpack import eigsh
2124
from sklearn.utils.mocking import MockDataFrame
25+
from sklearn.utils.graph import graph_laplacian
2226

2327

2428
def test_make_rng():
@@ -126,6 +130,28 @@ def test_pinvh_simple_complex():
126130
assert_almost_equal(np.dot(a, a_pinv), np.eye(3))
127131

128132

133+
def test_arpack_eigsh_initialization():
134+
'''
135+
Non-regression test that shows null-space computation is better with
136+
initialization of eigsh from [-1,1] instead of [0,1]
137+
'''
138+
random_state = check_random_state(42)
139+
140+
A = random_state.rand(50, 50)
141+
A = np.dot(A.T, A) # create s.p.d. matrix
142+
A = graph_laplacian(A)
143+
k = 5
144+
145+
# Test if eigsh is working correctly
146+
# New initialization [-1,1] (as in original ARPACK)
147+
# Was [0,1] before, with which this test could fail
148+
v0 = random_state.uniform(-1,1, A.shape[0])
149+
w, _ = eigsh(A, k=k, sigma=0.0, v0=v0)
150+
151+
# Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest
152+
assert_greater_equal(w[0], 0)
153+
154+
129155
def test_column_or_1d():
130156
EXAMPLES = [
131157
("binary", ["spam", "egg", "spam"]),

0 commit comments

Comments
 (0)
0