8000 Merge pull request #5012 from yanlend/patch-2 · scikit-learn/scikit-learn@2492dd0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2492dd0

Browse files
committed
Merge pull request #5012 from yanlend/patch-2
[MRG+1] Initialize ARPACK eigsh
2 parents 1863895 + 8ff339e commit 2492dd0

File tree

7 files changed

+58
-10
lines changed

7 files changed

+58
-10
lines changed

doc/modules/pipeline.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ and ``value`` is an estimator object::
156156
n_components=None, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0,
157157
coef0=1, degree=3, eigen_solver='auto', fit_inverse_transform=False,
158158
gamma=None, kernel='linear', kernel_params=None, max_iter=None,
159-
n_components=None, remove_zero_eig=False, tol=0))],
159+
n_components=None, random_state=None, remove_zero_eig=False, tol=0))],
160160
transformer_weights=None)
161161

162162
Like pipelines, feature unions have a shorthand constructor called

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ Bug fixes
4343
- Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized
4444
Laplacian matrix was incorrectly set to 1. By `Peter Fischer`_.
4545

46+
- Fixed incorrect initialization of :func:`utils.arpack.eigsh` on all
47+
occurrences. Affects :class:`cluster.SpectralBiclustering`,
48+
:class:`decomposition.KernelPCA`, :class:`manifold.LocallyLinearEmbedding`,
49+
and :class:`manifold.SpectralEmbedding`. By `Peter Fischer`_.
4650

4751
API changes summary
4852
-------------------

sklearn/cluster/bicluster.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from . import KMeans, MiniBatchKMeans
1515
from ..base import BaseEstimator, BiclusterMixin
1616
from ..externals import six
17+
from ..utils import check_random_state
1718
from ..utils.arpack import eigsh, svds
1819

1920
from ..utils.extmath import (make_nonnegative, norm, randomized_svd,
@@ -140,12 +141,18 @@ def _svd(self, array, n_components, n_discard):
140141
# some eigenvalues of A * A.T are negative, causing
141142
# sqrt() to be np.nan. This causes some vectors in vt
142143
# to be np.nan.
143-
_, v = eigsh(safe_sparse_dot(array.T, array),
144-
ncv=self.n_svd_vecs)
144+
A = safe_sparse_dot(array.T, array)
145+
random_state = check_random_state(self.random_state)
146+
# initialize with [-1,1] as in ARPACK
147+
v0 = random_state.uniform(-1, 1, A.shape[0])
148+
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
145149
vt = v.T
146150
if np.any(np.isnan(u)):
147-
_, u = eigsh(safe_sparse_dot(array, array.T),
148-
ncv=self.n_svd_vecs)
151+
A = safe_sparse_dot(array, array.T)
152+
random_state = check_random_state(self.random_state)
153+
# initialize with [-1,1] as in ARPACK
154+
v0 = random_state.uniform(-1, 1, A.shape[0])
155+
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
149156

150157
assert_all_finite(u)
151158
assert_all_finite(vt)

sklearn/decomposition/kernel_pca.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77
from scipy import linalg
88

9+
from ..utils import check_random_state
910
from ..utils.arpack import eigsh
1011
from ..utils.validation import check_is_fitted
1112
from ..exceptions import NotFittedError
@@ -76,6 +77,10 @@ class KernelPCA(BaseEstimator, TransformerMixin):
7677
When n_components is None, this parameter is ignored and components
7778
with zero eigenvalues are removed regardless.
7879
80+
random_state : int seed, RandomState instance, or None, default : None
81+
A pseudo random number generator used for the initialization of the
82+
residuals when eigen_solver == 'arpack'.
83+
7984
Attributes
8085
----------
8186
@@ -103,7 +108,8 @@ class KernelPCA(BaseEstimator, TransformerMixin):
103108
def __init__(self, n_components=None, kernel="linear",
104109
gamma=None, degree=3, coef0=1, kernel_params=None,
105110
alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
106-
tol=0, max_iter=None, remove_zero_eig=False):
111+
tol=0, max_iter=None, remove_zero_eig=False,
112+
random_state=None):
107113
if fit_inverse_transform and kernel == 'precomputed':
108114
raise ValueError(
109115
"Cannot fit_inverse_transform with a precomputed kernel.")
@@ -120,6 +126,7 @@ def __init__(self, n_components=None, kernel="linear",
120126
self.tol = tol
121127
self.max_iter = max_iter
122128
self._centerer = KernelCenterer()
129+
self.random_state = random_state
123130

124131
@property
125132
def _pairwise(self):
@@ -158,10 +165,14 @@ def _fit_transform(self, K):
158165
self.lambdas_, self.alphas_ = linalg.eigh(
159166
K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
160167
elif eigen_solver == 'arpack':
168+
random_state = check_random_state(self.random_state)
169+
# initialize with [-1,1] as in ARPACK
170+
v0 = random_state.uniform(-1, 1, K.shape[0])
161171
self.lambdas_, self.alphas_ = eigsh(K, n_components,
162172
which="LA",
163173
tol=self.tol,
164-
maxiter=self.max_iter)
174+
maxiter=self.max_iter,
175+
v0=v0)
165176

166177
# sort eigenvectors in descending order
167178
indices = self.lambdas_.argsort()[::-1]

sklearn/manifold/locally_linear.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
148148

149149
if eigen_solver == 'arpack':
150150
random_state = check_random_state(random_state)
151-
v0 = random_state.rand(M.shape[0])
151+
# initialize with [-1,1] as in ARPACK
152+
v0 = random_state.uniform(-1, 1, M.shape[0])
152153
try:
153154
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
154155
tol=tol, maxiter=max_iter,

sklearn/manifold/spectral_embedding_.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,10 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
259259
# We are computing the opposite of the laplacian inplace so as
260260
# to spare a memory allocation of a possibly very large array
261261
laplacian *= -1
262+
v0 = random_state.uniform(-1, 1, laplacian.shape[0])
262263
lambdas, diffusion_map = eigsh(laplacian, k=n_components,
263264
sigma=1.0, which='LM',
264-
tol=eigen_tol)
265+
tol=eigen_tol, v0=v0)
265266
embedding = diffusion_map.T[n_components::-1] * dd
266267
except RuntimeError:
267268
# When submatrices are exactly singular, an LU decomposition

sklearn/utils/tests/test_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
import numpy as np
44
import scipy.sparse as sp
55
from scipy.linalg import pinv2
6+
from scipy.linalg import eigh
67
from itertools import chain
78

89
from sklearn.utils.testing import (assert_equal, assert_raises, assert_true,
910
assert_almost_equal, assert_array_equal,
10-
SkipTest, assert_raises_regex)
11+
SkipTest, assert_raises_regex,
12+
assert_greater_equal)
1113

1214
from sklearn.utils import check_random_state
1315
from sklearn.utils import deprecated
@@ -18,7 +20,9 @@
1820
from sklearn.utils import shuffle
1921
from sklearn.utils import gen_even_slices
2022
from sklearn.utils.extmath import pinvh
23+
from sklearn.utils.arpack import eigsh
2124
from sklearn.utils.mocking import MockDataFrame
25+
from sklearn.utils.graph import graph_laplacian
2226

2327

2428
def test_make_rng():
@@ -126,6 +130,26 @@ def test_pinvh_simple_complex():
126130
assert_almost_equal(np.dot(a, a_pinv), np.eye(3))
127131

128132

133+
def test_arpack_eigsh_initialization():
134+
# Non-regression test that shows null-space computation is better with
135+
# initialization of eigsh from [-1,1] instead of [0,1]
136+
random_state = check_random_state(42)
137+
138+
A = random_state.rand(50, 50)
139+
A = np.dot(A.T, A) # create s.p.d. matrix
140+
A = graph_laplacian(A) + 1e-7 * np.identity(A.shape[0])
141+
k = 5
142+
143+< 7DB3 div class="diff-text-inner"> # Test if eigsh is working correctly
144+
# New initialization [-1,1] (as in original ARPACK)
145+
# Was [0,1] before, with which this test could fail
146+
v0 = random_state.uniform(-1,1, A.shape[0])
147+
w, _ = eigsh(A, k=k, sigma=0.0, v0=v0)
148+
149+
# Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest
150+
assert_greater_equal(w[0], 0)
151+
152+
129153
def test_column_or_1d():
130154
EXAMPLES = [
131155
("binary", ["spam", "egg", "spam"]),

0 commit comments

Comments
 (0)
0