8000 FIX: numerical stability in spectral · jwchennlp/scikit-learn@6c519c8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6c519c8

Browse files
committed
FIX: numerical stability in spectral
This enables me to unskip a skipped test, hurray!
1 parent 354b648 commit 6c519c8

File tree

2 files changed

+46
-30
lines changed

2 files changed

+46
-30
lines changed

sklearn/cluster/spectral.py

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,29 @@
1414
from ..neighbors import kneighbors_graph
1515
from .k_means_ import k_means
1616

17+
def _set_diag(laplacian, value):
18+
from scipy import sparse
19+
n_nodes = laplacian.shape[0]
20+
# We need to put the diagonal at zero
21+
if not sparse.isspmatrix(laplacian):
22+
laplacian.flat[::n_nodes + 1] = value
23+
else:
24+
laplacian = laplacian.tocoo()
25+
diag_idx = (laplacian.row == laplacian.col)
26+
laplacian.data[diag_idx] = value
27+
# If the matrix has a small number of diagonals (as in the
28+
# case of structured matrices comming from images), the
29+
# dia format might be best suited for matvec products:
30+
n_diags = np.unique(laplacian.row - laplacian.col).size
31+
if n_diags <= 7:
32+
# 3 or less outer diagonals on each side
33+
laplacian = laplacian.todia()
34+
else:
35+
# csr has the fastest matvec and is thus best suited to
36+
# arpack
37+
laplacian = laplacian.tocsr()
38+
return laplacian
39+
1740

1841
def spectral_embedding(adjacency, n_components=8, mode=None,
1942
random_state=None):
@@ -65,6 +88,7 @@ def spectral_embedding(adjacency, n_components=8, mode=None,
6588
from scipy import sparse
6689
from ..utils.arpack import eigsh
6790
from scipy.sparse.linalg import lobpcg
91+
from scipy.sparse.linalg.eigen.lobpcg.lobpcg import symeig
6892
try:
6993
from pyamg import smoothed_aggregation_solver
7094
except ImportError:
@@ -87,25 +111,7 @@ def spectral_embedding(adjacency, n_components 8000 =8, mode=None,
87111
or not sparse.isspmatrix(laplacian)
88112
or n_nodes < 5 * n_components):
89113
# lobpcg used with mode='amg' has bugs for low number of nodes
90-
91-
# We need to put the diagonal at zero
92-
if not sparse.isspmatrix(laplacian):
93-
laplacian.flat[::n_nodes + 1] = 0
94-
else:
95-
laplacian = laplacian.tocoo()
96-
diag_idx = (laplacian.row == laplacian.col)
97-
laplacian.data[diag_idx] = 0
98-
# If the matrix has a small number of diagonals (as in the
99-
# case of structured matrices comming from images), the
100-
# dia format might be best suited for matvec products:
101-
n_diags = np.unique(laplacian.row - laplacian.col).size
102-
if n_diags <= 7:
103-
# 3 or less outer diagonals on each side
104-
laplacian = laplacian.todia()
105-
else:
106-
# csr has the fastest matvec and is thus best suited to
107-
# arpack
108-
laplacian = laplacian.tocsr()
114+
laplacian = _set_diag(laplacian, 0)
109115

110116
# Here we'll use shift-invert mode for fast eigenvalues
111117
# (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
@@ -137,22 +143,33 @@ def spectral_embedding(adjacency, n_components=8, mode=None,
137143
ml = smoothed_aggregation_solver(laplacian.tocsr())
138144
M = ml.aspreconditioner()
139145
X = random_state.rand(laplacian.shape[0], n_components)
140-
X[:, 0] = 1. / dd.ravel()
146+
#X[:, 0] = 1. / dd.ravel()
147+
X[:, 0] = dd.ravel()
141148
lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12,
142149
largest=False)
143150
embedding = diffusion_map.T * dd
144151
if embedding.shape[0] == 1:
145152
raise ValueError
146153
elif mode == "lobpcg":
147-
# We increase the number of eigenvectors requested, as lobpcg
148-
# doesn't behave well in low dimension
149-
X = random_state.rand(laplacian.shape[0], n_components + 4)
150-
X[:, 0] = 1. / dd.ravel()
151-
lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15,
152-
largest=False, maxiter=2000)
153-
embedding = diffusion_map.T[:n_components] * dd
154-
if embedding.shape[0] == 1:
155-
raise ValueError
154+
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
155+
if n_nodes < 5 * n_components + 1:
156+
# lobpcg will fallback to symeig, so we short circuit it
157+
if sparse.isspmatrix(laplacian):
158+
laplacian = laplacian.todense()
159+
lambdas, diffusion_map = symeig(laplacian)
160+
embedding = diffusion_map.T[:n_components] * dd
161+
else:
162+
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
163+
laplacian = _set_diag(laplacian, 1)
164+
# We increase the number of eigenvectors requested, as lobpcg
165+
# doesn't behave well in low dimension
166+
X = random_state.rand(laplacian.shape[0], n_components + 1)
167+
X[:, 0] = dd.ravel()
168+
lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15,
169+
largest=False, maxiter=2000)
170+
embedding = diffusion_map.T[:n_components] * dd
171+
if embedding.shape[0] == 1:
172+
raise ValueError
156173
return embedding
157174

158175

sklearn/cluster/tests/test_spectral.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ def test_spectral_unknown_mode():
9090
def test_spectral_clustering_sparse():
9191
# We need a large matrice, or the lobpcg solver will fallback to its
9292
# non-sparse and buggy mode
93-
raise nose.SkipTest("XFailed Test")
9493
S = np.array([[1, 5, 2, 2, 1, 0, 0, 0, 0, 0],
9594
[5, 1, 3, 2, 1, 0, 0, 0, 0, 0],
9695
[2, 3, 1, 1, 1, 0, 0, 0, 0, 0],

0 commit comments

Comments
 (0)
0