8000 MNT initialize weights when using ARPACK solver with a utility (#18302) · franslarsson/scikit-learn@da562b4 · GitHub
[go: up one dir, main page]

Skip to content

Commit da562b4

Browse files
gauravkdesaiIvan PANICOsummer-bebopthomasjpfanglemaitre
authored
MNT initialize weights when using ARPACK solver with a utility (scikit-learn#18302)
Co-authored-by: Ivan PANICO <ivpanico@gmail.com> Co-authored-by: Ivan Panico <iv.panico@gmail.com> Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent cdde011 commit da562b4

File tree

9 files changed

+98
-28
lines changed

9 files changed

+98
-28
lines changed

benchmarks/bench_plot_randomized_svd.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
from collections import defaultdict
7575
import os.path
7676

77+
from sklearn.utils._arpack import _init_arpack_v0
7778
from sklearn.utils import gen_batches
7879
from sklearn.utils.validation import check_random_state
7980
from sklearn.utils.extmath import randomized_svd
@@ -256,7 +257,7 @@ def svd_timing(X, n_comps, n_iter, n_oversamples,
256257
return U, mu, V, call_time
257258

258259

259-
def norm_diff(A, norm=2, msg=True):
260+
def norm_diff(A, norm=2, msg=True, random_state=None):
260261
"""
261262
Compute the norm diff with the original matrix, when randomized
262263
SVD is called with *params.
@@ -268,7 +269,11 @@ def norm_diff(A, norm=2, msg=True):
268269
print("... computing %s norm ..." % norm)
269270
if norm == 2:
270271
# s = sp.linalg.norm(A, ord=2) # slow
271-
value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False)
272+
v0 = _init_arpack_v0(min(A.shape), random_state)
273+
value = sp.sparse.linalg.svds(A,
274+
k=1,
275+
return_singular_vectors=False,
276+
v0=v0)
272277
else:
273278
if sp.sparse.issparse(A):
274279
value = sp.sparse.linalg.norm(A, ord=norm)
@@ -298,7 +303,7 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps):
298303
all_time = defaultdict(list)
299304
if enable_spectral_norm:
300305
all_spectral = defaultdict(list)
301-
X_spectral_norm = norm_diff(X, norm=2, msg=False)
306+
X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0)
302307
all_frobenius = defaultdict(list)
303308
X_fro_norm = norm_diff(X, norm='fro', msg=False)
304309

@@ -312,8 +317,9 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps):
312317
all_time[label].append(time)
313318
if enable_spectral_norm:
314319
A = U.dot(np.diag(s).dot(V))
315-
all_spectral[label].append(norm_diff(X - A, norm=2) /
316-
X_spectral_norm)
320+
all_spectral[label].append(
321+
norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
322+
)
317323
f = scalable_frobenius_norm_discrepancy(X, U, s, V)
318324
all_frobenius[label].append(f / X_fro_norm)
319325

@@ -327,8 +333,9 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps):
327333
all_time[label].append(time)
328334
if enable_spectral_norm:
329335
A = U.dot(np.diag(s).dot(V))
330-
all_spectral[label].append(norm_diff(X - A, norm=2) /
331-
X_spectral_norm)
336+
all_spectral[label].append(
337+
norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
338+
)
332339
f = scalable_frobenius_norm_discrepancy(X, U, s, V)
333340
all_frobenius[label].append(f / X_fro_norm)
334341

@@ -353,7 +360,7 @@ def bench_b(power_list):
353360
for rank in ranks:
354361
X = make_low_rank_matrix(effective_rank=rank, **data_params)
355362
if enable_spectral_norm:
356-
X_spectral_norm = norm_diff(X, norm=2, msg=False)
363+
X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0)
357364
X_fro_norm = norm_diff(X, norm='fro', msg=False)
358365

359366
for n_comp in [int(rank/2), rank, rank*2]:
@@ -364,8 +371,10 @@ def bench_b(power_list):
364371
power_iteration_normalizer='LU')
365372
if enable_spectral_norm:
366373
A = U.dot(np.diag(s).dot(V))
367-
all_spectral[label].append(norm_diff(X - A, norm=2) /
368-
X_spectral_norm)
374+
all_spectral[label].append(
375+
norm_diff(X - A, norm=2, random_state=0) /
376+
X_spectral_norm
377+
)
369378
f = scalable_frobenius_norm_discrepancy(X, U, s, V)
370379
all_frobenius[label].append(f / X_fro_norm)
371380

@@ -388,7 +397,7 @@ def bench_c(datasets, n_comps):
388397
continue
389398

390399
if enable_spectral_norm:
391-
X_spectral_norm = norm_diff(X, norm=2, msg=False)
400+
X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0)
392401
X_fro_norm = norm_diff(X, norm='fro', msg=False)
393402
n_comps = np.minimum(n_comps, np.min(X.shape))
394403

@@ -401,8 +410,9 @@ def bench_c(datasets, n_comps):
401410
all_time[label].append(time)
402411
if enable_spectral_norm:
403412
A = U.dot(np.diag(s).dot(V))
404-
all_spectral[label].append(norm_diff(X - A, norm=2) /
405-
X_spectral_norm)
413+
all_spectral[label].append(
414+
norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
415+
)
406416
f = scalable_frobenius_norm_discrepancy(X, U, s, V)
407417
all_frobenius[label].append(f / X_fro_norm)
408418

@@ -415,8 +425,9 @@ def bench_c(datasets, n_comps):
415425
all_time[label].append(time)
416426
if enable_spectral_norm:
417427
A = U.dot(np.diag(s).dot(V))
418-
all_spectral[label].append(norm_diff(X - A, norm=2) /
419-
X_spectral_norm)
428+
all_spectral[label].append(
429+
norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
430+
)
420431
f = scalable_frobenius_norm_discrepancy(X, U, s, V)
421432
all_frobenius[label].append(f / X_fro_norm)
422433

doc/whats_new/v0.24.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@ random sampling procedures.
2626
between 32-bits and 64-bits data when the kernel has small positive
2727
eigenvalues.
2828

29-
- |Fix| :class:`linear_model.Perceptron` when `penalty='elasticnet'`
29+
- |Fix| :class:`decomposition.TruncatedSVD` becomes deterministic by exposing
30+
a `random_state` parameter.
31+
32+
- |Fix| :class:`linear_model.Perceptron` when `penalty='elasticnet'`.
3033

3134
Details are listed in the changelog below.
3235

@@ -169,6 +172,12 @@ Changelog
169172
`y_std_` attributes were deprecated and will be removed in 0.26.
170173
:pr:`18768` by :user:`Maren Westermann <marenwestermann>`.
171174

175+
- |Fix| :class:`decomposition.TruncatedSVD` becomes deterministic by using the
176+
`random_state`. It controls the weights' initialization of the underlying
177+
ARPACK solver.
178+
:pr:` #18302` by :user:`Gaurav Desai <gauravkdesai>` and
179+
:user:`Ivan Panico <FollowKenny>`.
180+
172181
:mod:`sklearn.datasets`
173182
.......................
174183

sklearn/decomposition/_kernel_pca.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from scipy import linalg
88
from scipy.sparse.linalg import eigsh
99

10-
from ..utils import check_random_state
10+
from ..utils._arpack import _init_arpack_v0
1111
from ..utils.extmath import svd_flip
1212
from ..utils.validation import check_is_fitted, _check_psd_eigenvalues
1313
from ..utils.deprecation import deprecated
@@ -209,9 +209,7 @@ def _fit_transform(self, K):
209209
self.lambdas_, self.alphas_ = linalg.eigh(
210210
K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
211211
elif eigen_solver == 'arpack':
212-
random_state = check_random_state(self.random_state)
213-
# initialize with [-1,1] as in ARPACK
214-
v0 = random_state.uniform(-1, 1, K.shape[0])
212+
v0 = _init_arpack_v0(K.shape[0], self.random_state)
215213
self.lambdas_, self.alphas_ = eigsh(K, n_components,
216214
which="LA",
217215
tol=self.tol,

sklearn/decomposition/_pca.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from ._base import _BasePCA
2323
from ..utils import check_random_state
24+
from ..utils._arpack import _init_arpack_v0
2425
from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
2526
from ..utils.extmath import stable_cumsum
2627
from ..utils.validation import check_is_fitted
@@ -527,8 +528,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
527528
X -= self.mean_
528529

529530
if svd_solver == 'arpack':
530-
# random init solution, as ARPACK does it internally
531-
v0 = random_state.uniform(-1, 1, size=min(X.shape))
531+
v0 = _init_arpack_v0(min(X.shape), random_state)
532532
U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0)
533533
# svds doesn't abide by scipy.linalg.svd/randomized_svd
534534
# conventions, so reverse its outputs.

sklearn/decomposition/_truncated_svd.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from ..base import BaseEstimator, TransformerMixin
1414
from ..utils import check_array, check_random_state
15+
from ..utils._arpack import _init_arpack_v0
1516
from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
1617
from ..utils.sparsefuncs import mean_variance_axis
1718
from ..utils.validation import _deprecate_positional_args
@@ -165,7 +166,8 @@ def fit_transform(self, X, y=None):
165166
random_state = check_random_state(self.random_state)
166167

167168
if self.algorithm == "arpack":
168-
U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
169+
v0 = _init_arpack_v0(min(X.shape), random_state)
170+
U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0)
169171
# svds doesn't abide by scipy.linalg.svd/randomized_svd
170172
# conventions, so reverse its outputs.
171173
Sigma = Sigma[::-1]

sklearn/manifold/_locally_linear.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from ..base import BaseEstimator, TransformerMixin, _UnstableArchMixin
1313
from ..utils import check_random_state, check_array
14+
from ..utils._arpack import _init_arpack_v0
1415
from ..utils.extmath import stable_cumsum
1516
from ..utils.validation import check_is_fitted
1617
from ..utils.validation import FLOAT_DTYPES
@@ -162,9 +163,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
162163
eigen_solver = 'dense'
163164

164165
if eigen_solver == 'arpack':
165-
random_state = check_random_state(random_state)
166-
# initialize with [-1,1] as in ARPACK
167-
v0 = random_state.uniform(-1, 1, M.shape[0])
166+
v0 = _init_arpack_v0(M.shape[0], random_state)
168167
try:
169168
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
170169
tol=tol, maxiter=max_iter,

sklearn/manifold/_spectral_embedding.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,12 @@
1515
from scipy.sparse.csgraph import laplacian as csgraph_laplacian
1616

1717
from ..base import BaseEstimator
18-
from ..utils import check_random_state, check_array, check_symmetric
18+
from ..utils import (
19+
check_array,
20+
check_random_state,
21+
check_symmetric,
22+
)
23+
from ..utils._arpack import _init_arpack_v0
1924
from ..utils.extmath import _deterministic_vector_sign_flip
2025
from ..utils.fixes import lobpcg
2126
from ..metrics.pairwise import rbf_kernel
@@ -270,7 +275,7 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
270275
# We are computing the opposite of the laplacian inplace so as
271276
# to spare a memory allocation of a possibly very large array
272277
laplacian *= -1
273-
v0 = random_state.uniform(-1, 1, laplacian.shape[0])
278+
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
274279
_, diffusion_map = eigsh(
275280
laplacian, k=n_components, sigma=1.0, which='LM',
276281
tol=eigen_tol, v0=v0)

sklearn/utils/_arpack.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from .validation import check_random_state
2+
3+
4+
def _init_arpack_v0(size, random_state):
5+
"""Initialize the starting vector for iteration in ARPACK functions.
6+
7+
Initialize a ndarray with values sampled from the uniform distribution on
8+
[-1, 1]. This initialization model has been chosen to be consistent with
9+
the ARPACK one as another initialization can lead to convergence issues.
10+
11+
Parameters
12+
----------
13+
size : int
14+
The size of the eigenvalue vector to be initialized.
15+
16+
random_state : int, RandomState instance or None, default=None
17+
The seed of the pseudo random number generator used to generate a
18+
uniform distribution. If int, random_state is the seed used by the
19+
random number generator; If RandomState instance, random_state is the
20+
random number generator; If None, the random number generator is the
21+
RandomState instance used by `np.random`.
22+
23+
Returns
24+
-------
25+
v0 : ndarray of shape (size,)
26+
The initialized vector.
27+
"""
28+
random_state = check_random_state(random_state)
29+
v0 = random_state.uniform(-1, 1, size)
30+
return v0

sklearn/utils/tests/test_arpack.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pytest
2+
from numpy.testing import assert_allclose
3+
4+
from sklearn.utils import check_random_state
5+
from sklearn.utils._arpack import _init_arpack_v0
6+
7+
8+
@pytest.mark.parametrize("seed", range(100))
9+
def test_init_arpack_v0(seed):
10+
# check that the initalization a sampling from an uniform distribution
11+
# where we can fix the random state
12+
size = 1000
13+
v0 = _init_arpack_v0(size, seed)
14+
15+
rng = check_random_state(seed)
16+
assert_allclose(v0, rng.uniform(-1, 1, size=size))

0 commit comments

Comments
 (0)
0