8000 ENH Change the default `n_init` and `eps` for MDS (#31117) · scikit-learn/scikit-learn@31439d2 · GitHub
[go: up one dir, main page]

Skip to content

Commit 31439d2

Browse files
dkobakogriselantoinebaker
authored
ENH Change the default n_init and eps for MDS (#31117)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: antoinebaker <antoinebaker@users.noreply.github.com>
1 parent 0173b91 commit 31439d2

File tree

8 files changed

+178
-56
lines changed

8 files changed

+178
-56
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
:class:`manifold.MDS` will switch to use `n_init=1` by default,
2+
starting from version 1.9.
3+
By :user:`Dmitry Kobak <dkobak>`
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
:class:`manifold.MDS` now uses `eps=1e-6` by default and the convergence
2+
criterion was adjusted to make sense for both metric and non-metric MDS
3+
and to follow the reference R implementation. The formula for normalized
4+
stress was adjusted to follow the original definition by Kruskal.
5+
By :user:`Dmitry Kobak <dkobak>`

examples/manifold/plot_compare_methods.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def add_2d_scatter(ax, points, points_color, title=None):
166166
md_scaling = manifold.MDS(
167167
n_components=n_components,
168168
max_iter=50,
169-
n_init=4,
169+
n_init=1,
170170
random_state=0,
171171
normalized_stress=False,
172172
)

examples/manifold/plot_lle_digits.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def plot_embedding(X, title):
130130
"LTSA LLE embedding": LocallyLinearEmbedding(
131131
n_neighbors=n_neighbors, n_components=2, method="ltsa"
132132
),
133-
"MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, n_jobs=2),
133+
"MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, eps=1e-6),
134134
"Random Trees embedding": make_pipeline(
135135
RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),
136136
TruncatedSVD(n_components=2),

examples/manifold/plot_mds.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@
55
66
An illustration of the metric and non-metric MDS on generated noisy data.
77
8-
The reconstructed points using the metric MDS and non metric MDS are slightly
9-
shifted to avoid overlapping.
10-
118
"""
129

1310
# Authors: The scikit-learn developers
1411
# SPDX-License-Identifier: BSD-3-Clause
1512

13+
# %%
14+
# Dataset preparation
15+
# -------------------
16+
#
17+
# We start by uniformly generating 20 points in a 2D space.
18+
1619
import numpy as np
1720
from matplotlib import pyplot as plt
1821
from matplotlib.collections import LineCollection
@@ -31,6 +34,11 @@
3134
# Center the data
3235
X_true -= X_true.mean()
3336

37+
# %%
38+
# Now we compute pairwise distances between all points and add
39+
# a small amount of noise to the distance matrix. We make sure
40+
# to keep the noisy distance matrix symmetric.
41+
3442
# Compute pairwise Euclidean distances
3543
distances = euclidean_distances(X_true)
3644

@@ -40,10 +48,14 @@
4048
np.fill_diagonal(noise, 0)
4149
distances += noise
4250

51+
# %%
52+
# Here we compute metric and non-metric MDS of the noisy distance matrix.
53+
4354
mds = manifold.MDS(
4455
n_components=2,
4556
max_iter=3000,
4657
eps=1e-9,
58+
n_init=1,
4759
random_state=42,
4860
dissimilarity="precomputed",
4961
n_jobs=1,
@@ -62,10 +74,16 @@
6274
)
6375
X_nmds = nmds.fit_transform(distances)
6476

65-
# Rescale the data
66-
X_mds *= np.sqrt((X_true**2).sum()) / np.sqrt((X_mds**2).sum())
77+
# %%
78+
# Rescaling the non-metric MDS solution to match the spread of the original data.
79+
6780
X_nmds *= np.sqrt((X_true**2).sum()) / np.sqrt((X_nmds**2).sum())
6881

82+
# %%
83+
# To make the visual comparisons easier, we rotate the original data and both MDS
84+
# solutions to their PCA axes. And flip horizontal and vertical MDS axes, if needed,
85+
# to match the original data orientation.
86+
6987
# Rotate the data
7088
pca = PCA(n_components=2)
7189
X_true = pca.fit_transform(X_true)
@@ -79,6 +97,9 @@
7997
if np.corrcoef(X_nmds[:, i], X_true[:, i])[0, 1] < 0:
8098
X_nmds[:, i] *= -1
8199

100+
# %%
101+
# Finally, we plot the original data and both MDS reconstructions.
102+
82103
fig = plt.figure(1)
83104
ax = plt.axes([0.0, 0.0, 1.0, 1.0])
84105

sklearn/manifold/_mds.py

Lines changed: 76 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def _smacof_single(
2727
init=None,
2828
max_iter=300,
2929
verbose=0,
30-
eps=1e-3,
30+
eps=1e-6,
3131
random_state=None,
3232
normalized_stress=False,
3333
):
@@ -59,18 +59,21 @@ def _smacof_single(
5959
verbose : int, default=0
6060
Level of verbosity.
6161
62-
eps : float, default=1e-3
63-
Relative tolerance with respect to stress at which to declare
64-
convergence. The value of `eps` should be tuned separately depending
65-
on whether or not `normalized_stress` is being used.
62+
eps : float, default=1e-6
63+
The tolerance with respect to stress (normalized by the sum of squared
64+
embedding distances) at which to declare convergence.
65+
66+
.. versionchanged:: 1.7
67+
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
68+
of a bugfix in the computation of the convergence criterion.
6669
6770
random_state : int, RandomState instance or None, default=None
6871
Determines the random number generator used to initialize the centers.
6972
Pass an int for reproducible results across multiple function calls.
7073
See :term:`Glossary <random_state>`.
7174
7275
normalized_stress : bool, default=False
73-
Whether use and return normalized stress value (Stress-1) instead of raw
76+
Whether to return normalized stress value (Stress-1) instead of raw
7477
stress.
7578
7679
.. versionadded:: 1.2
@@ -168,29 +171,32 @@ def _smacof_single(
168171
# Compute stress
169172
distances = euclidean_distances(X)
170173
stress = ((distances.ravel() - disparities.ravel()) ** 2).sum() / 2
171-
if normalized_stress:
172-
stress = np.sqrt(stress / ((disparities.ravel() ** 2).sum() / 2))
173174

174-
normalization = np.sqrt((X**2).sum(axis=1)).sum()
175175
if verbose >= 2: # pragma: no cover
176176
print(f"Iteration {it}, stress {stress:.4f}")
177177
if old_stress is not None:
178-
if (old_stress - stress / normalization) < eps:
178+
sum_squared_distances = (distances.ravel() ** 2).sum()
179+
if ((old_stress - stress) / (sum_squared_distances / 2)) < eps:
179180
if verbose: # pragma: no cover
180181
print("Convergence criterion reached.")
181182
break
182-
old_stress = stress / normalization
183+
old_stress = stress
184+
185+
if normalized_stress:
186+
sum_squared_distances = (distances.ravel() ** 2).sum()
187+
stress = np.sqrt(stress / (sum_squared_distances / 2))
183188

184189
return X, stress, it + 1
185190

186191

192+
# TODO(1.9): change default `n_init` to 1, see PR #31117
187193
@validate_params(
188194
{
189195
"dissimilarities": ["array-like"],
190196
"metric": ["boolean"],
191197
"n_components": [Interval(Integral, 1, None, closed="left")],
192198
"init": ["array-like", None],
193-
"n_init": [Interval(Integral, 1, None, closed="left")],
199+
"n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
194200
"n_jobs": [Integral, None],
195201
"max_iter": [Interval(Integral, 1, None, closed="left")],
196202
"verbose": ["verbose"],
@@ -207,11 +213,11 @@ def smacof(
207213
metric=True,
208214
n_components=2,
209215
init=None,
210-
n_init=8,
216+
n_init="warn",
211217
n_jobs=None,
212218
max_iter=300,
213219
verbose=0,
214-
eps=1e-3,
220+
eps=1e-6,
215221
random_state=None,
216222
return_n_iter=False,
217223
normalized_stress="auto",
@@ -262,6 +268,9 @@ def smacof(
262268
determined by the run with the smallest final stress. If ``init`` is
263269
provided, this option is overridden and a single run is performed.
264270
271+
.. versionchanged:: 1.9
272+
The default value for `n_iter` will change from 8 to 1 in version 1.9.
273+
265274
n_jobs : int, default=None
266275
The number of jobs to use for the computation. If multiple
267276
initializations are used (``n_init``), each run of the algorithm is
@@ -277,10 +286,13 @@ def smacof(
277286
verbose : int, default=0
278287
Level of verbosity.
279288
280-
eps : float, default=1e-3
281-
Relative tolerance with respect to stress at which to declare
282-
convergence. The value of `eps` should be tuned separately depending
283-
on whether or not `normalized_stress` is being used.
289+
eps : float, default=1e-6
290+
The tolerance with respect to stress (normalized by the sum of squared
291+
embedding distances) at which to declare convergence.
292+
293+
.. versionchanged:: 1.7
294+
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
295+
of a bugfix in the computation of the convergence criterion.
284296
285297
random_state : int, RandomState instance or None, default=None
286298
Determines the random number generator used to initialize the centers.
@@ -290,7 +302,7 @@ def smacof(
290302
return_n_iter : bool, default=False
291303
Whether or not to return the number of iterations.
292304
293-
normalized_stress : bool or "auto" default="auto"
305+
normalized_stress : bool or "auto", default="auto"
294306
Whether to return normalized stress value (Stress-1) instead of raw
295307
stress. By default, metric MDS returns raw stress while non-metric MDS
296308
returns normalized stress.
@@ -335,17 +347,24 @@ def smacof(
335347
>>> import numpy as np
336348
>>> from sklearn.manifold import smacof
337349
>>> from sklearn.metrics import euclidean_distances
338-
>>> X = np.array([[0, 1, 2], [1, 0, 3],[2, 3, 0]])
350+
>>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
339351
>>> dissimilarities = euclidean_distances(X)
340-
>>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42)
341-
>>> np.round(mds_result, 5)
342-
array([[ 0.05352, -1.07253],
343-
[ 1.74231, -0.75675],
344-
[-1.79583, 1.82928]])
345-
>>> np.round(stress, 5).item()
346-
0.00128
352+
>>> Z, stress = smacof(
353+
... dissimilarities, n_components=2, n_init=1, eps=1e-6, random_state=42
354+
... )
355+
>>> Z.shape
356+
(3, 2)
357+
>>> np.round(stress, 6).item()
358+
3.2e-05
347359
"""
348360

361+
if n_init == "warn":
362+
warnings.warn(
363+
"The default value of `n_init` will change from 8 to 1 in 1.9.",
364+
FutureWarning,
365+
)
366+
n_init = 8
367+
349368
dissimilarities = check_array(dissimilarities)
350369
random_state = check_random_state(random_state)
351370

@@ -408,6 +427,7 @@ def smacof(
408427
return best_pos, best_stress
409428

410429

430+
# TODO(1.9): change default `n_init` to 1, see PR #31117
411431
class MDS(BaseEstimator):
412432
"""Multidimensional scaling.
413433
@@ -428,16 +448,22 @@ class MDS(BaseEstimator):
428448
initializations. The final results will be the best output of the runs,
429449
determined by the run with the smallest final stress.
430450
451+
.. versionchanged:: 1.9
452+
The default value for `n_init` will change from 4 to 1 in version 1.9.
453+
431454
max_iter : int, default=300
432455
Maximum number of iterations of the SMACOF algorithm for a single run.
433456
434457
verbose : int, default=0
435458
Level of verbosity.
436459
437-
eps : float, default=1e-3
438-
Relative tolerance with respect to stress at which to declare
439-
convergence. The value of `eps` should be tuned separately depending
440-
on whether or not `normalized_stress` is being used.
460+
eps : float, default=1e-6
461+
The tolerance with respect to stress (normalized by the sum of squared
462+
embedding distances) at which to declare convergence.
463+
464+
.. versionchanged:: 1.7
465+
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
466+
of a bugfix in the computation of the convergence criterion.
441467
442468
n_jobs : int, default=None
443469
The number of jobs to use for the computation. If multiple
@@ -464,9 +490,9 @@ class MDS(BaseEstimator):
464490
``fit_transform``.
465491
466492
normalized_stress : bool or "auto" default="auto"
467-
Whether use and return normalized stress value (Stress-1) instead of raw
468-
stress. By default, metric MDS uses raw stress while non-metric MDS uses
469-
normalized stress.
493+
Whether to return normalized stress value (Stress-1) instead of raw
494+
stress. By default, metric MDS returns raw stress while non-metric MDS
495+
returns normalized stress.
470496
471497
.. versionadded:: 1.2
472498
@@ -539,7 +565,7 @@ class MDS(BaseEstimator):
539565
>>> X, _ = load_digits(return_X_y=True)
540566
>>> X.shape
541567
(1797, 64)
542-
>>> embedding = MDS(n_components=2, normalized_stress='auto')
568+
>>> embedding = MDS(n_components=2, n_init=1)
543569
>>> X_transformed = embedding.fit_transform(X[:100])
544570
>>> X_transformed.shape
545571
(100, 2)
@@ -554,7 +580,7 @@ class MDS(BaseEstimator):
554580
_parameter_constraints: dict = {
555581
"n_components": [Interval(Integral, 1, None, closed="left")],
556582
"metric": ["boolean"],
557-
"n_init": [Interval(Integral, 1, None, closed="left")],
583+
"n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
558584
"max_iter": [Interval(Integral, 1, None, closed="left")],
559585
"verbose": ["verbose"],
560586
"eps": [Interval(Real, 0.0, None, closed="left")],
@@ -569,10 +595,10 @@ def __init__(
569595
n_components=2,
570596
*,
571597
metric=True,
572-
n_init=4,
598+
n_init="warn",
573599
max_iter=300,
574600
verbose=0,
575-
eps=1e-3,
601+
eps=1e-6,
576602
n_jobs=None,
577603
random_state=None,
578604
dissimilarity="euclidean",
@@ -646,10 +672,20 @@ def fit_transform(self, X, y=None, init=None):
646672
X_new : ndarray of shape (n_samples, n_components)
647673
X transformed in the new space.
648674
"""
675+
676+
if self.n_init == "warn":
677+
warnings.warn(
678+
"The default value of `n_init` will change from 4 to 1 in 1.9.",
679+
FutureWarning,
680+
)
681+
self._n_init = 4
682+
else:
683+
self._n_init = self.n_init
684+
649685
X = validate_data(self, X)
650686
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
651687
warnings.warn(
652-
"The MDS API has changed. ``fit`` now constructs an"
688+
"The MDS API has changed. ``fit`` now constructs a"
653689
" dissimilarity matrix from data. To use a custom "
654690
"dissimilarity matrix, set "
655691
"``dissimilarity='precomputed'``."
@@ -665,7 +701,7 @@ def fit_transform(self, X, y=None, init=None):
665701
metric=self.metric,
666702
n_components=self.n_components,
667703
init=init,
668-
n_init=self.n_init,
704+
n_init=self._n_init,
669705
n_jobs=self.n_jobs,
670706
max_iter=self.max_iter,
671707
verbose=self.verbose,

0 commit comments

Comments
 (0)
0