6
6
# License: BSD
7
7
8
8
import numpy as np
9
+ from scipy import linalg
9
10
from joblib import Parallel , delayed , effective_n_jobs
10
11
11
12
import warnings
14
15
from ..metrics import euclidean_distances
15
16
from ..utils import check_random_state , check_array , check_symmetric
16
17
from ..isotonic import IsotonicRegression
18
+ from sklearn .utils .validation import _check_psd_eigenvalues
17
19
18
20
19
21
def _smacof_single (dissimilarities , metric = True , n_components = 2 , init = None ,
@@ -119,7 +121,7 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
119
121
if verbose >= 2 :
120
122
print ('it: %d, stress %s' % (it , stress ))
121
123
if old_stress is not None :
122
- if (old_stress - stress / dis ) < eps :
124
+ if (old_stress - stress / dis ) < eps :
123
125
if verbose :
124
126
print ('breaking at iteration %d with stress %s' % (it ,
125
127
stress ))
@@ -272,6 +274,71 @@ def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8,
272
274
return best_pos , best_stress
273
275
274
276
277
+ def svd_scaler (dissimilarities , n_components = 2 ):
278
+ """
279
+ Computes multidimensional scaling using SVD algorithm
280
+
281
+ Parameters
282
+ ----------
283
+ dissimilarities : ndarray, shape (n_samples, n_samples)
284
+ Pairwise dissimilarities between the points. Must be euclidean.
285
+ n_components : int, optional, default: 2
286
+ Number of dimension in which to immerse the dissimilarities.
287
+
288
+ Returns
289
+ ----------
290
+ X : ndarray, shape (n_samples, n_components)
291
+ Coordinates of the points in a ``n_components``-space.
292
+
293
+ stress : float
294
+ The final value of the stress (sum of squared distance of the
295
+ disparities and the distances for all constrained points).
296
+
297
+ References
298
+ ----------
299
+ "An Introduction to MDS" Florian Wickelmaier
300
+ Sound Quality Research Unit, Aalborg University, Denmark (2003)
301
+
302
+ "Multidimensional Scaling" Chapman Hall
303
+ 2nd edition, Boca Raton (2001)
304
+
305
+ """
306
+
307
+ dissimilarities = check_symmetric (dissimilarities , raise_exception = True )
308
+
309
+ n_samples = dissimilarities .shape [0 ]
310
+
311
+ # Centering matrix
312
+ H = np .eye (* dissimilarities .shape ) - (1. / n_samples ) * \
313
+ np .ones (dissimilarities .shape )
314
+
315
+ # Double centered matrix
316
+ K = - 0.5 * np .dot (H , np .dot (dissimilarities ** 2 , H ))
317
+
318
+ w , V = linalg .eigh (K , check_finite = False )
319
+
320
+ # ``dissimilarities`` is Euclidean iff ``K`` is positive semi-definite.
321
+ # For detail see "Multidimensional Scaling" Chapman Hall p 397
322
+ try :
323
+ w = _check_psd_eigenvalues (w )
324
+ except ValueError :
325
+ raise ValueError ("Dissimilarity matrix must be euclidean. "
326
+ "Make sure to pass an euclidean matrix, or use "
327
+ "dissimilarity='euclidean'." )
328
+
329
+ # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors.
330
+ # Eigenvalues should be in descending order by convention.
331
+ w = w [:- n_components - 1 :- 1 ]
332
+ V = V [:, :- n_components - 1 :- 1 ]
333
+
334
+ X = np .sqrt (w ) * V
335
+
336
+ dist = euclidean_distances (X )
337
+ stress = ((dissimilarities .ravel () - dist .ravel ()) ** 2 ).sum () * 0.5
338
+
339
+ return X , stress
340
+
341
+
275
342
class MDS (BaseEstimator ):
276
343
"""Multidimensional scaling
277
344
@@ -285,21 +352,29 @@ class MDS(BaseEstimator):
285
352
metric : boolean, optional, default: True
286
353
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
287
354
355
+ If ``method=='svd'``, metric must be set to True.
356
+
288
357
n_init : int, optional, default: 4
289
358
Number of times the SMACOF algorithm will be run with different
290
359
initializations. The final results will be the best output of the runs,
291
360
determined by the run with the smallest final stress.
292
361
362
+ Ignored if ``method=='svd'``.
363
+
293
364
max_iter : int, optional, default: 300
294
365
Maximum number of iterations of the SMACOF algorithm for a single run.
295
366
367
+ Ignored if ``method=='svd'``.
368
+
296
369
verbose : int, optional, default: 0
297
370
Level of verbosity.
298
371
299
372
eps : float, optional, default: 1e-3
300
373
Relative tolerance with respect to stress at which to declare
301
374
convergence.
302
375
376
+ Ignored if ``method=='svd'``.
377
+
303
378
n_jobs : int or None, optional (default=None)
304
379
The number of jobs to use for the computation. If multiple
305
380
initializations are used (``n_init``), each run of the algorithm is
@@ -309,6 +384,8 @@ class MDS(BaseEstimator):
309
384
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
310
385
for more details.
311
386
387
+ Ignored if ``method=='svd'``.
388
+
312
389
random_state : int, RandomState instance, default=None
313
390
Determines the random number generator used to initialize the centers.
314
391
Pass an int for reproducible results across multiple function calls.
@@ -324,6 +401,11 @@ class MDS(BaseEstimator):
324
401
Pre-computed dissimilarities are passed directly to ``fit`` and
325
402
``fit_transform``.
326
403
404
+ method: {'smacof', 'svd'}, default ='smacof'
405
+ The method used for solving the MDS problem.
406
+
407
+ .. versionadded:: 0.23
408
+
327
409
Attributes
328
410
----------
329
411
embedding_ : array-like, shape (n_samples, n_components)
@@ -333,6 +415,12 @@ class MDS(BaseEstimator):
333
415
The final value of the stress (sum of squared distance of the
334
416
disparities and the distances for all constrained points).
335
417
418
+ n_iter_ : int
419
+ The number of iterations of SMACOF algorithm corresponding
420
+ to the best stress.
421
+
422
+ It is set to ``None`` if ``method=='svd'``.
423
+
336
424
Examples
337
425
--------
338
426
>>> from sklearn.datasets import load_digits
@@ -357,12 +445,15 @@ class MDS(BaseEstimator):
357
445
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
358
446
359
447
"""
448
+
360
449
def __init__ (self , n_components = 2 , metric = True , n_init = 4 ,
361
450
max_iter = 300 , verbose = 0 , eps = 1e-3 , n_jobs = None ,
362
- random_state = None , dissimilarity = "euclidean" ):
451
+ random_state = None , dissimilarity = "euclidean" ,
452
+ method = "smacof" ):
363
453
self .n_components = n_components
364
454
self .dissimilarity = dissimilarity
365
455
self .metric = metric
456
+ self .method = method
366
457
self .n_init = n_init
367
458
self .max_iter = max_iter
368
459
self .eps = eps
@@ -387,6 +478,7 @@ def fit(self, X, y=None, init=None):
387
478
y : Ignored
388
479
389
480
init : ndarray, shape (n_samples,), optional, default: None
481
+ Ignored if ``method=='svd'``.
390
482
Starting configuration of the embedding to initialize the SMACOF
391
483
algorithm. By default, the algorithm is initialized with a randomly
392
484
chosen array.
@@ -407,6 +499,7 @@ def fit_transform(self, X, y=None, init=None):
407
499
y : Ignored
408
500
409
501
init : ndarray, shape (n_samples,), optional, default: None
502
+ Ignored if ``method=='svd'``.
410
503
Starting configuration of the embedding to initialize the SMACOF
411
504
algorithm. By default, the algorithm is initialized with a randomly
412
505
chosen array.
@@ -423,14 +516,26 @@ def fit_transform(self, X, y=None, init=None):
423
516
elif self .dissimilarity == "euclidean" :
424
517
self .dissimilarity_matrix_ = euclidean_distances (X )
425
518
else :
426
- raise ValueError ("Proximity must be 'precomputed' or 'euclidean'."
427
- " Got %s instead" % str (self .dissimilarity ))
428
-
429
- self .embedding_ , self .stress_ , self .n_iter_ = smacof (
430
- self .dissimilarity_matrix_ , metric = self .metric ,
431
- n_components = self .n_components , init = init , n_init = self .n_init ,
432
- n_jobs = self .n_jobs , max_iter = self .max_iter , verbose = self .verbose ,
433
- eps = self .eps , random_state = self .random_state ,
434
- return_n_iter = True )
519
+ raise ValueError (
520
+ "Dissimilarity matrix must be 'precomputed' or 'euclidean'."
521
+ " Got %s instead" % str (self .dissimilarity ))
522
+
523
+ if self .method == "smacof" :
524
+ self .embedding_ , self .stress_ , self .n_iter_ = smacof (
525
+ self .dissimilarity_matrix_ , metric = self .metric ,
526
+ n_components = self .n_components , init = init ,
527
+ n_init = self .n_init , n_jobs = self .n_jobs ,
528
+ max_iter = self .max_iter , verbose = self .verbose ,
529
+ eps = self .eps , random_state = self .random_state ,
530
+ return_n_iter = True )
531
+ elif self .method == "svd" :
532
+ if not self .metric :
533
+ raise ValueError ("Using SVD requires metric=True" )
534
+ self .embedding_ , self .stress_ = svd_scaler (
535
+ self .dissimilarity_matrix_ , n_components = self .n_components )
<
10000
td data-grid-cell-id="diff-a2d46f243976aa28653d89f341032c83e311199cc659e84d7731ec626a4bb2ef-434-536-0" data-selected="false" role="gridcell" style="background-color:var(--diffBlob-additionNum-bgColor, var(--diffBlob-addition-bgColor-num));text-align:center" tabindex="-1" valign="top" class="focusable-grid-cell diff-line-number position-relative left-side">
536
+ self .n_iter_ = None
537
+ else :
538
+ raise ValueError ("Method must be 'smacof' or 'svd'."
539
+ " Got %s instead" % str (self .method ))
435
540
436
541
return self .embedding_
0 commit comments