scikit-learn · panpiort8 · Mar 3, 2020 · Mar 5, 2020 · Mar 5, 2020 · Mar 7, 2020
diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
@@ -439,6 +439,22 @@ should then correspond exactly to the distance between point :math:`i` and
 
 Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`.
 
+If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate
+solver of calculating results. See :ref:`multidimensional_scaling_solver` for details.
+
+.. _multidimensional_scaling_solver:
+
+Solver
+-------------
+
+Metric :class:`MDS` offers two different algorithms (solvers) to calculate
+results: SMACOF and SVD-based. The SMACOF solver (Scaling by MAjorizing a
+COmplicated Function) minimizes its objective function (stress) in an iterative
+manner. The SVD-based solver performs series of transformations (including
+Singular Value Decomposition) to give exact result. The SVD-based solver is
+thus much faster and more accurate, but also less general - it requires metric
+of :math:`S` to be Euclidean.
+
 Nonmetric MDS
 -------------
 
@@ -472,6 +488,11 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized.
     <https://link.springer.com/article/10.1007%2FBF02289565>`_
     Kruskal, J. Psychometrika, 29, (1964)
 
+  * `"An Introduction to MDS"
+    <https://www.researchgate.net/publication/228775338_An_introduction_to_MDS>`_
+    Florian Wickelmaier, Sound Quality Research Unit, Aalborg University, Denmark (2003)
+
+
 .. _t_sne:
 
 t-distributed Stochastic Neighbor Embedding (t-SNE)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -259,6 +259,16 @@ Changelog
   of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
   :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
 
+:mod:`sklearn.manifold`
+...........................
+
+- |Feature| Support of multidimensional scaling solver,
+  which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`.
+  User can choose whether to use SVD- or SMACOF-based solver via parameter
+  `solver`. The SVD-based solver is faster and more accurate, but works
+  only for euclidean dissimilarity matrices.
+  :pr:`16067` by :user:`Piotr Gaiński <panpiort8>`.
+
 :mod:`sklearn.metrics`
 ......................
 

diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
@@ -6,6 +6,7 @@
 # License: BSD
 
 import numpy as np
+from scipy import linalg
 from joblib import Parallel, delayed, effective_n_jobs
 
 import warnings
@@ -14,6 +15,7 @@
 from ..metrics import euclidean_distances
 from ..utils import check_random_state, check_array, check_symmetric
 from ..isotonic import IsotonicRegression
+from sklearn.utils.validation import _check_psd_eigenvalues
 
 
 def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
@@ -119,7 +121,7 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
         if verbose >= 2:
             print('it: %d, stress %s' % (it, stress))
         if old_stress is not None:
-            if(old_stress - stress / dis) < eps:
+            if (old_stress - stress / dis) < eps:
                 if verbose:
                     print('breaking at iteration %d with stress %s' % (it,
                                                                        stress))
@@ -272,6 +274,72 @@ def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8,
         return best_pos, best_stress
 
 
+def svd_scaler(dissimilarities, n_components=2):
+    """
+    Computes multidimensional scaling using SVD algorithm
+
+    Parameters
+    ----------
+    dissimilarities : ndarray, shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be euclidean.
+    n_components : int, optional, default: 2
+        Number of dimension in which to immerse the dissimilarities.
+
+    Returns
+    ----------
+    embedding : ndarray, shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+
+    References
+    ----------
+    "An Introduction to MDS" Florian Wickelmaier
+    Sound Quality Research Unit, Aalborg University, Denmark (2003)
+
+    "Metric and Euclidean properties of dissimilarity coefficients"
+    J. C. Gower and P. Legendre, Journal of Classification 3, 5–48 (1986)
+
+    """
+
+    dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
+
+    n_samples = dissimilarities.shape[0]
+
+    # Centering matrix
+    J = np.eye(*dissimilarities.shape) - (1. / n_samples) * (
+        np.ones(dissimilarities.shape))
+
+    # Double centered matrix
+    B = -0.5 * np.dot(J, np.dot(dissimilarities ** 2, J))
-    B = -0.5 * np.dot(J, np.dot(dissimilarities ** 2, J))
+    B = -0.5 * np.dot(J, np.dot(dissimilarities, J))
-    B = -0.5 * np.dot(J, np.dot(dissimilarities ** 2, J))
+    B = -0.5 * np.dot(J, np.dot(dissimilarities, J))
+
+    w, V = linalg.eigh(B, check_finite=False)
+
+    # ``dissimilarities`` is Euclidean iff ``B`` is positive semi-definite.
+    # See "Metric and Euclidean properties of dissimilarity coefficients"
+    # for details
+    try:
+        w = _check_psd_eigenvalues(w)
+    except ValueError:
+        raise ValueError("Dissimilarity matrix must be euclidean. "
+                         "Make sure to pass an euclidean matrix, or use "
+                         "dissimilarity='euclidean'.")
+
+    # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors.
-    # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors.
+    # Get ``n_components`` greatest eigenvalues and corresponding eigenvectors.
-    # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors.
+    # Get ``n_components`` greatest eigenvalues and corresponding eigenvectors.
+    # Eigenvalues should be in descending order by convention.
+    w = w[::-1][:n_components]
+    V = V[:, ::-1][:, :n_components]
+
+    embedding = np.sqrt(w) * V
+
+    dist = euclidean_distances(embedding)
+    stress = ((dissimilarities.ravel() - dist.ravel()) ** 2).sum() * 0.5
+
+    return embedding, stress
+
+
 class MDS(BaseEstimator):
     """Multidimensional scaling
 
@@ -285,21 +353,29 @@ class MDS(BaseEstimator):
     metric : boolean, optional, default: True
         If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
 
+        If  ``solver=='svd'``, metric must be set to True.
+
     n_init : int, optional, default: 4
         Number of times the SMACOF algorithm will be run with different
         initializations. The final results will be the best output of the runs,
         determined by the run with the smallest final stress.
 
+        Ignored if  ``solver=='svd'``.
+
     max_iter : int, optional, default: 300
         Maximum number of iterations of the SMACOF algorithm for a single run.
 
+        Ignored if  ``solver=='svd'``.
+
     verbose : int, optional, default: 0
         Level of verbosity.
 
     eps : float, optional, default: 1e-3
         Relative tolerance with respect to stress at which to declare
         convergence.
 
+        Ignored if  ``solver=='svd'``.
+
     n_jobs : int or None, optional (default=None)
         The number of jobs to use for the computation. If multiple
         initializations are used (``n_init``), each run of the algorithm is
@@ -309,6 +385,8 @@ class MDS(BaseEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        Ignored if  ``solver=='svd'``.
+
     random_state : int, RandomState instance, default=None
         Determines the random number generator used to initialize the centers.
         Pass an int for reproducible results across multiple function calls.
@@ -324,6 +402,11 @@ class MDS(BaseEstimator):
             Pre-computed dissimilarities are passed directly to ``fit`` and
             ``fit_transform``.
 
+    solver : {'smacof', 'svd'}, default ='smacof'
+        The solver used for solving the MDS problem.
+
+        .. versionadded:: 0.23
+
     Attributes
     ----------
     embedding_ : array-like, shape (n_samples, n_components)
@@ -333,6 +416,12 @@ class MDS(BaseEstimator):
         The final value of the stress (sum of squared distance of the
         disparities and the distances for all constrained points).
 
+    n_iter_ : int
+        The number of iterations of SMACOF algorithm corresponding
+        to the best stress.
+
+        It is set to ``None`` if ``solver=='svd'``.
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
@@ -357,12 +446,15 @@ class MDS(BaseEstimator):
     hypothesis" Kruskal, J. Psychometrika, 29, (1964)
 
     """
+
     def __init__(self, n_components=2, metric=True, n_init=4,
                  max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
-                 random_state=None, dissimilarity="euclidean"):
+                 random_state=None, dissimilarity="euclidean",
+                 solver="smacof"):
         self.n_components = n_components
         self.dissimilarity = dissimilarity
         self.metric = metric
+        self.solver = solver
         self.n_init = n_init
         self.max_iter = max_iter
         self.eps = eps
@@ -390,6 +482,8 @@ def fit(self, X, y=None, init=None):
             Starting configuration of the embedding to initialize the SMACOF
             algorithm. By default, the algorithm is initialized with a randomly
             chosen array.
+
+            Ignored if  ``solver=='svd'``.
         """
         self.fit_transform(X, init=init)
         return self
@@ -410,6 +504,8 @@ def fit_transform(self, X, y=None, init=None):
             Starting configuration of the embedding to initialize the SMACOF
             algorithm. By default, the algorithm is initialized with a randomly
             chosen array.
+
+            Ignored if  ``solver=='svd'``.
         """
         X = self._validate_data(X)
         if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
@@ -423,14 +519,26 @@ def fit_transform(self, X, y=None, init=None):
         elif self.dissimilarity == "euclidean":
             self.dissimilarity_matrix_ = euclidean_distances(X)
         else:
-            raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
-                             " Got %s instead" % str(self.dissimilarity))
-
-        self.embedding_, self.stress_, self.n_iter_ = smacof(
-            self.dissimilarity_matrix_, metric=self.metric,
-            n_components=self.n_components, init=init, n_init=self.n_init,
-            n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
-            eps=self.eps, random_state=self.random_state,
-            return_n_iter=True)
+            raise ValueError(
+                "Dissimilarity matrix must be 'precomputed' or 'euclidean'."
+                " Got %s instead" % str(self.dissimilarity))
+
+        if self.solver == "smacof":
+            self.embedding_, self.stress_, self.n_iter_ = smacof(
+                self.dissimilarity_matrix_, metric=self.metric,
+                n_components=self.n_components, init=init,
+                n_init=self.n_init, n_jobs=self.n_jobs,
+                max_iter=self.max_iter, verbose=self.verbose,
+                eps=self.eps, random_state=self.random_state,
+                return_n_iter=True)
+        elif self.solver == "svd":
+            if not self.metric:
+                raise ValueError("Using SVD requires metric=True")
+            self.embedding_, self.stress_ = svd_scaler(
+                self.dissimilarity_matrix_, n_components=self.n_components)
+            self.n_iter_ = None
+        else:
+            raise ValueError("Solver must be 'smacof' or 'svd'."
+                             " Got %s instead" % str(self.solver))
 
         return self.embedding_
diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py
@@ -55,10 +55,80 @@ def test_smacof_error():
         mds.smacof(sim, init=Z, n_init=1)
 
 
+def test_svd_error():
+    # Non symmetric (dis)similarity matrix:
+    sim = np.array([[0, 5, 9, 4],
+                    [5, 0, 2, 2],
+                    [3, 2, 0, 1],
+                    [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError, match="Array must be symmetric"):
+        mds.svd_scaler(sim)
+
+    # Non squared (dis)similarity matrix:
+    sim = np.array([[0, 5, 9, 4],
+                    [5, 0, 2, 2],
+                    [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError,
+                       match="array must be 2-dimensional and square"):
+        mds.svd_scaler(sim)
+
+    # Non Euclidean (dis)similarity matrix:
+    sim = np.array([[0, 12, 3, 4],
+                    [12, 0, 2, 2],
+                    [3, 2, 0, 1],
+                    [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError,
+                       match="Dissimilarity matrix must be euclidean"):
+        mds.svd_scaler(sim)
+
+
+def test_MDS_error():
+    # Bad solver name
+    sim = np.ones((2, 2))
+    mdc_clf = mds.MDS(solver='bad name')
+    with pytest.raises(ValueError, match="Solver must be 'smacof' or 'svd'"):
+        mdc_clf.fit(sim)
+
+    # SVD with metric=False
+    sim = np.ones((2, 2))
+    mdc_clf = mds.MDS(metric=False, solver='svd')
+    with pytest.raises(ValueError, match="Using SVD requires metric=True"):
+        mdc_clf.fit(sim)
+
+
 def test_MDS():
     sim = np.array([[0, 5, 3, 4],
                     [5, 0, 2, 2],
                     [3, 2, 0, 1],
                     [4, 2, 1, 0]])
     mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
     mds_clf.fit(sim)
+
+
+def test_MDS_svd():
+    # Test svd using example data from "An Introduction to MDS"
+    # Florian Wickelmaier, p 11
+    sim = np.array([[0, 93, 82, 133],
+                    [93, 0, 52, 60],
+                    [82, 52, 0, 111],
+                    [133, 60, 111, 0]])
+
+    mds_clf = mds.MDS(metric=True, solver="svd", dissimilarity='precomputed')
+    mds_clf.fit(sim)
+
+    X_true_1 = np.array([[-62.831, -32.97448],
+                         [18.403, 12.02697],
+                         [-24.960, 39.71091],
+                         [69.388, -18.76340]])
+    X_true_2 = np.copy(X_true_1)
+    X_true_2[:, 0] *= -1
+
+    # Signs of columns are dependent on signs of computed eigenvectors
+    # which are arbitrary and meaningless
+    assert (np.allclose(mds_clf.embedding_, X_true_1)
+            or np.allclose(mds_clf.embedding_, -X_true_1)
+            or np.allclose(mds_clf.embedding_, X_true_2)
+            or np.allclose(mds_clf.embedding_, -X_true_2))