scikit-learn
diff --git a/‎doc/modules/manifold.rst
Lines changed: 21 additions & 0 deletions b/‎doc/modules/manifold.rst
Lines changed: 21 additions & 0 deletions
diff --git a/‎doc/whats_new/v0.23.rst
Lines changed: 10 additions & 0 deletions b/‎doc/whats_new/v0.23.rst
Lines changed: 10 additions & 0 deletions
diff --git a/‎sklearn/manifold/_mds.py
Lines changed: 116 additions & 11 deletions b/‎sklearn/manifold/_mds.py
Lines changed: 116 additions & 11 deletions
diff --git a/‎sklearn/manifold/tests/test_mds.py
Lines changed: 93 additions & 0 deletions b/‎sklearn/manifold/tests/test_mds.py
Lines changed: 93 additions & 0 deletions
@@ -439,6 +439,9 @@ should then correspond exactly to the distance between point :math:`i` and
 
 Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`.
 
+If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate
+method of calculating results. See :ref:`_multidimensional_scaling_method` for details.
+
 Nonmetric MDS
 -------------
 
@@ -457,6 +460,19 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized.
    :align: center
    :scale: 60
 
+.. _multidimensional_scaling_method:
+
+Method
+-------------
+
+Metric :class:`MDS` offers two different algorithms (methods) to calculate
+results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a
+COmplicated Function) minimizes objective function (stress) in iterative
+manner. The SVD-based method performs series of transformations (including
+Singular Value Decomposition) to give exact result. The SVD-based method is
+thus much faster and more accurate, but also less general - it requires metric
+of :math:`S` to be Euclidean.
+
 
 .. topic:: References:
 
@@ -472,6 +488,11 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized.
     <https://link.springer.com/article/10.1007%2FBF02289565>`_
     Kruskal, J. Psychometrika, 29, (1964)
 
+  * `"An Introduction to MDS"
+    <https://www.researchgate.net/publication/228775338_An_introduction_to_MDS>`_
+    Florian Wickelmaier, Sound Quality Research Unit, Aalborg University, Denmark (2003)
+
+
 .. _t_sne:
 
 t-distributed Stochastic Neighbor Embedding (t-SNE)
 
@@ -259,6 +259,16 @@ Changelog
   of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
   :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
 
+:mod:`sklearn.manifold`
+...........................
+
+- |Feature| Support of multidimensional scaling method,
+  which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`.
+  User can choose whether to use SVD- or SMACOF-based method via parameter
+  `method`. The SVD-based method is faster and more accurate, but works
+  only for euclidean dissimilarity matrices.
+  :pr:`16067` by :user:`Piotr Gaiński <panpiort8>`.
+
 :mod:`sklearn.metrics`
 ......................
 
 
@@ -6,6 +6,7 @@
 # License: BSD
 
 import numpy as np
+from scipy import linalg
 from joblib import Parallel, delayed, effective_n_jobs
 
 import warnings
@@ -14,6 +15,7 @@
 from ..metrics import euclidean_distances
 from ..utils import check_random_state, check_array, check_symmetric
 from ..isotonic import IsotonicRegression
+from sklearn.utils.validation import _check_psd_eigenvalues
 
 
 def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
@@ -119,7 +121,7 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
         if verbose >= 2:
             print('it: %d, stress %s' % (it, stress))
         if old_stress is not None:
-            if(old_stress - stress / dis) < eps:
+            if (old_stress - stress / dis) < eps:
                 if verbose:
                     print('breaking at iteration %d with stress %s' % (it,
                                                                        stress))
@@ -272,6 +274,71 @@ def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8,
         return best_pos, best_stress
 
 
+def svd_scaler(dissimilarities, n_components=2):
+    """
+    Computes multidimensional scaling using SVD algorithm
+
+    Parameters
+    ----------
+    dissimilarities : ndarray, shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be euclidean.
+    n_components : int, optional, default: 2
+        Number of dimension in which to immerse the dissimilarities.
+
+    Returns
+    ----------
+    X : ndarray, shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+
+    References
+    ----------
+    "An Introduction to MDS" Florian Wickelmaier
+    Sound Quality Research Unit, Aalborg University, Denmark (2003)
+
+    "Multidimensional Scaling" Chapman Hall
+    2nd edition, Boca Raton (2001)
+
+    """
+
+    dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
+
+    n_samples = dissimilarities.shape[0]
+
+    # Centering matrix
+    H = np.eye(*dissimilarities.shape) - (1. / n_samples) * \
+        np.ones(dissimilarities.shape)
+
+    # Double centered matrix
+    K = -0.5 * np.dot(H, np.dot(dissimilarities ** 2, H))
+
+    w, V = linalg.eigh(K, check_finite=False)
+
+    # ``dissimilarities`` is Euclidean iff ``K`` is positive semi-definite.
+    # For detail see "Multidimensional Scaling" Chapman Hall p 397
+    try:
+        w = _check_psd_eigenvalues(w)
+    except ValueError:
+        raise ValueError("Dissimilarity matrix must be euclidean. "
+                         "Make sure to pass an euclidean matrix, or use "
+                         "dissimilarity='euclidean'.")
+
+    # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors.
+    # Eigenvalues should be in descending order by convention.
+    w = w[:-n_components-1:-1]
+    V = V[:, :-n_components-1:-1]
+
+    X = np.sqrt(w) * V
+
+    dist = euclidean_distances(X)
+    stress = ((dissimilarities.ravel() - dist.ravel()) ** 2).sum() * 0.5
+
+    return X, stress
+
+
 class MDS(BaseEstimator):
     """Multidimensional scaling
 
@@ -285,21 +352,29 @@ class MDS(BaseEstimator):
     metric : boolean, optional, default: True
         If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
 
+        If  ``method=='svd'``, metric must be set to True.
+
     n_init : int, optional, default: 4
         Number of times the SMACOF algorithm will be run with different
         initializations. The final results will be the best output of the runs,
         determined by the run with the smallest final stress.
 
+        Ignored if  ``method=='svd'``.
+
     max_iter : int, optional, default: 300
         Maximum number of iterations of the SMACOF algorithm for a single run.
+        Ignored if  ``method=='svd'``.
+
     verbose : int, optional, default: 0
         Level of verbosity.
 
     eps : float, optional, default: 1e-3
         Relative tolerance with respect to stress at which to declare
         convergence.
 
+        Ignored if  ``method=='svd'``.
+
     n_jobs : int or None, optional (default=None)
         The number of jobs to use for the computation. If multiple
         initializations are used (``n_init``), each run of the algorithm is
@@ -309,6 +384,8 @@ class MDS(BaseEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        Ignored if  ``method=='svd'``.
+
     random_state : int, RandomState instance, default=None
         Determines the random number generator used to initialize the centers.
         Pass an int for reproducible results across multiple function calls.
@@ -324,6 +401,11 @@ class MDS(BaseEstimator):
             Pre-computed dissimilarities are passed directly to ``fit`` and
             ``fit_transform``.
 
+    method: {'smacof', 'svd'}, default ='smacof'
+        The method used for solving the MDS problem.
+
+        .. versionadded:: 0.23
+
     Attributes
     ----------
     embedding_ : array-like, shape (n_samples, n_components)
@@ -333,6 +415,12 @@ class MDS(BaseEstimator):
         The final value of the stress (sum of squared distance of the
         disparities and the distances for all constrained points).
 
+    n_iter_ : int
+        The number of iterations of SMACOF algorithm corresponding
+        to the best stress.
+
+        It is set to ``None`` if ``method=='svd'``.
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
@@ -357,12 +445,15 @@ class MDS(BaseEstimator):
     hypothesis" Kruskal, J. Psychometrika, 29, (1964)
 
     """
+
     def __init__(self, n_components=2, metric=True, n_init=4,
                  max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
-                 random_state=None, dissimilarity="euclidean"):
+                 random_state=None, dissimilarity="euclidean",
+                 method="smacof"):
         self.n_components = n_components
         self.dissimilarity = dissimilarity
         self.metric = metric
+        self.method = method
         self.n_init = n_init
         self.max_iter = max_iter
         self.eps = eps
@@ -387,6 +478,7 @@ def fit(self, X, y=None, init=None):
         y : Ignored
 
         init : ndarray, shape (n_samples,), optional, default: None
+            Ignored if  ``method=='svd'``.
             Starting configuration of the embedding to initialize the SMACOF
             algorithm. By default, the algorithm is initialized with a randomly
             chosen array.
@@ -407,6 +499,7 @@ def fit_transform(self, X, y=None, init=None):
         y : Ignored
 
         init : ndarray, shape (n_samples,), optional, default: None
+            Ignored if  ``method=='svd'``.
             Starting configuration of the embedding to initialize the SMACOF
             algorithm. By default, the algorithm is initialized with a randomly
             chosen array.
@@ -423,14 +516,26 @@ def fit_transform(self, X, y=None, init=None):
         elif self.dissimilarity == "euclidean":
             self.dissimilarity_matrix_ = euclidean_distances(X)
         else:
-            raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
-                             " Got %s instead" % str(self.dissimilarity))
-
-        self.embedding_, self.stress_, self.n_iter_ = smacof(
-            self.dissimilarity_matrix_, metric=self.metric,
-            n_components=self.n_components, init=init, n_init=self.n_init,
-            n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
-            eps=self.eps, random_state=self.random_state,
-            return_n_iter=True)
+            raise ValueError(
+                "Dissimilarity matrix must be 'precomputed' or 'euclidean'."
+                " Got %s instead" % str(self.dissimilarity))
+
+        if self.method == "smacof":
+            self.embedding_, self.stress_, self.n_iter_ = smacof(
+                self.dissimilarity_matrix_, metric=self.metric,
+                n_components=self.n_components, init=init,
+                n_init=self.n_init, n_jobs=self.n_jobs,
+                max_iter=self.max_iter, verbose=self.verbose,
+                eps=self.eps, random_state=self.random_state,
+                return_n_iter=True)
+        elif self.method == "svd":
+            if not self.metric:
+                raise ValueError("Using SVD requires metric=True")
+            self.embedding_, self.stress_ = svd_scaler(
+                self.dissimilarity_matrix_, n_components=self.n_components)
+            self.n_iter_ = None
+        else:
+            raise ValueError("Method must be 'smacof' or 'svd'."
+                             " Got %s instead" % str(self.method))
 
         return self.embedding_
@@ -55,10 +55,103 @@ def test_smacof_error():
         mds.smacof(sim, init=Z, n_init=1)
 
 
+def test_svd():
+    # Test svd using example data from "An Introduction to MDS"
+    # Florian Wickelmaier, p 11
+    sim = np.array([[0, 93, 82, 133],
+                    [93, 0, 52, 60],
+                    [82, 52, 0, 111],
+                    [133, 60, 111, 0]])
+
+    X, stress = mds.svd_scaler(sim, n_components=2)
+    X_true_1 = np.array([[-62.831, -32.97448],
+                         [18.403, 12.02697],
+                         [-24.960, 39.71091],
+                         [69.388, -18.76340]])
+    X_true_2 = np.copy(X_true_1)
+    X_true_2[:, 0] *= -1
+
+    # Signs of columns are dependent on signs of computed eigenvectors
+    # which are arbitrary and meaningless
+    assert(np.allclose(X, X_true_1)
+           or np.allclose(X, -X_true_1)
+           or np.allclose(X, X_true_2)
+           or np.allclose(X, -X_true_2))
+
+
+def test_svd_error():
+    # Non symmetric (dis)similarity matrix:
+    sim = np.array([[0, 5, 9, 4],
+                    [5, 0, 2, 2],
+                    [3, 2, 0, 1],
+                    [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError):
+        mds.svd_scaler(sim)
+
+    # Non squared (dis)similarity matrix:
+    sim = np.array([[0, 5, 9, 4],
+                    [5, 0, 2, 2],
+                    [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError):
+        mds.svd_scaler(sim)
+
+    # Non Euclidean (dis)similarity matrix:
+    sim = np.array([[0, 12, 3, 4],
+                    [12, 0, 2, 2],
+                    [3, 2, 0, 1],
+                    [4, 2, 1, 0]])
+
+    with pytest.raises(ValueError,
+                       match="Dissimilarity matrix must be euclidean"):
+        mds.svd_scaler(sim)
+
+
+def test_MDS_error():
+    # Bad method name
+    sim = np.ones((2, 2))
+    mdc_clf = mds.MDS(method='bad name')
+    with pytest.raises(ValueError):
+        mdc_clf.fit(sim)
+
+    # SVD with metric=False
+    sim = np.ones((2, 2))
+    mdc_clf = mds.MDS(metric=False, method='svd')
+    with pytest.raises(ValueError):
+        mdc_clf.fit(sim)
+
+
 def test_MDS():
     sim = np.array([[0, 5, 3, 4],
                     [5, 0, 2, 2],
                     [3, 2, 0, 1],
                     [4, 2, 1, 0]])
     mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
     mds_clf.fit(sim)
+
+
+def test_MDS_svd():
+    # Test svd using example data from "An Introduction to MDS"
+    # Florian Wickelmaier, p 11
+    sim = np.array([[0, 93, 82, 133],
+                    [93, 0, 52, 60],
+                    [82, 52, 0, 111],
+                    [133, 60, 111, 0]])
+
+    mds_clf = mds.MDS(metric=True, method="svd", dissimilarity='precomputed')
+    mds_clf.fit(sim)
+
+    X_true_1 = np.array([[-62.831, -32.97448],
+                         [18.403, 12.02697],
+                         [-24.960, 39.71091],
+                         [69.388, -18.76340]])
+    X_true_2 = np.copy(X_true_1)
+    X_true_2[:, 0] *= -1
+
+    # Signs of columns are dependent on signs of computed eigenvectors
+    # which are arbitrary and meaningless
+    assert (np.allclose(mds_clf.embedding_, X_true_1)
+            or np.allclose(mds_clf.embedding_, -X_true_1)
+            or np.allclose(mds_clf.embedding_, X_true_2)
+            or np.allclose(mds_clf.embedding_, -X_true_2))