diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index bd69e05a59604..07bfb5fa510ec 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -439,6 +439,22 @@ should then correspond exactly to the distance between point :math:`i` and Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. +If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate +solver of calculating results. See :ref:`multidimensional_scaling_solver` for details. + +.. _multidimensional_scaling_solver: + +Solver +------------- + +Metric :class:`MDS` offers two different algorithms (solvers) to calculate +results: SMACOF and SVD-based. The SMACOF solver (Scaling by MAjorizing a +COmplicated Function) minimizes its objective function (stress) in an iterative +manner. The SVD-based solver performs series of transformations (including +Singular Value Decomposition) to give exact result. The SVD-based solver is +thus much faster and more accurate, but also less general - it requires metric +of :math:`S` to be Euclidean. + Nonmetric MDS ------------- @@ -472,6 +488,11 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. `_ Kruskal, J. Psychometrika, 29, (1964) + * `"An Introduction to MDS" + `_ + Florian Wickelmaier, Sound Quality Research Unit, Aalborg University, Denmark (2003) + + .. _t_sne: t-distributed Stochastic Neighbor Embedding (t-SNE) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 96702dae01235..4da94f5936a10 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -259,6 +259,16 @@ Changelog of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. :pr:`16266` by :user:`Rushabh Vasani `. +:mod:`sklearn.manifold` +........................... + +- |Feature| Support of multidimensional scaling solver, + which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`. + User can choose whether to use SVD- or SMACOF-based solver via parameter + `solver`. The SVD-based solver is faster and more accurate, but works + only for euclidean dissimilarity matrices. + :pr:`16067` by :user:`Piotr Gaiński `. + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ca8c08ed69f98..a3559fa0c7e40 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -6,6 +6,7 @@ # License: BSD import numpy as np +from scipy import linalg from joblib import Parallel, delayed, effective_n_jobs import warnings @@ -14,6 +15,7 @@ from ..metrics import euclidean_distances from ..utils import check_random_state, check_array, check_symmetric from ..isotonic import IsotonicRegression +from sklearn.utils.validation import _check_psd_eigenvalues def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, @@ -119,7 +121,7 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: - if(old_stress - stress / dis) < eps: + if (old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) @@ -272,6 +274,72 @@ def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8, return best_pos, best_stress +def svd_scaler(dissimilarities, n_components=2): + """ + Computes multidimensional scaling using SVD algorithm + + Parameters + ---------- + dissimilarities : ndarray, shape (n_samples, n_samples) + Pairwise dissimilarities between the points. Must be euclidean. + n_components : int, optional, default: 2 + Number of dimension in which to immerse the dissimilarities. + + Returns + ---------- + embedding : ndarray, shape (n_samples, n_components) + Coordinates of the points in a ``n_components``-space. + + stress : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points). + + References + ---------- + "An Introduction to MDS" Florian Wickelmaier + Sound Quality Research Unit, Aalborg University, Denmark (2003) + + "Metric and Euclidean properties of dissimilarity coefficients" + J. C. Gower and P. Legendre, Journal of Classification 3, 5–48 (1986) + + """ + + dissimilarities = check_symmetric(dissimilarities, raise_exception=True) + + n_samples = dissimilarities.shape[0] + + # Centering matrix + J = np.eye(*dissimilarities.shape) - (1. / n_samples) * ( + np.ones(dissimilarities.shape)) + + # Double centered matrix + B = -0.5 * np.dot(J, np.dot(dissimilarities ** 2, J)) + + w, V = linalg.eigh(B, check_finite=False) + + # ``dissimilarities`` is Euclidean iff ``B`` is positive semi-definite. + # See "Metric and Euclidean properties of dissimilarity coefficients" + # for details + try: + w = _check_psd_eigenvalues(w) + except ValueError: + raise ValueError("Dissimilarity matrix must be euclidean. " + "Make sure to pass an euclidean matrix, or use " + "dissimilarity='euclidean'.") + + # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors. + # Eigenvalues should be in descending order by convention. + w = w[::-1][:n_components] + V = V[:, ::-1][:, :n_components] + + embedding = np.sqrt(w) * V + + dist = euclidean_distances(embedding) + stress = ((dissimilarities.ravel() - dist.ravel()) ** 2).sum() * 0.5 + + return embedding, stress + + class MDS(BaseEstimator): """Multidimensional scaling @@ -285,14 +353,20 @@ class MDS(BaseEstimator): metric : boolean, optional, default: True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. + If ``solver=='svd'``, metric must be set to True. + n_init : int, optional, default: 4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. + Ignored if ``solver=='svd'``. + max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run. + Ignored if ``solver=='svd'``. + verbose : int, optional, default: 0 Level of verbosity. @@ -300,6 +374,8 @@ class MDS(BaseEstimator): Relative tolerance with respect to stress at which to declare convergence. + Ignored if ``solver=='svd'``. + n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple initializations are used (``n_init``), each run of the algorithm is @@ -309,6 +385,8 @@ class MDS(BaseEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. + Ignored if ``solver=='svd'``. + random_state : int, RandomState instance, default=None Determines the random number generator used to initialize the centers. Pass an int for reproducible results across multiple function calls. @@ -324,6 +402,11 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. + solver : {'smacof', 'svd'}, default ='smacof' + The solver used for solving the MDS problem. + + .. versionadded:: 0.23 + Attributes ---------- embedding_ : array-like, shape (n_samples, n_components) @@ -333,6 +416,12 @@ class MDS(BaseEstimator): The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points). + n_iter_ : int + The number of iterations of SMACOF algorithm corresponding + to the best stress. + + It is set to ``None`` if ``solver=='svd'``. + Examples -------- >>> from sklearn.datasets import load_digits @@ -357,12 +446,15 @@ class MDS(BaseEstimator): hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ + def __init__(self, n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-3, n_jobs=None, - random_state=None, dissimilarity="euclidean"): + random_state=None, dissimilarity="euclidean", + solver="smacof"): self.n_components = n_components self.dissimilarity = dissimilarity self.metric = metric + self.solver = solver self.n_init = n_init self.max_iter = max_iter self.eps = eps @@ -390,6 +482,8 @@ def fit(self, X, y=None, init=None): Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. + + Ignored if ``solver=='svd'``. """ self.fit_transform(X, init=init) return self @@ -410,6 +504,8 @@ def fit_transform(self, X, y=None, init=None): Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. + + Ignored if ``solver=='svd'``. """ X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": @@ -423,14 +519,26 @@ def fit_transform(self, X, y=None, init=None): elif self.dissimilarity == "euclidean": self.dissimilarity_matrix_ = euclidean_distances(X) else: - raise ValueError("Proximity must be 'precomputed' or 'euclidean'." - " Got %s instead" % str(self.dissimilarity)) - - self.embedding_, self.stress_, self.n_iter_ = smacof( - self.dissimilarity_matrix_, metric=self.metric, - n_components=self.n_components, init=init, n_init=self.n_init, - n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, - eps=self.eps, random_state=self.random_state, - return_n_iter=True) + raise ValueError( + "Dissimilarity matrix must be 'precomputed' or 'euclidean'." + " Got %s instead" % str(self.dissimilarity)) + + if self.solver == "smacof": + self.embedding_, self.stress_, self.n_iter_ = smacof( + self.dissimilarity_matrix_, metric=self.metric, + n_components=self.n_components, init=init, + n_init=self.n_init, n_jobs=self.n_jobs, + max_iter=self.max_iter, verbose=self.verbose, + eps=self.eps, random_state=self.random_state, + return_n_iter=True) + elif self.solver == "svd": + if not self.metric: + raise ValueError("Using SVD requires metric=True") + self.embedding_, self.stress_ = svd_scaler( + self.dissimilarity_matrix_, n_components=self.n_components) + self.n_iter_ = None + else: + raise ValueError("Solver must be 'smacof' or 'svd'." + " Got %s instead" % str(self.solver)) return self.embedding_ diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 4349aeeefdedc..421256811164c 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -55,6 +55,50 @@ def test_smacof_error(): mds.smacof(sim, init=Z, n_init=1) +def test_svd_error(): + # Non symmetric (dis)similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError, match="Array must be symmetric"): + mds.svd_scaler(sim) + + # Non squared (dis)similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError, + match="array must be 2-dimensional and square"): + mds.svd_scaler(sim) + + # Non Euclidean (dis)similarity matrix: + sim = np.array([[0, 12, 3, 4], + [12, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError, + match="Dissimilarity matrix must be euclidean"): + mds.svd_scaler(sim) + + +def test_MDS_error(): + # Bad solver name + sim = np.ones((2, 2)) + mdc_clf = mds.MDS(solver='bad name') + with pytest.raises(ValueError, match="Solver must be 'smacof' or 'svd'"): + mdc_clf.fit(sim) + + # SVD with metric=False + sim = np.ones((2, 2)) + mdc_clf = mds.MDS(metric=False, solver='svd') + with pytest.raises(ValueError, match="Using SVD requires metric=True"): + mdc_clf.fit(sim) + + def test_MDS(): sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], @@ -62,3 +106,29 @@ def test_MDS(): [4, 2, 1, 0]]) mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed") mds_clf.fit(sim) + + +def test_MDS_svd(): + # Test svd using example data from "An Introduction to MDS" + # Florian Wickelmaier, p 11 + sim = np.array([[0, 93, 82, 133], + [93, 0, 52, 60], + [82, 52, 0, 111], + [133, 60, 111, 0]]) + + mds_clf = mds.MDS(metric=True, solver="svd", dissimilarity='precomputed') + mds_clf.fit(sim) + + X_true_1 = np.array([[-62.831, -32.97448], + [18.403, 12.02697], + [-24.960, 39.71091], + [69.388, -18.76340]]) + X_true_2 = np.copy(X_true_1) + X_true_2[:, 0] *= -1 + + # Signs of columns are dependent on signs of computed eigenvectors + # which are arbitrary and meaningless + assert (np.allclose(mds_clf.embedding_, X_true_1) + or np.allclose(mds_clf.embedding_, -X_true_1) + or np.allclose(mds_clf.embedding_, X_true_2) + or np.allclose(mds_clf.embedding_, -X_true_2))