From 27868c60c3e00701452faf1ec216ec48475b902b Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Tue, 3 Mar 2020 11:05:17 +0100 Subject: [PATCH 1/4] Implement SVD-based method in MDS class --- doc/modules/manifold.rst | 21 +++++ doc/whats_new/v0.23.rst | 10 +++ sklearn/manifold/_mds.py | 127 ++++++++++++++++++++++++++--- sklearn/manifold/tests/test_mds.py | 93 +++++++++++++++++++++ 4 files changed, 240 insertions(+), 11 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index bd69e05a59604..9d08304c9d474 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -439,6 +439,9 @@ should then correspond exactly to the distance between point :math:`i` and Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. +If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate +method of calculating results. See :ref:`multidimensional_scaling_method` for details. + Nonmetric MDS ------------- @@ -457,6 +460,19 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. :align: center :scale: 60 +.. _multidimensional_scaling_method: + +Method +------------- + +Metric :class:`MDS` offers two different algorithms (methods) to calculate +results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a +COmplicated Function) minimizes objective function (stress) in iterative +manner. The SVD-based method performs series of transformations (including +Singular Value Decomposition) to give exact result. The SVD-based method is +thus much faster and more accurate, but also less general - it requires metric +of :math:`S` to be Euclidean. + .. topic:: References: @@ -472,6 +488,11 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. `_ Kruskal, J. Psychometrika, 29, (1964) + * `"An Introduction to MDS" + `_ + Florian Wickelmaier, Sound Quality Research Unit, Aalborg University, Denmark (2003) + + .. _t_sne: t-distributed Stochastic Neighbor Embedding (t-SNE) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 96702dae01235..125c8f7f61f0a 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -259,6 +259,16 @@ Changelog of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. :pr:`16266` by :user:`Rushabh Vasani `. +:mod:`sklearn.manifold` +........................... + +- |Feature| Support of multidimensional scaling method, + which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`. + User can choose whether to use SVD- or SMACOF-based method via parameter + `method`. The SVD-based method is faster and more accurate, but works + only for euclidean dissimilarity matrices. + :pr:`16067` by :user:`Piotr Gaiński `. + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ca8c08ed69f98..d181b43ce44b3 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -6,6 +6,7 @@ # License: BSD import numpy as np +from scipy import linalg from joblib import Parallel, delayed, effective_n_jobs import warnings @@ -14,6 +15,7 @@ from ..metrics import euclidean_distances from ..utils import check_random_state, check_array, check_symmetric from ..isotonic import IsotonicRegression +from sklearn.utils.validation import _check_psd_eigenvalues def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, @@ -119,7 +121,7 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: - if(old_stress - stress / dis) < eps: + if (old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) @@ -272,6 +274,71 @@ def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8, return best_pos, best_stress +def svd_scaler(dissimilarities, n_components=2): + """ + Computes multidimensional scaling using SVD algorithm + + Parameters + ---------- + dissimilarities : ndarray, shape (n_samples, n_samples) + Pairwise dissimilarities between the points. Must be euclidean. + n_components : int, optional, default: 2 + Number of dimension in which to immerse the dissimilarities. + + Returns + ---------- + X : ndarray, shape (n_samples, n_components) + Coordinates of the points in a ``n_components``-space. + + stress : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points). + + References + ---------- + "An Introduction to MDS" Florian Wickelmaier + Sound Quality Research Unit, Aalborg University, Denmark (2003) + + "Multidimensional Scaling" Chapman Hall + 2nd edition, Boca Raton (2001) + + """ + + dissimilarities = check_symmetric(dissimilarities, raise_exception=True) + + n_samples = dissimilarities.shape[0] + + # Centering matrix + H = np.eye(*dissimilarities.shape) - (1. / n_samples) * \ + np.ones(dissimilarities.shape) + + # Double centered matrix + K = -0.5 * np.dot(H, np.dot(dissimilarities ** 2, H)) + + w, V = linalg.eigh(K, check_finite=False) + + # ``dissimilarities`` is Euclidean iff ``K`` is positive semi-definite. + # For detail see "Multidimensional Scaling" Chapman Hall p 397 + try: + w = _check_psd_eigenvalues(w) + except ValueError: + raise ValueError("Dissimilarity matrix must be euclidean. " + "Make sure to pass an euclidean matrix, or use " + "dissimilarity='euclidean'.") + + # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors. + # Eigenvalues should be in descending order by convention. + w = w[:-n_components-1:-1] + V = V[:, :-n_components-1:-1] + + X = np.sqrt(w) * V + + dist = euclidean_distances(X) + stress = ((dissimilarities.ravel() - dist.ravel()) ** 2).sum() * 0.5 + + return X, stress + + class MDS(BaseEstimator): """Multidimensional scaling @@ -285,14 +352,20 @@ class MDS(BaseEstimator): metric : boolean, optional, default: True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. + If ``method=='svd'``, metric must be set to True. + n_init : int, optional, default: 4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. + Ignored if ``method=='svd'``. + max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run. + Ignored if ``method=='svd'``. + verbose : int, optional, default: 0 Level of verbosity. @@ -300,6 +373,8 @@ class MDS(BaseEstimator): Relative tolerance with respect to stress at which to declare convergence. + Ignored if ``method=='svd'``. + n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple initializations are used (``n_init``), each run of the algorithm is @@ -309,6 +384,8 @@ class MDS(BaseEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. + Ignored if ``method=='svd'``. + random_state : int, RandomState instance, default=None Determines the random number generator used to initialize the centers. Pass an int for reproducible results across multiple function calls. @@ -324,6 +401,11 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. + method: {'smacof', 'svd'}, default ='smacof' + The method used for solving the MDS problem. + + .. versionadded:: 0.23 + Attributes ---------- embedding_ : array-like, shape (n_samples, n_components) @@ -333,6 +415,12 @@ class MDS(BaseEstimator): The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points). + n_iter_ : int + The number of iterations of SMACOF algorithm corresponding + to the best stress. + + It is set to ``None`` if ``method=='svd'``. + Examples -------- >>> from sklearn.datasets import load_digits @@ -357,12 +445,15 @@ class MDS(BaseEstimator): hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ + def __init__(self, n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-3, n_jobs=None, - random_state=None, dissimilarity="euclidean"): + random_state=None, dissimilarity="euclidean", + method="smacof"): self.n_components = n_components self.dissimilarity = dissimilarity self.metric = metric + self.method = method self.n_init = n_init self.max_iter = max_iter self.eps = eps @@ -387,6 +478,7 @@ def fit(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None + Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. @@ -407,6 +499,7 @@ def fit_transform(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None + Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. @@ -423,14 +516,26 @@ def fit_transform(self, X, y=None, init=None): elif self.dissimilarity == "euclidean": self.dissimilarity_matrix_ = euclidean_distances(X) else: - raise ValueError("Proximity must be 'precomputed' or 'euclidean'." - " Got %s instead" % str(self.dissimilarity)) - - self.embedding_, self.stress_, self.n_iter_ = smacof( - self.dissimilarity_matrix_, metric=self.metric, - n_components=self.n_components, init=init, n_init=self.n_init, - n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, - eps=self.eps, random_state=self.random_state, - return_n_iter=True) + raise ValueError( + "Dissimilarity matrix must be 'precomputed' or 'euclidean'." + " Got %s instead" % str(self.dissimilarity)) + + if self.method == "smacof": + self.embedding_, self.stress_, self.n_iter_ = smacof( + self.dissimilarity_matrix_, metric=self.metric, + n_components=self.n_components, init=init, + n_init=self.n_init, n_jobs=self.n_jobs, + max_iter=self.max_iter, verbose=self.verbose, + eps=self.eps, random_state=self.random_state, + return_n_iter=True) + elif self.method == "svd": + if not self.metric: + raise ValueError("Using SVD requires metric=True") + self.embedding_, self.stress_ = svd_scaler( + self.dissimilarity_matrix_, n_components=self.n_components) + self.n_iter_ = None + else: + raise ValueError("Method must be 'smacof' or 'svd'." + " Got %s instead" % str(self.method)) return self.embedding_ diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 4349aeeefdedc..e2749d71caf9d 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -55,6 +55,73 @@ def test_smacof_error(): mds.smacof(sim, init=Z, n_init=1) +def test_svd(): + # Test svd using example data from "An Introduction to MDS" + # Florian Wickelmaier, p 11 + sim = np.array([[0, 93, 82, 133], + [93, 0, 52, 60], + [82, 52, 0, 111], + [133, 60, 111, 0]]) + + X, stress = mds.svd_scaler(sim, n_components=2) + X_true_1 = np.array([[-62.831, -32.97448], + [18.403, 12.02697], + [-24.960, 39.71091], + [69.388, -18.76340]]) + X_true_2 = np.copy(X_true_1) + X_true_2[:, 0] *= -1 + + # Signs of columns are dependent on signs of computed eigenvectors + # which are arbitrary and meaningless + assert(np.allclose(X, X_true_1) + or np.allclose(X, -X_true_1) + or np.allclose(X, X_true_2) + or np.allclose(X, -X_true_2)) + + +def test_svd_error(): + # Non symmetric (dis)similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError): + mds.svd_scaler(sim) + + # Non squared (dis)similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError): + mds.svd_scaler(sim) + + # Non Euclidean (dis)similarity matrix: + sim = np.array([[0, 12, 3, 4], + [12, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError, + match="Dissimilarity matrix must be euclidean"): + mds.svd_scaler(sim) + + +def test_MDS_error(): + # Bad method name + sim = np.ones((2, 2)) + mdc_clf = mds.MDS(method='bad name') + with pytest.raises(ValueError): + mdc_clf.fit(sim) + + # SVD with metric=False + sim = np.ones((2, 2)) + mdc_clf = mds.MDS(metric=False, method='svd') + with pytest.raises(ValueError): + mdc_clf.fit(sim) + + def test_MDS(): sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], @@ -62,3 +129,29 @@ def test_MDS(): [4, 2, 1, 0]]) mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed") mds_clf.fit(sim) + + +def test_MDS_svd(): + # Test svd using example data from "An Introduction to MDS" + # Florian Wickelmaier, p 11 + sim = np.array([[0, 93, 82, 133], + [93, 0, 52, 60], + [82, 52, 0, 111], + [133, 60, 111, 0]]) + + mds_clf = mds.MDS(metric=True, method="svd", dissimilarity='precomputed') + mds_clf.fit(sim) + + X_true_1 = np.array([[-62.831, -32.97448], + [18.403, 12.02697], + [-24.960, 39.71091], + [69.388, -18.76340]]) + X_true_2 = np.copy(X_true_1) + X_true_2[:, 0] *= -1 + + # Signs of columns are dependent on signs of computed eigenvectors + # which are arbitrary and meaningless + assert (np.allclose(mds_clf.embedding_, X_true_1) + or np.allclose(mds_clf.embedding_, -X_true_1) + or np.allclose(mds_clf.embedding_, X_true_2) + or np.allclose(mds_clf.embedding_, -X_true_2)) From cba27008b54f954fb9de602f61b8480002e0a547 Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Thu, 5 Mar 2020 14:22:35 +0100 Subject: [PATCH 2/4] Make requested changes --- doc/modules/manifold.rst | 26 +++++++++++++------------- sklearn/manifold/_mds.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index 9d08304c9d474..a99a76b5d15b9 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -442,6 +442,19 @@ Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate method of calculating results. See :ref:`multidimensional_scaling_method` for details. +.. _multidimensional_scaling_method: + +Method +------------- + +Metric :class:`MDS` offers two different algorithms (methods) to calculate +results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a +COmplicated Function) minimizes its objective function (stress) in an iterative +manner. The SVD-based method performs series of transformations (including +Singular Value Decomposition) to give exact result. The SVD-based method is +thus much faster and more accurate, but also less general - it requires metric +of :math:`S` to be Euclidean. + Nonmetric MDS ------------- @@ -460,19 +473,6 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. :align: center :scale: 60 -.. _multidimensional_scaling_method: - -Method -------------- - -Metric :class:`MDS` offers two different algorithms (methods) to calculate -results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a -COmplicated Function) minimizes objective function (stress) in iterative -manner. The SVD-based method performs series of transformations (including -Singular Value Decomposition) to give exact result. The SVD-based method is -thus much faster and more accurate, but also less general - it requires metric -of :math:`S` to be Euclidean. - .. topic:: References: diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index d181b43ce44b3..ec1634e32cfed 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -401,7 +401,7 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. - method: {'smacof', 'svd'}, default ='smacof' + method : {'smacof', 'svd'}, default ='smacof' The method used for solving the MDS problem. .. versionadded:: 0.23 From 03954cf586ca4ddb71b17ca14ca3a20e1857643e Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Thu, 5 Mar 2020 14:34:11 +0100 Subject: [PATCH 3/4] Change name 'method' to 'solver' --- doc/modules/manifold.rst | 14 ++++++------ doc/whats_new/v0.23.rst | 6 +++--- sklearn/manifold/_mds.py | 34 ++++++++++++++++-------------- sklearn/manifold/tests/test_mds.py | 8 +++---- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index a99a76b5d15b9..07bfb5fa510ec 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -440,18 +440,18 @@ should then correspond exactly to the distance between point :math:`i` and Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate -method of calculating results. See :ref:`multidimensional_scaling_method` for details. +solver of calculating results. See :ref:`multidimensional_scaling_solver` for details. -.. _multidimensional_scaling_method: +.. _multidimensional_scaling_solver: -Method +Solver ------------- -Metric :class:`MDS` offers two different algorithms (methods) to calculate -results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a +Metric :class:`MDS` offers two different algorithms (solvers) to calculate +results: SMACOF and SVD-based. The SMACOF solver (Scaling by MAjorizing a COmplicated Function) minimizes its objective function (stress) in an iterative -manner. The SVD-based method performs series of transformations (including -Singular Value Decomposition) to give exact result. The SVD-based method is +manner. The SVD-based solver performs series of transformations (including +Singular Value Decomposition) to give exact result. The SVD-based solver is thus much faster and more accurate, but also less general - it requires metric of :math:`S` to be Euclidean. diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 125c8f7f61f0a..4da94f5936a10 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -262,10 +262,10 @@ Changelog :mod:`sklearn.manifold` ........................... -- |Feature| Support of multidimensional scaling method, +- |Feature| Support of multidimensional scaling solver, which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`. - User can choose whether to use SVD- or SMACOF-based method via parameter - `method`. The SVD-based method is faster and more accurate, but works + User can choose whether to use SVD- or SMACOF-based solver via parameter + `solver`. The SVD-based solver is faster and more accurate, but works only for euclidean dissimilarity matrices. :pr:`16067` by :user:`Piotr Gaiński `. diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ec1634e32cfed..ada22ece07a3c 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -352,19 +352,19 @@ class MDS(BaseEstimator): metric : boolean, optional, default: True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. - If ``method=='svd'``, metric must be set to True. + If ``solver=='svd'``, metric must be set to True. n_init : int, optional, default: 4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. verbose : int, optional, default: 0 Level of verbosity. @@ -373,7 +373,7 @@ class MDS(BaseEstimator): Relative tolerance with respect to stress at which to declare convergence. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple @@ -384,7 +384,7 @@ class MDS(BaseEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. random_state : int, RandomState instance, default=None Determines the random number generator used to initialize the centers. @@ -401,8 +401,8 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. - method : {'smacof', 'svd'}, default ='smacof' - The method used for solving the MDS problem. + solver : {'smacof', 'svd'}, default ='smacof' + The solver used for solving the MDS problem. .. versionadded:: 0.23 @@ -419,7 +419,7 @@ class MDS(BaseEstimator): The number of iterations of SMACOF algorithm corresponding to the best stress. - It is set to ``None`` if ``method=='svd'``. + It is set to ``None`` if ``solver=='svd'``. Examples -------- @@ -449,11 +449,11 @@ class MDS(BaseEstimator): def __init__(self, n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-3, n_jobs=None, random_state=None, dissimilarity="euclidean", - method="smacof"): + solver="smacof"): self.n_components = n_components self.dissimilarity = dissimilarity self.metric = metric - self.method = method + self.solver = solver self.n_init = n_init self.max_iter = max_iter self.eps = eps @@ -478,10 +478,11 @@ def fit(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None - Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. + + Ignored if ``solver=='svd'``. """ self.fit_transform(X, init=init) return self @@ -499,10 +500,11 @@ def fit_transform(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None - Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. + + Ignored if ``solver=='svd'``. """ X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": @@ -520,7 +522,7 @@ def fit_transform(self, X, y=None, init=None): "Dissimilarity matrix must be 'precomputed' or 'euclidean'." " Got %s instead" % str(self.dissimilarity)) - if self.method == "smacof": + if self.solver == "smacof": self.embedding_, self.stress_, self.n_iter_ = smacof( self.dissimilarity_matrix_, metric=self.metric, n_components=self.n_components, init=init, @@ -528,14 +530,14 @@ def fit_transform(self, X, y=None, init=None): max_iter=self.max_iter, verbose=self.verbose, eps=self.eps, random_state=self.random_state, return_n_iter=True) - elif self.method == "svd": + elif self.solver == "svd": if not self.metric: raise ValueError("Using SVD requires metric=True") self.embedding_, self.stress_ = svd_scaler( self.dissimilarity_matrix_, n_components=self.n_components) self.n_iter_ = None else: - raise ValueError("Method must be 'smacof' or 'svd'." - " Got %s instead" % str(self.method)) + raise ValueError("Solver must be 'smacof' or 'svd'." + " Got %s instead" % str(self.solver)) return self.embedding_ diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index e2749d71caf9d..ce70245d4a128 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -109,15 +109,15 @@ def test_svd_error(): def test_MDS_error(): - # Bad method name + # Bad solver name sim = np.ones((2, 2)) - mdc_clf = mds.MDS(method='bad name') + mdc_clf = mds.MDS(solver='bad name') with pytest.raises(ValueError): mdc_clf.fit(sim) # SVD with metric=False sim = np.ones((2, 2)) - mdc_clf = mds.MDS(metric=False, method='svd') + mdc_clf = mds.MDS(metric=False, solver='svd') with pytest.raises(ValueError): mdc_clf.fit(sim) @@ -139,7 +139,7 @@ def test_MDS_svd(): [82, 52, 0, 111], [133, 60, 111, 0]]) - mds_clf = mds.MDS(metric=True, method="svd", dissimilarity='precomputed') + mds_clf = mds.MDS(metric=True, solver="svd", dissimilarity='precomputed') mds_clf.fit(sim) X_true_1 = np.array([[-62.831, -32.97448], From 3047960ed9b8e12da76afb5f5d3673e22ce93e51 Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Sat, 7 Mar 2020 12:22:36 +0100 Subject: [PATCH 4/4] Make another set of requested changes --- sklearn/manifold/_mds.py | 29 +++++++++++++------------- sklearn/manifold/tests/test_mds.py | 33 +++++------------------------- 2 files changed, 20 insertions(+), 42 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ada22ece07a3c..a3559fa0c7e40 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -287,7 +287,7 @@ def svd_scaler(dissimilarities, n_components=2): Returns ---------- - X : ndarray, shape (n_samples, n_components) + embedding : ndarray, shape (n_samples, n_components) Coordinates of the points in a ``n_components``-space. stress : float @@ -299,8 +299,8 @@ def svd_scaler(dissimilarities, n_components=2): "An Introduction to MDS" Florian Wickelmaier Sound Quality Research Unit, Aalborg University, Denmark (2003) - "Multidimensional Scaling" Chapman Hall - 2nd edition, Boca Raton (2001) + "Metric and Euclidean properties of dissimilarity coefficients" + J. C. Gower and P. Legendre, Journal of Classification 3, 5–48 (1986) """ @@ -309,16 +309,17 @@ def svd_scaler(dissimilarities, n_components=2): n_samples = dissimilarities.shape[0] # Centering matrix - H = np.eye(*dissimilarities.shape) - (1. / n_samples) * \ - np.ones(dissimilarities.shape) + J = np.eye(*dissimilarities.shape) - (1. / n_samples) * ( + np.ones(dissimilarities.shape)) # Double centered matrix - K = -0.5 * np.dot(H, np.dot(dissimilarities ** 2, H)) + B = -0.5 * np.dot(J, np.dot(dissimilarities ** 2, J)) - w, V = linalg.eigh(K, check_finite=False) + w, V = linalg.eigh(B, check_finite=False) - # ``dissimilarities`` is Euclidean iff ``K`` is positive semi-definite. - # For detail see "Multidimensional Scaling" Chapman Hall p 397 + # ``dissimilarities`` is Euclidean iff ``B`` is positive semi-definite. + # See "Metric and Euclidean properties of dissimilarity coefficients" + # for details try: w = _check_psd_eigenvalues(w) except ValueError: @@ -328,15 +329,15 @@ def svd_scaler(dissimilarities, n_components=2): # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors. # Eigenvalues should be in descending order by convention. - w = w[:-n_components-1:-1] - V = V[:, :-n_components-1:-1] + w = w[::-1][:n_components] + V = V[:, ::-1][:, :n_components] - X = np.sqrt(w) * V + embedding = np.sqrt(w) * V - dist = euclidean_distances(X) + dist = euclidean_distances(embedding) stress = ((dissimilarities.ravel() - dist.ravel()) ** 2).sum() * 0.5 - return X, stress + return embedding, stress class MDS(BaseEstimator): diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index ce70245d4a128..421256811164c 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -55,30 +55,6 @@ def test_smacof_error(): mds.smacof(sim, init=Z, n_init=1) -def test_svd(): - # Test svd using example data from "An Introduction to MDS" - # Florian Wickelmaier, p 11 - sim = np.array([[0, 93, 82, 133], - [93, 0, 52, 60], - [82, 52, 0, 111], - [133, 60, 111, 0]]) - - X, stress = mds.svd_scaler(sim, n_components=2) - X_true_1 = np.array([[-62.831, -32.97448], - [18.403, 12.02697], - [-24.960, 39.71091], - [69.388, -18.76340]]) - X_true_2 = np.copy(X_true_1) - X_true_2[:, 0] *= -1 - - # Signs of columns are dependent on signs of computed eigenvectors - # which are arbitrary and meaningless - assert(np.allclose(X, X_true_1) - or np.allclose(X, -X_true_1) - or np.allclose(X, X_true_2) - or np.allclose(X, -X_true_2)) - - def test_svd_error(): # Non symmetric (dis)similarity matrix: sim = np.array([[0, 5, 9, 4], @@ -86,7 +62,7 @@ def test_svd_error(): [3, 2, 0, 1], [4, 2, 1, 0]]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Array must be symmetric"): mds.svd_scaler(sim) # Non squared (dis)similarity matrix: @@ -94,7 +70,8 @@ def test_svd_error(): [5, 0, 2, 2], [4, 2, 1, 0]]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="array must be 2-dimensional and square"): mds.svd_scaler(sim) # Non Euclidean (dis)similarity matrix: @@ -112,13 +89,13 @@ def test_MDS_error(): # Bad solver name sim = np.ones((2, 2)) mdc_clf = mds.MDS(solver='bad name') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Solver must be 'smacof' or 'svd'"): mdc_clf.fit(sim) # SVD with metric=False sim = np.ones((2, 2)) mdc_clf = mds.MDS(metric=False, solver='svd') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Using SVD requires metric=True"): mdc_clf.fit(sim)