From 27868c60c3e00701452faf1ec216ec48475b902b Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Tue, 3 Mar 2020 11:05:17 +0100 Subject: [PATCH 01/27] Implement SVD-based method in MDS class --- doc/modules/manifold.rst | 21 +++++ doc/whats_new/v0.23.rst | 10 +++ sklearn/manifold/_mds.py | 127 ++++++++++++++++++++++++++--- sklearn/manifold/tests/test_mds.py | 93 +++++++++++++++++++++ 4 files changed, 240 insertions(+), 11 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index bd69e05a59604..9d08304c9d474 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -439,6 +439,9 @@ should then correspond exactly to the distance between point :math:`i` and Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. +If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate +method of calculating results. See :ref:`multidimensional_scaling_method` for details. + Nonmetric MDS ------------- @@ -457,6 +460,19 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. :align: center :scale: 60 +.. _multidimensional_scaling_method: + +Method +------------- + +Metric :class:`MDS` offers two different algorithms (methods) to calculate +results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a +COmplicated Function) minimizes objective function (stress) in iterative +manner. The SVD-based method performs series of transformations (including +Singular Value Decomposition) to give exact result. The SVD-based method is +thus much faster and more accurate, but also less general - it requires metric +of :math:`S` to be Euclidean. + .. topic:: References: @@ -472,6 +488,11 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. `_ Kruskal, J. Psychometrika, 29, (1964) + * `"An Introduction to MDS" + `_ + Florian Wickelmaier, Sound Quality Research Unit, Aalborg University, Denmark (2003) + + .. _t_sne: t-distributed Stochastic Neighbor Embedding (t-SNE) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 96702dae01235..125c8f7f61f0a 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -259,6 +259,16 @@ Changelog of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. :pr:`16266` by :user:`Rushabh Vasani `. +:mod:`sklearn.manifold` +........................... + +- |Feature| Support of multidimensional scaling method, + which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`. + User can choose whether to use SVD- or SMACOF-based method via parameter + `method`. The SVD-based method is faster and more accurate, but works + only for euclidean dissimilarity matrices. + :pr:`16067` by :user:`Piotr Gaiński `. + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ca8c08ed69f98..d181b43ce44b3 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -6,6 +6,7 @@ # License: BSD import numpy as np +from scipy import linalg from joblib import Parallel, delayed, effective_n_jobs import warnings @@ -14,6 +15,7 @@ from ..metrics import euclidean_distances from ..utils import check_random_state, check_array, check_symmetric from ..isotonic import IsotonicRegression +from sklearn.utils.validation import _check_psd_eigenvalues def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, @@ -119,7 +121,7 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: - if(old_stress - stress / dis) < eps: + if (old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) @@ -272,6 +274,71 @@ def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8, return best_pos, best_stress +def svd_scaler(dissimilarities, n_components=2): + """ + Computes multidimensional scaling using SVD algorithm + + Parameters + ---------- + dissimilarities : ndarray, shape (n_samples, n_samples) + Pairwise dissimilarities between the points. Must be euclidean. + n_components : int, optional, default: 2 + Number of dimension in which to immerse the dissimilarities. + + Returns + ---------- + X : ndarray, shape (n_samples, n_components) + Coordinates of the points in a ``n_components``-space. + + stress : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points). + + References + ---------- + "An Introduction to MDS" Florian Wickelmaier + Sound Quality Research Unit, Aalborg University, Denmark (2003) + + "Multidimensional Scaling" Chapman Hall + 2nd edition, Boca Raton (2001) + + """ + + dissimilarities = check_symmetric(dissimilarities, raise_exception=True) + + n_samples = dissimilarities.shape[0] + + # Centering matrix + H = np.eye(*dissimilarities.shape) - (1. / n_samples) * \ + np.ones(dissimilarities.shape) + + # Double centered matrix + K = -0.5 * np.dot(H, np.dot(dissimilarities ** 2, H)) + + w, V = linalg.eigh(K, check_finite=False) + + # ``dissimilarities`` is Euclidean iff ``K`` is positive semi-definite. + # For detail see "Multidimensional Scaling" Chapman Hall p 397 + try: + w = _check_psd_eigenvalues(w) + except ValueError: + raise ValueError("Dissimilarity matrix must be euclidean. " + "Make sure to pass an euclidean matrix, or use " + "dissimilarity='euclidean'.") + + # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors. + # Eigenvalues should be in descending order by convention. + w = w[:-n_components-1:-1] + V = V[:, :-n_components-1:-1] + + X = np.sqrt(w) * V + + dist = euclidean_distances(X) + stress = ((dissimilarities.ravel() - dist.ravel()) ** 2).sum() * 0.5 + + return X, stress + + class MDS(BaseEstimator): """Multidimensional scaling @@ -285,14 +352,20 @@ class MDS(BaseEstimator): metric : boolean, optional, default: True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. + If ``method=='svd'``, metric must be set to True. + n_init : int, optional, default: 4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. + Ignored if ``method=='svd'``. + max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run. + Ignored if ``method=='svd'``. + verbose : int, optional, default: 0 Level of verbosity. @@ -300,6 +373,8 @@ class MDS(BaseEstimator): Relative tolerance with respect to stress at which to declare convergence. + Ignored if ``method=='svd'``. + n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple initializations are used (``n_init``), each run of the algorithm is @@ -309,6 +384,8 @@ class MDS(BaseEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. + Ignored if ``method=='svd'``. + random_state : int, RandomState instance, default=None Determines the random number generator used to initialize the centers. Pass an int for reproducible results across multiple function calls. @@ -324,6 +401,11 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. + method: {'smacof', 'svd'}, default ='smacof' + The method used for solving the MDS problem. + + .. versionadded:: 0.23 + Attributes ---------- embedding_ : array-like, shape (n_samples, n_components) @@ -333,6 +415,12 @@ class MDS(BaseEstimator): The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points). + n_iter_ : int + The number of iterations of SMACOF algorithm corresponding + to the best stress. + + It is set to ``None`` if ``method=='svd'``. + Examples -------- >>> from sklearn.datasets import load_digits @@ -357,12 +445,15 @@ class MDS(BaseEstimator): hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ + def __init__(self, n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-3, n_jobs=None, - random_state=None, dissimilarity="euclidean"): + random_state=None, dissimilarity="euclidean", + method="smacof"): self.n_components = n_components self.dissimilarity = dissimilarity self.metric = metric + self.method = method self.n_init = n_init self.max_iter = max_iter self.eps = eps @@ -387,6 +478,7 @@ def fit(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None + Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. @@ -407,6 +499,7 @@ def fit_transform(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None + Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. @@ -423,14 +516,26 @@ def fit_transform(self, X, y=None, init=None): elif self.dissimilarity == "euclidean": self.dissimilarity_matrix_ = euclidean_distances(X) else: - raise ValueError("Proximity must be 'precomputed' or 'euclidean'." - " Got %s instead" % str(self.dissimilarity)) - - self.embedding_, self.stress_, self.n_iter_ = smacof( - self.dissimilarity_matrix_, metric=self.metric, - n_components=self.n_components, init=init, n_init=self.n_init, - n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, - eps=self.eps, random_state=self.random_state, - return_n_iter=True) + raise ValueError( + "Dissimilarity matrix must be 'precomputed' or 'euclidean'." + " Got %s instead" % str(self.dissimilarity)) + + if self.method == "smacof": + self.embedding_, self.stress_, self.n_iter_ = smacof( + self.dissimilarity_matrix_, metric=self.metric, + n_components=self.n_components, init=init, + n_init=self.n_init, n_jobs=self.n_jobs, + max_iter=self.max_iter, verbose=self.verbose, + eps=self.eps, random_state=self.random_state, + return_n_iter=True) + elif self.method == "svd": + if not self.metric: + raise ValueError("Using SVD requires metric=True") + self.embedding_, self.stress_ = svd_scaler( + self.dissimilarity_matrix_, n_components=self.n_components) + self.n_iter_ = None + else: + raise ValueError("Method must be 'smacof' or 'svd'." + " Got %s instead" % str(self.method)) return self.embedding_ diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 4349aeeefdedc..e2749d71caf9d 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -55,6 +55,73 @@ def test_smacof_error(): mds.smacof(sim, init=Z, n_init=1) +def test_svd(): + # Test svd using example data from "An Introduction to MDS" + # Florian Wickelmaier, p 11 + sim = np.array([[0, 93, 82, 133], + [93, 0, 52, 60], + [82, 52, 0, 111], + [133, 60, 111, 0]]) + + X, stress = mds.svd_scaler(sim, n_components=2) + X_true_1 = np.array([[-62.831, -32.97448], + [18.403, 12.02697], + [-24.960, 39.71091], + [69.388, -18.76340]]) + X_true_2 = np.copy(X_true_1) + X_true_2[:, 0] *= -1 + + # Signs of columns are dependent on signs of computed eigenvectors + # which are arbitrary and meaningless + assert(np.allclose(X, X_true_1) + or np.allclose(X, -X_true_1) + or np.allclose(X, X_true_2) + or np.allclose(X, -X_true_2)) + + +def test_svd_error(): + # Non symmetric (dis)similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError): + mds.svd_scaler(sim) + + # Non squared (dis)similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError): + mds.svd_scaler(sim) + + # Non Euclidean (dis)similarity matrix: + sim = np.array([[0, 12, 3, 4], + [12, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + with pytest.raises(ValueError, + match="Dissimilarity matrix must be euclidean"): + mds.svd_scaler(sim) + + +def test_MDS_error(): + # Bad method name + sim = np.ones((2, 2)) + mdc_clf = mds.MDS(method='bad name') + with pytest.raises(ValueError): + mdc_clf.fit(sim) + + # SVD with metric=False + sim = np.ones((2, 2)) + mdc_clf = mds.MDS(metric=False, method='svd') + with pytest.raises(ValueError): + mdc_clf.fit(sim) + + def test_MDS(): sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], @@ -62,3 +129,29 @@ def test_MDS(): [4, 2, 1, 0]]) mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed") mds_clf.fit(sim) + + +def test_MDS_svd(): + # Test svd using example data from "An Introduction to MDS" + # Florian Wickelmaier, p 11 + sim = np.array([[0, 93, 82, 133], + [93, 0, 52, 60], + [82, 52, 0, 111], + [133, 60, 111, 0]]) + + mds_clf = mds.MDS(metric=True, method="svd", dissimilarity='precomputed') + mds_clf.fit(sim) + + X_true_1 = np.array([[-62.831, -32.97448], + [18.403, 12.02697], + [-24.960, 39.71091], + [69.388, -18.76340]]) + X_true_2 = np.copy(X_true_1) + X_true_2[:, 0] *= -1 + + # Signs of columns are dependent on signs of computed eigenvectors + # which are arbitrary and meaningless + assert (np.allclose(mds_clf.embedding_, X_true_1) + or np.allclose(mds_clf.embedding_, -X_true_1) + or np.allclose(mds_clf.embedding_, X_true_2) + or np.allclose(mds_clf.embedding_, -X_true_2)) From cba27008b54f954fb9de602f61b8480002e0a547 Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Thu, 5 Mar 2020 14:22:35 +0100 Subject: [PATCH 02/27] Make requested changes --- doc/modules/manifold.rst | 26 +++++++++++++------------- sklearn/manifold/_mds.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index 9d08304c9d474..a99a76b5d15b9 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -442,6 +442,19 @@ Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate method of calculating results. See :ref:`multidimensional_scaling_method` for details. +.. _multidimensional_scaling_method: + +Method +------------- + +Metric :class:`MDS` offers two different algorithms (methods) to calculate +results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a +COmplicated Function) minimizes its objective function (stress) in an iterative +manner. The SVD-based method performs series of transformations (including +Singular Value Decomposition) to give exact result. The SVD-based method is +thus much faster and more accurate, but also less general - it requires metric +of :math:`S` to be Euclidean. + Nonmetric MDS ------------- @@ -460,19 +473,6 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. :align: center :scale: 60 -.. _multidimensional_scaling_method: - -Method -------------- - -Metric :class:`MDS` offers two different algorithms (methods) to calculate -results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a -COmplicated Function) minimizes objective function (stress) in iterative -manner. The SVD-based method performs series of transformations (including -Singular Value Decomposition) to give exact result. The SVD-based method is -thus much faster and more accurate, but also less general - it requires metric -of :math:`S` to be Euclidean. - .. topic:: References: diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index d181b43ce44b3..ec1634e32cfed 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -401,7 +401,7 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. - method: {'smacof', 'svd'}, default ='smacof' + method : {'smacof', 'svd'}, default ='smacof' The method used for solving the MDS problem. .. versionadded:: 0.23 From 03954cf586ca4ddb71b17ca14ca3a20e1857643e Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Thu, 5 Mar 2020 14:34:11 +0100 Subject: [PATCH 03/27] Change name 'method' to 'solver' --- doc/modules/manifold.rst | 14 ++++++------ doc/whats_new/v0.23.rst | 6 +++--- sklearn/manifold/_mds.py | 34 ++++++++++++++++-------------- sklearn/manifold/tests/test_mds.py | 8 +++---- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index a99a76b5d15b9..07bfb5fa510ec 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -440,18 +440,18 @@ should then correspond exactly to the distance between point :math:`i` and Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate -method of calculating results. See :ref:`multidimensional_scaling_method` for details. +solver of calculating results. See :ref:`multidimensional_scaling_solver` for details. -.. _multidimensional_scaling_method: +.. _multidimensional_scaling_solver: -Method +Solver ------------- -Metric :class:`MDS` offers two different algorithms (methods) to calculate -results: SMACOF and SVD-based. The SMACOF method (Scaling by MAjorizing a +Metric :class:`MDS` offers two different algorithms (solvers) to calculate +results: SMACOF and SVD-based. The SMACOF solver (Scaling by MAjorizing a COmplicated Function) minimizes its objective function (stress) in an iterative -manner. The SVD-based method performs series of transformations (including -Singular Value Decomposition) to give exact result. The SVD-based method is +manner. The SVD-based solver performs series of transformations (including +Singular Value Decomposition) to give exact result. The SVD-based solver is thus much faster and more accurate, but also less general - it requires metric of :math:`S` to be Euclidean. diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 125c8f7f61f0a..4da94f5936a10 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -262,10 +262,10 @@ Changelog :mod:`sklearn.manifold` ........................... -- |Feature| Support of multidimensional scaling method, +- |Feature| Support of multidimensional scaling solver, which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`. - User can choose whether to use SVD- or SMACOF-based method via parameter - `method`. The SVD-based method is faster and more accurate, but works + User can choose whether to use SVD- or SMACOF-based solver via parameter + `solver`. The SVD-based solver is faster and more accurate, but works only for euclidean dissimilarity matrices. :pr:`16067` by :user:`Piotr Gaiński `. diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ec1634e32cfed..ada22ece07a3c 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -352,19 +352,19 @@ class MDS(BaseEstimator): metric : boolean, optional, default: True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. - If ``method=='svd'``, metric must be set to True. + If ``solver=='svd'``, metric must be set to True. n_init : int, optional, default: 4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. verbose : int, optional, default: 0 Level of verbosity. @@ -373,7 +373,7 @@ class MDS(BaseEstimator): Relative tolerance with respect to stress at which to declare convergence. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple @@ -384,7 +384,7 @@ class MDS(BaseEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. - Ignored if ``method=='svd'``. + Ignored if ``solver=='svd'``. random_state : int, RandomState instance, default=None Determines the random number generator used to initialize the centers. @@ -401,8 +401,8 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. - method : {'smacof', 'svd'}, default ='smacof' - The method used for solving the MDS problem. + solver : {'smacof', 'svd'}, default ='smacof' + The solver used for solving the MDS problem. .. versionadded:: 0.23 @@ -419,7 +419,7 @@ class MDS(BaseEstimator): The number of iterations of SMACOF algorithm corresponding to the best stress. - It is set to ``None`` if ``method=='svd'``. + It is set to ``None`` if ``solver=='svd'``. Examples -------- @@ -449,11 +449,11 @@ class MDS(BaseEstimator): def __init__(self, n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-3, n_jobs=None, random_state=None, dissimilarity="euclidean", - method="smacof"): + solver="smacof"): self.n_components = n_components self.dissimilarity = dissimilarity self.metric = metric - self.method = method + self.solver = solver self.n_init = n_init self.max_iter = max_iter self.eps = eps @@ -478,10 +478,11 @@ def fit(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None - Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. + + Ignored if ``solver=='svd'``. """ self.fit_transform(X, init=init) return self @@ -499,10 +500,11 @@ def fit_transform(self, X, y=None, init=None): y : Ignored init : ndarray, shape (n_samples,), optional, default: None - Ignored if ``method=='svd'``. Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly chosen array. + + Ignored if ``solver=='svd'``. """ X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": @@ -520,7 +522,7 @@ def fit_transform(self, X, y=None, init=None): "Dissimilarity matrix must be 'precomputed' or 'euclidean'." " Got %s instead" % str(self.dissimilarity)) - if self.method == "smacof": + if self.solver == "smacof": self.embedding_, self.stress_, self.n_iter_ = smacof( self.dissimilarity_matrix_, metric=self.metric, n_components=self.n_components, init=init, @@ -528,14 +530,14 @@ def fit_transform(self, X, y=None, init=None): max_iter=self.max_iter, verbose=self.verbose, eps=self.eps, random_state=self.random_state, return_n_iter=True) - elif self.method == "svd": + elif self.solver == "svd": if not self.metric: raise ValueError("Using SVD requires metric=True") self.embedding_, self.stress_ = svd_scaler( self.dissimilarity_matrix_, n_components=self.n_components) self.n_iter_ = None else: - raise ValueError("Method must be 'smacof' or 'svd'." - " Got %s instead" % str(self.method)) + raise ValueError("Solver must be 'smacof' or 'svd'." + " Got %s instead" % str(self.solver)) return self.embedding_ diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index e2749d71caf9d..ce70245d4a128 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -109,15 +109,15 @@ def test_svd_error(): def test_MDS_error(): - # Bad method name + # Bad solver name sim = np.ones((2, 2)) - mdc_clf = mds.MDS(method='bad name') + mdc_clf = mds.MDS(solver='bad name') with pytest.raises(ValueError): mdc_clf.fit(sim) # SVD with metric=False sim = np.ones((2, 2)) - mdc_clf = mds.MDS(metric=False, method='svd') + mdc_clf = mds.MDS(metric=False, solver='svd') with pytest.raises(ValueError): mdc_clf.fit(sim) @@ -139,7 +139,7 @@ def test_MDS_svd(): [82, 52, 0, 111], [133, 60, 111, 0]]) - mds_clf = mds.MDS(metric=True, method="svd", dissimilarity='precomputed') + mds_clf = mds.MDS(metric=True, solver="svd", dissimilarity='precomputed') mds_clf.fit(sim) X_true_1 = np.array([[-62.831, -32.97448], From 3047960ed9b8e12da76afb5f5d3673e22ce93e51 Mon Sep 17 00:00:00 2001 From: Pan Jan Date: Sat, 7 Mar 2020 12:22:36 +0100 Subject: [PATCH 04/27] Make another set of requested changes --- sklearn/manifold/_mds.py | 29 +++++++++++++------------- sklearn/manifold/tests/test_mds.py | 33 +++++------------------------- 2 files changed, 20 insertions(+), 42 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ada22ece07a3c..a3559fa0c7e40 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -287,7 +287,7 @@ def svd_scaler(dissimilarities, n_components=2): Returns ---------- - X : ndarray, shape (n_samples, n_components) + embedding : ndarray, shape (n_samples, n_components) Coordinates of the points in a ``n_components``-space. stress : float @@ -299,8 +299,8 @@ def svd_scaler(dissimilarities, n_components=2): "An Introduction to MDS" Florian Wickelmaier Sound Quality Research Unit, Aalborg University, Denmark (2003) - "Multidimensional Scaling" Chapman Hall - 2nd edition, Boca Raton (2001) + "Metric and Euclidean properties of dissimilarity coefficients" + J. C. Gower and P. Legendre, Journal of Classification 3, 5–48 (1986) """ @@ -309,16 +309,17 @@ def svd_scaler(dissimilarities, n_components=2): n_samples = dissimilarities.shape[0] # Centering matrix - H = np.eye(*dissimilarities.shape) - (1. / n_samples) * \ - np.ones(dissimilarities.shape) + J = np.eye(*dissimilarities.shape) - (1. / n_samples) * ( + np.ones(dissimilarities.shape)) # Double centered matrix - K = -0.5 * np.dot(H, np.dot(dissimilarities ** 2, H)) + B = -0.5 * np.dot(J, np.dot(dissimilarities ** 2, J)) - w, V = linalg.eigh(K, check_finite=False) + w, V = linalg.eigh(B, check_finite=False) - # ``dissimilarities`` is Euclidean iff ``K`` is positive semi-definite. - # For detail see "Multidimensional Scaling" Chapman Hall p 397 + # ``dissimilarities`` is Euclidean iff ``B`` is positive semi-definite. + # See "Metric and Euclidean properties of dissimilarity coefficients" + # for details try: w = _check_psd_eigenvalues(w) except ValueError: @@ -328,15 +329,15 @@ def svd_scaler(dissimilarities, n_components=2): # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors. # Eigenvalues should be in descending order by convention. - w = w[:-n_components-1:-1] - V = V[:, :-n_components-1:-1] + w = w[::-1][:n_components] + V = V[:, ::-1][:, :n_components] - X = np.sqrt(w) * V + embedding = np.sqrt(w) * V - dist = euclidean_distances(X) + dist = euclidean_distances(embedding) stress = ((dissimilarities.ravel() - dist.ravel()) ** 2).sum() * 0.5 - return X, stress + return embedding, stress class MDS(BaseEstimator): diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index ce70245d4a128..421256811164c 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -55,30 +55,6 @@ def test_smacof_error(): mds.smacof(sim, init=Z, n_init=1) -def test_svd(): - # Test svd using example data from "An Introduction to MDS" - # Florian Wickelmaier, p 11 - sim = np.array([[0, 93, 82, 133], - [93, 0, 52, 60], - [82, 52, 0, 111], - [133, 60, 111, 0]]) - - X, stress = mds.svd_scaler(sim, n_components=2) - X_true_1 = np.array([[-62.831, -32.97448], - [18.403, 12.02697], - [-24.960, 39.71091], - [69.388, -18.76340]]) - X_true_2 = np.copy(X_true_1) - X_true_2[:, 0] *= -1 - - # Signs of columns are dependent on signs of computed eigenvectors - # which are arbitrary and meaningless - assert(np.allclose(X, X_true_1) - or np.allclose(X, -X_true_1) - or np.allclose(X, X_true_2) - or np.allclose(X, -X_true_2)) - - def test_svd_error(): # Non symmetric (dis)similarity matrix: sim = np.array([[0, 5, 9, 4], @@ -86,7 +62,7 @@ def test_svd_error(): [3, 2, 0, 1], [4, 2, 1, 0]]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Array must be symmetric"): mds.svd_scaler(sim) # Non squared (dis)similarity matrix: @@ -94,7 +70,8 @@ def test_svd_error(): [5, 0, 2, 2], [4, 2, 1, 0]]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="array must be 2-dimensional and square"): mds.svd_scaler(sim) # Non Euclidean (dis)similarity matrix: @@ -112,13 +89,13 @@ def test_MDS_error(): # Bad solver name sim = np.ones((2, 2)) mdc_clf = mds.MDS(solver='bad name') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Solver must be 'smacof' or 'svd'"): mdc_clf.fit(sim) # SVD with metric=False sim = np.ones((2, 2)) mdc_clf = mds.MDS(metric=False, solver='svd') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Using SVD requires metric=True"): mdc_clf.fit(sim) From c8ec454388a19fa382449dd692803d8502576357 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 29 Jan 2022 15:19:38 -0500 Subject: [PATCH 05/27] Added temporary benchmark notebook --- tmp_bench/bench.ipynb | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tmp_bench/bench.ipynb diff --git a/tmp_bench/bench.ipynb b/tmp_bench/bench.ipynb new file mode 100644 index 0000000000000..4476edae9861f --- /dev/null +++ b/tmp_bench/bench.ipynb @@ -0,0 +1,79 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.manifold import _mds as mds\n", + "from time import time\n", + "\n", + "def generate_euclidean(size):\n", + " Y = np.random.randint(-100, 100, (size, 2))\n", + " M = mds.euclidean_distances(Y)\n", + " return M\n", + "\n", + "def bench(func, args):\n", + " res = np.empty((len(args), 2))\n", + " start = time()\n", + " for i, arg in enumerate(args):\n", + " res[i] = func(arg)[1]\n", + " dt = time()-start\n", + " return {'dt':dt, 'avg stress': np.average(res), 'max stress':np.max(res), 'stress variance':np.var(res)}\n", + "\n", + "headers = ['dt', 'avg stress', 'max stress', 'stress variance']\n", + "params = [(5, 100), (50, 50), (250, 25), (1000, 5)]\n", + "#params = [(4, 1)]\n", + "table_svd = []\n", + "table_smacof = []\n", + "table_diff = []\n", + "\n", + "for i, (size, reps) in enumerate(params):\n", + " head = f'{size}x{size} ({reps})'\n", + " args = [generate_euclidean(size) for j in range(reps)]\n", + " print('benching tuple {} ({}/{})'.format((size, reps), i+1, len(params)))\n", + " stats_svd = bench(mds.svd_scaler, args)\n", + " stats_smacof = bench(mds.smacof, args)\n", + " table_svd.append(stats_svd)\n", + " table_smacof.append(stats_smacof)\n", + "\n", + "df_svd = pd.DataFrame(table_svd, columns=headers)\n", + "df_smacof = pd.DataFrame(table_smacof, columns=headers)\n", + "print('SVD:')\n", + "print(df_svd)\n", + "print('\\nSMACOF:')\n", + "print(df_smacof)\n", + "print('\\nSMACOF/SVD:')\n", + "print(df_svd/df_smacof)\n" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "146110dc17f39bf2eaf4afb2b459894be2a7189440f0c9917d0223fc286b6bb2" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit ('scikit-dev': conda)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From bdc70da74b143d8f06d9694435334507b91e1d7d Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 29 Jan 2022 17:03:24 -0500 Subject: [PATCH 06/27] Updated solver selection logic --- sklearn/manifold/_mds.py | 14 +++++++++----- tmp_bench/bench.ipynb | 26 ++++++++++++++++++++------ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 40127f7103640..f7060d3110fe1 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -365,7 +365,7 @@ def svd_scaler(dissimilarities, n_components=2): "dissimilarity='euclidean'." ) - # Get ``n_compontent`` greatest eigenvalues and corresponding eigenvectors. + # Get ``n_components`` greatest eigenvalues and corresponding eigenvectors. # Eigenvalues should be in descending order by convention. w = w[::-1][:n_components] V = V[:, ::-1][:, :n_components] @@ -435,10 +435,12 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. - solver : {'smacof', 'svd'}, default ='smacof' - The solver used for solving the MDS problem. + solver : {'auto', 'smacof', 'svd'}, default = 'auto' + The solver used for solving the MDS problem. When set to 'auto', MDS + will use the ``svd`` solver when ``metric==True`` and the ``smacof`` + solver when ``metric==False`` - .. versionadded:: 0.23 + .. versionadded:: 1.1 Attributes ---------- @@ -518,7 +520,7 @@ def __init__( n_jobs=None, random_state=None, dissimilarity="euclidean", - solver="smacof", + solver="auto", ): self.n_components = n_components self.dissimilarity = dissimilarity @@ -603,6 +605,8 @@ def fit_transform(self, X, y=None, init=None): "dissimilarity matrix, set " "``dissimilarity='precomputed'``." ) + if self.solver == "auto": + self.solver = "svd" if self.metric else "smacof" if self.dissimilarity == "precomputed": self.dissimilarity_matrix_ = X diff --git a/tmp_bench/bench.ipynb b/tmp_bench/bench.ipynb index 4476edae9861f..e9dbdf8a5fa3f 100644 --- a/tmp_bench/bench.ipynb +++ b/tmp_bench/bench.ipynb @@ -2,20 +2,34 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.manifold import _mds as mds\n", - "from time import time\n", - "\n", + "from time import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ "def generate_euclidean(size):\n", " Y = np.random.randint(-100, 100, (size, 2))\n", " M = mds.euclidean_distances(Y)\n", - " return M\n", - "\n", + " return M" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "def bench(func, args):\n", " res = np.empty((len(args), 2))\n", " start = time()\n", @@ -47,7 +61,7 @@ "print('\\nSMACOF:')\n", "print(df_smacof)\n", "print('\\nSMACOF/SVD:')\n", - "print(df_svd/df_smacof)\n" + "print(df_svd/df_smacof)" ] } ], From 8573ed3f28354f4f01c1a2af855a31e0f58c1aa4 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sun, 30 Jan 2022 12:36:12 -0500 Subject: [PATCH 07/27] Updated documentation and improved solver selection --- sklearn/manifold/_mds.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index f7060d3110fe1..08791c6444961 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -390,17 +390,17 @@ class MDS(BaseEstimator): metric : bool, default=True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. - If ``solver=='svd'``, metric must be set to True. + If ``solver=='svd'``, metric must be set to True. n_init : int, optional, default: 4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. - Ignored if ``solver=='svd'``. + Ignored if ``solver=='svd'``. max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run. - Ignored if ``solver=='svd'``. + Ignored if ``solver=='svd'``. verbose : int, optional, default: 0 Level of verbosity. @@ -408,7 +408,7 @@ class MDS(BaseEstimator): eps : float, default=1e-3 Relative tolerance with respect to stress at which to declare convergence. - Ignored if ``solver=='svd'``. + Ignored if ``solver=='svd'``. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple @@ -418,7 +418,7 @@ class MDS(BaseEstimator): ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - Ignored if ``solver=='svd'``. + Ignored if ``solver=='svd'``. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -438,7 +438,7 @@ class MDS(BaseEstimator): solver : {'auto', 'smacof', 'svd'}, default = 'auto' The solver used for solving the MDS problem. When set to 'auto', MDS will use the ``svd`` solver when ``metric==True`` and the ``smacof`` - solver when ``metric==False`` + solver when ``metric==False``. .. versionadded:: 1.1 @@ -563,7 +563,7 @@ def fit(self, X, y=None, init=None): init : ndarray of shape (n_samples,), default=None Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly - chosen array. Ignored if ``solver=='svd'``. + chosen array. Ignored if ``solver=='svd'``. Returns ------- @@ -605,8 +605,9 @@ def fit_transform(self, X, y=None, init=None): "dissimilarity matrix, set " "``dissimilarity='precomputed'``." ) + solver = self.solver if self.solver == "auto": - self.solver = "svd" if self.metric else "smacof" + solver = "svd" if self.metric else "smacof" if self.dissimilarity == "precomputed": self.dissimilarity_matrix_ = X @@ -619,7 +620,7 @@ def fit_transform(self, X, y=None, init=None): % str(self.dissimilarity) ) - if self.solver == "smacof": + if solver == "smacof": self.embedding_, self.stress_, self.n_iter_ = smacof( self.dissimilarity_matrix_, metric=self.metric, @@ -633,7 +634,7 @@ def fit_transform(self, X, y=None, init=None): random_state=self.random_state, return_n_iter=True, ) - elif self.solver == "svd": + elif solver == "svd": if not self.metric: raise ValueError("Using SVD requires metric=True") self.embedding_, self.stress_ = svd_scaler( @@ -642,7 +643,8 @@ def fit_transform(self, X, y=None, init=None): self.n_iter_ = None else: raise ValueError( - "Solver must be 'smacof' or 'svd'. Got %s instead" % str(self.solver) + "Solver must be 'smacof', 'svd' or 'auto'. Got %s instead" + % str(self.solver) ) return self.embedding_ From 4c1b098e35a7b84bd53b639c7eadaf76a1b82adb Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 1 Feb 2022 22:55:46 -0500 Subject: [PATCH 08/27] Updated docs that included old `default: ...` format --- sklearn/manifold/_mds.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 08791c6444961..60898c47d02b6 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -317,7 +317,7 @@ def svd_scaler(dissimilarities, n_components=2): ---------- dissimilarities : ndarray, shape (n_samples, n_samples) Pairwise dissimilarities between the points. Must be euclidean. - n_components : int, optional, default: 2 + n_components : int, optional, default=2 Number of dimension in which to immerse the dissimilarities. Returns @@ -392,17 +392,17 @@ class MDS(BaseEstimator): If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. If ``solver=='svd'``, metric must be set to True. - n_init : int, optional, default: 4 + n_init : int, optional, default=4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. Ignored if ``solver=='svd'``. - max_iter : int, optional, default: 300 + max_iter : int, optional, default=300 Maximum number of iterations of the SMACOF algorithm for a single run. Ignored if ``solver=='svd'``. - verbose : int, optional, default: 0 + verbose : int, optional, default=0 Level of verbosity. eps : float, default=1e-3 From 94a7bfa8f76a0baff4858054edfea22c69b82386 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 1 Feb 2022 23:10:36 -0500 Subject: [PATCH 09/27] Updated whats_new --- doc/whats_new/v0.23.rst | 10 ---------- doc/whats_new/v1.1.rst | 7 +++++++ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 97acd69d6eafa..ebf63eac5b8a3 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -562,16 +562,6 @@ Changelog :pr:`17021` by :user:`Alex Gramfort ` and :user:`Mathurin Massias `. -:mod:`sklearn.manifold` -........................... - -- |Feature| Support of multidimensional scaling solver, - which uses Singular Value Decomposition (SVD) in :class:`manifold.MDS`. - User can choose whether to use SVD- or SMACOF-based solver via parameter - `solver`. The SVD-based solver is faster and more accurate, but works - only for euclidean dissimilarity matrices. - :pr:`16067` by :user:`Piotr Gaiński `. - :mod:`sklearn.metrics` ...................... diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 5fd88b4132c3a..20187f8e2910c 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -440,6 +440,13 @@ Changelog in eigen_solvers `lobpcg` and `amg` to improve their numerical stability. :pr:`21565` by :user:`Andrew Knyazev `. +- |Feature| :class:`manifold.MDS` now supports a Singular Value Decomposition + (SVD). Users can choose whether to use SVD or SMACOF-based solvers via the + `solver` parameter. The SVD-based solver is faster and more accurate, but only + works for euclidean dissimilarity matrices in the metric-MDS context. + :pr:`16067` by :user:`Piotr Gaiński ` and + :pr:`22330` by :user:`Meekail Zain `. + :mod:`sklearn.model_selection` .............................. From 48a003bbb7defbe3df0b8da88e1374e304a06d29 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 2 Feb 2022 10:24:56 -0500 Subject: [PATCH 10/27] Undo `solver='auto'` change --- sklearn/manifold/_mds.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 60898c47d02b6..d19fdbb602970 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -520,7 +520,7 @@ def __init__( n_jobs=None, random_state=None, dissimilarity="euclidean", - solver="auto", + solver="smacof", ): self.n_components = n_components self.dissimilarity = dissimilarity @@ -605,9 +605,6 @@ def fit_transform(self, X, y=None, init=None): "dissimilarity matrix, set " "``dissimilarity='precomputed'``." ) - solver = self.solver - if self.solver == "auto": - solver = "svd" if self.metric else "smacof" if self.dissimilarity == "precomputed": self.dissimilarity_matrix_ = X @@ -620,7 +617,7 @@ def fit_transform(self, X, y=None, init=None): % str(self.dissimilarity) ) - if solver == "smacof": + if self.solver == "smacof": self.embedding_, self.stress_, self.n_iter_ = smacof( self.dissimilarity_matrix_, metric=self.metric, @@ -634,7 +631,7 @@ def fit_transform(self, X, y=None, init=None): random_state=self.random_state, return_n_iter=True, ) - elif solver == "svd": + elif self.solver == "svd": if not self.metric: raise ValueError("Using SVD requires metric=True") self.embedding_, self.stress_ = svd_scaler( From ca6066035f5422c8169757cd92ec1e9218500115 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 2 Feb 2022 10:27:49 -0500 Subject: [PATCH 11/27] Updated solver error message --- sklearn/manifold/_mds.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index d19fdbb602970..153fc152d12c8 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -640,8 +640,7 @@ def fit_transform(self, X, y=None, init=None): self.n_iter_ = None else: raise ValueError( - "Solver must be 'smacof', 'svd' or 'auto'. Got %s instead" - % str(self.solver) + "Solver must be 'smacof' or 'svd'. Got %s instead" % str(self.solver) ) return self.embedding_ From 48b9d21783091ba27375d024895419720d72f898 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 2 Feb 2022 11:00:43 -0500 Subject: [PATCH 12/27] Corrected solver parameter documentation --- sklearn/manifold/_mds.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 153fc152d12c8..bd7332b877d86 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -435,10 +435,8 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. - solver : {'auto', 'smacof', 'svd'}, default = 'auto' - The solver used for solving the MDS problem. When set to 'auto', MDS - will use the ``svd`` solver when ``metric==True`` and the ``smacof`` - solver when ``metric==False``. + solver : {'smacof', 'svd'}, default = 'smacof' + The solver used for solving the MDS problem. .. versionadded:: 1.1 From 6042fa23f741d8cc5394bf9d04885ad725e930cb Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 15 Feb 2022 13:30:10 -0500 Subject: [PATCH 13/27] Removed depreciated tests --- sklearn/manifold/tests/test_mds.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index c20bc4de533f0..fa5a7ca1908d5 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -3,7 +3,6 @@ import pytest from sklearn.manifold import _mds as mds -from sklearn.utils._testing import ignore_warnings def test_smacof(): @@ -108,26 +107,3 @@ def test_MDS_svd(): or np.allclose(mds_clf.embedding_, X_true_2) or np.allclose(mds_clf.embedding_, -X_true_2) ) - - -# TODO: Remove in 1.1 -def test_MDS_pairwise_deprecated(): - mds_clf = mds.MDS(metric="precomputed") - msg = r"Attribute `_pairwise` was deprecated in version 0\.24" - with pytest.warns(FutureWarning, match=msg): - mds_clf._pairwise - - -# TODO: Remove in 1.1 -@ignore_warnings(category=FutureWarning) -@pytest.mark.parametrize( - "dissimilarity, expected_pairwise", - [ - ("precomputed", True), - ("euclidean", False), - ], -) -def test_MDS_pairwise(dissimilarity, expected_pairwise): - # _pairwise attribute is set correctly - mds_clf = mds.MDS(dissimilarity=dissimilarity) - assert mds_clf._pairwise == expected_pairwise From 874739a33797acd176acd6896ddb33427d1d8dd0 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 16 Feb 2022 00:35:23 -0500 Subject: [PATCH 14/27] Linting w/ new version of Black --- sklearn/manifold/_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index d0f2260afbd76..3dc784a745e6d 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -348,7 +348,7 @@ def svd_scaler(dissimilarities, n_components=2): ) # Double centered matrix - B = -0.5 * np.dot(J, np.dot(dissimilarities ** 2, J)) + B = -0.5 * np.dot(J, np.dot(dissimilarities**2, J)) w, V = linalg.eigh(B, check_finite=False) From 2b6e84ccb7cf269346805ca347f56f38ff948068 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Sun, 27 Mar 2022 14:37:07 -0400 Subject: [PATCH 15/27] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/manifold/_mds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 3dc784a745e6d..bb4b032bd9f80 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -391,11 +391,11 @@ class MDS(BaseEstimator): If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. If ``solver=='svd'``, metric must be set to True. - n_init : int, optional, default=4 + n_init : int, default=4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. - Ignored if ``solver=='svd'``. + Used only when ``solver=='smacof'``. max_iter : int, optional, default=300 Maximum number of iterations of the SMACOF algorithm for a single run. From e347dc9809e5606219469ff996f8d6f96b88cba9 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 27 Mar 2022 14:47:22 -0400 Subject: [PATCH 16/27] Renamed solver from `svd` to `eigh` and added public reference --- doc/modules/manifold.rst | 2 +- sklearn/manifold/_mds.py | 28 ++++++++++++++-------------- sklearn/manifold/tests/test_mds.py | 22 +++++++++++----------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index bc3458899c5b6..25e08092f25e8 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -508,7 +508,7 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. Kruskal, J. Psychometrika, 29, (1964) * `"An Introduction to MDS" - `_ + `_ Florian Wickelmaier, Sound Quality Research Unit, Aalborg University, Denmark (2003) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index bb4b032bd9f80..750d54aa7e9df 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -308,9 +308,9 @@ def smacof( return best_pos, best_stress -def svd_scaler(dissimilarities, n_components=2): +def eigh_scaler(dissimilarities, n_components=2): """ - Computes multidimensional scaling using SVD algorithm + Computes multidimensional scaling using eigh solver. Parameters ---------- @@ -389,7 +389,7 @@ class MDS(BaseEstimator): metric : bool, default=True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. - If ``solver=='svd'``, metric must be set to True. + If ``solver=='eigh'``, metric must be set to True. n_init : int, default=4 Number of times the SMACOF algorithm will be run with different @@ -399,7 +399,7 @@ class MDS(BaseEstimator): max_iter : int, optional, default=300 Maximum number of iterations of the SMACOF algorithm for a single run. - Ignored if ``solver=='svd'``. + Ignored if ``solver=='eigh'``. verbose : int, optional, default=0 Level of verbosity. @@ -407,7 +407,7 @@ class MDS(BaseEstimator): eps : float, default=1e-3 Relative tolerance with respect to stress at which to declare convergence. - Ignored if ``solver=='svd'``. + Ignored if ``solver=='eigh'``. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple @@ -417,7 +417,7 @@ class MDS(BaseEstimator): ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - Ignored if ``solver=='svd'``. + Ignored if ``solver=='eigh'``. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -434,7 +434,7 @@ class MDS(BaseEstimator): Pre-computed dissimilarities are passed directly to ``fit`` and ``fit_transform``. - solver : {'smacof', 'svd'}, default = 'smacof' + solver : {'smacof', 'eigh'}, default = 'smacof' The solver used for solving the MDS problem. .. versionadded:: 1.1 @@ -469,7 +469,7 @@ class MDS(BaseEstimator): n_iter_ : int The number of iterations corresponding to the best stress. - It is set to ``None`` if ``solver=='svd'``. + It is set to ``None`` if ``solver=='eigh'``. See Also -------- @@ -550,7 +550,7 @@ def fit(self, X, y=None, init=None): init : ndarray of shape (n_samples,), default=None Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly - chosen array. Ignored if ``solver=='svd'``. + chosen array. Ignored if ``solver=='eigh'``. Returns ------- @@ -577,7 +577,7 @@ def fit_transform(self, X, y=None, init=None): init : ndarray of shape (n_samples,), default=None Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly - chosen array. Ignored if ``solver=='svd'``. + chosen array. Ignored if ``solver=='eigh'``. Returns ------- @@ -618,16 +618,16 @@ def fit_transform(self, X, y=None, init=None): random_state=self.random_state, return_n_iter=True, ) - elif self.solver == "svd": + elif self.solver == "eigh": if not self.metric: - raise ValueError("Using SVD requires metric=True") - self.embedding_, self.stress_ = svd_scaler( + raise ValueError("Using eigh requires metric=True") + self.embedding_, self.stress_ = eigh_scaler( self.dissimilarity_matrix_, n_components=self.n_components ) self.n_iter_ = None else: raise ValueError( - "Solver must be 'smacof' or 'svd'. Got %s instead" % str(self.solver) + "Solver must be 'smacof' or 'eigh'. Got %s instead" % str(self.solver) ) return self.embedding_ diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index fa5a7ca1908d5..ce1c3c2681379 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -38,37 +38,37 @@ def test_smacof_error(): mds.smacof(sim, init=Z, n_init=1) -def test_svd_error(): +def test_eigh_error(): # Non symmetric (dis)similarity matrix: sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) with pytest.raises(ValueError, match="Array must be symmetric"): - mds.svd_scaler(sim) + mds.eigh_scaler(sim) # Non squared (dis)similarity matrix: sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]]) with pytest.raises(ValueError, match="array must be 2-dimensional and square"): - mds.svd_scaler(sim) + mds.eigh_scaler(sim) # Non Euclidean (dis)similarity matrix: sim = np.array([[0, 12, 3, 4], [12, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) with pytest.raises(ValueError, match="Dissimilarity matrix must be euclidean"): - mds.svd_scaler(sim) + mds.eigh_scaler(sim) def test_MDS_error(): # Bad solver name sim = np.ones((2, 2)) mdc_clf = mds.MDS(solver="bad name") - with pytest.raises(ValueError, match="Solver must be 'smacof' or 'svd'"): + with pytest.raises(ValueError, match="Solver must be 'smacof' or 'eigh'"): mdc_clf.fit(sim) - # SVD with metric=False + # eigh with metric=False sim = np.ones((2, 2)) - mdc_clf = mds.MDS(metric=False, solver="svd") - with pytest.raises(ValueError, match="Using SVD requires metric=True"): + mdc_clf = mds.MDS(metric=False, solver="eigh") + with pytest.raises(ValueError, match="Using eigh requires metric=True"): mdc_clf.fit(sim) @@ -78,14 +78,14 @@ def test_MDS(): mds_clf.fit(sim) -def test_MDS_svd(): - # Test svd using example data from "An Introduction to MDS" +def test_MDS_eigh(): + # Test eigh using example data from "An Introduction to MDS" # Florian Wickelmaier, p 11 sim = np.array( [[0, 93, 82, 133], [93, 0, 52, 60], [82, 52, 0, 111], [133, 60, 111, 0]] ) - mds_clf = mds.MDS(metric=True, solver="svd", dissimilarity="precomputed") + mds_clf = mds.MDS(metric=True, solver="eigh", dissimilarity="precomputed") mds_clf.fit(sim) X_true_1 = np.array( From 5e33d52044561463e3fc41e8057f2890f7229ae2 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Mon, 30 May 2022 17:25:20 -0400 Subject: [PATCH 17/27] Reconciled changelogs --- doc/whats_new/v1.1.rst | 187 ++++++++++++++++++++++++++--------------- doc/whats_new/v1.2.rst | 9 ++ 2 files changed, 127 insertions(+), 69 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 771a2c3f01f9f..4c46c0d631f76 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -2,31 +2,92 @@ .. currentmodule:: sklearn +.. _changes_1_1_2: + +Version 1.1.2 +============= + +**In Development** + +Changelog +--------- + +:mod:`sklearn.cluster` +...................... + +- |Fix| Fixed a bug in :class:`cluster.Birch` that could trigger an error when splitting + a node if there are duplicates in the dataset. + :pr:`23395` by :user:`Jérémie du Boisberranger `. + .. _changes_1_1_1: Version 1.1.1 ============= -**In Development** +**May 2022** Changelog --------- +- |Enhancement| The error message is improved when importing + :class:`model_selection.HalvingGridSearchCV`, + :class:`model_selection.HalvingRandomSearchCV`, or + :class:`impute.IterativeImputer` without importing the experimental flag. + :pr:`23194` by `Thomas Fan`_. + +- |Enhancement| Added an extension in doc/conf.py to automatically generate + the list of estimators that handle NaN values. + :pr:`23198` by :user:`Lise Kleiber `, :user:`Zhehao Liu ` + and :user:`Chiara Marmo `. + +:mod:`sklearn.datasets` +....................... + +- |Fix| Avoid timeouts in :func:`datasets.fetch_openml` by not passing a + `timeout` argument, :pr:`23358` by :user:`Loïc Estève `. + :mod:`sklearn.decomposition` ............................ - |Fix| Avoid spurious warning in :class:`decomposition.IncrementalPCA` when - `n_samples == n_components`. :pr:`23264` by :user:`Lucy Liu ` + `n_samples == n_components`. :pr:`23264` by :user:`Lucy Liu `. + +:mod:`sklearn.feature_selection` +................................ + +- |Fix| The `partial_fit` method of :class:`feature_selection.SelectFromModel` + now conducts validation for `max_features` and `feature_names_in` parameters. + :pr:`23299` by :user:`Long Bao `. :mod:`sklearn.metrics` ...................... -- |Fix| Fixes `metrics.precision_recall_curve` to compute precision-recall at 100% +- |Fix| Fixes :func:`metrics.precision_recall_curve` to compute precision-recall at 100% recall. The Precision-Recall curve now displays the last point corresponding to a classifier that always predicts the positive class: recall=100% and precision=class balance. :pr:`23214` by :user:`Stéphane Collot ` and :user:`Max Baak `. +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :class:`preprocessing.PolynomialFeatures` with ``degree`` equal to 0 + will raise error when ``include_bias`` is set to False, and outputs a single + constant array when ``include_bias`` is set to True. + :pr:`23370` by :user:`Zhehao Liu `. + +:mod:`sklearn.tree` +................... + +- |Fix| Fixes performance regression with low cardinality features for + :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor`, + :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestRegressor`, + :class:`ensemble.GradientBoostingClassifier`, and + :class:`ensemble.GradientBoostingRegressor`. + :pr:`23410` by :user:`Loïc Estève `. + :mod:`sklearn.utils` .................... @@ -201,17 +262,6 @@ Changelog :func:`sklearn.set_config`. :pr:`22856` by :user:`Jérémie du Boisberranger `. -- |Enhancement| The error message is improved when importing - :class:`model_selection.HalvingGridSearchCV`, - :class:`model_selection.HalvingRandomSearchCV`, or - :class:`impute.IterativeImputer` without importing the experimental flag. - :pr:`23194` by `Thomas Fan`_. - -- |Enhancement| Added an extension in doc/conf.py to automatically generate - the list of estimators that handle NaN values. - :pr:`23198` by `Lise Kleiber `_, :user:`Zhehao Liu ` - and :user:`Chiara Marmo `. - :mod:`sklearn.calibration` .......................... @@ -912,12 +962,6 @@ Changelog backward compatibility, but this alias will be removed in 1.3. :pr:`21177` by :user:`Julien Jerphanion `. -- |Feature| :class:`manifold.MDS` now supports a Singular Value Decomposition - (SVD). Users can choose whether to use SVD or SMACOF-based solvers via the - `solver` parameter. The SVD-based solver is faster and more accurate, but only - works for euclidean dissimilarity matrices in the metric-MDS context. - :pr:`16067` by :user:`Piotr Gaiński ` and - :pr:`22330` by :user:`Meekail Zain `. :mod:`sklearn.mixture` ...................... @@ -1189,55 +1233,60 @@ the project since version 1.0, including: 2357juan, Abhishek Gupta, adamgonzo, Adam Li, adijohar, Aditya Kumawat, Aditya Raghuwanshi, Aditya Singh, Adrian Trujillo Duron, Adrin Jalali, ahmadjubair33, AJ Druck, aj-white, Alan Peixinho, Alberto Mario Ceballos-Arroyo, Alek -Lefebvre, Alex, Alexandre Gramfort, alexanmv, almeidayoel, Amanda Dsouza, Aman -Sharma, Amar pratap singh, Amit, amrcode, András Simon, Andreas Mueller, -Andrew Knyazev, Andriy, Angus L'Herrou, Ankit Sharma, Anne Ducout, Arisa, Arth, -arthurmello, Arturo Amor, ArturoAmor, Atharva Patil, aufarkari, Aurélien -Geron, avm19, Ayan Bag, baam, Behrouz B, Ben3940, Benjamin Bossan, Bharat -Raghunathan, Bijil Subhash, bmreiniger, Brandon Truth, Brenden Kadota, Brian -Sun, cdrig, Chalmer Lowe, Chiara Marmo, Chitteti Srinath Reddy, Chloe-Agathe -Azencott, Christian Lorentzen, Christian Ritter, christopherlim98, Christoph T. -Weidemann, Christos Aridas, Claudio Salvatore Arcidiacono, combscCode, Daniela -Fernandes, Dave Eargle, David Poznik, Dea María Léon, Dennis Osei, DessyVV, -Dev514, Dimitri Papadopoulos Orfanos, Diwakar Gupta, Dr. Felix M. Riese, drskd, -Emiko Sano, Emmanouil Gionanidis, EricEllwanger, Erich Schubert, Eric Larson, -Eric Ndirangu, Estefania Barreto-Ojeda, eyast, Fatima GASMI, Federico Luna, -Felix Glushchenkov, fkaren27, Fortune Uwha, FPGAwesome, francoisgoupil, Frans -Larsson, Gabor Berei, Gabor Kertesz, Gabriel Stefanini Vicente, Gabriel S -Vicente, Gael Varoquaux, GAURAV CHOUDHARY, Gauthier I, genvalen, -Geoffrey-Paris, Giancarlo Pablo, glennfrutiz, gpapadok, Guillaume Lemaitre, -Guillermo Tomás Fernández Martín, Gustavo Oliveira, Haidar Almubarak, Hannah -Bohle, Haoyin Xu, Haya, Helder Geovane Gomes de Lima, henrymooresc, Hideaki -Imamura, Himanshu Kumar, Hind-M, hmasdev, hvassard, i-aki-y, iasoon, Inclusive -Coding Bot, Ingela, iofall, Ishan Kumar, Jack Liu, Jake Cowton, jalexand3r, J -Alexander, Jauhar, Jaya Surya Kommireddy, Jay Stanley, Jeff Hale, je-kr, -JElfner, Jenny Vo, Jérémie du Boisberranger, Jihane, Jirka Borovec, Joel -Nothman, Jon Haitz Legarreta Gorroño, Jordan Silke, Jorge Ciprián, Jorge -Loayza, Joseph Chazalon, Joseph Schwartz-Messing, JSchuerz, Juan Carlos Alfaro -Jiménez, Juan Martin Loyola, Julien Jerphanion, katotten, Kaushik Roy -Chowdhury, Ken4git, Kevin Doucet, KimAYoung, Koushik Joshi, Kranthi Sedamaki, -krumetoft, lesnee, Logan Thomas, Loic Esteve, Louis Wagner, LucieClair, Lucy -Liu, Luiz Eduardo Amaral, Magali, MaggieChege, Mai, mandjevant, Mandy Gu, -Manimaran, MarcoM, Maren Westermann, Maria Boerner, MarieS-WiMLDS, Martel -Corentin, mathurinm, Matías, matjansen, Matteo Francia, Maxwell, Meekail Zain, -Megabyte, Mehrdad Moradizadeh, melemo2, Michael I Chen, michalkrawczyk, -Micky774, milana2, millawell, Ming-Yang Ho, Mitzi, miwojc, Mizuki, mlant, -Mohamed Haseeb, Mohit Sharma, Moonkyung94, mpoemsl, MrinalTyagi, Mr. Leu, -msabatier, murata-yu, N, Nadirhan Şahin, NartayXD, nastegiano, nathansquan, +Lefebvre, Alex, Alexandr, Alexandre Gramfort, alexanmv, almeidayoel, Amanda +Dsouza, Aman Sharma, Amar pratap singh, Amit, amrcode, András Simon, Andreas +Grivas, Andreas Mueller, Andrew Knyazev, Andriy, Angus L'Herrou, Ankit Sharma, +Anne Ducout, Arisa, Arth, arthurmello, Arturo Amor, ArturoAmor, Atharva Patil, +aufarkari, Aurélien Geron, avm19, Ayan Bag, baam, Bardiya Ak, Behrouz B, +Ben3940, Benjamin Bossan, Bharat Raghunathan, Bijil Subhash, bmreiniger, +Brandon Truth, Brenden Kadota, Brian Sun, cdrig, Chalmer Lowe, Chiara Marmo, +Chitteti Srinath Reddy, Chloe-Agathe Azencott, Christian Lorentzen, Christian +Ritter, christopherlim98, Christoph T. Weidemann, Christos Aridas, Claudio +Salvatore Arcidiacono, combscCode, Daniela Fernandes, darioka, Darren Nguyen, +Dave Eargle, David Gilbertson, David Poznik, Dea María Léon, Dennis Osei, +DessyVV, Dev514, Dimitri Papadopoulos Orfanos, Diwakar Gupta, Dr. Felix M. +Riese, drskd, Emiko Sano, Emmanouil Gionanidis, EricEllwanger, Erich Schubert, +Eric Larson, Eric Ndirangu, ErmolaevPA, Estefania Barreto-Ojeda, eyast, Fatima +GASMI, Federico Luna, Felix Glushchenkov, fkaren27, Fortune Uwha, FPGAwesome, +francoisgoupil, Frans Larsson, ftorres16, Gabor Berei, Gabor Kertesz, Gabriel +Stefanini Vicente, Gabriel S Vicente, Gael Varoquaux, GAURAV CHOUDHARY, +Gauthier I, genvalen, Geoffrey-Paris, Giancarlo Pablo, glennfrutiz, gpapadok, +Guillaume Lemaitre, Guillermo Tomás Fernández Martín, Gustavo Oliveira, Haidar +Almubarak, Hannah Bohle, Hansin Ahuja, Haoyin Xu, Haya, Helder Geovane Gomes de +Lima, henrymooresc, Hideaki Imamura, Himanshu Kumar, Hind-M, hmasdev, hvassard, +i-aki-y, iasoon, Inclusive Coding Bot, Ingela, iofall, Ishan Kumar, Jack Liu, +Jake Cowton, jalexand3r, J Alexander, Jauhar, Jaya Surya Kommireddy, Jay +Stanley, Jeff Hale, je-kr, JElfner, Jenny Vo, Jérémie du Boisberranger, Jihane, +Jirka Borovec, Joel Nothman, Jon Haitz Legarreta Gorroño, Jordan Silke, Jorge +Ciprián, Jorge Loayza, Joseph Chazalon, Joseph Schwartz-Messing, Jovan +Stojanovic, JSchuerz, Juan Carlos Alfaro Jiménez, Juan Martin Loyola, Julien +Jerphanion, katotten, Kaushik Roy Chowdhury, Ken4git, Kenneth Prabakaran, +kernc, Kevin Doucet, KimAYoung, Koushik Joshi, Kranthi Sedamaki, krishna kumar, +krumetoft, lesnee, Lisa Casino, Logan Thomas, Loic Esteve, Louis Wagner, +LucieClair, Lucy Liu, Luiz Eduardo Amaral, Magali, MaggieChege, Mai, +mandjevant, Mandy Gu, Manimaran, MarcoM, Marco Wurps, Maren Westermann, Maria +Boerner, MarieS-WiMLDS, Martel Corentin, martin-kokos, mathurinm, Matías, +matjansen, Matteo Francia, Maxwell, Meekail Zain, Megabyte, Mehrdad +Moradizadeh, melemo2, Michael I Chen, michalkrawczyk, Micky774, milana2, +millawell, Ming-Yang Ho, Mitzi, miwojc, Mizuki, mlant, Mohamed Haseeb, Mohit +Sharma, Moonkyung94, mpoemsl, MrinalTyagi, Mr. Leu, msabatier, murata-yu, N, +Nadirhan Şahin, Naipawat Poolsawat, NartayXD, nastegiano, nathansquan, nat-salt, Nicki Skafte Detlefsen, Nicolas Hug, Niket Jain, Nikhil Suresh, Nikita Titov, Nikolay Kondratyev, Ohad Michel, Oleksandr Husak, Olivier Grisel, -partev, Patrick Ferreira, Paul, pelennor, PierreAttard, Pieter Gijsbers, Pinky, -poloso, Pramod Anantharam, puhuk, Purna Chandra Mansingh, QuadV, Rahil Parikh, -Randall Boyes, randomgeek78, Raz Hoshia, Reshama Shaikh, Ricardo Ferreira, -Richard Taylor, Rileran, Rishabh, Robin Thibaut, Roman Feldbauer, Roman -Yurchak, Ross Barnowski, rsnegrin, Sachin Yadav, sakinaOuisrani, Sam Adam Day, -Sanjay Marreddi, Sebastian Pujalte, SEELE, Seyedsaman (Sam) Emami, ShanDeng123, -Shao Yang Hong, sharmadharmpal, shaymerNaturalint, Shubhraneel Pal, siavrez, -slishak, Smile, spikebh, sply88, Sultan Orazbayev, Sumit Saha, Sven Eschlbeck, -Swapnil Jha, Sylvain Marié, Takeshi Oura, Tamires Santana, Tenavi, teunpe, -Theis Ferré Hjortkjær, Thiruvenkadam, Thomas J. Fan, t-jakubek, Tom Dupré la -Tour, TONY GEORGE, Tyler Martin, Tyler Reddy, Udit Gupta, Ugo Marchand, Varun -Agrawal, Venkatachalam N, Vera Komeyer, victoirelouis, Vikas Vishwakarma, -Vikrant khedkar, Vladimir Chernyy, Vladimir Kim, WeijiaDu, Xiao Yuan, Yar Khine -Phyo, Ying Xiong, yiyangq, Yosshi999, Yuki Koyama, Zach Deane-Mayer, Zeel B Patel, +partev, Patrick Ferreira, Paul, pelennor, PierreAttard, Piet Brömmel, Pieter +Gijsbers, Pinky, poloso, Pramod Anantharam, puhuk, Purna Chandra Mansingh, +QuadV, Rahil Parikh, Randall Boyes, randomgeek78, Raz Hoshia, Reshama Shaikh, +Ricardo Ferreira, Richard Taylor, Rileran, Rishabh, Robin Thibaut, Rocco Meli, +Roman Feldbauer, Roman Yurchak, Ross Barnowski, rsnegrin, Sachin Yadav, +sakinaOuisrani, Sam Adam Day, Sanjay Marreddi, Sebastian Pujalte, SEELE, SELEE, +Seyedsaman (Sam) Emami, ShanDeng123, Shao Yang Hong, sharmadharmpal, +shaymerNaturalint, Shuangchi He, Shubhraneel Pal, siavrez, slishak, Smile, +spikebh, sply88, Srinath Kailasa, Stéphane Collot, Sultan Orazbayev, Sumit +Saha, Sven Eschlbeck, Sven Stehle, Swapnil Jha, Sylvain Marié, Takeshi Oura, +Tamires Santana, Tenavi, teunpe, Theis Ferré Hjortkjær, Thiruvenkadam, Thomas +J. Fan, t-jakubek, toastedyeast, Tom Dupré la Tour, Tom McTiernan, TONY GEORGE, +Tyler Martin, Tyler Reddy, Udit Gupta, Ugo Marchand, Varun Agrawal, +Venkatachalam N, Vera Komeyer, victoirelouis, Vikas Vishwakarma, Vikrant +khedkar, Vladimir Chernyy, Vladimir Kim, WeijiaDu, Xiao Yuan, Yar Khine Phyo, +Ying Xiong, yiyangq, Yosshi999, Yuki Koyama, Zach Deane-Mayer, Zeel B Patel, zempleni, zhenfisher, 赵丰 (Zhao Feng) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 5c2056791ace2..c5eef1bdaea94 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -85,6 +85,15 @@ Changelog matrices in a variety of estimators and avoid an `EfficiencyWarning`. :pr:`23139` by `Tom Dupre la Tour`_. +:mod:`sklearn.manifold`: + +- |Feature| :class:`manifold.MDS` now supports a Singular Value Decomposition + (SVD). Users can choose whether to use SVD or SMACOF-based solvers via the + `solver` parameter. The SVD-based solver is faster and more accurate, but only + works for euclidean dissimilarity matrices in the metric-MDS context. + :pr:`16067` by :user:`Piotr Gaiński ` and + :pr:`22330` by :user:`Meekail Zain `. + Code and Documentation Contributors ----------------------------------- From 0e03893c5d6ce2610fc019be866e70acad250eb7 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Tue, 31 May 2022 16:17:10 -0400 Subject: [PATCH 18/27] Improved test --- sklearn/manifold/tests/test_mds.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index ce1c3c2681379..66d962ce11afa 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -101,9 +101,7 @@ def test_MDS_eigh(): # Signs of columns are dependent on signs of computed eigenvectors # which are arbitrary and meaningless - assert ( - np.allclose(mds_clf.embedding_, X_true_1) - or np.allclose(mds_clf.embedding_, -X_true_1) - or np.allclose(mds_clf.embedding_, X_true_2) - or np.allclose(mds_clf.embedding_, -X_true_2) - ) + match = False + for X_possible in (X_true_1, -X_true_1, X_true_2, -X_true_2): + match = match or np.allclose(mds_clf.embedding_, X_possible) + assert match From 2dd78facee7fc6549d15052a9e7be25c119cc698 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Jun 2022 16:31:49 -0400 Subject: [PATCH 19/27] Improved documentation and testing --- sklearn/manifold/_mds.py | 17 ++++++++--------- sklearn/manifold/tests/test_mds.py | 9 +++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 750d54aa7e9df..0657ab8584c0e 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -389,25 +389,24 @@ class MDS(BaseEstimator): metric : bool, default=True If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. - If ``solver=='eigh'``, metric must be set to True. + If `solver=='eigh'`, metric must be set to `True`. n_init : int, default=4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. - Used only when ``solver=='smacof'``. + Ignored if `solver=='eigh'`. max_iter : int, optional, default=300 Maximum number of iterations of the SMACOF algorithm for a single run. - Ignored if ``solver=='eigh'``. + Ignored if `solver=='eigh'`. verbose : int, optional, default=0 Level of verbosity. eps : float, default=1e-3 Relative tolerance with respect to stress at which to declare - convergence. - Ignored if ``solver=='eigh'``. + convergence. Ignored if `solver=='eigh'`. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. If multiple @@ -416,8 +415,7 @@ class MDS(BaseEstimator): ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` - for more details. - Ignored if ``solver=='eigh'``. + for more details. Ignored if `solver=='eigh'`. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -435,9 +433,10 @@ class MDS(BaseEstimator): ``fit_transform``. solver : {'smacof', 'eigh'}, default = 'smacof' - The solver used for solving the MDS problem. + The solver used for solving the MDS problem. The `eigh` solver is only + usable when `metric=False` but is often significantly faster. - .. versionadded:: 1.1 + .. versionadded:: 1.2 Attributes ---------- diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 66d962ce11afa..5d6966f196922 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -105,3 +105,12 @@ def test_MDS_eigh(): for X_possible in (X_true_1, -X_true_1, X_true_2, -X_true_2): match = match or np.allclose(mds_clf.embedding_, X_possible) assert match + +def test_nonmetric_mds_eigh_error(): + sim = np.ones((2, 2)) + mds_clf = mds.MDS(metric=False, solver="eigh") + msg = "Using eigh requires metric=True" + with pytest.raises(ValueError, match=msg): + mds_clf.fit(sim) + with pytest.raises(ValueError, match=msg): + mds_clf.fit_transform(sim) From 0f49a13f31da1cc50217ce21a97226549af9c4fb Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Jun 2022 17:08:21 -0400 Subject: [PATCH 20/27] Significant improvements to user guide entry --- doc/modules/manifold.rst | 50 +++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index 25e08092f25e8..afa016e17dab8 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -429,12 +429,12 @@ countries. There exists two types of MDS algorithm: metric and non metric. In the scikit-learn, the class :class:`MDS` implements both. In Metric MDS, the input -similarity matrix arises from a metric (and thus respects the triangular +dissimilarity matrix arises from a metric (and thus respects the triangular inequality), the distances between output two points are then set to be as -close as possible to the similarity or dissimilarity data. In the non-metric -version, the algorithms will try to preserve the order of the distances, and -hence seek for a monotonic relationship between the distances in the embedded -space and the similarities/dissimilarities. +close as possible to the dissimilarity data. In the non-metric version, the +algorithms will try to preserve the order of the distances, and hence seek for +a monotonic relationship between the distances in the embedded space and the +dissimilarities. .. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_010.png :target: ../auto_examples/manifold/plot_lle_digits.html @@ -442,9 +442,9 @@ space and the similarities/dissimilarities. :scale: 50 -Let :math:`S` be the similarity matrix, and :math:`X` the coordinates of the +Let :math:`D` be the dissimilarity matrix, and :math:`X` the coordinates of the :math:`n` input points. Disparities :math:`\hat{d}_{ij}` are transformation of -the similarities chosen in some optimal ways. The objective, called the +the dissimilarities chosen in some optimal ways. The objective, called the stress, is then defined by :math:`\sum_{i < j} d_{ij}(X) - \hat{d}_{ij}(X)` @@ -452,14 +452,15 @@ Metric MDS ---------- The simplest metric :class:`MDS` model, called *absolute MDS*, disparities are defined by -:math:`\hat{d}_{ij} = S_{ij}`. With absolute MDS, the value :math:`S_{ij}` +:math:`\hat{d}_{ij} = D_{ij}`. With absolute MDS, the value :math:`D_{ij}` should then correspond exactly to the distance between point :math:`i` and :math:`j` in the embedding point. -Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. +Most commonly, disparities are set to :math:`\hat{d}_{ij} = b D_{ij}`. -If the metric of :math:`S` is Euclidean, user can choose to use faster and more accurate -solver of calculating results. See :ref:`multidimensional_scaling_solver` for details. +If the dissimilarity matrix :math:`D` is "Euclidean", user can choose to use +faster and more accurate solver of calculating results. See +:ref:`multidimensional_scaling_solver` for details. .. _multidimensional_scaling_solver: @@ -471,17 +472,30 @@ results: SMACOF and SVD-based. The SMACOF solver (Scaling by MAjorizing a COmplicated Function) minimizes its objective function (stress) in an iterative manner. The SVD-based solver performs series of transformations (including Singular Value Decomposition) to give exact result. The SVD-based solver is -thus much faster and more accurate, but also less general - it requires metric -of :math:`S` to be Euclidean. +thus much faster and more accurate, but also less general - it requires :math:`D` +to be "Euclidean". Specifically, we refer to :math:`D` as "Euclidean" if there +exists a set of points :math:`p_1,\dots,p_n` such that the dissimilarities can +be generated by taking the euclidean distance between such points, i.e. + +.. math:: + D_{i,j}=\|p_i-p_j\|_2^2 + +Let the matrix :math:`\Delta` be the matrix of transformed dissimilarities, such +that :math:`\Delta_{i,j}=\frac{-1}{d_{i,j}^2}`. Then define the vector of ones as +:math:`J=[1,\dots,1]^\intercal`. We then say that the matrix :math:`D` is "Euclidean" +iff the matrix :math:`L=(I-Js^\intercal)\Delta (I-sJ^\intercal)` is positive +semi-definite for some vector :math:`s` such that :math:`s^\intercal J=1` (e.g. +:math:`s=e_i` or :math:`s=\frac{1}{n}J`). Saying that :math:`L` is positive +semi-definite is equivalent to saying that all eigenvalues of :math:`L` are non-negative. Nonmetric MDS ------------- Non metric :class:`MDS` focuses on the ordination of the data. If -:math:`S_{ij} < S_{jk}`, then the embedding should enforce :math:`d_{ij} < +:math:`D_{ij} < D_{jk}`, then the embedding should enforce :math:`d_{ij} < d_{jk}`. A simple algorithm to enforce that is to use a monotonic regression -of :math:`d_{ij}` on :math:`S_{ij}`, yielding disparities :math:`\hat{d}_{ij}` -in the same order as :math:`S_{ij}`. +of :math:`d_{ij}` on :math:`D_{ij}`, yielding disparities :math:`\hat{d}_{ij}` +in the same order as :math:`D_{ij}`. A trivial solution to this problem is to set all the points on the origin. In order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. @@ -495,6 +509,10 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. .. topic:: References: + * `"Metric and Euclidean properties of dissimilarity coefficients" + `_ + Gower, J.C., Legendre, P. Journal of Classification 3, 5-48 (1986) + * `"Modern Multidimensional Scaling - Theory and Applications" `_ Borg, I.; Groenen P. Springer Series in Statistics (1997) From 3570a596caefa5d5c4d893a6fa30351404e48e38 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Jun 2022 17:09:49 -0400 Subject: [PATCH 21/27] Minor text correction --- sklearn/manifold/_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 0657ab8584c0e..19c539f3cf106 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -352,7 +352,7 @@ def eigh_scaler(dissimilarities, n_components=2): w, V = linalg.eigh(B, check_finite=False) - # ``dissimilarities`` is Euclidean iff ``B`` is positive semi-definite. + # `dissimilarities` is Euclidean iff `B` is positive semi-definite. # See "Metric and Euclidean properties of dissimilarity coefficients" # for details try: From 51a684fbc43493c44f9cf762389b447cb2f3f084 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Jun 2022 17:20:23 -0400 Subject: [PATCH 22/27] Extended documentation --- doc/modules/manifold.rst | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index afa016e17dab8..1d458ed83e43d 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -478,16 +478,27 @@ exists a set of points :math:`p_1,\dots,p_n` such that the dissimilarities can be generated by taking the euclidean distance between such points, i.e. .. math:: - D_{i,j}=\|p_i-p_j\|_2^2 + D_{i,j}=\|p_i-p_j\|_2^2 Let the matrix :math:`\Delta` be the matrix of transformed dissimilarities, such that :math:`\Delta_{i,j}=\frac{-1}{d_{i,j}^2}`. Then define the vector of ones as :math:`J=[1,\dots,1]^\intercal`. We then say that the matrix :math:`D` is "Euclidean" -iff the matrix :math:`L=(I-Js^\intercal)\Delta (I-sJ^\intercal)` is positive -semi-definite for some vector :math:`s` such that :math:`s^\intercal J=1` (e.g. +iff the matrix + +.. math:: + L=(I-Js^\intercal)\Delta (I-sJ^\intercal) + +is positive semi-definite for some vector :math:`s` such that :math:`s^\intercal J=1` (e.g. :math:`s=e_i` or :math:`s=\frac{1}{n}J`). Saying that :math:`L` is positive semi-definite is equivalent to saying that all eigenvalues of :math:`L` are non-negative. +.. note:: + While technically :math:`D` is "Euclidean" if there exists *some* vector :math:`s` + satisfying the above equation, it is shown that it is sufficient to test an + arbitrary vector :math:`s` satisfying the above conditions; if there exists + a single valid choice of :math:`s` solving the above, then *every* valid choice + of :math:`s` must solve the above. + Nonmetric MDS ------------- From 63a9f1caed66ac40a153b00ef97f8f59de03285c Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Jun 2022 18:29:19 -0400 Subject: [PATCH 23/27] Linting --- sklearn/manifold/tests/test_mds.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 5d6966f196922..9f67b968f6484 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -106,6 +106,7 @@ def test_MDS_eigh(): match = match or np.allclose(mds_clf.embedding_, X_possible) assert match + def test_nonmetric_mds_eigh_error(): sim = np.ones((2, 2)) mds_clf = mds.MDS(metric=False, solver="eigh") From e8eef3656199c2b8094660931e0d2e848636cc8e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 30 Jun 2022 18:45:25 -0400 Subject: [PATCH 24/27] Reconciled param validation changes and streamlined tests --- sklearn/manifold/_mds.py | 8 ++------ sklearn/manifold/tests/test_mds.py | 16 +--------------- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index fc41b37b65070..d37c08cb91678 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -518,6 +518,7 @@ class MDS(BaseEstimator): "n_jobs": [None, Integral], "random_state": ["random_state"], "dissimilarity": [StrOptions({"euclidean", "precomputed"})], + "solver": [StrOptions({"smacof", "eigh"})], } def __init__( @@ -630,14 +631,9 @@ def fit_transform(self, X, y=None, init=None): ) elif self.solver == "eigh": if not self.metric: - raise ValueError("Using eigh requires metric=True") + raise ValueError("Using the eigh solver requires metric=True") self.embedding_, self.stress_ = eigh_scaler( self.dissimilarity_matrix_, n_components=self.n_components ) self.n_iter_ = None - else: - raise ValueError( - "Solver must be 'smacof' or 'eigh'. Got %s instead" % str(self.solver) - ) - return self.embedding_ diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 9f67b968f6484..77c3f2544d7f2 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -58,20 +58,6 @@ def test_eigh_error(): mds.eigh_scaler(sim) -def test_MDS_error(): - # Bad solver name - sim = np.ones((2, 2)) - mdc_clf = mds.MDS(solver="bad name") - with pytest.raises(ValueError, match="Solver must be 'smacof' or 'eigh'"): - mdc_clf.fit(sim) - - # eigh with metric=False - sim = np.ones((2, 2)) - mdc_clf = mds.MDS(metric=False, solver="eigh") - with pytest.raises(ValueError, match="Using eigh requires metric=True"): - mdc_clf.fit(sim) - - def test_MDS(): sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed") @@ -110,7 +96,7 @@ def test_MDS_eigh(): def test_nonmetric_mds_eigh_error(): sim = np.ones((2, 2)) mds_clf = mds.MDS(metric=False, solver="eigh") - msg = "Using eigh requires metric=True" + msg = "Using the eigh solver requires metric=True" with pytest.raises(ValueError, match=msg): mds_clf.fit(sim) with pytest.raises(ValueError, match=msg): From aa074a4efd9c72441dd82ea21927fea4a1fa8d6a Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 30 Jun 2022 19:30:14 -0400 Subject: [PATCH 25/27] Reverting unnecessary diff --- sklearn/manifold/_mds.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index d37c08cb91678..5a0c3c99e1160 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -524,6 +524,7 @@ class MDS(BaseEstimator): def __init__( self, n_components=2, + *, metric=True, n_init=4, max_iter=300, From 595ca9dd7b8f3196d596513f2aa31d7a32b1417e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 30 Jun 2022 19:42:08 -0400 Subject: [PATCH 26/27] Added clarifying comment to test --- sklearn/manifold/tests/test_mds.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 77c3f2544d7f2..a2f9943fd4729 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -66,7 +66,8 @@ def test_MDS(): def test_MDS_eigh(): # Test eigh using example data from "An Introduction to MDS" - # Florian Wickelmaier, p 11 + # Florian Wickelmaier, p 11. Validated against R implementation + # (cmdscale) as well. sim = np.array( [[0, 93, 82, 133], [93, 0, 52, 60], [82, 52, 0, 111], [133, 60, 111, 0]] ) From 12fbbe12b13f3b93256744c5022c634d1715c9ef Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 6 Jul 2022 15:45:04 -0400 Subject: [PATCH 27/27] Adopted suggestion from @thisirs --- sklearn/manifold/_mds.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 5a0c3c99e1160..c1fe3f548606a 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -343,15 +343,12 @@ def eigh_scaler(dissimilarities, n_components=2): dissimilarities = check_symmetric(dissimilarities, raise_exception=True) - n_samples = dissimilarities.shape[0] - - # Centering matrix - J = np.eye(*dissimilarities.shape) - (1.0 / n_samples) * ( - np.ones(dissimilarities.shape) - ) - - # Double centered matrix - B = -0.5 * np.dot(J, np.dot(dissimilarities**2, J)) + # Centering + B = dissimilarities**2 + B = B.astype(np.float64) + B -= np.mean(B, axis=0) + B -= np.mean(B, axis=1, keepdims=True) + B *= -0.5 w, V = linalg.eigh(B, check_finite=False)