scikit-learn · johny-c · Mar 16, 2017 · Mar 16, 2017 · Mar 16, 2017 · Mar 16, 2017
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1346,6 +1346,7 @@ Model validation
    neighbors.NearestCentroid
    neighbors.NearestNeighbors
    neighbors.NeighborhoodComponentsAnalysis
+   neighbors.LargeMarginNearestNeighbor
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
@@ -1085,4 +1085,5 @@ when data can be fetched sequentially.
       H. F. Kaiser, 1958
 
 See also :ref:`nca_dim_reduction` for dimensionality reduction with
-Neighborhood Components Analysis.
+Neighborhood Components Analysis or :ref:`lmnn_dim_reduction` for
+dimensionality reduction with Large Margin Nearest Neighbor.
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
@@ -843,3 +843,214 @@ added space complexity in the operation.
 
     `Wikipedia entry on Neighborhood Components Analysis
     <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
+
+
+.. _lmnn:
+
+Large Margin Nearest Neighbor
+=============================
+
+.. sectionauthor:: John Chiotellis <johnyc.code@gmail.com>
+
+Large Margin Nearest Neighbor (LMNN, :class:`LargeMarginNearestNeighbor`) is
+a metric learning algorithm which aims to improve the accuracy of
+nearest neighbors classification compared to the standard Euclidean distance.
+
+.. |lmnn_illustration_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_lmnn_illustration_001.png
+   :target: ../auto_examples/neighbors/plot_lmnn_illustration.html
+   :scale: 50
+
+.. |lmnn_illustration_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_lmnn_illustration_002.png
+   :target: ../auto_examples/neighbors/plot_lmnn_illustration.html
+   :scale: 50
+
+.. centered:: |lmnn_illustration_1| |lmnn_illustration_2|
+
+
+For each training sample, the algorithm fixes :math:`k` "target neighbors",
+namely the :math:`k`-nearest training samples (as measured by the Euclidean
+distance) that share the same label. Given these target neighbors, LMNN
+learns a linear transformation of the data by optimizing a trade-off between
+two goals. The first one is to make each (transformed) point closer to its
+target neighbors than to any differently-labeled point by a large margin,
+thereby enclosing the target neighbors in a sphere around the reference
+sample. Data samples from different classes that violate this margin are
+called "impostors". The second goal is to minimize the distances of each
+sample to its target neighbors, which can be seen as a form of regularization.
+
+Classification
+--------------
+
+Combined with a nearest neighbors classifier (:class:`KNeighborsClassifier`),
+this method is attractive for classification because it can naturally
+handle multi-class problems without any increase in the model size, and only
+a single parameter (``n_neighbors``) has to be selected by the user before
+training.
+
+Large Margin Nearest Neighbor classification has been shown to work well in
+practice for data sets of varying size and difficulty. In contrast to
+related methods such as Linear Discriminant Analysis, LMNN does not make any
+assumptions about the class distributions. The nearest neighbor classification
+can naturally produce highly irregular decision boundaries.
+
+To use this model for classification, one needs to combine a :class:`LargeMarginNearestNeighbor`
+instance that learns the optimal transformation with a :class:`KNeighborsClassifier`
+instance that performs the classification in the embedded space. Here is an
+example using the two classes:
+
+    >>> from sklearn.neighbors import LargeMarginNearestNeighbor
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ... stratify=y, test_size=0.7, random_state=42)
+    >>> lmnn = LargeMarginNearestNeighbor(n_neighbors=3, random_state=42)
+    >>> lmnn.fit(X_train, y_train)
+    LargeMarginNearestNeighbor(...)
+    >>> # Apply the learned transformation when using KNeighborsClassifier
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> knn.fit(lmnn.transform(X_train), y_train)
+    KNeighborsClassifier(...)
+    >>> print(knn.score(lmnn.transform(X_test), y_test))
+    0.971428...
+
+Alternatively, one can create a :class:`sklearn.pipeline.Pipeline` instance
+that automatically applies the transformation when fitting or predicting:
+
+    >>> from sklearn.pipeline import Pipeline
+    >>> lmnn = LargeMarginNearestNeighbor(n_neighbors=3, random_state=42)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> lmnn_pipe = Pipeline([('lmnn', lmnn), ('knn', knn)])
+    >>> lmnn_pipe.fit(X_train, y_train)
+    Pipeline(...)
+    >>> print(lmnn_pipe.score(X_test, y_test))
+    0.971428...
+
+.. |lmnn_classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_lmnn_classification_001.png
+   :target: ../auto_examples/neighbors/plot_lmnn_classification.html
+   :scale: 50
+
+.. |lmnn_classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_lmnn_classification_002.png
+   :target: ../auto_examples/neighbors/plot_lmnn_classification.html
+   :scale: 50
+
+.. centered:: |lmnn_classification_1| |lmnn_classification_2|
+
+
+The plot shows decision boundaries for nearest neighbor classification and
+large margin nearest neighbor classification.
+
+.. _lmnn_dim_reduction:
+
+Dimensionality reduction
+------------------------
+
+:class:`LargeMarginNearestNeighbor` can be used to perform supervised
+dimensionality reduction. The input data are mapped to a linear subspace
+consisting of the directions which minimize the LMNN objective. Unlike
+unsupervised methods which aim to maximize the uncorrelatedness (PCA) or even
+independence (ICA) of the components, LMNN aims to find components that
+maximize the nearest neighbors classification accuracy of the transformed
+inputs. The desired output dimensionality can be set using the parameter
+``n_components``. For instance, the following shows a comparison of
+dimensionality reduction with Principal Component Analysis (:class:`sklearn
+.decomposition.PCA`), Linear Discriminant Analysis (:class:`sklearn
+.discriminant_analysis.LinearDiscriminantAnalysis`) and Large Margin Nearest
+Neighbor (:class:`LargeMarginNearestNeighbor`) on the Olivetti dataset, a
+dataset with size :math:`n_{samples} = 400` and :math:`n_{features} = 64 \times 64 = 4096`.
+The data set is splitted in a training and test set of equal size. For
+evaluation the 3-nearest neighbor classification accuracy is computed on the
+2-dimensional embedding found by each method. Each data sample belongs to one
+of 40 classes.
+
+.. |lmnn_dim_reduction_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_lmnn_dim_reduction_001.png
+   :target: ../auto_examples/neighbors/plot_lmnn_dim_reduction.html
+   :width: 32%
+
+.. |lmnn_dim_reduction_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_lmnn_dim_reduction_002.png
+   :target: ../auto_examples/neighbors/plot_lmnn_dim_reduction.html
+   :width: 32%
+
+.. |lmnn_dim_reduction_3| image:: ../auto_examples/neighbors/images/sphx_glr_plot_lmnn_dim_reduction_003.png
+   :target: ../auto_examples/neighbors/plot_lmnn_dim_reduction.html
+   :width: 32%
+
+.. centered:: |lmnn_dim_reduction_1| |lmnn_dim_reduction_2| |lmnn_dim_reduction_3|
+
+
+Mathematical formulation
+------------------------
+
+LMNN learns a linear transformation matrix :math:`L` of
+size ``(n_components, n_features)``. The objective function consists of
+two competing terms, the pull loss that pulls target neighbors closer to
+their reference sample and the push loss that pushes impostors away:
+
+.. math::
+    \varepsilon_{\text{pull}} (L) = \sum_{i, j \rightsquigarrow i} ||L(x_i - x_j)||^2,
+.. math::
+    \varepsilon_{\text{push}} (L) = \sum_{i, j \rightsquigarrow i}
+    \sum_{l} (1 - y_{il}) [1 + || L(x_i - x_j)||^2 - || L
+    (x_i - x_l)||^2]_+,
+
+where :math:`y_{il} = 1` if :math:`y_i = y_l` and :math:`0` otherwise,
+:math:`[x]_+ = \max(0, x)` is the hinge loss, and :math:`j \rightsquigarrow i`
+means that the :math:`j^{th}` sample is a target neighbor of the
+:math:`i^{th}` sample.
+
+LMNN solves the following (nonconvex) minimization problem:
+
+.. math::
+    \min_L \varepsilon(L) = (1 - \mu) \varepsilon_{\text{pull}} (L) +
+    \mu \varepsilon_{\text{push}} (L) \text{, } \quad \mu \in [0,1].
+
+The parameter :math:`\mu` (``push_loss_weight``) calibrates the trade-off
+between penalizing large distances to target neighbors and penalizing margin
+violations by impostors. In practice, the two terms are usually weighted
+equally (:math:`\mu = 0.5`).
+
+
+Mahalanobis distance
+^^^^^^^^^^^^^^^^^^^^
+
+LMNN can be seen as learning a (squared) Mahalanobis distance metric:
+
+.. math::
+    || L(x_i - x_j)||^2 = (x_i - x_j)^TM(x_i - x_j),
+
+where :math:`M = L^T L` is a symmetric positive semi-definite matrix of size
+``(n_features, n_features)``. The objective function of LMNN can be
+rewritten and solved with respect to :math:`M` directly. This results in a
+convex but constrained problem (since :math:`M` must be symmetric positive
+semi-definite). See the journal paper in the References for more details.
+
+
+Implementation
+--------------
+
+This implementation follows closely the MATLAB implementation found at
+https://bitbucket.org/mlcircus/lmnn which solves the unconstrained problem.
+It finds a linear transformation :math:`L` by optimization with L-BFGS instead
+of solving the constrained problem that finds the globally optimal distance
+metric. Different from the paper, the problem solved by this implementation is
+with the *squared* hinge loss (to make the problem differentiable).
+
+See the examples below and the doc string of :meth:`LargeMarginNearestNeighbor.fit`
+for further information.
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_neighbors_plot_lmnn_classification.py`
+ * :ref:`sphx_glr_auto_examples_neighbors_plot_lmnn_dim_reduction.py`
+
+
+.. topic:: References:
+
+   * `"Distance Metric Learning for Large Margin Nearest Neighbor Classification"
+     <http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf>`_,
+     Weinberger, Kilian Q., and Lawrence K. Saul, Journal of Machine Learning
+     Research, Vol. 10, Feb. 2009, pp. 207-244.
+
+   * `Wikipedia entry on Large Margin Nearest Neighbor
+     <https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor>`_
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -348,13 +348,6 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
-- |API| The option for using the squared error via ``loss`` and
-  ``criterion`` parameters was made more consistent. The preferred way is by
-  setting the value to `"squared_error"`. Old option names are still valid,
-  produce the same models, but are deprecated and will be removed in version
-  1.2.
-  :pr:`19310` by :user:`Christian Lorentzen <lorentzenchr>`.
-
   - For :class:`ensemble.ExtraTreesRegressor`, `criterion="mse"` is deprecated,
     use `"squared_error"` instead which is now the default.
 
@@ -1011,6 +1004,8 @@ Changelog
   Use ``var_`` instead.
   :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
 
+<<<<<<< HEAD
+=======
 :mod:`sklearn.neighbors`
 ........................
 
@@ -1212,6 +1207,7 @@ Changelog
 - |API| Fixed several bugs in :func:`utils.graph.graph_shortest_path`, which is
   now deprecated. Use `scipy.sparse.csgraph.shortest_path` instead. :pr:`20531`
   by `Tom Dupre la Tour`_.
+>>>>>>> origin/main
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -254,6 +254,11 @@ Changelog
   instead of failing with a low-level error message at predict-time.
   :pr:`23874` by :user:`Juan Gomez <2357juan>`.
 
+- |MajorFeature| A metric learning algorithm:
+  :class:`neighbors.LargeMarginNearestNeighbor`, which implements the
+  Large Margin Nearest Neighbor algorithm described in Weinberger et al.
+  (2006). :pr:`8602` by :user:`John Chiotellis <johny-c>`.
+
 :mod:`sklearn.svm`
 ..................
 

diff --git a/examples/neighbors/plot_lmnn_classification.py b/examples/neighbors/plot_lmnn_classification.py
@@ -0,0 +1,85 @@
+"""
+==========================================================================
+Comparing Nearest Neighbors with and without Large Margin Nearest Neighbor
+==========================================================================
+
+This example compares nearest neighbors classification with and without
+Large Margin Nearest Neighbor.
+
+It will plot the decision boundaries for each class determined by a simple
+Nearest Neighbors classifier against the decision boundaries determined by a
+Large Margin Nearest Neighbor classifier. The latter aims to find a distance
+metric that maximizes the nearest neighbor classification accuracy on a given
+training set.
+"""
+
+# Author: John Chiotellis <johnyc.code@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier, LargeMarginNearestNeighbor
+from sklearn.pipeline import Pipeline
+
+
+print(__doc__)
+
+n_neighbors = 3
+
+# import some data to play with
+iris = datasets.load_iris()
+
+# we only take the first two features. We could avoid this ugly
+# slicing by using a two-dim dataset
+X = iris.data[:, :2]
+y = iris.target
+
+X_train, X_test, y_train, y_test = \
+    train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)
+
+h = .01  # step size in the mesh
+
+# Create color maps
+cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
+cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
+
+names = ['K-Nearest Neighbors', 'Large Margin Nearest Neighbor']
+
+classifiers = [KNeighborsClassifier(n_neighbors=n_neighbors),
+               Pipeline([('lmnn', LargeMarginNearestNeighbor(
+                   n_neighbors=n_neighbors, random_state=42)),
+                         ('knn', KNeighborsClassifier(n_neighbors))
+                         ])
+               ]
+
+x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+    <
10000
span class='blob-code-inner blob-code-marker ' data-code-marker="+">                     np.arange(y_min, y_max, h))
+
+for name, clf in zip(names, classifiers):
+
+    clf.fit(X_train, y_train)
+    score = clf.score(X_test, y_test)
+
+    # Plot the decision boundary. For that, we will assign a color to each
+    # point in the mesh [x_min, x_max]x[y_min, y_max].
+    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
+
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    plt.figure()
+    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8, shading='auto')
+
+    # Plot also the training and testing points
+    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
+    plt.xlim(xx.min(), xx.max())
+    plt.ylim(yy.min(), yy.max())
+    plt.title("{} (k = {})".format(name, n_neighbors))
+    plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,
+             ha='center', va='center', transform=plt.gca().transAxes)
+
+plt.show()