From 8f894a85cc38af6f9f141959b6e83404fa9b91a5 Mon Sep 17 00:00:00 2001
From: Erich Schubert <kno10@users.noreply.github.com>
Date: Fri, 21 Sep 2018 11:49:38 +0200
Subject: [PATCH 1/3] Add a test for numeric precision #9354

Surprisingly bad precision, isn't it?

Note that the traditional computation sqrt(sum((x-y)**2)) gets the results exact.
---
 sklearn/metrics/tests/test_pairwise.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index e63219a817bed..7465d28f7005c 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -874,3 +874,15 @@ def test_check_preserve_type():
                                                    XB.astype(np.float))
     assert_equal(XA_checked.dtype, np.float)
     assert_equal(XB_checked.dtype, np.float)
+
+def test_euclidean_precision32():
+    """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision"""
+    XA = np.array([[10000]], np.float32)
+    XB = np.array([[10001]], np.float32)
+    assert_equal(euclidean_distances(XA, XB)[0,0], 1)
+
+def test_euclidean_precision64():
+    """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision"""
+    XA = np.array([[100000000]])
+    XB = np.array([[100000001]])
+    assert_equal(euclidean_distances(XA, XB)[0,0], 1)

From 7dcf3b22fcd80e0fee776fbdaebf511849d899de Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Sun, 23 Sep 2018 17:11:27 +0200
Subject: [PATCH 2/3] Add heuristics for detecting precision issues with
 euclidean distance

---
 sklearn/exceptions.py                  | 11 +++++
 sklearn/metrics/pairwise.py            | 47 ++++++++++++++++++
 sklearn/metrics/tests/test_pairwise.py | 67 +++++++++++++++++++++-----
 3 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index 9cf207e40fdd6..7c0f767cb5b66 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -96,6 +96,17 @@ class EfficiencyWarning(UserWarning):
     """
 
 
+class NumericalPrecisionWarning(UserWarning):
+    """Warning used to notify the user of possibly inaccurate computation.
+
+    This warning notifies the user that the numerical accuracy may not be
+    optimal due to some reason which may be included as a part of the warning
+    message.
+
+    .. versionadded:: 0.21
+    """
+
+
 class FitFailedWarning(RuntimeWarning):
     """Warning class used if there is an error while fitting the estimator.
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 2e56255af0019..45804be53b786 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -18,6 +18,7 @@
 from scipy.sparse import csr_matrix
 from scipy.sparse import issparse
 
+from ..exceptions import NumericalPrecisionWarning
 from ..utils.validation import _num_samples
 from ..utils import check_array
 from ..utils import gen_even_slices
@@ -27,6 +28,7 @@
 from ..utils import Parallel
 from ..utils import delayed
 from ..utils import effective_n_jobs
+from ..utils.fixes import nanpercentile
 
 from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 
@@ -244,6 +246,51 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     else:
         YY = row_norms(Y, squared=True)[np.newaxis, :]
 
+    if XX.size > 0 and YY.size > 0:
+        # Heuristics to detect possible numerical precision issues, with
+        # the used quadratic expansion that occurs when the distance between
+        # considered vectors is much lower than their norm.
+        #
+        # Here, a warning is raised if all considered vectors are within a ND
+        # sperical shell with a thickness to diameter ratio lower than
+        # the threshold value of 1e-7 in 64 bit. While such a configuration
+        # is generally uncommon, it occurs when a distribution with an
+        # inertia (or mean radius) of I, is shifted by I/threshold out of
+        # of the origin.
+
+        # min and max of sample norms, ignoring outliers
+        XX_min = nanpercentile(XX, 5)**0.5
+        XX_max = nanpercentile(XX, 95)**0.5
+        YY_min = nanpercentile(YY, 5)**0.5
+        YY_max = nanpercentile(YY, 95)**0.5
+        XY_min = min(XX_min, YY_min)
+        XY_max = max(XX_max, YY_max)
+
+        # only float64, float32 dtypes are possible
+        if X.dtype == np.float64:
+            threshold = 1e-7
+        elif X.dtype == np.float32:
+            threshold = 1e-3
+
+        if (abs(XY_max - XY_min) < threshold*XY_max and
+                # ignore null vector
+                XX_max > 0.0 and YY_max > 0
+                # ignore comparison between the vector and itself
+                and XX_max != YY_max):
+
+            warning_message = (
+                    "with the provided data, computing "
+                    "Euclidean distances with the quadratic expansion may "
+                    "lead to numerically inaccurate results. ")
+            if not issparse(X):
+                warning_message += (
+                    "Consider standardizing features by removing the mean, "
+                    "or setting globally "
+                    "euclidean_distances_algorithm='exact' for slower but "
+                    "more precise implementation.")
+
+            warnings.warn(warning_message, NumericalPrecisionWarning)
+
     distances = safe_sparse_dot(X, Y.T, dense_output=True)
     distances *= -2
     distances += XX
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 7465d28f7005c..55c2b2ffc9122 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -8,6 +8,7 @@
 
 import pytest
 
+import sklearn
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_allclose
@@ -49,8 +50,7 @@
 from sklearn.metrics.pairwise import paired_manhattan_distances
 from sklearn.preprocessing import normalize
 from sklearn.exceptions import DataConversionWarning
-
-import pytest
+from sklearn.exceptions import NumericalPrecisionWarning
 
 
 def test_pairwise_distances():
@@ -875,14 +875,55 @@ def test_check_preserve_type():
     assert_equal(XA_checked.dtype, np.float)
     assert_equal(XB_checked.dtype, np.float)
 
-def test_euclidean_precision32():
-    """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision"""
-    XA = np.array([[10000]], np.float32)
-    XB = np.array([[10001]], np.float32)
-    assert_equal(euclidean_distances(XA, XB)[0,0], 1)
-
-def test_euclidean_precision64():
-    """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision"""
-    XA = np.array([[100000000]])
-    XB = np.array([[100000001]])
-    assert_equal(euclidean_distances(XA, XB)[0,0], 1)
+
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+def test_euclidean_distance_precision(dtype):
+    """Checks for the most problematic cases when computing the
+    the euclidean distance with dot(x,x) - 2 dot(x,y) + dot(y,y)"""
+    if dtype == 'float32':
+        offset = 10000
+    else:
+        offset = 1e8
+
+    XA = np.array([[offset]], dtype)
+    XB = np.array([[offset + 1]], dtype)
+
+    with pytest.warns(NumericalPrecisionWarning):
+        with pytest.raises(AssertionError):
+            assert euclidean_distances(XA, XB)[0, 0] == 1.
+
+    with pytest.warns(NumericalPrecisionWarning):
+        with pytest.raises(AssertionError):
+            assert pairwise_distances(XA, XB)[0, 0] == 1.
+
+    with pytest.warns(None) as record:
+        assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1
+
+        with sklearn.config_context(euclidean_distances_algorithm='exact'):
+            assert euclidean_distances(XA, XB)[0, 0] == 1
+            assert pairwise_distances(XA, XB)[0, 0] == 1.
+    assert len(record) == 0
+
+
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+def test_euclidean_distance_distribution_precision(dtype):
+    """Computing euclidean_distances with the quadratic expansion
+    on data strongly shifted off the origin leads to numerical precision
+    issues"""
+    XA = np.random.RandomState(42).randn(100, 10).astype(dtype)
+    XB = np.random.RandomState(41).randn(200, 10).astype(dtype)
+
+    if dtype == 'float32':
+        offset = 10000
+    else:
+        offset = 1e8
+
+    with pytest.warns(None) as record:
+        euclidean_distances(XA, XB)
+    assert len(record) == 0
+
+    XA += offset
+    XB += offset
+
+    with pytest.warns(NumericalPrecisionWarning):
+        euclidean_distances(XA, XB)

From 28bacf9f0a89e7075e4f36b805a93298c3eeee24 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 24 Sep 2018 13:54:37 +0200
Subject: [PATCH 3/3] Fix CI

---
 sklearn/metrics/tests/test_pairwise.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 55c2b2ffc9122..2e887cc570b3e 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -8,7 +8,6 @@
 
 import pytest
 
-import sklearn
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_allclose
@@ -896,13 +895,16 @@ def test_euclidean_distance_precision(dtype):
         with pytest.raises(AssertionError):
             assert pairwise_distances(XA, XB)[0, 0] == 1.
 
-    with pytest.warns(None) as record:
-        assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1
+    # Note: this needs https://github.com/scikit-learn/scikit-learn/pull/12136
+    # to be merged first.
+    #
+    #  with pytest.warns(None) as record:
+    #      assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1
 
-        with sklearn.config_context(euclidean_distances_algorithm='exact'):
-            assert euclidean_distances(XA, XB)[0, 0] == 1
-            assert pairwise_distances(XA, XB)[0, 0] == 1.
-    assert len(record) == 0
+    #      with sklearn.config_context(euclidean_distances_algorithm='exact'):
+    #          assert euclidean_distances(XA, XB)[0, 0] == 1
+    #          assert pairwise_distances(XA, XB)[0, 0] == 1.
+    #  assert len(record) == 0
 
 
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))