From 8f894a85cc38af6f9f141959b6e83404fa9b91a5 Mon Sep 17 00:00:00 2001 From: Erich Schubert Date: Fri, 21 Sep 2018 11:49:38 +0200 Subject: [PATCH 1/3] Add a test for numeric precision #9354 Surprisingly bad precision, isn't it? Note that the traditional computation sqrt(sum((x-y)**2)) gets the results exact. --- sklearn/metrics/tests/test_pairwise.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index e63219a817bed..7465d28f7005c 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -874,3 +874,15 @@ def test_check_preserve_type(): XB.astype(np.float)) assert_equal(XA_checked.dtype, np.float) assert_equal(XB_checked.dtype, np.float) + +def test_euclidean_precision32(): + """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision""" + XA = np.array([[10000]], np.float32) + XB = np.array([[10001]], np.float32) + assert_equal(euclidean_distances(XA, XB)[0,0], 1) + +def test_euclidean_precision64(): + """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision""" + XA = np.array([[100000000]]) + XB = np.array([[100000001]]) + assert_equal(euclidean_distances(XA, XB)[0,0], 1) From 7dcf3b22fcd80e0fee776fbdaebf511849d899de Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 23 Sep 2018 17:11:27 +0200 Subject: [PATCH 2/3] Add heuristics for detecting precision issues with euclidean distance --- sklearn/exceptions.py | 11 +++++ sklearn/metrics/pairwise.py | 47 ++++++++++++++++++ sklearn/metrics/tests/test_pairwise.py | 67 +++++++++++++++++++++----- 3 files changed, 112 insertions(+), 13 deletions(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 9cf207e40fdd6..7c0f767cb5b66 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -96,6 +96,17 @@ class EfficiencyWarning(UserWarning): """ +class NumericalPrecisionWarning(UserWarning): + """Warning used to notify the user of possibly inaccurate computation. + + This warning notifies the user that the numerical accuracy may not be + optimal due to some reason which may be included as a part of the warning + message. + + .. versionadded:: 0.21 + """ + + class FitFailedWarning(RuntimeWarning): """Warning class used if there is an error while fitting the estimator. diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2e56255af0019..45804be53b786 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -18,6 +18,7 @@ from scipy.sparse import csr_matrix from scipy.sparse import issparse +from ..exceptions import NumericalPrecisionWarning from ..utils.validation import _num_samples from ..utils import check_array from ..utils import gen_even_slices @@ -27,6 +28,7 @@ from ..utils import Parallel from ..utils import delayed from ..utils import effective_n_jobs +from ..utils.fixes import nanpercentile from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan @@ -244,6 +246,51 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, else: YY = row_norms(Y, squared=True)[np.newaxis, :] + if XX.size > 0 and YY.size > 0: + # Heuristics to detect possible numerical precision issues, with + # the used quadratic expansion that occurs when the distance between + # considered vectors is much lower than their norm. + # + # Here, a warning is raised if all considered vectors are within a ND + # sperical shell with a thickness to diameter ratio lower than + # the threshold value of 1e-7 in 64 bit. While such a configuration + # is generally uncommon, it occurs when a distribution with an + # inertia (or mean radius) of I, is shifted by I/threshold out of + # of the origin. + + # min and max of sample norms, ignoring outliers + XX_min = nanpercentile(XX, 5)**0.5 + XX_max = nanpercentile(XX, 95)**0.5 + YY_min = nanpercentile(YY, 5)**0.5 + YY_max = nanpercentile(YY, 95)**0.5 + XY_min = min(XX_min, YY_min) + XY_max = max(XX_max, YY_max) + + # only float64, float32 dtypes are possible + if X.dtype == np.float64: + threshold = 1e-7 + elif X.dtype == np.float32: + threshold = 1e-3 + + if (abs(XY_max - XY_min) < threshold*XY_max and + # ignore null vector + XX_max > 0.0 and YY_max > 0 + # ignore comparison between the vector and itself + and XX_max != YY_max): + + warning_message = ( + "with the provided data, computing " + "Euclidean distances with the quadratic expansion may " + "lead to numerically inaccurate results. ") + if not issparse(X): + warning_message += ( + "Consider standardizing features by removing the mean, " + "or setting globally " + "euclidean_distances_algorithm='exact' for slower but " + "more precise implementation.") + + warnings.warn(warning_message, NumericalPrecisionWarning) + distances = safe_sparse_dot(X, Y.T, dense_output=True) distances *= -2 distances += XX diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 7465d28f7005c..55c2b2ffc9122 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -8,6 +8,7 @@ import pytest +import sklearn from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_allclose @@ -49,8 +50,7 @@ from sklearn.metrics.pairwise import paired_manhattan_distances from sklearn.preprocessing import normalize from sklearn.exceptions import DataConversionWarning - -import pytest +from sklearn.exceptions import NumericalPrecisionWarning def test_pairwise_distances(): @@ -875,14 +875,55 @@ def test_check_preserve_type(): assert_equal(XA_checked.dtype, np.float) assert_equal(XB_checked.dtype, np.float) -def test_euclidean_precision32(): - """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision""" - XA = np.array([[10000]], np.float32) - XB = np.array([[10001]], np.float32) - assert_equal(euclidean_distances(XA, XB)[0,0], 1) - -def test_euclidean_precision64(): - """dot(x,x) - 2 dot(x,y) + dot(y,y) has catastrophic precision""" - XA = np.array([[100000000]]) - XB = np.array([[100000001]]) - assert_equal(euclidean_distances(XA, XB)[0,0], 1) + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +def test_euclidean_distance_precision(dtype): + """Checks for the most problematic cases when computing the + the euclidean distance with dot(x,x) - 2 dot(x,y) + dot(y,y)""" + if dtype == 'float32': + offset = 10000 + else: + offset = 1e8 + + XA = np.array([[offset]], dtype) + XB = np.array([[offset + 1]], dtype) + + with pytest.warns(NumericalPrecisionWarning): + with pytest.raises(AssertionError): + assert euclidean_distances(XA, XB)[0, 0] == 1. + + with pytest.warns(NumericalPrecisionWarning): + with pytest.raises(AssertionError): + assert pairwise_distances(XA, XB)[0, 0] == 1. + + with pytest.warns(None) as record: + assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1 + + with sklearn.config_context(euclidean_distances_algorithm='exact'): + assert euclidean_distances(XA, XB)[0, 0] == 1 + assert pairwise_distances(XA, XB)[0, 0] == 1. + assert len(record) == 0 + + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +def test_euclidean_distance_distribution_precision(dtype): + """Computing euclidean_distances with the quadratic expansion + on data strongly shifted off the origin leads to numerical precision + issues""" + XA = np.random.RandomState(42).randn(100, 10).astype(dtype) + XB = np.random.RandomState(41).randn(200, 10).astype(dtype) + + if dtype == 'float32': + offset = 10000 + else: + offset = 1e8 + + with pytest.warns(None) as record: + euclidean_distances(XA, XB) + assert len(record) == 0 + + XA += offset + XB += offset + + with pytest.warns(NumericalPrecisionWarning): + euclidean_distances(XA, XB) From 28bacf9f0a89e7075e4f36b805a93298c3eeee24 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 24 Sep 2018 13:54:37 +0200 Subject: [PATCH 3/3] Fix CI --- sklearn/metrics/tests/test_pairwise.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 55c2b2ffc9122..2e887cc570b3e 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -8,7 +8,6 @@ import pytest -import sklearn from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_allclose @@ -896,13 +895,16 @@ def test_euclidean_distance_precision(dtype): with pytest.raises(AssertionError): assert pairwise_distances(XA, XB)[0, 0] == 1. - with pytest.warns(None) as record: - assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1 + # Note: this needs https://github.com/scikit-learn/scikit-learn/pull/12136 + # to be merged first. + # + # with pytest.warns(None) as record: + # assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1 - with sklearn.config_context(euclidean_distances_algorithm='exact'): - assert euclidean_distances(XA, XB)[0, 0] == 1 - assert pairwise_distances(XA, XB)[0, 0] == 1. - assert len(record) == 0 + # with sklearn.config_context(euclidean_distances_algorithm='exact'): + # assert euclidean_distances(XA, XB)[0, 0] == 1 + # assert pairwise_distances(XA, XB)[0, 0] == 1. + # assert len(record) == 0 @pytest.mark.parametrize('dtype', ('float32', 'float64'))