diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 9cf207e40fdd6..7c0f767cb5b66 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -96,6 +96,17 @@ class EfficiencyWarning(UserWarning): """ +class NumericalPrecisionWarning(UserWarning): + """Warning used to notify the user of possibly inaccurate computation. + + This warning notifies the user that the numerical accuracy may not be + optimal due to some reason which may be included as a part of the warning + message. + + .. versionadded:: 0.21 + """ + + class FitFailedWarning(RuntimeWarning): """Warning class used if there is an error while fitting the estimator. diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2e56255af0019..45804be53b786 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -18,6 +18,7 @@ from scipy.sparse import csr_matrix from scipy.sparse import issparse +from ..exceptions import NumericalPrecisionWarning from ..utils.validation import _num_samples from ..utils import check_array from ..utils import gen_even_slices @@ -27,6 +28,7 @@ from ..utils import Parallel from ..utils import delayed from ..utils import effective_n_jobs +from ..utils.fixes import nanpercentile from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan @@ -244,6 +246,51 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, else: YY = row_norms(Y, squared=True)[np.newaxis, :] + if XX.size > 0 and YY.size > 0: + # Heuristics to detect possible numerical precision issues, with + # the used quadratic expansion that occurs when the distance between + # considered vectors is much lower than their norm. + # + # Here, a warning is raised if all considered vectors are within a ND + # sperical shell with a thickness to diameter ratio lower than + # the threshold value of 1e-7 in 64 bit. While such a configuration + # is generally uncommon, it occurs when a distribution with an + # inertia (or mean radius) of I, is shifted by I/threshold out of + # of the origin. + + # min and max of sample norms, ignoring outliers + XX_min = nanpercentile(XX, 5)**0.5 + XX_max = nanpercentile(XX, 95)**0.5 + YY_min = nanpercentile(YY, 5)**0.5 + YY_max = nanpercentile(YY, 95)**0.5 + XY_min = min(XX_min, YY_min) + XY_max = max(XX_max, YY_max) + + # only float64, float32 dtypes are possible + if X.dtype == np.float64: + threshold = 1e-7 + elif X.dtype == np.float32: + threshold = 1e-3 + + if (abs(XY_max - XY_min) < threshold*XY_max and + # ignore null vector + XX_max > 0.0 and YY_max > 0 + # ignore comparison between the vector and itself + and XX_max != YY_max): + + warning_message = ( + "with the provided data, computing " + "Euclidean distances with the quadratic expansion may " + "lead to numerically inaccurate results. ") + if not issparse(X): + warning_message += ( + "Consider standardizing features by removing the mean, " + "or setting globally " + "euclidean_distances_algorithm='exact' for slower but " + "more precise implementation.") + + warnings.warn(warning_message, NumericalPrecisionWarning) + distances = safe_sparse_dot(X, Y.T, dense_output=True) distances *= -2 distances += XX diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index e63219a817bed..2e887cc570b3e 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -49,8 +49,7 @@ from sklearn.metrics.pairwise import paired_manhattan_distances from sklearn.preprocessing import normalize from sklearn.exceptions import DataConversionWarning - -import pytest +from sklearn.exceptions import NumericalPrecisionWarning def test_pairwise_distances(): @@ -874,3 +873,59 @@ def test_check_preserve_type(): XB.astype(np.float)) assert_equal(XA_checked.dtype, np.float) assert_equal(XB_checked.dtype, np.float) + + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +def test_euclidean_distance_precision(dtype): + """Checks for the most problematic cases when computing the + the euclidean distance with dot(x,x) - 2 dot(x,y) + dot(y,y)""" + if dtype == 'float32': + offset = 10000 + else: + offset = 1e8 + + XA = np.array([[offset]], dtype) + XB = np.array([[offset + 1]], dtype) + + with pytest.warns(NumericalPrecisionWarning): + with pytest.raises(AssertionError): + assert euclidean_distances(XA, XB)[0, 0] == 1. + + with pytest.warns(NumericalPrecisionWarning): + with pytest.raises(AssertionError): + assert pairwise_distances(XA, XB)[0, 0] == 1. + + # Note: this needs https://github.com/scikit-learn/scikit-learn/pull/12136 + # to be merged first. + # + # with pytest.warns(None) as record: + # assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1 + + # with sklearn.config_context(euclidean_distances_algorithm='exact'): + # assert euclidean_distances(XA, XB)[0, 0] == 1 + # assert pairwise_distances(XA, XB)[0, 0] == 1. + # assert len(record) == 0 + + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +def test_euclidean_distance_distribution_precision(dtype): + """Computing euclidean_distances with the quadratic expansion + on data strongly shifted off the origin leads to numerical precision + issues""" + XA = np.random.RandomState(42).randn(100, 10).astype(dtype) + XB = np.random.RandomState(41).randn(200, 10).astype(dtype) + + if dtype == 'float32': + offset = 10000 + else: + offset = 1e8 + + with pytest.warns(None) as record: + euclidean_distances(XA, XB) + assert len(record) == 0 + + XA += offset + XB += offset + + with pytest.warns(NumericalPrecisionWarning): + euclidean_distances(XA, XB)