diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b4928ed7492f3..90645d4641b8f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,6 +22,7 @@ from ..utils import check_array from ..utils import gen_even_slices from ..utils import gen_batches, get_chunk_n_rows +from ..utils.fixes import _cast_if_needed from ..utils.extmath import row_norms, safe_sparse_dot from ..preprocessing import normalize from ..externals.joblib import Parallel @@ -161,6 +162,121 @@ def check_paired_arrays(X, Y): # Pairwise distances +def _euclidean_distances_float64(X, Y, Y_norm_squared=None, squared=False, + X_norm_squared=None): + """ + Compute the euclidean distances between X and Y when both are arrays of + float64. + """ + if X_norm_squared is not None: + XX = X_norm_squared + else: + XX = row_norms(X, squared=True)[:, np.newaxis] + + if X is Y: # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is not None: + YY = Y_norm_squared + else: + YY = row_norms(Y, squared=True)[np.newaxis, :] + + distances = safe_sparse_dot(X, Y.T, dense_output=True) + distances *= -2 + distances += XX + distances += YY + np.maximum(distances, 0, out=distances) + + if X is Y: + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + distances.flat[::distances.shape[0] + 1] = 0.0 + + return distances if squared else np.sqrt(distances, out=distances) + + +def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, + squared=False, X_norm_squared=None): + """ + Compute the euclidean distance matrix between X and Y by casting to float64 + for a greater numerical stability. + + The computation is done by blocks to limit additional memory usage. + """ + # For performance reasons, swap X and Y if I got X_norm_squared but not + # Y_norm_squared + if X_norm_squared is not None and Y_norm_squared is None: + swap = True + X, Y = Y, X + X_norm_squared, Y_norm_squared = None, X_norm_squared.T + else: + swap = False + + # No more than 10MB of additional memory will be used to cast X and Y to + # float64 and to get the float64 result. + maxmem = 10*1024*1024 + itemsize = np.float64(0).itemsize + maxmem /= itemsize + + # Compute the block size that won't use more than maxmem bytes of + # additional (temporary) memory. + # The amount of additional memory required is: + # - a float64 copy of a block of the rows of X if needed; + # - a float64 copy of a block of the rows of Y if needed; + # - the float64 block result; + # - a float64 copy of a block of X_norm_squared if needed; + # - a float64 copy of a block of Y_norm_squared if needed. + # This is a quadratic equation that we solve to compute the block size that + # would use maxmem bytes. + XYmem = 0 + if X.dtype != np.float64: + XYmem += X.shape[1] + if Y.dtype != np.float64: + XYmem += Y.shape[1] # Note that Y.shape[1] == X.shape[1] + if X_norm_squared is None or X_norm_squared.dtype != np.float64: + XYmem += 1 + if Y_norm_squared is None or Y_norm_squared.dtype != np.float64: + XYmem += 1 + + delta = XYmem ** 2 + 4 * maxmem + bs = int((-XYmem + np.sqrt(delta)) // 2) + bs = max(1, bs) + + distances = np.empty((X.shape[0], Y.shape[0]), dtype=outdtype) + + for i in range(0, X.shape[0], bs): + ipbs = min(i + bs, X.shape[0]) + Xc = _cast_if_needed(X[i:ipbs, :], np.float64) + + if X_norm_squared is not None: + Xnc = _cast_if_needed(X_norm_squared[i:ipbs, :], np.float64) + else: + Xnc = row_norms(Xc, squared=True)[:, np.newaxis] + + for j in range(i, Y.shape[0], bs): + jpbs = min(j + bs, Y.shape[0]) + if X is Y and i == j: + Yc = Xc + else: + Yc = _cast_if_needed(Y[j:jpbs, :], np.float64) + + if Y_norm_squared is not None: + Ync = _cast_if_needed(Y_norm_squared[:, j:jpbs], np.float64) + else: + Ync = None + + d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=Ync, + squared=True, X_norm_squared=Xnc) + distances[i:ipbs, j:jpbs] = d + + if X is Y and j > i: + distances[j:jpbs, i:ipbs] = d.T + + if swap: + distances = distances.T + + return distances if squared else np.sqrt(distances, out=distances) + + def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, X_norm_squared=None): """ @@ -223,39 +339,42 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, """ X, Y = check_pairwise_arrays(X, Y) - if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): - raise ValueError( - "Incompatible dimensions for X and X_norm_squared") - else: - XX = row_norms(X, squared=True)[:, np.newaxis] + if X_norm_squared is not None and X_norm_squared.dtype != X.dtype: + warnings.warn("X_norm_squared has a different dtype than X") - if X is Y: # shortcut in the common case euclidean_distances(X, X) - YY = XX.T - elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) + if Y_norm_squared is not None and Y_norm_squared.dtype != Y.dtype: + warnings.warn("Y_norm_squared has a different dtype than Y") - if YY.shape != (1, Y.shape[0]): + if X_norm_squared is not None: + X_norm_squared = np.atleast_2d(X_norm_squared) + X_norm_squared = check_array(X_norm_squared) + if X_norm_squared.shape == (1, X.shape[0]): + X_norm_squared = X_norm_squared.T + elif X_norm_squared.shape != (X.shape[0], 1): raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - else: - YY = row_norms(Y, squared=True)[np.newaxis, :] + "Incompatible dimensions between X and X_norm_squared") - distances = safe_sparse_dot(X, Y.T, dense_output=True) - distances *= -2 - distances += XX - distances += YY - np.maximum(distances, 0, out=distances) + if Y_norm_squared is not None: + Y_norm_squared = np.atleast_2d(Y_norm_squared) + Y_norm_squared = check_array(Y_norm_squared) + if Y_norm_squared.shape != (1, Y.shape[0]): + raise ValueError( + "Incompatible dimensions between Y and Y_norm_squared") if X is Y: - # Ensure that distances between vectors and themselves are set to 0.0. - # This may not be the case due to floating point rounding errors. - distances.flat[::distances.shape[0] + 1] = 0.0 + if X_norm_squared is None and Y_norm_squared is not None: + X_norm_squared = Y_norm_squared.T + if X_norm_squared is not None and Y_norm_squared is None: + Y_norm_squared = X_norm_squared.T - return distances if squared else np.sqrt(distances, out=distances) + outdtype = (X[0, 0] + Y[0, 0]).dtype + + if X.dtype == np.float64 and Y.dtype == np.float64: + return _euclidean_distances_float64(X, Y, Y_norm_squared, squared, + X_norm_squared) + else: + return _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared, + squared, X_norm_squared) def _argmin_min_reduce(dist, start): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index e63219a817bed..4aa3d3b7c8b25 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -556,6 +556,38 @@ def test_euclidean_distances(): Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01) + # Check euclidean_distances when using float32 for one or both arguments + X32 = X.astype(np.float32) + Y32 = Y.astype(np.float32) + D64 = euclidean_distances(X, Y) + D64_32 = euclidean_distances(X, Y32) + D32_64 = euclidean_distances(X32, Y) + D32_32 = euclidean_distances(X32, Y32) + assert_equal(D64_32.dtype, np.float64) + assert_equal(D32_64.dtype, np.float64) + assert_equal(D32_32.dtype, np.float32) + assert_array_almost_equal(D64_32, D64) + assert_array_almost_equal(D32_64, D64) + assert_array_almost_equal(D32_32, D64) + + # Check that {X,Y}_norm_squared are used with float32 arguments + X32_norm_sq = 0.5 * (X32 ** 2).sum(axis=1).reshape(1, -1) + Y32_norm_sq = 0.5 * (Y32 ** 2).sum(axis=1).reshape(1, -1) + DYN = euclidean_distances(X32, Y32, Y_norm_squared=Y32_norm_sq) + DXN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq) + DXYN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq, + Y_norm_squared=Y32_norm_sq) + assert_greater(np.max(np.abs(DYN - D64)), .01) + assert_greater(np.max(np.abs(DXN - D64)), .01) + assert_greater(np.max(np.abs(DXYN - D64)), .01) + + # Check that the accuracy with float32 is not too bad + X = np.array([[0.9215765222065525, 0.9682577158572608], + [0.9221151778782808, 0.9681831844652774]]) + D64 = euclidean_distances(X) + D32 = euclidean_distances(X.astype(np.float32)) + assert_array_almost_equal(D32, D64) + def test_cosine_distances(): # Check the pairwise Cosine distances computation diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 0117770084177..07d72e63dfd08 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -295,3 +295,15 @@ def _object_dtype_isnan(X): else: def _object_dtype_isnan(X): return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool) + + +if sp_version < (1, 0): + # Before scipy 1.0, sp.sparse.csr_matrix.astype didn't have a 'copy' + # argument. + def _cast_if_needed(arr, dtype): + if arr.dtype == dtype: + return arr + return arr.astype(dtype) +else: + def _cast_if_needed(arr, dtype): + return arr.astype(dtype, copy=False)