-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[WIP] Stable and fast float32 implementation of euclidean_distances #11271
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
952795a
53d0fee
578193a
c6bcd5a
a6b7f7c
d976473
7bd4b76
d0f9796
1721349
4e07e8b
3b7ad89
8aecbde
0fd279a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
from ..utils import check_array | ||
from ..utils import gen_even_slices | ||
from ..utils import gen_batches, get_chunk_n_rows | ||
from ..utils.fixes import _cast_if_needed | ||
from ..utils.extmath import row_norms, safe_sparse_dot | ||
from ..preprocessing import normalize | ||
from ..externals.joblib import Parallel | ||
|
@@ -161,6 +162,121 @@ def check_paired_arrays(X, Y): | |
|
||
|
||
# Pairwise distances | ||
def _euclidean_distances_float64(X, Y, Y_norm_squared=None, squared=False, | ||
X_norm_squared=None): | ||
""" | ||
Compute the euclidean distances between X and Y when both are arrays of | ||
float64. | ||
""" | ||
if X_norm_squared is not None: | ||
XX = X_norm_squared | ||
else: | ||
XX = row_norms(X, squared=True)[:, np.newaxis] | ||
|
||
if X is Y: # shortcut in the common case euclidean_distances(X, X) | ||
YY = XX.T | ||
elif Y_norm_squared is not None: | ||
YY = Y_norm_squared | ||
else: | ||
YY = row_norms(Y, squared=True)[np.newaxis, :] | ||
|
||
distances = safe_sparse_dot(X, Y.T, dense_output=True) | ||
distances *= -2 | ||
distances += XX | ||
distances += YY | ||
np.maximum(distances, 0, out=distances) | ||
|
||
if X is Y: | ||
# Ensure that distances between vectors and themselves are set to 0.0. | ||
# This may not be the case due to floating point rounding errors. | ||
distances.flat[::distances.shape[0] + 1] = 0.0 | ||
|
||
return distances if squared else np.sqrt(distances, out=distances) | ||
|
||
|
||
def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, | ||
squared=False, X_norm_squared=None): | ||
""" | ||
Compute the euclidean distance matrix between X and Y by casting to float64 | ||
for a greater numerical stability. | ||
|
||
The computation is done by blocks to limit additional memory usage. | ||
""" | ||
# For performance reasons, swap X and Y if I got X_norm_squared but not | ||
# Y_norm_squared | ||
if X_norm_squared is not None and Y_norm_squared is None: | ||
swap = True | ||
X, Y = Y, X | ||
X_norm_squared, Y_norm_squared = None, X_norm_squared.T | ||
else: | ||
swap = False | ||
|
||
# No more than 10MB of additional memory will be used to cast X and Y to | ||
# float64 and to get the float64 result. | ||
maxmem = 10*1024*1024 | ||
itemsize = np.float64(0).itemsize | ||
maxmem /= itemsize | ||
|
||
# Compute the block size that won't use more than maxmem bytes of | ||
# additional (temporary) memory. | ||
# The amount of additional memory required is: | ||
# - a float64 copy of a block of the rows of X if needed; | ||
# - a float64 copy of a block of the rows of Y if needed; | ||
# - the float64 block result; | ||
# - a float64 copy of a block of X_norm_squared if needed; | ||
# - a float64 copy of a block of Y_norm_squared if needed. | ||
# This is a quadratic equation that we solve to compute the block size that | ||
# would use maxmem bytes. | ||
XYmem = 0 | ||
if X.dtype != np.float64: | ||
XYmem += X.shape[1] | ||
if Y.dtype != np.float64: | ||
XYmem += Y.shape[1] # Note that Y.shape[1] == X.shape[1] | ||
if X_norm_squared is None or X_norm_squared.dtype != np.float64: | ||
XYmem += 1 | ||
if Y_norm_squared is None or Y_norm_squared.dtype != np.float64: | ||
XYmem += 1 | ||
|
||
delta = XYmem ** 2 + 4 * maxmem | ||
bs = int((-XYmem + np.sqrt(delta)) // 2) | ||
bs = max(1, bs) | ||
|
||
distances = np.empty((X.shape[0], Y.shape[0]), dtype=outdtype) | ||
|
||
for i in range(0, X.shape[0], bs): | ||
ipbs = min(i + bs, X.shape[0]) | ||
Xc = _cast_if_needed(X[i:ipbs, :], np.float64) | ||
|
||
if X_norm_squared is not None: | ||
Xnc = _cast_if_needed(X_norm_squared[i:ipbs, :], np.float64) | ||
else: | ||
Xnc = row_norms(Xc, squared=True)[:, np.newaxis] | ||
|
||
for j in range(i, Y.shape[0], bs): | ||
jpbs = min(j + bs, Y.shape[0]) | ||
if X is Y and i == j: | ||
Yc = Xc | ||
else: | ||
Yc = _cast_if_needed(Y[j:jpbs, :], np.float64) | ||
|
||
if Y_norm_squared is not None: | ||
Ync = _cast_if_needed(Y_norm_squared[:, j:jpbs], np.float64) | ||
else: | ||
Ync = None | ||
|
||
d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=Ync, | ||
squared=True, X_norm_squared=Xnc) | ||
distances[i:ipbs, j:jpbs] = d | ||
|
||
if X is Y and j > i: | ||
distances[j:jpbs, i:ipbs] = d.T | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Line not covered by tests, probably need to add a test for this |
||
|
||
if swap: | ||
distances = distances.T | ||
|
||
return distances if squared else np.sqrt(distances, out=distances) | ||
|
||
|
||
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, | ||
X_norm_squared=None): | ||
""" | ||
|
@@ -223,39 +339,42 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, | |
""" | ||
X, Y = check_pairwise_arrays(X, Y) | ||
|
||
if X_norm_squared is not None: | ||
XX = check_array(X_norm_squared) | ||
if XX.shape == (1, X.shape[0]): | ||
XX = XX.T | ||
elif XX.shape != (X.shape[0], 1): | ||
raise ValueError( | ||
"Incompatible dimensions for X and X_norm_squared") | ||
else: | ||
XX = row_norms(X, squared=True)[:, np.newaxis] | ||
if X_norm_squared is not None and X_norm_squared.dtype != X.dtype: | ||
warnings.warn("X_norm_squared has a different dtype than X") | ||
|
||
if X is Y: # shortcut in the common case euclidean_distances(X, X) | ||
YY = XX.T | ||
elif Y_norm_squared is not None: | ||
YY = np.atleast_2d(Y_norm_squared) | ||
if Y_norm_squared is not None and Y_norm_squared.dtype != Y.dtype: | ||
warnings.warn("Y_norm_squared has a different dtype than Y") | ||
|
||
if YY.shape != (1, Y.shape[0]): | ||
if X_norm_squared is not None: | ||
X_norm_squared = np.atleast_2d(X_norm_squared) | ||
X_norm_squared = check_array(X_norm_squared) | ||
if X_norm_squared.shape == (1, X.shape[0]): | ||
X_norm_squared = X_norm_squared.T | ||
elif X_norm_squared.shape != (X.shape[0], 1): | ||
raise ValueError( | ||
"Incompatible dimensions for Y and Y_norm_squared") | ||
else: | ||
YY = row_norms(Y, squared=True)[np.newaxis, :] | ||
"Incompatible dimensions between X and X_norm_squared") | ||
|
||
distances = safe_sparse_dot(X, Y.T, dense_output=True) | ||
distances *= -2 | ||
distances += XX | ||
distances += YY | ||
np.maximum(distances, 0, out=distances) | ||
if Y_norm_squared is not None: | ||
Y_norm_squared = np.atleast_2d(Y_norm_squared) | ||
Y_norm_squared = check_array(Y_norm_squared) | ||
if Y_norm_squared.shape != (1, Y.shape[0]): | ||
raise ValueError( | ||
"Incompatible dimensions between Y and Y_norm_squared") | ||
|
||
if X is Y: | ||
# Ensure that distances between vectors and themselves are set to 0.0. | ||
# This may not be the case due to floating point rounding errors. | ||
distances.flat[::distances.shape[0] + 1] = 0.0 | ||
if X_norm_squared is None and Y_norm_squared is not None: | ||
X_norm_squared = Y_norm_squared.T | ||
if X_norm_squared is not None and Y_norm_squared is None: | ||
Y_norm_squared = X_norm_squared.T | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also not covered by tests.. |
||
|
||
return distances if squared else np.sqrt(distances, out=distances) | ||
outdtype = (X[0, 0] + Y[0, 0]).dtype | ||
|
||
if X.dtype == np.float64 and Y.dtype == np.float64: | ||
return _euclidean_distances_float64(X, Y, Y_norm_squared, squared, | ||
X_norm_squared) | ||
else: | ||
return _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared, | ||
squared, X_norm_squared) | ||
|
||
|
||
def _argmin_min_reduce(dist, start): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -556,6 +556,38 @@ def test_euclidean_distances(): | |
Y_norm_squared=np.zeros_like(Y_norm_sq)) | ||
assert_greater(np.max(np.abs(wrong_D - D1)), .01) | ||
|
||
# Check euclidean_distances when using float32 for one or both arguments | ||
X32 = X.astype(np.float32) | ||
Y32 = Y.astype(np.float32) | ||
D64 = euclidean_distances(X, Y) | ||
D64_32 = euclidean_distances(X, Y32) | ||
D32_64 = euclidean_distances(X32, Y) | ||
D32_32 = euclidean_distances(X32, Y32) | ||
assert_equal(D64_32.dtype, np.float64) | ||
assert_equal(D32_64.dtype, np.float64) | ||
assert_equal(D32_32.dtype, np.float32) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With the adoption of pytest, we are phasing out use of test helpers |
||
assert_array_almost_equal(D64_32, D64) | ||
assert_array_almost_equal(D32_64, D64) | ||
assert_array_almost_equal(D32_32, D64) | ||
|
||
# Check that {X,Y}_norm_squared are used with float32 arguments | ||
X32_norm_sq = 0.5 * (X32 ** 2).sum(axis=1).reshape(1, -1) | ||
Y32_norm_sq = 0.5 * (Y32 ** 2).sum(axis=1).reshape(1, -1) | ||
DYN = euclidean_distances(X32, Y32, Y_norm_squared=Y32_norm_sq) | ||
DXN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq) | ||
DXYN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq, | ||
Y_norm_squared=Y32_norm_sq) | ||
assert_greater(np.max(np.abs(DYN - D64)), .01) | ||
assert_greater(np.max(np.abs(DXN - D64)), .01) | ||
assert_greater(np.max(np.abs(DXYN - D64)), .01) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
# Check that the accuracy with float32 is not too bad | ||
X = np.array([[0.9215765222065525, 0.9682577158572608], | ||
[0.9221151778782808, 0.9681831844652774]]) | ||
D64 = euclidean_distances(X) | ||
D32 = euclidean_distances(X.astype(np.float32)) | ||
assert_array_almost_equal(D32, D64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On master I get, In [5]: D64 = euclidean_distances(X)
In [6]: D64
Out[6]:
array([[0. , 0.00054379],
[0.00054379, 0. ]])
In [7]: D32 = euclidean_distances(X.astype(np.float32))
In [8]: D32
Out[8]:
array([[0. , 0.00059802],
[0.00059802, 0. ]], dtype=float32) and |
||
|
||
|
||
def test_cosine_distances(): | ||
# Check the pairwise Cosine distances computation | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if we use a bit more? The performance is decreased?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The performance decreases a bit, yes. But it's only noticeable for a
maxmem
several order of magnitude larger than required. If you look at the first set of plots, you'll see a trend for larger memory sizes.