From 952795a1bf24770c9ac4cbb33303b1acbe695730 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Tue, 12 Jun 2018 16:25:57 +0200 Subject: [PATCH 01/13] TST: float32 inputs to euclidean_distances Signed-off-by: Celelibi --- sklearn/metrics/tests/test_pairwise.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index e63219a817bed..789490b2f2b96 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -556,6 +556,20 @@ def test_euclidean_distances(): Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01) + # Check euclidean_distances when using float32 for one or both arguments + X32 = X.astype(np.float32) + Y32 = Y.astype(np.float32) + D64 = euclidean_distances(X, Y) + D64_32 = euclidean_distances(X, Y32) + D32_64 = euclidean_distances(X32, Y) + D32_32 = euclidean_distances(X32, Y32) + assert_equal(D64_32.dtype, np.float64) + assert_equal(D32_64.dtype, np.float64) + assert_equal(D32_32.dtype, np.float32) + assert_array_almost_equal(D64_32, D64) + assert_array_almost_equal(D32_64, D64) + assert_array_almost_equal(D32_32, D64) + def test_cosine_distances(): # Check the pairwise Cosine distances computation From 53d0feeae7872c71278958754fad455d176d77ce Mon Sep 17 00:00:00 2001 From: Celelibi Date: Tue, 12 Jun 2018 16:26:07 +0200 Subject: [PATCH 02/13] TST: Accuracy of euclidean_distances with float32 Signed-off-by: Celelibi --- sklearn/metrics/tests/test_pairwise.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 789490b2f2b96..587f332337000 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -570,6 +570,13 @@ def test_euclidean_distances(): assert_array_almost_equal(D32_64, D64) assert_array_almost_equal(D32_32, D64) + # Check that the accuracy with float32 is not too bad + X = np.array([[0.9215765222065525, 0.9682577158572608], + [0.9221151778782808, 0.9681831844652774]]) + D64 = euclidean_distances(X) + D32 = euclidean_distances(X.astype(np.float32)) + assert_array_almost_equal(D32, D64) + def test_cosine_distances(): # Check the pairwise Cosine distances computation From 578193a30b52a666bb7066d4fea249142fa2a3d8 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Tue, 12 Jun 2018 16:26:13 +0200 Subject: [PATCH 03/13] FIX: More stable euclidean_distances for float32 Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 36 ++++++++++++++++++++++++++++++++++++ sklearn/utils/fixes.py | 12 ++++++++++++ 2 files changed, 48 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b4928ed7492f3..0862228494bc1 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,6 +22,7 @@ from ..utils import check_array from ..utils import gen_even_slices from ..utils import gen_batches, get_chunk_n_rows +from ..utils.fixes import _cast_if_needed from ..utils.extmath import row_norms, safe_sparse_dot from ..preprocessing import normalize from ..externals.joblib import Parallel @@ -223,6 +224,41 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, """ X, Y = check_pairwise_arrays(X, Y) + if X_norm_squared is not None and X_norm_squared.dtype != X.dtype: + warnings.warn("X_norm_squared has a different dtype than X") + + if Y_norm_squared is not None and Y_norm_squared.dtype != Y.dtype: + warnings.warn("Y_norm_squared has a different dtype than Y") + + outdtype = (X[0, 0] + Y[0, 0]).dtype + + if X.dtype != np.float64 or Y.dtype != np.float64: + bs = 1024 + distances = np.zeros((X.shape[0], Y.shape[0]), dtype=outdtype) + + for i in range(0, X.shape[0], bs): + for j in range(i, Y.shape[0], bs): + ipbs = min(i + bs, X.shape[0]) + jpbs = min(j + bs, Y.shape[0]) + + for k in range(0, X.shape[1], bs): + kpbs = min(k + bs, X.shape[1]) + + Xc = _cast_if_needed(X[i:ipbs, k:kpbs], np.float64) + if X is Y: + Yc = None + else: + Yc = _cast_if_needed(Y[j:jpbs, k:kpbs], np.float64) + + d = euclidean_distances(Xc, Yc, Y_norm_squared=None, + squared=True, X_norm_squared=None) + distances[i:ipbs, j:jpbs] += d + + if X is Y and j > i: + distances[j:jpbs, i:ipbs] = distances[i:ipbs, j:jpbs].T + + return distances if squared else np.sqrt(distances, out=distances) + if X_norm_squared is not None: XX = check_array(X_norm_squared) if XX.shape == (1, X.shape[0]): diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 0117770084177..07d72e63dfd08 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -295,3 +295,15 @@ def _object_dtype_isnan(X): else: def _object_dtype_isnan(X): return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool) + + +if sp_version < (1, 0): + # Before scipy 1.0, sp.sparse.csr_matrix.astype didn't have a 'copy' + # argument. + def _cast_if_needed(arr, dtype): + if arr.dtype == dtype: + return arr + return arr.astype(dtype) +else: + def _cast_if_needed(arr, dtype): + return arr.astype(dtype, copy=False) From c6bcd5a960970d191242d911a8ac8a93abd9d1aa Mon Sep 17 00:00:00 2001 From: Celelibi Date: Tue, 12 Jun 2018 16:26:35 +0200 Subject: [PATCH 04/13] ENH: Refactor euclidean_distances to avoid recursive calls Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 142 +++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 59 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 0862228494bc1..fccc80dc031c9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -162,6 +162,84 @@ def check_paired_arrays(X, Y): # Pairwise distances +def _euclidean_distances_float64(X, Y, Y_norm_squared=None, squared=False, + X_norm_squared=None): + """ + Compute the euclidean distances between X and Y when both are arrays of + float64. + """ + if X_norm_squared is not None: + XX = check_array(X_norm_squared) + if XX.shape == (1, X.shape[0]): + XX = XX.T + elif XX.shape != (X.shape[0], 1): + raise ValueError( + "Incompatible dimensions for X and X_norm_squared") + + else: + XX = row_norms(X, squared=True)[:, np.newaxis] + + if X is Y: # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is not None: + YY = np.atleast_2d(Y_norm_squared) + + if YY.shape != (1, Y.shape[0]): + raise ValueError( + "Incompatible dimensions for Y and Y_norm_squared") + + else: + YY = row_norms(Y, squared=True)[np.newaxis, :] + + distances = safe_sparse_dot(X, Y.T, dense_output=True) + distances *= -2 + distances += XX + distances += YY + np.maximum(distances, 0, out=distances) + + if X is Y: + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + distances.flat[::distances.shape[0] + 1] = 0.0 + + return distances if squared else np.sqrt(distances, out=distances) + + +def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, + squared=False, X_norm_squared=None): + """ + Compute the euclidean distance matrix between X and Y by casting to float64 + for a greater numerical stability. + + The computation is done by blocks to limit additional memory usage. + """ + bs = 1024 + distances = np.zeros((X.shape[0], Y.shape[0]), dtype=outdtype) + + for i in range(0, X.shape[0], bs): + for j in range(i, Y.shape[0], bs): + ipbs = min(i + bs, X.shape[0]) + jpbs = min(j + bs, Y.shape[0]) + + for k in range(0, X.shape[1], bs): + kpbs = min(k + bs, X.shape[1]) + + Xc = _cast_if_needed(X[i:ipbs, k:kpbs], np.float64) + if X is Y: + Yc = Xc + else: + Yc = _cast_if_needed(Y[j:jpbs, k:kpbs], np.float64) + + d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=None, + squared=True, X_norm_squared=None) + distances[i:ipbs, j:jpbs] += d + + if X is Y and j > i: + distances[j:jpbs, i:ipbs] = distances[i:ipbs, j:jpbs].T + + return distances if squared else np.sqrt(distances, out=distances) + + def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, X_norm_squared=None): """ @@ -232,66 +310,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, outdtype = (X[0, 0] + Y[0, 0]).dtype - if X.dtype != np.float64 or Y.dtype != np.float64: - bs = 1024 - distances = np.zeros((X.shape[0], Y.shape[0]), dtype=outdtype) - - for i in range(0, X.shape[0], bs): - for j in range(i, Y.shape[0], bs): - ipbs = min(i + bs, X.shape[0]) - jpbs = min(j + bs, Y.shape[0]) - - for k in range(0, X.shape[1], bs): - kpbs = min(k + bs, X.shape[1]) - - Xc = _cast_if_needed(X[i:ipbs, k:kpbs], np.float64) - if X is Y: - Yc = None - else: - Yc = _cast_if_needed(Y[j:jpbs, k:kpbs], np.float64) - - d = euclidean_distances(Xc, Yc, Y_norm_squared=None, - squared=True, X_norm_squared=None) - distances[i:ipbs, j:jpbs] += d - - if X is Y and j > i: - distances[j:jpbs, i:ipbs] = distances[i:ipbs, j:jpbs].T - - return distances if squared else np.sqrt(distances, out=distances) - - if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): - raise ValueError( - "Incompatible dimensions for X and X_norm_squared") + if X.dtype == np.float64 and Y.dtype == np.float64: + return _euclidean_distances_float64(X, Y, Y_norm_squared, squared, + X_norm_squared) else: - XX = row_norms(X, squared=True)[:, np.newaxis] - - if X is Y: # shortcut in the common case euclidean_distances(X, X) - YY = XX.T - elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) - - if YY.shape != (1, Y.shape[0]): - raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - else: - YY = row_norms(Y, squared=True)[np.newaxis, :] - - distances = safe_sparse_dot(X, Y.T, dense_output=True) - distances *= -2 - distances += XX - distances += YY - np.maximum(distances, 0, out=distances) - - if X is Y: - # Ensure that distances between vectors and themselves are set to 0.0. - # This may not be the case due to floating point rounding errors. - distances.flat[::distances.shape[0] + 1] = 0.0 - - return distances if squared else np.sqrt(distances, out=distances) + return _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared, + squared, X_norm_squared) def _argmin_min_reduce(dist, start): From a6b7f7c3565dfaa158ab93bd9d9949498a036a64 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Wed, 13 Jun 2018 17:13:44 +0200 Subject: [PATCH 05/13] ENH: Share squared norm if X is Y in euclidean_distances Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index fccc80dc031c9..7052225648536 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -308,6 +308,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, if Y_norm_squared is not None and Y_norm_squared.dtype != Y.dtype: warnings.warn("Y_norm_squared has a different dtype than Y") + if X is Y: + if X_norm_squared is None and Y_norm_squared is not None: + X_norm_squared = Y_norm_squared.T + if X_norm_squared is not None and Y_norm_squared is None: + Y_norm_squared = X_norm_squared.T + outdtype = (X[0, 0] + Y[0, 0]).dtype if X.dtype == np.float64 and Y.dtype == np.float64: From d976473a15b2187dfc4df15a3efb2d1721ac87b3 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Thu, 14 Jun 2018 03:44:36 +0200 Subject: [PATCH 06/13] ENH: Check squared norm earilier in euclidean_distances Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 7052225648536..7259de42c110f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -169,25 +169,14 @@ def _euclidean_distances_float64(X, Y, Y_norm_squared=None, squared=False, float64. """ if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): - raise ValueError( - "Incompatible dimensions for X and X_norm_squared") - + XX = X_norm_squared else: XX = row_norms(X, squared=True)[:, np.newaxis] if X is Y: # shortcut in the common case euclidean_distances(X, X) YY = XX.T elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) - - if YY.shape != (1, Y.shape[0]): - raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - + YY = Y_norm_squared else: YY = row_norms(Y, squared=True)[np.newaxis, :] @@ -308,6 +297,22 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, if Y_norm_squared is not None and Y_norm_squared.dtype != Y.dtype: warnings.warn("Y_norm_squared has a different dtype than Y") + if X_norm_squared is not None: + X_norm_squared = np.atleast_2d(X_norm_squared) + X_norm_squared = check_array(X_norm_squared) + if X_norm_squared.shape == (1, X.shape[0]): + X_norm_squared = X_norm_squared.T + elif X_norm_squared.shape != (X.shape[0], 1): + raise ValueError( + "Incompatible dimensions between X and X_norm_squared") + + if Y_norm_squared is not None: + Y_norm_squared = np.atleast_2d(Y_norm_squared) + Y_norm_squared = check_array(Y_norm_squared) + if Y_norm_squared.shape != (1, Y.shape[0]): + raise ValueError( + "Incompatible dimensions between Y and Y_norm_squared") + if X is Y: if X_norm_squared is None and Y_norm_squared is not None: X_norm_squared = Y_norm_squared.T From 7bd4b766825cbf41b6d838ab977d5fba44e923d2 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Wed, 13 Jun 2018 22:02:14 +0200 Subject: [PATCH 07/13] ENH: Adjust the block size of euclidean_distances to avoid a loop Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 54 +++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 7259de42c110f..0c5ae4e152e29 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -202,29 +202,55 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, The computation is done by blocks to limit additional memory usage. """ - bs = 1024 - distances = np.zeros((X.shape[0], Y.shape[0]), dtype=outdtype) + # No more than 10MB of additional memory will be used to cast X and Y to + # float64 and to get the float64 result. + maxmem = 10*1024*1024 + itemsize = np.float64(0).itemsize + maxmem /= itemsize + + # Compute the block size that won't use more than maxmem bytes of + # additional (temporary) memory. + # The amount of additional memory required is: + # - a float64 copy of a block of the rows of X if needed; + # - a float64 copy of a block of the rows of Y if needed; + # - the float64 block result; + # - a block of X_norm_squared if needed; + # - a block of Y_norm_squared if needed. + # This is a quadratic equation that we solve to compute the block size that + # would use maxmem bytes. + XYmem = 0 + if X.dtype != np.float64: + XYmem += X.shape[1] + if Y.dtype != np.float64: + XYmem += Y.shape[1] # Note that Y.shape[1] == X.shape[1] + if X_norm_squared is None: + XYmem += 1 + if Y_norm_squared is None: + XYmem += 1 + + delta = XYmem ** 2 + 4 * maxmem + bs = int((-XYmem + np.sqrt(delta)) // 2) + bs = max(1, bs) + + distances = np.empty((X.shape[0], Y.shape[0]), dtype=outdtype) for i in range(0, X.shape[0], bs): for j in range(i, Y.shape[0], bs): ipbs = min(i + bs, X.shape[0]) jpbs = min(j + bs, Y.shape[0]) - for k in range(0, X.shape[1], bs): - kpbs = min(k + bs, X.shape[1]) + Xc = _cast_if_needed(X[i:ipbs, :], np.float64) + if X is Y and i == j: + Yc = Xc + else: + Yc = _cast_if_needed(Y[j:jpbs, :], np.float64) - Xc = _cast_if_needed(X[i:ipbs, k:kpbs], np.float64) - if X is Y: - Yc = Xc - else: - Yc = _cast_if_needed(Y[j:jpbs, k:kpbs], np.float64) - - d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=None, - squared=True, X_norm_squared=None) - distances[i:ipbs, j:jpbs] += d + d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=None, + squared=True, X_norm_squared=None) + distances[i:ipbs, j:jpbs] = d if X is Y and j > i: - distances[j:jpbs, i:ipbs] = distances[i:ipbs, j:jpbs].T + distances[j:jpbs, i:ipbs] = d.T return distances if squared else np.sqrt(distances, out=distances) From d0f97967867979106906b11ccf6bc0ee55e1b9ca Mon Sep 17 00:00:00 2001 From: Celelibi Date: Thu, 14 Jun 2018 13:31:35 +0200 Subject: [PATCH 08/13] ENH: cast blocks of X only once in euclidean_distances Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 0c5ae4e152e29..1a554849526a9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -235,11 +235,11 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, distances = np.empty((X.shape[0], Y.shape[0]), dtype=outdtype) for i in range(0, X.shape[0], bs): + ipbs = min(i + bs, X.shape[0]) + Xc = _cast_if_needed(X[i:ipbs, :], np.float64) + for j in range(i, Y.shape[0], bs): - ipbs = min(i + bs, X.shape[0]) jpbs = min(j + bs, Y.shape[0]) - - Xc = _cast_if_needed(X[i:ipbs, :], np.float64) if X is Y and i == j: Yc = Xc else: From 17213493db54255b5305f97d48e774af93dc68f6 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Tue, 12 Jun 2018 17:49:24 +0200 Subject: [PATCH 09/13] TST: euclidean_distances usage of float32 norm squared Signed-off-by: Celelibi --- sklearn/metrics/tests/test_pairwise.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 587f332337000..4aa3d3b7c8b25 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -570,6 +570,17 @@ def test_euclidean_distances(): assert_array_almost_equal(D32_64, D64) assert_array_almost_equal(D32_32, D64) + # Check that {X,Y}_norm_squared are used with float32 arguments + X32_norm_sq = 0.5 * (X32 ** 2).sum(axis=1).reshape(1, -1) + Y32_norm_sq = 0.5 * (Y32 ** 2).sum(axis=1).reshape(1, -1) + DYN = euclidean_distances(X32, Y32, Y_norm_squared=Y32_norm_sq) + DXN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq) + DXYN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq, + Y_norm_squared=Y32_norm_sq) + assert_greater(np.max(np.abs(DYN - D64)), .01) + assert_greater(np.max(np.abs(DXN - D64)), .01) + assert_greater(np.max(np.abs(DXYN - D64)), .01) + # Check that the accuracy with float32 is not too bad X = np.array([[0.9215765222065525, 0.9682577158572608], [0.9221151778782808, 0.9681831844652774]]) From 4e07e8be96fcc3e524fecee21f7c8db9d8d5a41f Mon Sep 17 00:00:00 2001 From: Celelibi Date: Wed, 13 Jun 2018 22:06:37 +0200 Subject: [PATCH 10/13] FIX: Use the provided squared norm in euclidean_distances Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 1a554849526a9..4dbad2b0db69a 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -238,6 +238,11 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, ipbs = min(i + bs, X.shape[0]) Xc = _cast_if_needed(X[i:ipbs, :], np.float64) + if X_norm_squared is not None: + Xnc = X_norm_squared[i:ipbs, :] + else: + Xnc = None + for j in range(i, Y.shape[0], bs): jpbs = min(j + bs, Y.shape[0]) if X is Y and i == j: @@ -245,8 +250,13 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, else: Yc = _cast_if_needed(Y[j:jpbs, :], np.float64) - d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=None, - squared=True, X_norm_squared=None) + if Y_norm_squared is not None: + Ync = Y_norm_squared[:, j:jpbs] + else: + Ync = None + + d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=Ync, + squared=True, X_norm_squared=Xnc) distances[i:ipbs, j:jpbs] = d if X is Y and j > i: From 3b7ad89e62dc2e32370d1b62306aa25ee3d5b569 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Fri, 15 Jun 2018 10:04:58 +0200 Subject: [PATCH 11/13] ENH: euclidean_distances: cast the blocks of {X,Y}_norm_squared to float64 Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 4dbad2b0db69a..2da82acac9feb 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -214,8 +214,8 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, # - a float64 copy of a block of the rows of X if needed; # - a float64 copy of a block of the rows of Y if needed; # - the float64 block result; - # - a block of X_norm_squared if needed; - # - a block of Y_norm_squared if needed. + # - a float64 copy of a block of X_norm_squared if needed; + # - a float64 copy of a block of Y_norm_squared if needed. # This is a quadratic equation that we solve to compute the block size that # would use maxmem bytes. XYmem = 0 @@ -223,9 +223,9 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, XYmem += X.shape[1] if Y.dtype != np.float64: XYmem += Y.shape[1] # Note that Y.shape[1] == X.shape[1] - if X_norm_squared is None: + if X_norm_squared is None or X_norm_squared.dtype != np.float64: XYmem += 1 - if Y_norm_squared is None: + if Y_norm_squared is None or Y_norm_squared.dtype != np.float64: XYmem += 1 delta = XYmem ** 2 + 4 * maxmem @@ -239,7 +239,7 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, Xc = _cast_if_needed(X[i:ipbs, :], np.float64) if X_norm_squared is not None: - Xnc = X_norm_squared[i:ipbs, :] + Xnc = _cast_if_needed(X_norm_squared[i:ipbs, :], np.float64) else: Xnc = None @@ -251,7 +251,7 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, Yc = _cast_if_needed(Y[j:jpbs, :], np.float64) if Y_norm_squared is not None: - Ync = Y_norm_squared[:, j:jpbs] + Ync = _cast_if_needed(Y_norm_squared[:, j:jpbs], np.float64) else: Ync = None From 8aecbde564b235319f3889de7dc4348bed9d049c Mon Sep 17 00:00:00 2001 From: Celelibi Date: Fri, 15 Jun 2018 10:08:13 +0200 Subject: [PATCH 12/13] ENH: euclidean_distances: precompute blocks of X_norm_squared Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2da82acac9feb..f2030c64aa2cc 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -241,7 +241,7 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, if X_norm_squared is not None: Xnc = _cast_if_needed(X_norm_squared[i:ipbs, :], np.float64) else: - Xnc = None + Xnc = row_norms(Xc, squared=True)[:, np.newaxis] for j in range(i, Y.shape[0], bs): jpbs = min(j + bs, Y.shape[0]) From 0fd279a6706c23a26058e3b415bf2c77746df9d4 Mon Sep 17 00:00:00 2001 From: Celelibi Date: Fri, 15 Jun 2018 10:12:19 +0200 Subject: [PATCH 13/13] ENH: euclidean_distances: swap X and Y sometimes with float32 Signed-off-by: Celelibi --- sklearn/metrics/pairwise.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f2030c64aa2cc..90645d4641b8f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -202,6 +202,15 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, The computation is done by blocks to limit additional memory usage. """ + # For performance reasons, swap X and Y if I got X_norm_squared but not + # Y_norm_squared + if X_norm_squared is not None and Y_norm_squared is None: + swap = True + X, Y = Y, X + X_norm_squared, Y_norm_squared = None, X_norm_squared.T + else: + swap = False + # No more than 10MB of additional memory will be used to cast X and Y to # float64 and to get the float64 result. maxmem = 10*1024*1024 @@ -262,6 +271,9 @@ def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None, if X is Y and j > i: distances[j:jpbs, i:ipbs] = d.T + if swap: + distances = distances.T + return distances if squared else np.sqrt(distances, out=distances)