From 60fc4ad2001b9070405879873295e4c6d2caf43c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 11 Jun 2021 18:47:30 +0200
Subject: [PATCH 001/290] Add private submodule for fast argkmin

The implementation has been taken from:

https://github.com/scikit-learn-inria-fondation/pdist_aggregation/commit/d051bf4da5fea0dd0d033664485665655a5c3922
---
 sklearn/metrics/_argkmin_fast.pyx | 540 ++++++++++++++++++++++++++++++
 sklearn/metrics/setup.py          |   4 +
 2 files changed, 544 insertions(+)
 create mode 100644 sklearn/metrics/_argkmin_fast.pyx

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
new file mode 100644
index 0000000000000..7aa88e6adad45
--- /dev/null
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -0,0 +1,540 @@
+# cython: language_level=3
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: profile=False
+# cython: linetrace=False
+# cython: binding=False
+# distutils: define_macros=CYTHON_TRACE_NOGIL=0
+import os
+
+import numpy as np
+
+cimport numpy as np
+cimport openmp
+
+from joblib import cpu_count
+
+from libc.math cimport floor, sqrt
+from libc.stdlib cimport free, malloc
+
+from cython cimport floating, integral
+from cython.parallel cimport parallel, prange
+
+DEF CHUNK_SIZE = 256  # number of vectors
+
+DEF MIN_CHUNK_SAMPLES = 20
+
+DEF FLOAT_INF = 1e36
+
+from sklearn.utils._cython_blas cimport (
+  BLAS_Order,
+  BLAS_Trans,
+  ColMajor,
+  NoTrans,
+  RowMajor,
+  Trans,
+  _gemm,
+)
+
+
+cpdef int _openmp_effective_n_threads(n_threads=None):
+    # Taken and adapted from sklearn.utils._openmp_helpers
+    if os.getenv("OMP_NUM_THREADS"):
+        # Fall back to user provided number of threads making it possible
+        # to exceed the number of cpus.
+        return openmp.omp_get_max_threads()
+    else:
+        return min(openmp.omp_get_max_threads(),
+                   cpu_count(only_physical_cores=True))
+
+### Heaps utilities, minified from sklearn internals NeighborsHeap
+# https://github.com/scikit-learn/scikit-learn/blob/e4bb9fa86b0df873ad750b6d59090843d9d23d50/sklearn/neighbors/_binary_tree.pxi#L513
+
+cdef int _push(
+    floating* dist,
+    integral* idx,
+    integral size,
+    floating val,
+    integral i_val,
+) nogil except -1:
+    """push (val, i_val) into the heap (dist, idx) of the given size"""
+    cdef:
+        integral current_idx, left_child_idx, right_child_idx, swap_idx
+
+    # check if val should be in heap
+    if val > dist[0]:
+        return 0
+
+    # insert val at position zero
+    dist[0] = val
+    idx[0] = i_val
+
+    # descend the heap, swapping values until the max heap criterion is met
+    current_idx = 0
+    while True:
+        left_child_idx = 2 * current_idx + 1
+        right_child_idx = left_child_idx + 1
+
+        if left_child_idx >= size:
+            break
+        elif right_child_idx >= size:
+            if dist[left_child_idx] > val:
+                swap_idx = left_child_idx
+            else:
+                break
+        elif dist[left_child_idx] >= dist[right_child_idx]:
+            if val < dist[left_child_idx]:
+                swap_idx = left_child_idx
+            else:
+                break
+        else:
+            if val < dist[right_child_idx]:
+                swap_idx = right_child_idx
+            else:
+                break
+
+        dist[current_idx] = dist[swap_idx]
+        idx[current_idx] = idx[swap_idx]
+
+        current_idx = swap_idx
+
+    dist[current_idx] = val
+    idx[current_idx] = i_val
+
+    return 0
+
+
+cdef inline void dual_swap(
+    floating* dist,
+    integral* idx,
+    integral i1,
+    integral i2
+) nogil:
+    """swap the values at index i1 and i2 of both dist and idx"""
+    cdef:
+        floating dtmp = dist[i1]
+        integral itmp = idx[i1]
+
+    dist[i1] = dist[i2]
+    dist[i2] = dtmp
+
+    idx[i1] = idx[i2]
+    idx[i2] = itmp
+
+
+cdef int _simultaneous_sort(
+    floating* dist,
+    integral* idx,
+    integral size
+) nogil except -1:
+    """
+    Perform a recursive quicksort on the dist array, simultaneously
+    performing the same swaps on the idx array.
+
+    TODO: test if the following algorithms are better:
+      - introselect via std::nth_element
+      - heap-sort-like
+    """
+    cdef:
+        integral pivot_idx, i, store_idx
+        floating pivot_val
+
+    # in the small-array case, do things efficiently
+    if size <= 1:
+        pass
+    elif size == 2:
+        if dist[0] > dist[1]:
+            dual_swap(dist, idx, 0, 1)
+    elif size == 3:
+        if dist[0] > dist[1]:
+            dual_swap(dist, idx, 0, 1)
+        if dist[1] > dist[2]:
+            dual_swap(dist, idx, 1, 2)
+            if dist[0] > dist[1]:
+                dual_swap(dist, idx, 0, 1)
+    else:
+        # Determine the pivot using the median-of-three rule.
+        # The smallest of the three is moved to the beginning of the array,
+        # the middle (the pivot value) is moved to the end, and the largest
+        # is moved to the pivot index.
+        pivot_idx = size / 2
+        if dist[0] > dist[size - 1]:
+            dual_swap(dist, idx, 0, size - 1)
+        if dist[size - 1] > dist[pivot_idx]:
+            dual_swap(dist, idx, size - 1, pivot_idx)
+            if dist[0] > dist[size - 1]:
+                dual_swap(dist, idx, 0, size - 1)
+        pivot_val = dist[size - 1]
+
+        # partition indices about pivot.  At the end of this operation,
+        # pivot_idx will contain the pivot value, everything to the left
+        # will be smaller, and everything to the right will be larger.
+        store_idx = 0
+        for i in range(size - 1):
+            if dist[i] < pivot_val:
+                dual_swap(dist, idx, i, store_idx)
+                store_idx += 1
+        dual_swap(dist, idx, store_idx, size - 1)
+        pivot_idx = store_idx
+
+        # recursively sort each side of the pivot
+        if pivot_idx > 1:
+            _simultaneous_sort(dist, idx, pivot_idx)
+        if pivot_idx + 2 < size:
+            _simultaneous_sort(dist + pivot_idx + 1,
+                               idx + pivot_idx + 1,
+                               size - pivot_idx - 1)
+    return 0
+
+### argkmin helpers
+
+cdef void _k_argkmin_on_chunk(
+    floating[:, ::1] X_c,                  # IN
+    floating[:, ::1] Y_c,                  # IN
+    floating[::1] Y_sq_norms,              # IN
+    floating *dist_middle_terms,           # IN
+    floating *heaps_red_distances,         # IN/OUT
+    integral *heaps_indices,               # IN/OUT
+    integral k,                            # IN
+    # ID of the first element of Y_c
+    integral Y_idx_offset,
+) nogil:
+    cdef:
+        integral i, j
+    # Instead of computing the full pairwise squared distances matrix,
+    # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+    # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
+    # term since the argmin for a given sample X_c^{i} does not depend on
+    # ||X_c^{i}||²
+
+    # Careful: LDA, LDB and LDC are given for F-ordered arrays.
+    # Here, we use their counterpart values as indicated in the documentation.
+    # See the documentation of parameters here:
+    # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+    #
+    # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+    _gemm(RowMajor, NoTrans, Trans,
+          X_c.shape[0], Y_c.shape[0], X_c.shape[1],
+          -2.0,
+          &X_c[0, 0], X_c.shape[1],
+          &Y_c[0, 0], X_c.shape[1], 0.0,
+          dist_middle_terms, Y_c.shape[0])
+
+    # Computing argmins here
+    for i in range(X_c.shape[0]):
+        for j in range(Y_c.shape[0]):
+            _push(heaps_red_distances + i * k,
+                  heaps_indices + i * k,
+                  k,
+                  # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                  dist_middle_terms[i * Y_c.shape[0] + j] + Y_sq_norms[j],
+                  j + Y_idx_offset)
+
+
+
+cdef int _argkmin_on_X(
+    floating[:, ::1] X,                   # IN
+    floating[:, ::1] Y,                   # IN
+    floating[::1] Y_sq_norms,             # IN
+    integral chunk_size,                  # IN
+    integral effective_n_threads,         # IN
+    integral[:, ::1] knn_indices,         # OUT
+    floating[:, ::1] knn_red_distances,   # OUT
+) nogil except -1:
+    cdef:
+        integral k = knn_indices.shape[1]
+        integral d = X.shape[1]
+        integral sf = sizeof(floating)
+        integral si = sizeof(integral)
+        integral n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+        integral n_train = Y.shape[0]
+        integral Y_n_samples_chunk = min(n_train, n_samples_chunk)
+        integral Y_n_full_chunks = n_train / Y_n_samples_chunk
+        integral Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+        integral n_test = X.shape[0]
+        integral X_n_samples_chunk = min(n_test, n_samples_chunk)
+        integral X_n_full_chunks = n_test // X_n_samples_chunk
+        integral X_n_samples_rem = n_test % X_n_samples_chunk
+
+        # Counting remainder chunk in total number of chunks
+        integral Y_n_chunks = Y_n_full_chunks + (
+            n_train != (Y_n_full_chunks * Y_n_samples_chunk)
+        )
+
+        integral X_n_chunks = X_n_full_chunks + (
+            n_test != (X_n_full_chunks * X_n_samples_chunk)
+        )
+
+        integral num_threads = min(Y_n_chunks, effective_n_threads)
+
+        integral Y_start, Y_end, X_start, X_end
+        integral X_chunk_idx, Y_chunk_idx, idx, jdx
+
+        floating *dist_middle_terms_chunks
+        floating *heaps_red_distances_chunks
+
+
+    with nogil, parallel(num_threads=num_threads):
+        # Thread local buffers
+
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        dist_middle_terms_chunks = <floating*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
+        heaps_red_distances_chunks = <floating*> malloc(X_n_samples_chunk * k * sf)
+
+        for X_chunk_idx in prange(X_n_chunks, schedule='static'):
+            # We reset the heap between X chunks (memset isn't suitable here)
+            for idx in range(X_n_samples_chunk * k):
+                heaps_red_distances_chunks[idx] = FLOAT_INF
+
+            X_start = X_chunk_idx * X_n_samples_chunk
+            if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
+                X_end = X_start + X_n_samples_rem
+            else:
+                X_end = X_start + X_n_samples_chunk
+
+            for Y_chunk_idx in range(Y_n_chunks):
+                Y_start = Y_chunk_idx * Y_n_samples_chunk
+                if Y_chunk_idx == Y_n_chunks - 1 and Y_n_samples_rem > 0:
+                    Y_end = Y_start + Y_n_samples_rem
+                else:
+                    Y_end = Y_start + Y_n_samples_chunk
+
+                _k_argkmin_on_chunk(
+                    X[X_start:X_end, :],
+                    Y[Y_start:Y_end, :],
+                    Y_sq_norms[Y_start:Y_end],
+                    dist_middle_terms_chunks,
+                    heaps_red_distances_chunks,
+                    &knn_indices[X_start, 0],
+                    k,
+                    Y_start
+                )
+
+            # Getting the indices of the k-argkmin points in
+            # the sorted order
+            for idx in range(X_end - X_start):
+                _simultaneous_sort(
+                    heaps_red_distances_chunks + idx * k,
+                    &knn_indices[X_start + idx, 0],
+                    k
+                )
+
+        # end: for X_chunk_idx
+        free(dist_middle_terms_chunks)
+        free(heaps_red_distances_chunks)
+
+    # end: with nogil, parallel
+    return X_n_chunks
+
+
+cdef int _argkmin_on_Y(
+    floating[:, ::1] X,                   # IN
+    floating[:, ::1] Y,                   # IN
+    floating[::1] Y_sq_norms,             # IN
+    integral chunk_size,                  # IN
+    integral effective_n_threads,         # IN
+    integral[:, ::1] knn_indices,         # OUT
+    floating[:, ::1] knn_red_distances,   # OUT
+) nogil except -1:
+    cdef:
+        integral k = knn_indices.shape[1]
+        integral d = X.shape[1]
+        integral sf = sizeof(floating)
+        integral si = sizeof(integral)
+        integral n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+        integral n_train = Y.shape[0]
+        integral Y_n_samples_chunk = min(n_train, n_samples_chunk)
+        integral Y_n_full_chunks = n_train / Y_n_samples_chunk
+        integral Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+        integral n_test = X.shape[0]
+        integral X_n_samples_chunk = min(n_test, n_samples_chunk)
+        integral X_n_full_chunks = n_test // X_n_samples_chunk
+        integral X_n_samples_rem = n_test % X_n_samples_chunk
+
+        # Counting remainder chunk in total number of chunks
+        integral Y_n_chunks = Y_n_full_chunks + (
+            n_train != (Y_n_full_chunks * Y_n_samples_chunk)
+        )
+
+        integral X_n_chunks = X_n_full_chunks + (
+            n_test != (X_n_full_chunks * X_n_samples_chunk)
+        )
+
+        integral num_threads = min(Y_n_chunks, effective_n_threads)
+
+        integral Y_start, Y_end, X_start, X_end
+        integral X_chunk_idx, Y_chunk_idx, idx, jdx
+
+        floating *dist_middle_terms_chunks
+        floating *heaps_red_distances_chunks
+        integral *heaps_indices_chunks
+
+    for X_chunk_idx in range(X_n_chunks):
+        X_start = X_chunk_idx * X_n_samples_chunk
+        if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
+            X_end = X_start + X_n_samples_rem
+        else:
+            X_end = X_start + X_n_samples_chunk
+
+        with nogil, parallel(num_threads=num_threads):
+            # Thread local buffers
+
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            dist_middle_terms_chunks = <floating*> malloc(
+                Y_n_samples_chunk * X_n_samples_chunk * sf)
+            heaps_red_distances_chunks = <floating*> malloc(
+                X_n_samples_chunk * k * sf)
+            heaps_indices_chunks = <integral*> malloc(
+                X_n_samples_chunk * k * sf)
+
+            # Initialising heep (memset isn't suitable here)
+            for idx in range(X_n_samples_chunk * k):
+                heaps_red_distances_chunks[idx] = FLOAT_INF
+                heaps_indices_chunks[idx] = -1
+
+            for Y_chunk_idx in prange(Y_n_chunks, schedule='static'):
+                Y_start = Y_chunk_idx * Y_n_samples_chunk
+                if Y_chunk_idx == Y_n_chunks - 1 \
+                    and Y_n_samples_rem > 0:
+                    Y_end = Y_start + Y_n_samples_rem
+                else:
+                    Y_end = Y_start + Y_n_samples_chunk
+
+                _k_argkmin_on_chunk(
+                    X[X_start:X_end, :],
+                    Y[Y_start:Y_end, :],
+                    Y_sq_norms[Y_start:Y_end],
+                    dist_middle_terms_chunks,
+                    heaps_red_distances_chunks,
+                    heaps_indices_chunks,
+                    k,
+                    Y_start,
+                )
+
+            # end: for Y_chunk_idx
+            with gil:
+                # Synchronising with the main heaps
+                for idx in range(X_end - X_start):
+                    for jdx in range(k):
+                        _push(
+                            &knn_red_distances[X_start + idx, 0],
+                            &knn_indices[X_start + idx, 0],
+                            k,
+                            heaps_red_distances_chunks[idx * k + jdx],
+                            heaps_indices_chunks[idx * k + jdx],
+                        )
+
+            free(dist_middle_terms_chunks)
+            free(heaps_red_distances_chunks)
+            free(heaps_indices_chunks)
+
+        # end: with nogil, parallel
+        # Sortting indices of the k-nn for each query vector of X
+        for idx in prange(n_test,schedule='static',
+                          nogil=True, num_threads=num_threads):
+            _simultaneous_sort(
+                &knn_red_distances[idx, 0],
+                &knn_indices[idx, 0],
+                k,
+            )
+
+        # end: with nogil, parallel
+    # end: for X_chunk_idx
+    return Y_n_chunks
+
+cdef inline floating _euclidean_dist(
+    floating[:, ::1] X,
+    floating[:, ::1] Y,
+    integral i,
+    integral j,
+) nogil:
+    cdef:
+        floating dist = 0
+        integral k
+        integral upper_unrolled_idx = (X.shape[1] // 4) * 4
+
+    # Unrolling loop to potentially help with vectorisation?
+    for k in range(0, upper_unrolled_idx, 4):
+        dist += (X[i, k] - Y[j, k]) * (X[i, k] - Y[j, k])
+        dist += (X[i, k + 1] - Y[j, k + 1]) * (X[i, k + 1] - Y[j, k + 1])
+        dist += (X[i, k + 2] - Y[j, k + 2]) * (X[i, k + 2] - Y[j, k + 2])
+        dist += (X[i, k + 3] - Y[j, k + 3]) * (X[i, k + 3] - Y[j, k + 3])
+
+    for k in range(upper_unrolled_idx, X.shape[1]):
+        dist += (X[i, k] - Y[j, k]) * (X[i, k] - Y[j, k])
+
+    return sqrt(dist)
+
+cdef int _compute_exact_distances(
+    floating[:, ::1] X,                  # IN
+    floating[:, ::1] Y,                  # IN
+    integral[:, ::1] knn_indices,        # IN
+    integral effective_n_threads,        # IN
+    floating[:, ::1] knn_distances,      # OUT
+) nogil except -1:
+    cdef:
+        integral i, k
+
+    for i in prange(X.shape[0], schedule='static',
+                    nogil=True, num_threads=effective_n_threads):
+        for k in range(knn_indices.shape[1]):
+            knn_distances[i, k] = _euclidean_dist(X, Y,
+                                                  i, knn_indices[i, k])
+
+
+# Python interface
+
+def _argkmin(
+    floating[:, ::1] X,
+    floating[:, ::1] Y,
+    integral k,
+    integral chunk_size = CHUNK_SIZE,
+    str strategy = "auto",
+    bint return_distance = False,
+):
+    int_dtype = np.intp
+    float_dtype = np.float32 if floating is float else np.float64
+    cdef:
+        integral[:, ::1] knn_indices = np.full((X.shape[0], k), 0,
+                                               dtype=int_dtype)
+        floating[:, ::1] knn_distances = np.full((X.shape[0], k),
+                                                  FLOAT_INF,
+                                                  dtype=float_dtype)
+        floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+        integral effective_n_threads = _openmp_effective_n_threads()
+
+    if strategy == 'auto':
+        if 4 * chunk_size * effective_n_threads < X.shape[0]:
+            strategy = 'chunk_on_X'
+        else:
+            strategy = 'chunk_on_Y'
+
+    if strategy == 'chunk_on_Y':
+        n_parallel_chunks = _argkmin_on_Y(
+            X, Y, Y_sq_norms,
+            chunk_size, effective_n_threads,
+            knn_indices, knn_distances
+        )
+    elif strategy == 'chunk_on_X':
+        n_parallel_chunks = _argkmin_on_X(
+            X, Y, Y_sq_norms,
+            chunk_size, effective_n_threads,
+            knn_indices, knn_distances
+        )
+    else:
+        raise RuntimeError(f"strategy '{strategy}' not supported.")
+
+    if return_distance:
+        # We need to recompute distances because we relied on reduced distances
+        # using _gemm, which are missing a term for squarred norms and which are
+        # not the most precise (catastrophic cancellation might have happened).
+        _compute_exact_distances(X, Y, knn_indices,
+                                 effective_n_threads, knn_distances)
+        return (np.asarray(knn_distances), np.asarray(knn_indices)), n_parallel_chunks
+
+    return np.asarray(knn_indices), n_parallel_chunks
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 07aa01da308b8..178263406c1fe 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -18,6 +18,10 @@ def configuration(parent_package="", top_path=None):
                          sources=["_pairwise_fast.pyx"],
                          libraries=libraries)
 
+    config.add_extension("_argkmin_fast",
+                         sources=["_argkmin_fast.pyx"],
+                         libraries=libraries)
+
     config.add_subpackage('tests')
 
     return config

From 13442b618886772116e61e780d8f0b1e6eabd92b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 14 Jun 2021 13:40:40 +0200
Subject: [PATCH 002/290] Add minimal documentation

Also perform some renaming.
---
 sklearn/metrics/_argkmin_fast.pyx | 195 +++++++++++++++++++++---------
 1 file changed, 137 insertions(+), 58 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 7aa88e6adad45..1b9c2be67079e 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -50,6 +50,7 @@ cpdef int _openmp_effective_n_threads(n_threads=None):
 
 ### Heaps utilities, minified from sklearn internals NeighborsHeap
 # https://github.com/scikit-learn/scikit-learn/blob/e4bb9fa86b0df873ad750b6d59090843d9d23d50/sklearn/neighbors/_binary_tree.pxi#L513
+# TODO: factor those utilities
 
 cdef int _push(
     floating* dist,
@@ -57,7 +58,7 @@ cdef int _push(
     integral size,
     floating val,
     integral i_val,
-) nogil except -1:
+) nogil:
     """push (val, i_val) into the heap (dist, idx) of the given size"""
     cdef:
         integral current_idx, left_child_idx, right_child_idx, swap_idx
@@ -127,7 +128,7 @@ cdef int _simultaneous_sort(
     floating* dist,
     integral* idx,
     integral size
-) nogil except -1:
+) nogil:
     """
     Perform a recursive quicksort on the dist array, simultaneously
     performing the same swaps on the idx array.
@@ -187,9 +188,11 @@ cdef int _simultaneous_sort(
                                size - pivot_idx - 1)
     return 0
 
+### End: Heaps utilities
+
 ### argkmin helpers
 
-cdef void _k_argkmin_on_chunk(
+cdef void _argkmin_on_chunk(
     floating[:, ::1] X_c,                  # IN
     floating[:, ::1] Y_c,                  # IN
     floating[::1] Y_sq_norms,              # IN
@@ -200,6 +203,12 @@ cdef void _k_argkmin_on_chunk(
     # ID of the first element of Y_c
     integral Y_idx_offset,
 ) nogil:
+    """
+    Critical part of the computation of pairwise distances.
+
+    "Fast Squared Euclidean" distances strategy relying
+    on the gemm-trick.
+    """
     cdef:
         integral i, j
     # Instead of computing the full pairwise squared distances matrix,
@@ -234,16 +243,19 @@ cdef void _k_argkmin_on_chunk(
 
 
 cdef int _argkmin_on_X(
-    floating[:, ::1] X,                   # IN
-    floating[:, ::1] Y,                   # IN
-    floating[::1] Y_sq_norms,             # IN
-    integral chunk_size,                  # IN
-    integral effective_n_threads,         # IN
-    integral[:, ::1] knn_indices,         # OUT
-    floating[:, ::1] knn_red_distances,   # OUT
-) nogil except -1:
+    floating[:, ::1] X,                       # IN
+    floating[:, ::1] Y,                       # IN
+    floating[::1] Y_sq_norms,                 # IN
+    integral chunk_size,                      # IN
+    integral effective_n_threads,             # IN
+    integral[:, ::1] argkmin_indices,         # OUT
+    floating[:, ::1] argkmin_red_distances,   # OUT
+) nogil:
+    """Computes the argkmin of each vector (row) of X on Y
+    by parallelising computation on chunks of X.
+    """
     cdef:
-        integral k = knn_indices.shape[1]
+        integral k = argkmin_indices.shape[1]
         integral d = X.shape[1]
         integral sf = sizeof(floating)
         integral si = sizeof(integral)
@@ -302,23 +314,22 @@ cdef int _argkmin_on_X(
                 else:
                     Y_end = Y_start + Y_n_samples_chunk
 
-                _k_argkmin_on_chunk(
+                _argkmin_on_chunk(
                     X[X_start:X_end, :],
                     Y[Y_start:Y_end, :],
                     Y_sq_norms[Y_start:Y_end],
                     dist_middle_terms_chunks,
                     heaps_red_distances_chunks,
-                    &knn_indices[X_start, 0],
+                    &argkmin_indices[X_start, 0],
                     k,
                     Y_start
                 )
 
-            # Getting the indices of the k-argkmin points in
-            # the sorted order
+            # Sorting indices so that the closests' come first.
             for idx in range(X_end - X_start):
                 _simultaneous_sort(
                     heaps_red_distances_chunks + idx * k,
-                    &knn_indices[X_start + idx, 0],
+                    &argkmin_indices[X_start + idx, 0],
                     k
                 )
 
@@ -331,16 +342,23 @@ cdef int _argkmin_on_X(
 
 
 cdef int _argkmin_on_Y(
-    floating[:, ::1] X,                   # IN
-    floating[:, ::1] Y,                   # IN
-    floating[::1] Y_sq_norms,             # IN
-    integral chunk_size,                  # IN
-    integral effective_n_threads,         # IN
-    integral[:, ::1] knn_indices,         # OUT
-    floating[:, ::1] knn_red_distances,   # OUT
-) nogil except -1:
+    floating[:, ::1] X,                       # IN
+    floating[:, ::1] Y,                       # IN
+    floating[::1] Y_sq_norms,                 # IN
+    integral chunk_size,                      # IN
+    integral effective_n_threads,             # IN
+    integral[:, ::1] argkmin_indices,         # OUT
+    floating[:, ::1] argkmin_red_distances,   # OUT
+) nogil:
+    """Computes the argkmin of each vector (row) of X on Y
+    by parallelising computation on chunks of Y.
+
+    This parallelisation strategy is more costly (as we need
+    extra heaps and synchronisation), yet it is useful in
+    most contexts.
+    """
     cdef:
-        integral k = knn_indices.shape[1]
+        integral k = argkmin_indices.shape[1]
         integral d = X.shape[1]
         integral sf = sizeof(floating)
         integral si = sizeof(integral)
@@ -372,6 +390,10 @@ cdef int _argkmin_on_Y(
 
         floating *dist_middle_terms_chunks
         floating *heaps_red_distances_chunks
+
+        # As chunks of X are shared across threads, so must their
+        # heaps. To solve this, each thread has its own locals
+        # heaps which are then synchronised back in the main ones. 
         integral *heaps_indices_chunks
 
     for X_chunk_idx in range(X_n_chunks):
@@ -392,7 +414,7 @@ cdef int _argkmin_on_Y(
             heaps_indices_chunks = <integral*> malloc(
                 X_n_samples_chunk * k * sf)
 
-            # Initialising heep (memset isn't suitable here)
+            # Initialising heaps (memset can't be used here)
             for idx in range(X_n_samples_chunk * k):
                 heaps_red_distances_chunks[idx] = FLOAT_INF
                 heaps_indices_chunks[idx] = -1
@@ -405,7 +427,7 @@ cdef int _argkmin_on_Y(
                 else:
                     Y_end = Y_start + Y_n_samples_chunk
 
-                _k_argkmin_on_chunk(
+                _argkmin_on_chunk(
                     X[X_start:X_end, :],
                     Y[Y_start:Y_end, :],
                     Y_sq_norms[Y_start:Y_end],
@@ -418,12 +440,13 @@ cdef int _argkmin_on_Y(
 
             # end: for Y_chunk_idx
             with gil:
-                # Synchronising with the main heaps
+                # Synchronising the thread local heaps
+                # with the main heaps
                 for idx in range(X_end - X_start):
                     for jdx in range(k):
                         _push(
-                            &knn_red_distances[X_start + idx, 0],
-                            &knn_indices[X_start + idx, 0],
+                            &argkmin_red_distances[X_start + idx, 0],
+                            &argkmin_indices[X_start + idx, 0],
                             k,
                             heaps_red_distances_chunks[idx * k + jdx],
                             heaps_indices_chunks[idx * k + jdx],
@@ -434,16 +457,16 @@ cdef int _argkmin_on_Y(
             free(heaps_indices_chunks)
 
         # end: with nogil, parallel
-        # Sortting indices of the k-nn for each query vector of X
+        # Sorting indices of the argkmin for each query vector of X
         for idx in prange(n_test,schedule='static',
                           nogil=True, num_threads=num_threads):
             _simultaneous_sort(
-                &knn_red_distances[idx, 0],
-                &knn_indices[idx, 0],
+                &argkmin_red_distances[idx, 0],
+                &argkmin_indices[idx, 0],
                 k,
             )
-
-        # end: with nogil, parallel
+        # end: prange
+    
     # end: for X_chunk_idx
     return Y_n_chunks
 
@@ -458,7 +481,7 @@ cdef inline floating _euclidean_dist(
         integral k
         integral upper_unrolled_idx = (X.shape[1] // 4) * 4
 
-    # Unrolling loop to potentially help with vectorisation?
+    # Unrolling loop to help with vectorisation
     for k in range(0, upper_unrolled_idx, 4):
         dist += (X[i, k] - Y[j, k]) * (X[i, k] - Y[j, k])
         dist += (X[i, k + 1] - Y[j, k + 1]) * (X[i, k + 1] - Y[j, k + 1])
@@ -470,21 +493,31 @@ cdef inline floating _euclidean_dist(
 
     return sqrt(dist)
 
-cdef int _compute_exact_distances(
+cdef int _exact_euclidean_dist(
     floating[:, ::1] X,                  # IN
     floating[:, ::1] Y,                  # IN
-    integral[:, ::1] knn_indices,        # IN
+    integral[:, ::1] Y_indices,          # IN
     integral effective_n_threads,        # IN
-    floating[:, ::1] knn_distances,      # OUT
-) nogil except -1:
+    floating[:, ::1] distances,          # OUT
+) nogil:
+    """
+    Compute exact pairwise euclidean distances in parallel.
+
+    The pairwise distances considered are X vectors
+    and a subset of Y given for each row if X given in
+    Y_indices.
+
+    Notes: the body of this function could have been inlined,
+    but we use a function to have a cdef nogil context.
+    """
     cdef:
         integral i, k
 
     for i in prange(X.shape[0], schedule='static',
                     nogil=True, num_threads=effective_n_threads):
-        for k in range(knn_indices.shape[1]):
-            knn_distances[i, k] = _euclidean_dist(X, Y,
-                                                  i, knn_indices[i, k])
+        for k in range(Y_indices.shape[1]):
+            distances[i, k] = _euclidean_dist(X, Y, i,
+                                              Y_indices[i, k])
 
 
 # Python interface
@@ -497,44 +530,90 @@ def _argkmin(
     str strategy = "auto",
     bint return_distance = False,
 ):
+    """Computes the argkmin of vectors (rows) of X on Y for 
+    the euclidean distance.
+    
+    The implementation is parallelised on chunks whose size can
+    be set using ``chunk_size``.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n, d)
+        Rows represent vectors
+
+    Y: ndarray of shape (m, d)  
+        Rows represent vectors
+
+    chunk_size: int
+        The number of vectors per chunk.
+
+    strategy: str, {'auto', 'chunk_on_X', 'chunk_on_Y'}
+        The chunking strategy defining which dataset
+        parallelisation are made on.
+
+         - 'chunk_on_X' is embarassingly parallel but 
+        is less used in practice.
+         - 'chunk_on_Y' comes with synchronisation but
+        is more useful in practice.
+         -'auto' relies on a simple heuristic to choose
+        between 'chunk_on_X' and 'chunk_on_Y'.
+
+    return_distance: boolean
+        Return distances between each X vectory and its
+        argkmin if set to True.
+
+    Returns
+    -------
+    distances: ndarray of shape (n, k)
+        Distances between each X vector and its argkmin
+        in Y. Only returned if ``return_distance=True``.
+
+    indices: ndarray of shape (n, k)
+        Indices of each X vector argkmin in Y.
+    """
     int_dtype = np.intp
     float_dtype = np.float32 if floating is float else np.float64
     cdef:
-        integral[:, ::1] knn_indices = np.full((X.shape[0], k), 0,
+        integral[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
                                                dtype=int_dtype)
-        floating[:, ::1] knn_distances = np.full((X.shape[0], k),
+        floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
                                                   FLOAT_INF,
                                                   dtype=float_dtype)
         floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
         integral effective_n_threads = _openmp_effective_n_threads()
 
     if strategy == 'auto':
+        # This is a simple heuristic whose constant for the
+        # comparison has been chosen based on experiments.
         if 4 * chunk_size * effective_n_threads < X.shape[0]:
             strategy = 'chunk_on_X'
         else:
             strategy = 'chunk_on_Y'
 
     if strategy == 'chunk_on_Y':
-        n_parallel_chunks = _argkmin_on_Y(
+        _argkmin_on_Y(
             X, Y, Y_sq_norms,
             chunk_size, effective_n_threads,
-            knn_indices, knn_distances
+            argkmin_indices, argkmin_distances
         )
     elif strategy == 'chunk_on_X':
-        n_parallel_chunks = _argkmin_on_X(
+        _argkmin_on_X(
             X, Y, Y_sq_norms,
             chunk_size, effective_n_threads,
-            knn_indices, knn_distances
+            argkmin_indices, argkmin_distances
         )
     else:
         raise RuntimeError(f"strategy '{strategy}' not supported.")
 
     if return_distance:
-        # We need to recompute distances because we relied on reduced distances
-        # using _gemm, which are missing a term for squarred norms and which are
-        # not the most precise (catastrophic cancellation might have happened).
-        _compute_exact_distances(X, Y, knn_indices,
-                                 effective_n_threads, knn_distances)
-        return (np.asarray(knn_distances), np.asarray(knn_indices)), n_parallel_chunks
-
-    return np.asarray(knn_indices), n_parallel_chunks
+        # We need to recompute distances because we relied on 
+        # reduced distances using _gemm, which are missing a
+        # term for squarred norms and which are not the most 
+        # precise (catastrophic cancellation might have happened).
+        _exact_euclidean_dist(X, Y, argkmin_indices, 
+                              effective_n_threads,
+                              argkmin_distances)
+        return (np.asarray(argkmin_distances), 
+                np.asarray(argkmin_indices))
+
+    return np.asarray(argkmin_indices)

From 7aa2c3c9b32f81dab26d0a46d5bf6ce65f02de2d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 14 Jun 2021 15:08:22 +0200
Subject: [PATCH 003/290] Plug 'fast_sqeuclidean' strategy implementation and
 test for KNeighborsMixin

---
 sklearn/neighbors/_base.py                | 12 ++++-
 sklearn/neighbors/tests/test_neighbors.py | 54 ++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index c6438165aba1a..ce0f5b3cb7154 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,6 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
+from ..metrics._argkmin_fast import _argkmin
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -42,7 +43,8 @@
                      # sklearn.metrics.pairwise doc string
                      brute=(list(PAIRWISE_DISTANCE_FUNCTIONS.keys()) +
                             ['braycurtis', 'canberra', 'chebyshev',
-                             'correlation', 'cosine', 'dice', 'hamming',
+                             'correlation', 'cosine', 'dice',
+                             'fast_sqeuclidean', 'hamming',
                              'jaccard', 'kulsinski', 'mahalanobis',
                              'matching', 'minkowski', 'rogerstanimoto',
                              'russellrao', 'seuclidean', 'sokalmichener',
@@ -685,6 +687,14 @@ class from an array representing our data set and ask who's
                 X, n_neighbors=n_neighbors,
                 return_distance=return_distance)
 
+        elif (self._fit_method == 'brute' and
+                self.effective_metric_ == 'fast_sqeuclidean'):
+            # TODO: generalise this simple plug here
+            results = _argkmin(X, Y=self._fit_X,
+                               k=n_neighbors,
+                               strategy='auto',
+                               return_distance=return_distance)
+
         elif self._fit_method == 'brute':
             reduce_func = partial(self._kneighbors_reduce_func,
                                   n_neighbors=n_neighbors,
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 555687b7ea74a..0d8d992f9e5ec 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -15,7 +15,11 @@
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
-from sklearn.neighbors import VALID_METRICS_SPARSE, VALID_METRICS
+from sklearn.neighbors import (
+    NearestNeighbors,
+    VALID_METRICS_SPARSE,
+    VALID_METRICS,
+)
 from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed
 from sklearn.pipeline import make_pipeline
 from sklearn.utils._testing import assert_array_almost_equal
@@ -1762,3 +1766,51 @@ def test_pairwise_deprecated(NearestNeighbors):
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
     with pytest.warns(FutureWarning, match=msg):
         nn._pairwise
+
+
+@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("d", [5, 10, 100, 500])
+@pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
+@pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
+@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 13)])
+@pytest.mark.parametrize("strategy", ["auto", "chunk_on_train", "chunk_on_test"])
+def test_fast_sqeuclidean_correctness(
+        n,
+        d,
+        ratio_train_test,
+        n_neighbors,
+        chunk_size,
+        strategy,
+        dtype=np.float64,
+):
+    """ The Fast squared euclidean strategy ("fast-sqeuclidean") is a faster
+    alternative to the squared euclidean strategy ("sqeuclidean").
+    It computed reduced squared euclidean distances of using the
+    the GEMM subroutine of BLAS, allowing high arithmetic intensity.
+
+    Yet, it can be unstable for some range of data far the origin overflowing
+    the representation for float64.
+    """
+    if n < n_neighbors:
+        pytest.skip(
+            f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
+            allow_module_level=True,
+        )
+
+    rng = np.random.RandomState(1)
+    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
+    X_test = rng.rand(
+        int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d))
+
+    neigh = NearestNeighbors(n_neighbors=n_neighbors, algorithm="brute",
+                             metric="euclidean").fit(X_train)
+    eucl_dist, eucl_nn = neigh.kneighbors(X=X_test, n_neighbors=n_neighbors,
+                                          return_distance=True)
+
+    fse_neigh = NearestNeighbors(n_neighbors=n_neighbors, algorithm="brute",
+                                 metric="fast_sqeuclidean").fit(X_train)
+    fse_dist, fse_nn = fse_neigh.kneighbors(X=X_test, n_neighbors=n_neighbors,
+                                            return_distance=True)
+
+    np.testing.assert_almost_equal(eucl_dist, fse_dist)
+    np.testing.assert_array_equal(eucl_nn, fse_nn)

From e5b33a18dc559cdebfe66230934848577eed1540 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 15 Jun 2021 10:43:06 +0200
Subject: [PATCH 004/290] Add test for translation invariance

---
 sklearn/neighbors/tests/test_neighbors.py | 33 +++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 0d8d992f9e5ec..0e7844c4a9a1c 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1814,3 +1814,36 @@ def test_fast_sqeuclidean_correctness(
 
     np.testing.assert_almost_equal(eucl_dist, fse_dist)
     np.testing.assert_array_equal(eucl_nn, fse_nn)
+
+
+@pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
+@pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
+@pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+def test_fast_sqeuclidean_translation_invariance(
+    n_neighbors,
+    strategy,
+    translation,
+    dtype=np.float64,
+):
+    """ The Fast euclidean strategy should be translation invariant. """
+    n = 10_000
+    d = 50
+
+    rng = np.random.RandomState(1)
+    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
+    X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
+
+    neigh = NearestNeighbors(n_neighbors=n_neighbors, algorithm="brute",
+                             metric="fast_sqeuclidean").fit(X_train)
+    reference_dist, reference_nns = neigh.kneighbors(X=X_test,
+                                                     n_neighbors=n_neighbors,
+                                                     return_distance=True)
+
+    neigh = NearestNeighbors(n_neighbors=n_neighbors, algorithm="brute",
+                             metric="fast_sqeuclidean").fit(X_train + translation)
+    dist, nns = neigh.kneighbors(X=X_test + translation,
+                                 n_neighbors=n_neighbors,
+                                 return_distance=True)
+
+    np.testing.assert_array_equal(reference_nns, nns)
+    np.testing.assert_almost_equal(reference_dist, dist)

From 59c8a57a8f25cfd414650f3f279b698fca22bbe7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 16 Jun 2021 09:31:37 +0200
Subject: [PATCH 005/290] Complete test parametrisation for translation
 invariance

As this test aims at identifying contexts of numerical
instabilities, parametrising on more parameters make
sense.

Hypothesis:

Numerical stability is influenced by:

 - the range of the data (given by ``translation``)
 - the number of dimensions (given by ``d``)
 - the ``chunk_size``

But not by:
 - the parallelisation ``strategy``
 - the number of neighbors (given by ``n_neighbors``)
---
 sklearn/neighbors/tests/test_neighbors.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 0e7844c4a9a1c..dd41fe4b8cef5 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1816,19 +1816,22 @@ def test_fast_sqeuclidean_correctness(
     np.testing.assert_array_equal(eucl_nn, fse_nn)
 
 
+@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("d", [5, 10, 100, 500])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
 @pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
+@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 13)])
 @pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 def test_fast_sqeuclidean_translation_invariance(
+    n,
+    d,
     n_neighbors,
     strategy,
+    chunk_size,
     translation,
     dtype=np.float64,
 ):
     """ The Fast euclidean strategy should be translation invariant. """
-    n = 10_000
-    d = 50
-
     rng = np.random.RandomState(1)
     X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
     X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))

From 0171827b1e778638496764e7abcb17b2f3c9310d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 21 Jun 2021 09:28:29 +0200
Subject: [PATCH 006/290] Lighten test parametrisation

---
 sklearn/neighbors/tests/test_neighbors.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 6ad5c9d1005b0..70bbcee5a1e48 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1848,15 +1848,11 @@ def test_fast_sqeuclidean_correctness(
 @pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
 @pytest.mark.parametrize("d", [5, 10, 100, 500])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
-@pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
-@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 13)])
 @pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 def test_fast_sqeuclidean_translation_invariance(
     n,
     d,
     n_neighbors,
-    strategy,
-    chunk_size,
     translation,
     dtype=np.float64,
 ):

From 36a52efe80d8932dbaef15a21a284d68d6909cfa Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 21 Jun 2021 17:50:04 +0200
Subject: [PATCH 007/290] Factorise NeighborsHeap code under a private Cython
 submodule

---
 sklearn/metrics/_argkmin_fast.pyx     | 146 +-----------------
 sklearn/neighbors/_binary_tree.pxi    | 195 +------------------------
 sklearn/neighbors/_neighbors_heap.pxd |  78 ++++++++++
 sklearn/neighbors/_neighbors_heap.pyx | 203 ++++++++++++++++++++++++++
 sklearn/neighbors/setup.py            |   7 +
 5 files changed, 291 insertions(+), 338 deletions(-)
 create mode 100644 sklearn/neighbors/_neighbors_heap.pxd
 create mode 100644 sklearn/neighbors/_neighbors_heap.pyx

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 1b9c2be67079e..5cc47f19968ff 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -27,7 +27,8 @@ DEF MIN_CHUNK_SAMPLES = 20
 
 DEF FLOAT_INF = 1e36
 
-from sklearn.utils._cython_blas cimport (
+from ..neighbors._neighbors_heap cimport NeighborsHeap, _simultaneous_sort, _push
+from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
   ColMajor,
@@ -37,7 +38,6 @@ from sklearn.utils._cython_blas cimport (
   _gemm,
 )
 
-
 cpdef int _openmp_effective_n_threads(n_threads=None):
     # Taken and adapted from sklearn.utils._openmp_helpers
     if os.getenv("OMP_NUM_THREADS"):
@@ -48,148 +48,6 @@ cpdef int _openmp_effective_n_threads(n_threads=None):
         return min(openmp.omp_get_max_threads(),
                    cpu_count(only_physical_cores=True))
 
-### Heaps utilities, minified from sklearn internals NeighborsHeap
-# https://github.com/scikit-learn/scikit-learn/blob/e4bb9fa86b0df873ad750b6d59090843d9d23d50/sklearn/neighbors/_binary_tree.pxi#L513
-# TODO: factor those utilities
-
-cdef int _push(
-    floating* dist,
-    integral* idx,
-    integral size,
-    floating val,
-    integral i_val,
-) nogil:
-    """push (val, i_val) into the heap (dist, idx) of the given size"""
-    cdef:
-        integral current_idx, left_child_idx, right_child_idx, swap_idx
-
-    # check if val should be in heap
-    if val > dist[0]:
-        return 0
-
-    # insert val at position zero
-    dist[0] = val
-    idx[0] = i_val
-
-    # descend the heap, swapping values until the max heap criterion is met
-    current_idx = 0
-    while True:
-        left_child_idx = 2 * current_idx + 1
-        right_child_idx = left_child_idx + 1
-
-        if left_child_idx >= size:
-            break
-        elif right_child_idx >= size:
-            if dist[left_child_idx] > val:
-                swap_idx = left_child_idx
-            else:
-                break
-        elif dist[left_child_idx] >= dist[right_child_idx]:
-            if val < dist[left_child_idx]:
-                swap_idx = left_child_idx
-            else:
-                break
-        else:
-            if val < dist[right_child_idx]:
-                swap_idx = right_child_idx
-            else:
-                break
-
-        dist[current_idx] = dist[swap_idx]
-        idx[current_idx] = idx[swap_idx]
-
-        current_idx = swap_idx
-
-    dist[current_idx] = val
-    idx[current_idx] = i_val
-
-    return 0
-
-
-cdef inline void dual_swap(
-    floating* dist,
-    integral* idx,
-    integral i1,
-    integral i2
-) nogil:
-    """swap the values at index i1 and i2 of both dist and idx"""
-    cdef:
-        floating dtmp = dist[i1]
-        integral itmp = idx[i1]
-
-    dist[i1] = dist[i2]
-    dist[i2] = dtmp
-
-    idx[i1] = idx[i2]
-    idx[i2] = itmp
-
-
-cdef int _simultaneous_sort(
-    floating* dist,
-    integral* idx,
-    integral size
-) nogil:
-    """
-    Perform a recursive quicksort on the dist array, simultaneously
-    performing the same swaps on the idx array.
-
-    TODO: test if the following algorithms are better:
-      - introselect via std::nth_element
-      - heap-sort-like
-    """
-    cdef:
-        integral pivot_idx, i, store_idx
-        floating pivot_val
-
-    # in the small-array case, do things efficiently
-    if size <= 1:
-        pass
-    elif size == 2:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
-    elif size == 3:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
-        if dist[1] > dist[2]:
-            dual_swap(dist, idx, 1, 2)
-            if dist[0] > dist[1]:
-                dual_swap(dist, idx, 0, 1)
-    else:
-        # Determine the pivot using the median-of-three rule.
-        # The smallest of the three is moved to the beginning of the array,
-        # the middle (the pivot value) is moved to the end, and the largest
-        # is moved to the pivot index.
-        pivot_idx = size / 2
-        if dist[0] > dist[size - 1]:
-            dual_swap(dist, idx, 0, size - 1)
-        if dist[size - 1] > dist[pivot_idx]:
-            dual_swap(dist, idx, size - 1, pivot_idx)
-            if dist[0] > dist[size - 1]:
-                dual_swap(dist, idx, 0, size - 1)
-        pivot_val = dist[size - 1]
-
-        # partition indices about pivot.  At the end of this operation,
-        # pivot_idx will contain the pivot value, everything to the left
-        # will be smaller, and everything to the right will be larger.
-        store_idx = 0
-        for i in range(size - 1):
-            if dist[i] < pivot_val:
-                dual_swap(dist, idx, i, store_idx)
-                store_idx += 1
-        dual_swap(dist, idx, store_idx, size - 1)
-        pivot_idx = store_idx
-
-        # recursively sort each side of the pivot
-        if pivot_idx > 1:
-            _simultaneous_sort(dist, idx, pivot_idx)
-        if pivot_idx + 2 < size:
-            _simultaneous_sort(dist + pivot_idx + 1,
-                               idx + pivot_idx + 1,
-                               size - pivot_idx - 1)
-    return 0
-
-### End: Heaps utilities
-
 ### argkmin helpers
 
 cdef void _argkmin_on_chunk(
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 3adfa1b31006a..05b47750707bd 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -160,6 +160,7 @@ from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
                              euclidean_dist_to_rdist, euclidean_rdist_to_dist)
 
 from ._partition_nodes cimport partition_node_indices
+from ._neighbors_heap cimport NeighborsHeap, _simultaneous_sort
 
 cdef extern from "numpy/arrayobject.h":
     void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
@@ -489,200 +490,6 @@ def kernel_norm(h, d, kernel, return_log=False):
         return np.exp(result)
 
 
-######################################################################
-# Tree Utility Routines
-cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2):
-    """swap the values at index i1 and i2 of arr"""
-    cdef DITYPE_t tmp = arr[i1]
-    arr[i1] = arr[i2]
-    arr[i2] = tmp
-
-
-cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr,
-                           ITYPE_t i1, ITYPE_t i2) nogil:
-    """swap the values at inex i1 and i2 of both darr and iarr"""
-    cdef DTYPE_t dtmp = darr[i1]
-    darr[i1] = darr[i2]
-    darr[i2] = dtmp
-
-    cdef ITYPE_t itmp = iarr[i1]
-    iarr[i1] = iarr[i2]
-    iarr[i2] = itmp
-
-
-cdef class NeighborsHeap:
-    """A max-heap structure to keep track of distances/indices of neighbors
-
-    This implements an efficient pre-allocated set of fixed-size heaps
-    for chasing neighbors, holding both an index and a distance.
-    When any row of the heap is full, adding an additional point will push
-    the furthest point off the heap.
-
-    Parameters
-    ----------
-    n_pts : int
-        the number of heaps to use
-    n_nbrs : int
-        the size of each heap.
-    """
-    cdef np.ndarray distances_arr
-    cdef np.ndarray indices_arr
-
-    cdef DTYPE_t[:, ::1] distances
-    cdef ITYPE_t[:, ::1] indices
-
-    def __cinit__(self):
-        self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C')
-        self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C')
-        self.distances = self.distances_arr
-        self.indices = self.indices_arr
-
-    def __init__(self, n_pts, n_nbrs):
-        self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE,
-                                     order='C')
-        self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C')
-        self.distances = self.distances_arr
-        self.indices = self.indices_arr
-
-    def get_arrays(self, sort=True):
-        """Get the arrays of distances and indices within the heap.
-
-        If sort=True, then simultaneously sort the indices and distances,
-        so the closer points are listed first.
-        """
-        if sort:
-            self._sort()
-        return self.distances_arr, self.indices_arr
-
-    cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1:
-        """Return the largest distance in the given row"""
-        return self.distances[row, 0]
-
-    def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val):
-        return self._push(row, val, i_val)
-
-    cdef int _push(self, ITYPE_t row, DTYPE_t val,
-                   ITYPE_t i_val) nogil except -1:
-        """push (val, i_val) into the given row"""
-        cdef ITYPE_t i, ic1, ic2, i_swap
-        cdef ITYPE_t size = self.distances.shape[1]
-        cdef DTYPE_t* dist_arr = &self.distances[row, 0]
-        cdef ITYPE_t* ind_arr = &self.indices[row, 0]
-
-        # check if val should be in heap
-        if val > dist_arr[0]:
-            return 0
-
-        # insert val at position zero
-        dist_arr[0] = val
-        ind_arr[0] = i_val
-
-        # descend the heap, swapping values until the max heap criterion is met
-        i = 0
-        while True:
-            ic1 = 2 * i + 1
-            ic2 = ic1 + 1
-
-            if ic1 >= size:
-                break
-            elif ic2 >= size:
-                if dist_arr[ic1] > val:
-                    i_swap = ic1
-                else:
-                    break
-            elif dist_arr[ic1] >= dist_arr[ic2]:
-                if val < dist_arr[ic1]:
-                    i_swap = ic1
-                else:
-                    break
-            else:
-                if val < dist_arr[ic2]:
-                    i_swap = ic2
-                else:
-                    break
-
-            dist_arr[i] = dist_arr[i_swap]
-            ind_arr[i] = ind_arr[i_swap]
-
-            i = i_swap
-
-        dist_arr[i] = val
-        ind_arr[i] = i_val
-
-        return 0
-
-    cdef int _sort(self) except -1:
-        """simultaneously sort the distances and indices"""
-        cdef DTYPE_t[:, ::1] distances = self.distances
-        cdef ITYPE_t[:, ::1] indices = self.indices
-        cdef ITYPE_t row
-        for row in range(distances.shape[0]):
-            _simultaneous_sort(&distances[row, 0],
-                               &indices[row, 0],
-                               distances.shape[1])
-        return 0
-
-
-cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx,
-                            ITYPE_t size) nogil except -1:
-    """
-    Perform a recursive quicksort on the dist array, simultaneously
-    performing the same swaps on the idx array.  The equivalent in
-    numpy (though quite a bit slower) is
-
-    def simultaneous_sort(dist, idx):
-        i = np.argsort(dist)
-        return dist[i], idx[i]
-    """
-    cdef ITYPE_t pivot_idx, i, store_idx
-    cdef DTYPE_t pivot_val
-
-    # in the small-array case, do things efficiently
-    if size <= 1:
-        pass
-    elif size == 2:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
-    elif size == 3:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
-        if dist[1] > dist[2]:
-            dual_swap(dist, idx, 1, 2)
-            if dist[0] > dist[1]:
-                dual_swap(dist, idx, 0, 1)
-    else:
-        # Determine the pivot using the median-of-three rule.
-        # The smallest of the three is moved to the beginning of the array,
-        # the middle (the pivot value) is moved to the end, and the largest
-        # is moved to the pivot index.
-        pivot_idx = size / 2
-        if dist[0] > dist[size - 1]:
-            dual_swap(dist, idx, 0, size - 1)
-        if dist[size - 1] > dist[pivot_idx]:
-            dual_swap(dist, idx, size - 1, pivot_idx)
-            if dist[0] > dist[size - 1]:
-                dual_swap(dist, idx, 0, size - 1)
-        pivot_val = dist[size - 1]
-
-        # partition indices about pivot.  At the end of this operation,
-        # pivot_idx will contain the pivot value, everything to the left
-        # will be smaller, and everything to the right will be larger.
-        store_idx = 0
-        for i in range(size - 1):
-            if dist[i] < pivot_val:
-                dual_swap(dist, idx, i, store_idx)
-                store_idx += 1
-        dual_swap(dist, idx, store_idx, size - 1)
-        pivot_idx = store_idx
-
-        # recursively sort each side of the pivot
-        if pivot_idx > 1:
-            _simultaneous_sort(dist, idx, pivot_idx)
-        if pivot_idx + 2 < size:
-            _simultaneous_sort(dist + pivot_idx + 1,
-                               idx + pivot_idx + 1,
-                               size - pivot_idx - 1)
-    return 0
 
 #------------------------------------------------------------
 # find_node_split_dim:
diff --git a/sklearn/neighbors/_neighbors_heap.pxd b/sklearn/neighbors/_neighbors_heap.pxd
new file mode 100644
index 0000000000000..6027bdfdac6a1
--- /dev/null
+++ b/sklearn/neighbors/_neighbors_heap.pxd
@@ -0,0 +1,78 @@
+#!python
+#cython: boundscheck=False
+#cython: wraparound=False
+#cython: cdivision=True
+
+cimport cython
+cimport numpy as np
+from libc.math cimport fabs, sqrt, exp, cos, pow
+from cython cimport floating, integral, numeric
+
+from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
+from ._typedefs import DTYPE, ITYPE
+
+cdef inline void swap(numeric* arr, integral i1, integral i2):
+    """swap the values at index i1 and i2 of arr"""
+    cdef numeric tmp = arr[i1]
+    arr[i1] = arr[i2]
+    arr[i2] = tmp
+
+cdef inline void dual_swap(
+    floating* dist,
+    integral* idx,
+    integral i1,
+    integral i2
+) nogil:
+    """swap the values at index i1 and i2 of both dist and idx"""
+    cdef:
+        floating dtmp = dist[i1]
+        integral itmp = idx[i1]
+
+    dist[i1] = dist[i2]
+    dist[i2] = dtmp
+
+    idx[i1] = idx[i2]
+    idx[i2] = itmp
+
+cdef int _simultaneous_sort(
+    floating* dist,
+    integral* idx,
+    integral size
+) nogil except -1
+
+cdef int _push(
+    floating* dist,
+    integral* idx,
+    integral size,
+    floating val,
+    integral i_val,
+) nogil except -1
+
+
+cdef class NeighborsHeap:
+    """A max-heap structure to keep track of distances/indices of neighbors
+
+    This implements an efficient pre-allocated set of fixed-size heaps
+    for chasing neighbors, holding both an index and a distance.
+    When any row of the heap is full, adding an additional point will push
+    the furthest point off the heap.
+
+    Parameters
+    ----------
+    n_pts : int
+        the number of heaps to use
+    n_nbrs : int
+        the size of each heap.
+    """
+    cdef np.ndarray distances_arr
+    cdef np.ndarray indices_arr
+
+    cdef DTYPE_t[:, ::1] distances
+    cdef ITYPE_t[:, ::1] indices
+
+    cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1
+
+    cdef int _push(self, ITYPE_t row, DTYPE_t val,
+                   ITYPE_t i_val) nogil except -1
+
+    cdef int _sort(self) except -1
diff --git a/sklearn/neighbors/_neighbors_heap.pyx b/sklearn/neighbors/_neighbors_heap.pyx
new file mode 100644
index 0000000000000..05b7831f496c8
--- /dev/null
+++ b/sklearn/neighbors/_neighbors_heap.pyx
@@ -0,0 +1,203 @@
+#!python
+#cython: boundscheck=False
+#cython: wraparound=False
+#cython: cdivision=True
+
+import numpy as np
+cimport numpy as np
+np.import_array()  # required in order to use C-API
+
+cimport cython
+cimport numpy as np
+from libc.math cimport fabs, sqrt, exp, cos, pow
+from cython cimport floating, integral, numeric
+
+from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
+from ._typedefs import DTYPE, ITYPE
+
+
+cdef int _simultaneous_sort(
+    floating* dist,
+    integral* idx,
+    integral size
+) nogil except -1:
+    """
+    Perform a recursive quicksort on the dist array, simultaneously
+    performing the same swaps on the idx array.
+
+    TODO: test if the following algorithms are better:
+      - introselect via std::nth_element
+      - heap-sort-like
+    """
+    cdef:
+        integral pivot_idx, i, store_idx
+        floating pivot_val
+
+    # in the small-array case, do things efficiently
+    if size <= 1:
+        pass
+    elif size == 2:
+        if dist[0] > dist[1]:
+            dual_swap(dist, idx, 0, 1)
+    elif size == 3:
+        if dist[0] > dist[1]:
+            dual_swap(dist, idx, 0, 1)
+        if dist[1] > dist[2]:
+            dual_swap(dist, idx, 1, 2)
+            if dist[0] > dist[1]:
+                dual_swap(dist, idx, 0, 1)
+    else:
+        # Determine the pivot using the median-of-three rule.
+        # The smallest of the three is moved to the beginning of the array,
+        # the middle (the pivot value) is moved to the end, and the largest
+        # is moved to the pivot index.
+        pivot_idx = size // 2
+        if dist[0] > dist[size - 1]:
+            dual_swap(dist, idx, 0, size - 1)
+        if dist[size - 1] > dist[pivot_idx]:
+            dual_swap(dist, idx, size - 1, pivot_idx)
+            if dist[0] > dist[size - 1]:
+                dual_swap(dist, idx, 0, size - 1)
+        pivot_val = dist[size - 1]
+
+        # partition indices about pivot.  At the end of this operation,
+        # pivot_idx will contain the pivot value, everything to the left
+        # will be smaller, and everything to the right will be larger.
+        store_idx = 0
+        for i in range(size - 1):
+            if dist[i] < pivot_val:
+                dual_swap(dist, idx, i, store_idx)
+                store_idx += 1
+        dual_swap(dist, idx, store_idx, size - 1)
+        pivot_idx = store_idx
+
+        # recursively sort each side of the pivot
+        if pivot_idx > 1:
+            _simultaneous_sort(dist, idx, pivot_idx)
+        if pivot_idx + 2 < size:
+            _simultaneous_sort(dist + pivot_idx + 1,
+                               idx + pivot_idx + 1,
+                               size - pivot_idx - 1)
+    return 0
+
+
+cdef int _push(
+    floating* dist,
+    integral* idx,
+    integral size,
+    floating val,
+    integral i_val,
+) nogil except -1:
+    """push (val, i_val) into the heap (dist, idx) of the given size"""
+    cdef:
+        integral current_idx, left_child_idx, right_child_idx, swap_idx
+
+    # check if val should be in heap
+    if val > dist[0]:
+        return 0
+
+    # insert val at position zero
+    dist[0] = val
+    idx[0] = i_val
+
+    # descend the heap, swapping values until the max heap criterion is met
+    current_idx = 0
+    while True:
+        left_child_idx = 2 * current_idx + 1
+        right_child_idx = left_child_idx + 1
+
+        if left_child_idx >= size:
+            break
+        elif right_child_idx >= size:
+            if dist[left_child_idx] > val:
+                swap_idx = left_child_idx
+            else:
+                break
+        elif dist[left_child_idx] >= dist[right_child_idx]:
+            if val < dist[left_child_idx]:
+                swap_idx = left_child_idx
+            else:
+                break
+        else:
+            if val < dist[right_child_idx]:
+                swap_idx = right_child_idx
+            else:
+                break
+
+        dist[current_idx] = dist[swap_idx]
+        idx[current_idx] = idx[swap_idx]
+
+        current_idx = swap_idx
+
+    dist[current_idx] = val
+    idx[current_idx] = i_val
+
+    return 0
+
+
+cdef class NeighborsHeap:
+    """A max-heap structure to keep track of distances/indices of neighbors
+
+    This implements an efficient pre-allocated set of fixed-size heaps
+    for chasing neighbors, holding both an index and a distance.
+    When any row of the heap is full, adding an additional point will push
+    the furthest point off the heap.
+
+    Parameters
+    ----------
+    n_pts : int
+        the number of heaps to use
+    n_nbrs : int
+        the size of each heap.
+    """
+
+    def __cinit__(self):
+        self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C')
+        self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C')
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
+
+    def __init__(self, n_pts, n_nbrs):
+        self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE,
+                                     order='C')
+        self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C')
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
+
+    def get_arrays(self, sort=True):
+        """Get the arrays of distances and indices within the heap.
+
+        If sort=True, then simultaneously sort the indices and distances,
+        so the closer points are listed first.
+        """
+        if sort:
+            self._sort()
+        return self.distances_arr, self.indices_arr
+
+    cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1:
+        """Return the largest distance in the given row"""
+        return self.distances[row, 0]
+
+    def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val):
+        return self._push(row, val, i_val)
+
+    cdef int _push(self, ITYPE_t row, DTYPE_t val,
+                   ITYPE_t i_val) nogil except -1:
+        """push (val, i_val) into the given row"""
+        cdef ITYPE_t size = self.distances.shape[1]
+        cdef DTYPE_t* dist_arr = &self.distances[row, 0]
+        cdef ITYPE_t* ind_arr = &self.indices[row, 0]
+
+        return _push(dist_arr, ind_arr, size, val, i_val)
+
+
+    cdef int _sort(self) except -1:
+        """simultaneously sort the distances and indices"""
+        cdef DTYPE_t[:, ::1] distances = self.distances
+        cdef ITYPE_t[:, ::1] indices = self.indices
+        cdef ITYPE_t row
+        for row in range(distances.shape[0]):
+            _simultaneous_sort(&distances[row, 0],
+                               &indices[row, 0],
+                               distances.shape[1])
+        return 0
diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py
index 85305efc29c78..bf330533102e1 100644
--- a/sklearn/neighbors/setup.py
+++ b/sklearn/neighbors/setup.py
@@ -52,6 +52,13 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
+    config.add_extension(
+        "_neighbors_heap",
+        sources=["_neighbors_heap.pyx"],
+        include_dirs=[numpy.get_include()],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config

From 9a6595834a6eed9824f0cf9aaf22b0ce13ee477e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 21 Jun 2021 18:01:47 +0200
Subject: [PATCH 008/290] Use relative imports

Also remove unused import and apply style with black
---
 sklearn/metrics/_argkmin_fast.pyx | 27 +++++++++++----------------
 sklearn/metrics/pairwise.py       |  1 +
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 5cc47f19968ff..a206871096412 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -6,14 +6,9 @@
 # cython: linetrace=False
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
-import os
 
 import numpy as np
-
 cimport numpy as np
-cimport openmp
-
-from joblib import cpu_count
 
 from libc.math cimport floor, sqrt
 from libc.stdlib cimport free, malloc
@@ -27,7 +22,7 @@ DEF MIN_CHUNK_SAMPLES = 20
 
 DEF FLOAT_INF = 1e36
 
-from ..neighbors._neighbors_heap cimport NeighborsHeap, _simultaneous_sort, _push
+from ..neighbors._neighbors_heap cimport _simultaneous_sort, _push
 from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -251,7 +246,7 @@ cdef int _argkmin_on_Y(
 
         # As chunks of X are shared across threads, so must their
         # heaps. To solve this, each thread has its own locals
-        # heaps which are then synchronised back in the main ones. 
+        # heaps which are then synchronised back in the main ones.
         integral *heaps_indices_chunks
 
     for X_chunk_idx in range(X_n_chunks):
@@ -324,7 +319,7 @@ cdef int _argkmin_on_Y(
                 k,
             )
         # end: prange
-    
+
     # end: for X_chunk_idx
     return Y_n_chunks
 
@@ -388,9 +383,9 @@ def _argkmin(
     str strategy = "auto",
     bint return_distance = False,
 ):
-    """Computes the argkmin of vectors (rows) of X on Y for 
+    """Computes the argkmin of vectors (rows) of X on Y for
     the euclidean distance.
-    
+
     The implementation is parallelised on chunks whose size can
     be set using ``chunk_size``.
 
@@ -399,7 +394,7 @@ def _argkmin(
     X: ndarray of shape (n, d)
         Rows represent vectors
 
-    Y: ndarray of shape (m, d)  
+    Y: ndarray of shape (m, d)
         Rows represent vectors
 
     chunk_size: int
@@ -409,7 +404,7 @@ def _argkmin(
         The chunking strategy defining which dataset
         parallelisation are made on.
 
-         - 'chunk_on_X' is embarassingly parallel but 
+         - 'chunk_on_X' is embarassingly parallel but
         is less used in practice.
          - 'chunk_on_Y' comes with synchronisation but
         is more useful in practice.
@@ -464,14 +459,14 @@ def _argkmin(
         raise RuntimeError(f"strategy '{strategy}' not supported.")
 
     if return_distance:
-        # We need to recompute distances because we relied on 
+        # We need to recompute distances because we relied on
         # reduced distances using _gemm, which are missing a
-        # term for squarred norms and which are not the most 
+        # term for squarred norms and which are not the most
         # precise (catastrophic cancellation might have happened).
-        _exact_euclidean_dist(X, Y, argkmin_indices, 
+        _exact_euclidean_dist(X, Y, argkmin_indices,
                               effective_n_threads,
                               argkmin_distances)
-        return (np.asarray(argkmin_distances), 
+        return (np.asarray(argkmin_distances),
                 np.asarray(argkmin_indices))
 
     return np.asarray(argkmin_indices)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 14a0d5e34734a..45920f764b776 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,6 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
+from ._argkmin_fast import _argkmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 

From e1bb0a11ce31de07f6ad887967773ab5b8b33863 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 22 Jun 2021 10:01:32 +0200
Subject: [PATCH 009/290] Use utils._openmp_helpers._openmp_effective_n_threads
 directly

TODO: Think about the best way of supporting
``only_physical_cores``.
---
 sklearn/metrics/_argkmin_fast.pyx | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index a206871096412..1a8ae9d9217df 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -32,16 +32,8 @@ from ..utils._cython_blas cimport (
   Trans,
   _gemm,
 )
+from ..utils._openmp_helpers import _openmp_effective_n_threads
 
-cpdef int _openmp_effective_n_threads(n_threads=None):
-    # Taken and adapted from sklearn.utils._openmp_helpers
-    if os.getenv("OMP_NUM_THREADS"):
-        # Fall back to user provided number of threads making it possible
-        # to exceed the number of cpus.
-        return openmp.omp_get_max_threads()
-    else:
-        return min(openmp.omp_get_max_threads(),
-                   cpu_count(only_physical_cores=True))
 
 ### argkmin helpers
 

From 03e516f1fdb5f81f4d33c1ca0881f407696437c5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 16 Jun 2021 09:43:48 +0200
Subject: [PATCH 010/290] Plug 'fast_sqeuclidean' strategy implementation and
 test for pairwise_distances_argmin_min

---
 sklearn/metrics/pairwise.py            | 26 ++++++++++++++++----------
 sklearn/metrics/tests/test_pairwise.py | 22 ++++++++++++++++++++++
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 45920f764b776..a20c49a20346c 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -646,19 +646,25 @@ def pairwise_distances_argmin_min(
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    if metric_kwargs is None:
-        metric_kwargs = {}
+    if metric == "fast_sqeuclidean":
+        # TODO: generalise this simple plug here
+        values, indices = _argkmin(X, Y, k=1, strategy="auto", return_distance=True)
+        values = np.ndarray.flatten(values)
+        indices = np.ndarray.flatten(indices)
+    else:
+        if metric_kwargs is None:
+            metric_kwargs = {}
 
-    if axis == 0:
-        X, Y = Y, X
+        if axis == 0:
+            X, Y = Y, X
 
-    indices, values = zip(
-        *pairwise_distances_chunked(
-            X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
+        indices, values = zip(
+            *pairwise_distances_chunked(
+                X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
+            )
         )
-    )
-    indices = np.concatenate(indices)
-    values = np.concatenate(values)
+        indices = np.concatenate(indices)
+        values = np.concatenate(values)
 
     return indices, values
 
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index fdc47ee886b58..032e55d155a71 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1477,3 +1477,25 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
     # and fails due to rounding errors
     rtol = 1e-5 if dtype is np.float32 else 1e-7
     assert_allclose(dist, expected_dist, rtol=rtol)
+
+
+@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("d", [5, 10, 100])
+@pytest.mark.parametrize("X_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+@pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+@pytest.mark.parametrize("sign", [1, -1])
+def test_fast_sqeuclidean_correctness(n, d, X_translation, Y_translation, sign):
+
+    rng = np.random.RandomState(1)
+
+    # Translating to test numerical stability
+    X = X_translation + rng.rand(int(n * d)).reshape((-1, d))
+    Y = sign * Y_translation + rng.rand(int(n * d)).reshape((-1, d))
+
+    argmins, distances = pairwise_distances_argmin_min(X, Y,
+                                                       metric="euclidean")
+    fsq_argmins, fsq_distances = pairwise_distances_argmin_min(X, Y,
+                                                               metric="fast_sqeuclidean")
+
+    np.testing.assert_array_equal(argmins, fsq_argmins)
+    np.testing.assert_almost_equal(distances, fsq_distances)

From cb85791ee63763d0058a5c9c0cf4b27f403e5672 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 23 Jun 2021 14:29:48 +0200
Subject: [PATCH 011/290] Adapt cython submodule for heaps

Move neighbors.NeighborsHeap's code and  _typedefs under
sklearn.utils as cyclic imports are currently happening
between sklearn.neighbors and sklearn.metrics.

Also, using integral in some cases gave unexpected results.
Occurences were changed to use np.int_p, as exposed by
utils._typedefs.ITYPE_t (we don't need signed integer)
---
 sklearn/metrics/_argkmin_fast.pyx             | 135 +++++++++---------
 sklearn/neighbors/_binary_tree.pxi            | 122 +++++++++++++++-
 sklearn/neighbors/_dist_metrics.pxd           |   6 +-
 sklearn/neighbors/_dist_metrics.pyx           |   4 +-
 sklearn/neighbors/_neighbors_heap.pxd         |  78 ----------
 sklearn/neighbors/_partition_nodes.pxd        |   2 +-
 sklearn/neighbors/setup.py                    |  13 --
 .../neighbors/tests/test_neighbors_tree.py    |  23 ---
 sklearn/utils/_heap.pxd                       |  37 +++++
 .../_neighbors_heap.pyx => utils/_heap.pyx}   |  91 ++----------
 sklearn/{neighbors => utils}/_typedefs.pxd    |   0
 sklearn/{neighbors => utils}/_typedefs.pyx    |   0
 sklearn/utils/setup.py                        |  14 ++
 13 files changed, 250 insertions(+), 275 deletions(-)
 delete mode 100644 sklearn/neighbors/_neighbors_heap.pxd
 create mode 100644 sklearn/utils/_heap.pxd
 rename sklearn/{neighbors/_neighbors_heap.pyx => utils/_heap.pyx} (54%)
 rename sklearn/{neighbors => utils}/_typedefs.pxd (100%)
 rename sklearn/{neighbors => utils}/_typedefs.pyx (100%)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 1a8ae9d9217df..b133805eaafa9 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -13,7 +13,7 @@ cimport numpy as np
 from libc.math cimport floor, sqrt
 from libc.stdlib cimport free, malloc
 
-from cython cimport floating, integral
+from cython cimport floating
 from cython.parallel cimport parallel, prange
 
 DEF CHUNK_SIZE = 256  # number of vectors
@@ -22,7 +22,6 @@ DEF MIN_CHUNK_SAMPLES = 20
 
 DEF FLOAT_INF = 1e36
 
-from ..neighbors._neighbors_heap cimport _simultaneous_sort, _push
 from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -32,7 +31,11 @@ from ..utils._cython_blas cimport (
   Trans,
   _gemm,
 )
+
+from ..utils._heap cimport _simultaneous_sort, _push
 from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._typedefs cimport ITYPE_t
+from ..utils._typedefs import ITYPE
 
 
 ### argkmin helpers
@@ -43,10 +46,10 @@ cdef void _argkmin_on_chunk(
     floating[::1] Y_sq_norms,              # IN
     floating *dist_middle_terms,           # IN
     floating *heaps_red_distances,         # IN/OUT
-    integral *heaps_indices,               # IN/OUT
-    integral k,                            # IN
+    ITYPE_t *heaps_indices,               # IN/OUT
+    ITYPE_t k,                            # IN
     # ID of the first element of Y_c
-    integral Y_idx_offset,
+    ITYPE_t Y_idx_offset,
 ) nogil:
     """
     Critical part of the computation of pairwise distances.
@@ -55,7 +58,7 @@ cdef void _argkmin_on_chunk(
     on the gemm-trick.
     """
     cdef:
-        integral i, j
+        ITYPE_t i, j
     # Instead of computing the full pairwise squared distances matrix,
     # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
     # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
@@ -91,44 +94,44 @@ cdef int _argkmin_on_X(
     floating[:, ::1] X,                       # IN
     floating[:, ::1] Y,                       # IN
     floating[::1] Y_sq_norms,                 # IN
-    integral chunk_size,                      # IN
-    integral effective_n_threads,             # IN
-    integral[:, ::1] argkmin_indices,         # OUT
+    ITYPE_t chunk_size,                      # IN
+    ITYPE_t effective_n_threads,             # IN
+    ITYPE_t[:, ::1] argkmin_indices,         # OUT
     floating[:, ::1] argkmin_red_distances,   # OUT
 ) nogil:
     """Computes the argkmin of each vector (row) of X on Y
     by parallelising computation on chunks of X.
     """
     cdef:
-        integral k = argkmin_indices.shape[1]
-        integral d = X.shape[1]
-        integral sf = sizeof(floating)
-        integral si = sizeof(integral)
-        integral n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-        integral n_train = Y.shape[0]
-        integral Y_n_samples_chunk = min(n_train, n_samples_chunk)
-        integral Y_n_full_chunks = n_train / Y_n_samples_chunk
-        integral Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-        integral n_test = X.shape[0]
-        integral X_n_samples_chunk = min(n_test, n_samples_chunk)
-        integral X_n_full_chunks = n_test // X_n_samples_chunk
-        integral X_n_samples_rem = n_test % X_n_samples_chunk
+        ITYPE_t k = argkmin_indices.shape[1]
+        ITYPE_t d = X.shape[1]
+        ITYPE_t sf = sizeof(floating)
+        ITYPE_t si = sizeof(ITYPE_t)
+        ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+        ITYPE_t n_train = Y.shape[0]
+        ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
+        ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
+        ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+        ITYPE_t n_test = X.shape[0]
+        ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
+        ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
+        ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
 
         # Counting remainder chunk in total number of chunks
-        integral Y_n_chunks = Y_n_full_chunks + (
+        ITYPE_t Y_n_chunks = Y_n_full_chunks + (
             n_train != (Y_n_full_chunks * Y_n_samples_chunk)
         )
 
-        integral X_n_chunks = X_n_full_chunks + (
+        ITYPE_t X_n_chunks = X_n_full_chunks + (
             n_test != (X_n_full_chunks * X_n_samples_chunk)
         )
 
-        integral num_threads = min(Y_n_chunks, effective_n_threads)
+        ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
 
-        integral Y_start, Y_end, X_start, X_end
-        integral X_chunk_idx, Y_chunk_idx, idx, jdx
+        ITYPE_t Y_start, Y_end, X_start, X_end
+        ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
 
         floating *dist_middle_terms_chunks
         floating *heaps_red_distances_chunks
@@ -190,9 +193,9 @@ cdef int _argkmin_on_Y(
     floating[:, ::1] X,                       # IN
     floating[:, ::1] Y,                       # IN
     floating[::1] Y_sq_norms,                 # IN
-    integral chunk_size,                      # IN
-    integral effective_n_threads,             # IN
-    integral[:, ::1] argkmin_indices,         # OUT
+    ITYPE_t chunk_size,                      # IN
+    ITYPE_t effective_n_threads,             # IN
+    ITYPE_t[:, ::1] argkmin_indices,         # OUT
     floating[:, ::1] argkmin_red_distances,   # OUT
 ) nogil:
     """Computes the argkmin of each vector (row) of X on Y
@@ -203,35 +206,35 @@ cdef int _argkmin_on_Y(
     most contexts.
     """
     cdef:
-        integral k = argkmin_indices.shape[1]
-        integral d = X.shape[1]
-        integral sf = sizeof(floating)
-        integral si = sizeof(integral)
-        integral n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-        integral n_train = Y.shape[0]
-        integral Y_n_samples_chunk = min(n_train, n_samples_chunk)
-        integral Y_n_full_chunks = n_train / Y_n_samples_chunk
-        integral Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-        integral n_test = X.shape[0]
-        integral X_n_samples_chunk = min(n_test, n_samples_chunk)
-        integral X_n_full_chunks = n_test // X_n_samples_chunk
-        integral X_n_samples_rem = n_test % X_n_samples_chunk
+        ITYPE_t k = argkmin_indices.shape[1]
+        ITYPE_t d = X.shape[1]
+        ITYPE_t sf = sizeof(floating)
+        ITYPE_t si = sizeof(ITYPE_t)
+        ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+        ITYPE_t n_train = Y.shape[0]
+        ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
+        ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
+        ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+        ITYPE_t n_test = X.shape[0]
+        ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
+        ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
+        ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
 
         # Counting remainder chunk in total number of chunks
-        integral Y_n_chunks = Y_n_full_chunks + (
+        ITYPE_t Y_n_chunks = Y_n_full_chunks + (
             n_train != (Y_n_full_chunks * Y_n_samples_chunk)
         )
 
-        integral X_n_chunks = X_n_full_chunks + (
+        ITYPE_t X_n_chunks = X_n_full_chunks + (
             n_test != (X_n_full_chunks * X_n_samples_chunk)
         )
 
-        integral num_threads = min(Y_n_chunks, effective_n_threads)
+        ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
 
-        integral Y_start, Y_end, X_start, X_end
-        integral X_chunk_idx, Y_chunk_idx, idx, jdx
+        ITYPE_t Y_start, Y_end, X_start, X_end
+        ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
 
         floating *dist_middle_terms_chunks
         floating *heaps_red_distances_chunks
@@ -239,7 +242,7 @@ cdef int _argkmin_on_Y(
         # As chunks of X are shared across threads, so must their
         # heaps. To solve this, each thread has its own locals
         # heaps which are then synchronised back in the main ones.
-        integral *heaps_indices_chunks
+        ITYPE_t *heaps_indices_chunks
 
     for X_chunk_idx in range(X_n_chunks):
         X_start = X_chunk_idx * X_n_samples_chunk
@@ -256,7 +259,7 @@ cdef int _argkmin_on_Y(
                 Y_n_samples_chunk * X_n_samples_chunk * sf)
             heaps_red_distances_chunks = <floating*> malloc(
                 X_n_samples_chunk * k * sf)
-            heaps_indices_chunks = <integral*> malloc(
+            heaps_indices_chunks = <ITYPE_t*> malloc(
                 X_n_samples_chunk * k * sf)
 
             # Initialising heaps (memset can't be used here)
@@ -318,13 +321,13 @@ cdef int _argkmin_on_Y(
 cdef inline floating _euclidean_dist(
     floating[:, ::1] X,
     floating[:, ::1] Y,
-    integral i,
-    integral j,
+    ITYPE_t i,
+    ITYPE_t j,
 ) nogil:
     cdef:
         floating dist = 0
-        integral k
-        integral upper_unrolled_idx = (X.shape[1] // 4) * 4
+        ITYPE_t k
+        ITYPE_t upper_unrolled_idx = (X.shape[1] // 4) * 4
 
     # Unrolling loop to help with vectorisation
     for k in range(0, upper_unrolled_idx, 4):
@@ -341,8 +344,8 @@ cdef inline floating _euclidean_dist(
 cdef int _exact_euclidean_dist(
     floating[:, ::1] X,                  # IN
     floating[:, ::1] Y,                  # IN
-    integral[:, ::1] Y_indices,          # IN
-    integral effective_n_threads,        # IN
+    ITYPE_t[:, ::1] Y_indices,          # IN
+    ITYPE_t effective_n_threads,        # IN
     floating[:, ::1] distances,          # OUT
 ) nogil:
     """
@@ -356,7 +359,7 @@ cdef int _exact_euclidean_dist(
     but we use a function to have a cdef nogil context.
     """
     cdef:
-        integral i, k
+        ITYPE_t i, k
 
     for i in prange(X.shape[0], schedule='static',
                     nogil=True, num_threads=effective_n_threads):
@@ -370,8 +373,8 @@ cdef int _exact_euclidean_dist(
 def _argkmin(
     floating[:, ::1] X,
     floating[:, ::1] Y,
-    integral k,
-    integral chunk_size = CHUNK_SIZE,
+    ITYPE_t k,
+    ITYPE_t chunk_size = CHUNK_SIZE,
     str strategy = "auto",
     bint return_distance = False,
 ):
@@ -419,13 +422,13 @@ def _argkmin(
     int_dtype = np.intp
     float_dtype = np.float32 if floating is float else np.float64
     cdef:
-        integral[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
-                                               dtype=int_dtype)
+        ITYPE_t[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
+                                               dtype=ITYPE)
         floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
                                                   FLOAT_INF,
                                                   dtype=float_dtype)
         floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
-        integral effective_n_threads = _openmp_effective_n_threads()
+        ITYPE_t effective_n_threads = _openmp_effective_n_threads()
 
     if strategy == 'auto':
         # This is a simple heuristic whose constant for the
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 05b47750707bd..b48ea40079717 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -151,16 +151,16 @@ from libc.string cimport memcpy
 
 import numpy as np
 import warnings
-from ..utils import check_array
-
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
 
 from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
                              euclidean_dist_to_rdist, euclidean_rdist_to_dist)
 
 from ._partition_nodes cimport partition_node_indices
-from ._neighbors_heap cimport NeighborsHeap, _simultaneous_sort
+
+from ..utils import check_array
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
+from ..utils._typedefs import DTYPE, ITYPE
+from ..utils._heap cimport _simultaneous_sort, _push
 
 cdef extern from "numpy/arrayobject.h":
     void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
@@ -490,6 +490,118 @@ def kernel_norm(h, d, kernel, return_log=False):
         return np.exp(result)
 
 
+cdef class NeighborsHeap:
+    """A max-heap structure to keep track of distances/indices of neighbors
+
+    This implements an efficient pre-allocated set of fixed-size heaps
+    for chasing neighbors, holding both an index and a distance.
+    When any row of the heap is full, adding an additional point will push
+    the furthest point off the heap.
+
+    Parameters
+    ----------
+    n_pts : int
+        the number of heaps to use
+    n_nbrs : int
+        the size of each heap.
+    """
+    cdef np.ndarray distances_arr
+    cdef np.ndarray indices_arr
+
+    cdef DTYPE_t[:, ::1] distances
+    cdef ITYPE_t[:, ::1] indices
+
+    def __cinit__(self):
+        self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C')
+        self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C')
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
+
+    def __init__(self, n_pts, n_nbrs):
+        self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE,
+                                     order='C')
+        self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C')
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
+
+    def get_arrays(self, sort=True):
+        """Get the arrays of distances and indices within the heap.
+
+        If sort=True, then simultaneously sort the indices and distances,
+        so the closer points are listed first.
+        """
+        if sort:
+            self._sort()
+        return self.distances_arr, self.indices_arr
+
+    cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1:
+        """Return the largest distance in the given row"""
+        return self.distances[row, 0]
+
+    def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val):
+        return self._push(row, val, i_val)
+
+    cdef int _push(self, ITYPE_t row, DTYPE_t val,
+                   ITYPE_t i_val) nogil except -1:
+        """push (val, i_val) into the given row"""
+        cdef ITYPE_t i, ic1, ic2, i_swap
+        cdef ITYPE_t size = self.distances.shape[1]
+        cdef DTYPE_t* dist_arr = &self.distances[row, 0]
+        cdef ITYPE_t* ind_arr = &self.indices[row, 0]
+
+        # check if val should be in heap
+        if val > dist_arr[0]:
+            return 0
+
+        # insert val at position zero
+        dist_arr[0] = val
+        ind_arr[0] = i_val
+
+        # descend the heap, swapping values until the max heap criterion is met
+        i = 0
+        while True:
+            ic1 = 2 * i + 1
+            ic2 = ic1 + 1
+
+            if ic1 >= size:
+                break
+            elif ic2 >= size:
+                if dist_arr[ic1] > val:
+                    i_swap = ic1
+                else:
+                    break
+            elif dist_arr[ic1] >= dist_arr[ic2]:
+                if val < dist_arr[ic1]:
+                    i_swap = ic1
+                else:
+                    break
+            else:
+                if val < dist_arr[ic2]:
+                    i_swap = ic2
+                else:
+                    break
+
+            dist_arr[i] = dist_arr[i_swap]
+            ind_arr[i] = ind_arr[i_swap]
+
+            i = i_swap
+
+        dist_arr[i] = val
+        ind_arr[i] = i_val
+
+        return 0
+
+
+    cdef int _sort(self) except -1:
+        """simultaneously sort the distances and indices"""
+        cdef DTYPE_t[:, ::1] distances = self.distances
+        cdef ITYPE_t[:, ::1] indices = self.indices
+        cdef ITYPE_t row
+        for row in range(distances.shape[0]):
+            _simultaneous_sort(&distances[row, 0],
+                               &indices[row, 0],
+                               distances.shape[1])
+        return 0
 
 #------------------------------------------------------------
 # find_node_split_dim:
diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/neighbors/_dist_metrics.pxd
index 30124c309bc49..fe0d7322268dd 100644
--- a/sklearn/neighbors/_dist_metrics.pxd
+++ b/sklearn/neighbors/_dist_metrics.pxd
@@ -3,12 +3,10 @@
 #cython: wraparound=False
 #cython: cdivision=True
 
-cimport cython
 cimport numpy as np
-from libc.math cimport fabs, sqrt, exp, cos, pow
+from libc.math cimport  sqrt, exp
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
 ######################################################################
 # Inline distance functions
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx
index cf0c703a5d491..c9941cab0fc60 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/neighbors/_dist_metrics.pyx
@@ -43,8 +43,8 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
 from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
 cdef DTYPE_t INF = np.inf
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
-from ._typedefs import DTYPE, ITYPE
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
+from ..utils._typedefs import DTYPE, ITYPE
 
 
 ######################################################################
diff --git a/sklearn/neighbors/_neighbors_heap.pxd b/sklearn/neighbors/_neighbors_heap.pxd
deleted file mode 100644
index 6027bdfdac6a1..0000000000000
--- a/sklearn/neighbors/_neighbors_heap.pxd
+++ /dev/null
@@ -1,78 +0,0 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
-
-cimport cython
-cimport numpy as np
-from libc.math cimport fabs, sqrt, exp, cos, pow
-from cython cimport floating, integral, numeric
-
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
-
-cdef inline void swap(numeric* arr, integral i1, integral i2):
-    """swap the values at index i1 and i2 of arr"""
-    cdef numeric tmp = arr[i1]
-    arr[i1] = arr[i2]
-    arr[i2] = tmp
-
-cdef inline void dual_swap(
-    floating* dist,
-    integral* idx,
-    integral i1,
-    integral i2
-) nogil:
-    """swap the values at index i1 and i2 of both dist and idx"""
-    cdef:
-        floating dtmp = dist[i1]
-        integral itmp = idx[i1]
-
-    dist[i1] = dist[i2]
-    dist[i2] = dtmp
-
-    idx[i1] = idx[i2]
-    idx[i2] = itmp
-
-cdef int _simultaneous_sort(
-    floating* dist,
-    integral* idx,
-    integral size
-) nogil except -1
-
-cdef int _push(
-    floating* dist,
-    integral* idx,
-    integral size,
-    floating val,
-    integral i_val,
-) nogil except -1
-
-
-cdef class NeighborsHeap:
-    """A max-heap structure to keep track of distances/indices of neighbors
-
-    This implements an efficient pre-allocated set of fixed-size heaps
-    for chasing neighbors, holding both an index and a distance.
-    When any row of the heap is full, adding an additional point will push
-    the furthest point off the heap.
-
-    Parameters
-    ----------
-    n_pts : int
-        the number of heaps to use
-    n_nbrs : int
-        the size of each heap.
-    """
-    cdef np.ndarray distances_arr
-    cdef np.ndarray indices_arr
-
-    cdef DTYPE_t[:, ::1] distances
-    cdef ITYPE_t[:, ::1] indices
-
-    cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1
-
-    cdef int _push(self, ITYPE_t row, DTYPE_t val,
-                   ITYPE_t i_val) nogil except -1
-
-    cdef int _sort(self) except -1
diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd
index 522e826632824..1659801db469d 100644
--- a/sklearn/neighbors/_partition_nodes.pxd
+++ b/sklearn/neighbors/_partition_nodes.pxd
@@ -1,4 +1,4 @@
-from ._typedefs cimport DTYPE_t, ITYPE_t
+from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t
 
 cdef int partition_node_indices(
         DTYPE_t *data,
diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py
index bf330533102e1..34921de75041a 100644
--- a/sklearn/neighbors/setup.py
+++ b/sklearn/neighbors/setup.py
@@ -39,12 +39,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_typedefs",
-        sources=["_typedefs.pyx"],
-        include_dirs=[numpy.get_include()],
-        libraries=libraries,
-    )
     config.add_extension(
         "_quad_tree",
         sources=["_quad_tree.pyx"],
@@ -52,13 +46,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_neighbors_heap",
-        sources=["_neighbors_heap.pyx"],
-        include_dirs=[numpy.get_include()],
-        libraries=libraries,
-    )
-
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index de34b4d230171..bcd1a54729059 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -12,13 +12,11 @@
     kernel_norm,
     DTYPE,
     ITYPE,
-    NeighborsHeap as NeighborsHeapBT,
     simultaneous_sort as simultaneous_sort_bt,
     nodeheap_sort as nodeheap_sort_bt,
 )
 from sklearn.neighbors._kd_tree import (
     KDTree,
-    NeighborsHeap as NeighborsHeapKDT,
     simultaneous_sort as simultaneous_sort_kdt,
     nodeheap_sort as nodeheap_sort_kdt,
 )
@@ -157,27 +155,6 @@ def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
     assert_array_almost_equal(counts, counts_true)
 
 
-@pytest.mark.parametrize("NeighborsHeap", [NeighborsHeapBT, NeighborsHeapKDT])
-def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
-    heap = NeighborsHeap(n_pts, n_nbrs)
-    rng = check_random_state(0)
-
-    for row in range(n_pts):
-        d_in = rng.random_sample(2 * n_nbrs).astype(DTYPE, copy=False)
-        i_in = np.arange(2 * n_nbrs, dtype=ITYPE)
-        for d, i in zip(d_in, i_in):
-            heap.push(row, d, i)
-
-        ind = np.argsort(d_in)
-        d_in = d_in[ind]
-        i_in = i_in[ind]
-
-        d_heap, i_heap = heap.get_arrays(sort=True)
-
-        assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
-        assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
-
-
 @pytest.mark.parametrize("nodeheap_sort", [nodeheap_sort_bt, nodeheap_sort_kdt])
 def test_node_heap(nodeheap_sort, n_nodes=50):
     rng = check_random_state(0)
diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
new file mode 100644
index 0000000000000..44b282662b559
--- /dev/null
+++ b/sklearn/utils/_heap.pxd
@@ -0,0 +1,37 @@
+#!python
+#cython: boundscheck=False
+#cython: wraparound=False
+#cython: cdivision=True
+
+cimport cython
+cimport numpy as np
+from libc.math cimport fabs, sqrt, exp, cos, pow
+from cython cimport floating, integral
+
+from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
+from ._typedefs import DTYPE, ITYPE
+
+cdef inline void dual_swap(floating* darr, ITYPE_t* iarr,
+                           ITYPE_t i1, ITYPE_t i2) nogil:
+    """swap the values at inex i1 and i2 of both darr and iarr"""
+    cdef floating dtmp = darr[i1]
+    darr[i1] = darr[i2]
+    darr[i2] = dtmp
+
+    cdef ITYPE_t itmp = iarr[i1]
+    iarr[i1] = iarr[i2]
+    iarr[i2] = itmp
+
+cdef int _simultaneous_sort(
+    floating* dist,
+    ITYPE_t* idx,
+    ITYPE_t size
+) nogil except -1
+
+cdef int _push(
+    floating* dist,
+    ITYPE_t* idx,
+    ITYPE_t size,
+    floating val,
+    ITYPE_t i_val,
+) nogil except -1
diff --git a/sklearn/neighbors/_neighbors_heap.pyx b/sklearn/utils/_heap.pyx
similarity index 54%
rename from sklearn/neighbors/_neighbors_heap.pyx
rename to sklearn/utils/_heap.pyx
index 05b7831f496c8..72c4e7fcf6524 100644
--- a/sklearn/neighbors/_neighbors_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -3,23 +3,16 @@
 #cython: wraparound=False
 #cython: cdivision=True
 
-import numpy as np
-cimport numpy as np
-np.import_array()  # required in order to use C-API
 
-cimport cython
-cimport numpy as np
-from libc.math cimport fabs, sqrt, exp, cos, pow
 from cython cimport floating, integral, numeric
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
+from ._typedefs cimport ITYPE_t
 
 
 cdef int _simultaneous_sort(
     floating* dist,
-    integral* idx,
-    integral size
+    ITYPE_t* idx,
+    ITYPE_t size
 ) nogil except -1:
     """
     Perform a recursive quicksort on the dist array, simultaneously
@@ -30,7 +23,7 @@ cdef int _simultaneous_sort(
       - heap-sort-like
     """
     cdef:
-        integral pivot_idx, i, store_idx
+        ITYPE_t pivot_idx, i, store_idx
         floating pivot_val
 
     # in the small-array case, do things efficiently
@@ -83,14 +76,14 @@ cdef int _simultaneous_sort(
 
 cdef int _push(
     floating* dist,
-    integral* idx,
-    integral size,
+    ITYPE_t* idx,
+    ITYPE_t size,
     floating val,
-    integral i_val,
+    ITYPE_t i_val,
 ) nogil except -1:
     """push (val, i_val) into the heap (dist, idx) of the given size"""
     cdef:
-        integral current_idx, left_child_idx, right_child_idx, swap_idx
+        ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx
 
     # check if val should be in heap
     if val > dist[0]:
@@ -133,71 +126,3 @@ cdef int _push(
     idx[current_idx] = i_val
 
     return 0
-
-
-cdef class NeighborsHeap:
-    """A max-heap structure to keep track of distances/indices of neighbors
-
-    This implements an efficient pre-allocated set of fixed-size heaps
-    for chasing neighbors, holding both an index and a distance.
-    When any row of the heap is full, adding an additional point will push
-    the furthest point off the heap.
-
-    Parameters
-    ----------
-    n_pts : int
-        the number of heaps to use
-    n_nbrs : int
-        the size of each heap.
-    """
-
-    def __cinit__(self):
-        self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C')
-        self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C')
-        self.distances = self.distances_arr
-        self.indices = self.indices_arr
-
-    def __init__(self, n_pts, n_nbrs):
-        self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE,
-                                     order='C')
-        self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C')
-        self.distances = self.distances_arr
-        self.indices = self.indices_arr
-
-    def get_arrays(self, sort=True):
-        """Get the arrays of distances and indices within the heap.
-
-        If sort=True, then simultaneously sort the indices and distances,
-        so the closer points are listed first.
-        """
-        if sort:
-            self._sort()
-        return self.distances_arr, self.indices_arr
-
-    cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1:
-        """Return the largest distance in the given row"""
-        return self.distances[row, 0]
-
-    def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val):
-        return self._push(row, val, i_val)
-
-    cdef int _push(self, ITYPE_t row, DTYPE_t val,
-                   ITYPE_t i_val) nogil except -1:
-        """push (val, i_val) into the given row"""
-        cdef ITYPE_t size = self.distances.shape[1]
-        cdef DTYPE_t* dist_arr = &self.distances[row, 0]
-        cdef ITYPE_t* ind_arr = &self.indices[row, 0]
-
-        return _push(dist_arr, ind_arr, size, val, i_val)
-
-
-    cdef int _sort(self) except -1:
-        """simultaneously sort the distances and indices"""
-        cdef DTYPE_t[:, ::1] distances = self.distances
-        cdef ITYPE_t[:, ::1] indices = self.indices
-        cdef ITYPE_t row
-        for row in range(distances.shape[0]):
-            _simultaneous_sort(&distances[row, 0],
-                               &indices[row, 0],
-                               distances.shape[1])
-        return 0
diff --git a/sklearn/neighbors/_typedefs.pxd b/sklearn/utils/_typedefs.pxd
similarity index 100%
rename from sklearn/neighbors/_typedefs.pxd
rename to sklearn/utils/_typedefs.pxd
diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/utils/_typedefs.pyx
similarity index 100%
rename from sklearn/neighbors/_typedefs.pyx
rename to sklearn/utils/_typedefs.pyx
diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
index fb995fb74752e..6f5d907cf046a 100644
--- a/sklearn/utils/setup.py
+++ b/sklearn/utils/setup.py
@@ -85,6 +85,20 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
+    config.add_extension(
+        "_typedefs",
+        sources=["_typedefs.pyx"],
+        include_dirs=[numpy.get_include()],
+        libraries=libraries,
+    )
+
+    config.add_extension(
+        "_heap",
+        sources=["_heap.pyx"],
+        include_dirs=[numpy.get_include()],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config

From 5abda9461dc366afe67efc08f5d93ddfbe5b54ae Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 23 Jun 2021 18:24:26 +0200
Subject: [PATCH 012/290] Reintroduce deleted test_neighbors_heap

---
 sklearn/neighbors/_binary_tree.pxi            |  1 -
 .../neighbors/tests/test_neighbors_tree.py    | 23 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index b48ea40079717..37aa13b0a4f30 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -591,7 +591,6 @@ cdef class NeighborsHeap:
 
         return 0
 
-
     cdef int _sort(self) except -1:
         """simultaneously sort the distances and indices"""
         cdef DTYPE_t[:, ::1] distances = self.distances
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index bcd1a54729059..de34b4d230171 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -12,11 +12,13 @@
     kernel_norm,
     DTYPE,
     ITYPE,
+    NeighborsHeap as NeighborsHeapBT,
     simultaneous_sort as simultaneous_sort_bt,
     nodeheap_sort as nodeheap_sort_bt,
 )
 from sklearn.neighbors._kd_tree import (
     KDTree,
+    NeighborsHeap as NeighborsHeapKDT,
     simultaneous_sort as simultaneous_sort_kdt,
     nodeheap_sort as nodeheap_sort_kdt,
 )
@@ -155,6 +157,27 @@ def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
     assert_array_almost_equal(counts, counts_true)
 
 
+@pytest.mark.parametrize("NeighborsHeap", [NeighborsHeapBT, NeighborsHeapKDT])
+def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
+    heap = NeighborsHeap(n_pts, n_nbrs)
+    rng = check_random_state(0)
+
+    for row in range(n_pts):
+        d_in = rng.random_sample(2 * n_nbrs).astype(DTYPE, copy=False)
+        i_in = np.arange(2 * n_nbrs, dtype=ITYPE)
+        for d, i in zip(d_in, i_in):
+            heap.push(row, d, i)
+
+        ind = np.argsort(d_in)
+        d_in = d_in[ind]
+        i_in = i_in[ind]
+
+        d_heap, i_heap = heap.get_arrays(sort=True)
+
+        assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
+        assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
+
+
 @pytest.mark.parametrize("nodeheap_sort", [nodeheap_sort_bt, nodeheap_sort_kdt])
 def test_node_heap(nodeheap_sort, n_nodes=50):
     rng = check_random_state(0)

From bc8925ef71b7351425399a94f3912da882a04f30 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 23 Jun 2021 18:27:01 +0200
Subject: [PATCH 013/290] Lint

---
 sklearn/metrics/_argkmin_fast.pyx | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index b133805eaafa9..46549816d3b1b 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -46,8 +46,8 @@ cdef void _argkmin_on_chunk(
     floating[::1] Y_sq_norms,              # IN
     floating *dist_middle_terms,           # IN
     floating *heaps_red_distances,         # IN/OUT
-    ITYPE_t *heaps_indices,               # IN/OUT
-    ITYPE_t k,                            # IN
+    ITYPE_t *heaps_indices,                # IN/OUT
+    ITYPE_t k,                             # IN
     # ID of the first element of Y_c
     ITYPE_t Y_idx_offset,
 ) nogil:
@@ -94,9 +94,9 @@ cdef int _argkmin_on_X(
     floating[:, ::1] X,                       # IN
     floating[:, ::1] Y,                       # IN
     floating[::1] Y_sq_norms,                 # IN
-    ITYPE_t chunk_size,                      # IN
-    ITYPE_t effective_n_threads,             # IN
-    ITYPE_t[:, ::1] argkmin_indices,         # OUT
+    ITYPE_t chunk_size,                       # IN
+    ITYPE_t effective_n_threads,              # IN
+    ITYPE_t[:, ::1] argkmin_indices,          # OUT
     floating[:, ::1] argkmin_red_distances,   # OUT
 ) nogil:
     """Computes the argkmin of each vector (row) of X on Y
@@ -193,9 +193,9 @@ cdef int _argkmin_on_Y(
     floating[:, ::1] X,                       # IN
     floating[:, ::1] Y,                       # IN
     floating[::1] Y_sq_norms,                 # IN
-    ITYPE_t chunk_size,                      # IN
-    ITYPE_t effective_n_threads,             # IN
-    ITYPE_t[:, ::1] argkmin_indices,         # OUT
+    ITYPE_t chunk_size,                       # IN
+    ITYPE_t effective_n_threads,              # IN
+    ITYPE_t[:, ::1] argkmin_indices,          # OUT
     floating[:, ::1] argkmin_red_distances,   # OUT
 ) nogil:
     """Computes the argkmin of each vector (row) of X on Y
@@ -344,8 +344,8 @@ cdef inline floating _euclidean_dist(
 cdef int _exact_euclidean_dist(
     floating[:, ::1] X,                  # IN
     floating[:, ::1] Y,                  # IN
-    ITYPE_t[:, ::1] Y_indices,          # IN
-    ITYPE_t effective_n_threads,        # IN
+    ITYPE_t[:, ::1] Y_indices,           # IN
+    ITYPE_t effective_n_threads,         # IN
     floating[:, ::1] distances,          # OUT
 ) nogil:
     """

From e2bb56230d598822454efc6b8d563e54bfdcca74 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 23 Jun 2021 18:31:42 +0200
Subject: [PATCH 014/290] Minify utils._heap definition file

---
 sklearn/utils/_heap.pxd | 24 +++---------------------
 sklearn/utils/_heap.pyx | 10 ++++++++++
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
index 44b282662b559..05e4760994e33 100644
--- a/sklearn/utils/_heap.pxd
+++ b/sklearn/utils/_heap.pxd
@@ -1,26 +1,8 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
+# cython: language_level=3
 
-cimport cython
-cimport numpy as np
-from libc.math cimport fabs, sqrt, exp, cos, pow
-from cython cimport floating, integral
+from cython cimport floating
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
-
-cdef inline void dual_swap(floating* darr, ITYPE_t* iarr,
-                           ITYPE_t i1, ITYPE_t i2) nogil:
-    """swap the values at inex i1 and i2 of both darr and iarr"""
-    cdef floating dtmp = darr[i1]
-    darr[i1] = darr[i2]
-    darr[i2] = dtmp
-
-    cdef ITYPE_t itmp = iarr[i1]
-    iarr[i1] = iarr[i2]
-    iarr[i2] = itmp
+from ._typedefs cimport ITYPE_t
 
 cdef int _simultaneous_sort(
     floating* dist,
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index 72c4e7fcf6524..ef2d393cc1a55 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -8,6 +8,16 @@ from cython cimport floating, integral, numeric
 
 from ._typedefs cimport ITYPE_t
 
+cdef inline void dual_swap(floating* darr, ITYPE_t* iarr,
+                           ITYPE_t i1, ITYPE_t i2) nogil:
+    """swap the values at inex i1 and i2 of both darr and iarr"""
+    cdef floating dtmp = darr[i1]
+    darr[i1] = darr[i2]
+    darr[i2] = dtmp
+
+    cdef ITYPE_t itmp = iarr[i1]
+    iarr[i1] = iarr[i2]
+    iarr[i2] = itmp
 
 cdef int _simultaneous_sort(
     floating* dist,

From ac768526dc7486dc5bd93340bb025732aa72ff6a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 24 Jun 2021 11:01:24 +0200
Subject: [PATCH 015/290] Post-merge black code formatting

---
 sklearn/metrics/tests/test_pairwise.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 5b513a5501b43..2ea1dafb8f6cd 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1481,10 +1481,10 @@ def test_fast_sqeuclidean_correctness(n, d, X_translation, Y_translation, sign):
     X = X_translation + rng.rand(int(n * d)).reshape((-1, d))
     Y = sign * Y_translation + rng.rand(int(n * d)).reshape((-1, d))
 
-    argmins, distances = pairwise_distances_argmin_min(X, Y,
-                                                       metric="euclidean")
-    fsq_argmins, fsq_distances = pairwise_distances_argmin_min(X, Y,
-                                                               metric="fast_sqeuclidean")
+    argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
+    fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
+        X, Y, metric="fast_sqeuclidean"
+    )
 
     np.testing.assert_array_equal(argmins, fsq_argmins)
     np.testing.assert_almost_equal(distances, fsq_distances)

From cac731395ef03f85ec4e927d420610af5f5fd9c1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 24 Jun 2021 14:55:26 +0200
Subject: [PATCH 016/290] Spread datasets for the tests of the fast_sqeuclidean
 strategy

So that the range correspond to actual datasets and not
to datasets whose marginal spreads are in [0, 1].
---
 sklearn/metrics/tests/test_pairwise.py    | 11 ++---
 sklearn/neighbors/tests/test_neighbors.py | 52 +++++++++++++----------
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 2ea1dafb8f6cd..35a6d3dd0f9e9 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1474,12 +1474,13 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
 @pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("sign", [1, -1])
 def test_fast_sqeuclidean_correctness(n, d, X_translation, Y_translation, sign):
-
+    # The fast squared euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
     rng = np.random.RandomState(1)
 
-    # Translating to test numerical stability
-    X = X_translation + rng.rand(int(n * d)).reshape((-1, d))
-    Y = sign * Y_translation + rng.rand(int(n * d)).reshape((-1, d))
+    spread = 100
+    X = X_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread
+    Y = Y_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread * sign
 
     argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
     fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
@@ -1487,4 +1488,4 @@ def test_fast_sqeuclidean_correctness(n, d, X_translation, Y_translation, sign):
     )
 
     np.testing.assert_array_equal(argmins, fsq_argmins)
-    np.testing.assert_almost_equal(distances, fsq_distances)
+    np.testing.assert_allclose(distances, fsq_distances, rtol=1e-5)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 70bbcee5a1e48..ee9e92b0347ee 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -29,8 +29,11 @@
 )
 from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed
 from sklearn.pipeline import make_pipeline
-from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
 from sklearn.utils.fixes import sp_version, parse_version
@@ -1795,11 +1798,11 @@ def test_pairwise_deprecated(NearestNeighbors):
 
 
 @pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("d", [5, 10, 100, 500])
+@pytest.mark.parametrize("d", [5, 10, 100])
 @pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
-@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 13)])
-@pytest.mark.parametrize("strategy", ["auto", "chunk_on_train", "chunk_on_test"])
+@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 11)])
+@pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
 def test_fast_sqeuclidean_correctness(
     n,
     d,
@@ -1809,14 +1812,8 @@ def test_fast_sqeuclidean_correctness(
     strategy,
     dtype=np.float64,
 ):
-    """The Fast squared euclidean strategy ("fast-sqeuclidean") is a faster
-    alternative to the squared euclidean strategy ("sqeuclidean").
-    It computed reduced squared euclidean distances of using the
-    the GEMM subroutine of BLAS, allowing high arithmetic intensity.
-
-    Yet, it can be unstable for some range of data far the origin overflowing
-    the representation for float64.
-    """
+    # The fast squared euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
     if n < n_neighbors:
         pytest.skip(
             f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
@@ -1824,8 +1821,12 @@ def test_fast_sqeuclidean_correctness(
         )
 
     rng = np.random.RandomState(1)
-    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
-    X_test = rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d))
+
+    spread = 100
+    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_test = (
+        rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d)) * spread
+    )
 
     neigh = NearestNeighbors(
         n_neighbors=n_neighbors, algorithm="brute", metric="euclidean"
@@ -1841,8 +1842,8 @@ def test_fast_sqeuclidean_correctness(
         X=X_test, n_neighbors=n_neighbors, return_distance=True
     )
 
-    np.testing.assert_almost_equal(eucl_dist, fse_dist)
-    np.testing.assert_array_equal(eucl_nn, fse_nn)
+    assert_allclose(eucl_dist, fse_dist)
+    assert_array_equal(eucl_nn, fse_nn)
 
 
 @pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
@@ -1856,10 +1857,17 @@ def test_fast_sqeuclidean_translation_invariance(
     translation,
     dtype=np.float64,
 ):
-    """The Fast euclidean strategy should be translation invariant."""
+    # The fast squared euclidean strategy should be translation invariant.
+    if n < n_neighbors:
+        pytest.skip(
+            f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
+            allow_module_level=True,
+        )
+
     rng = np.random.RandomState(1)
-    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
-    X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
+    spread = 100
+    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
 
     neigh = NearestNeighbors(
         n_neighbors=n_neighbors, algorithm="brute", metric="fast_sqeuclidean"
@@ -1875,5 +1883,5 @@ def test_fast_sqeuclidean_translation_invariance(
         X=X_test + translation, n_neighbors=n_neighbors, return_distance=True
     )
 
-    np.testing.assert_array_equal(reference_nns, nns)
-    np.testing.assert_almost_equal(reference_dist, dist)
+    assert_allclose(reference_dist, dist)
+    assert_array_equal(reference_nns, nns)

From 8a06c3f515c51ef351f43222a1050d83a066727d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 09:34:57 +0200
Subject: [PATCH 017/290] Rectify test

---
 sklearn/neighbors/tests/test_neighbors.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index ee9e92b0347ee..ca16f373212bc 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1801,15 +1801,11 @@ def test_pairwise_deprecated(NearestNeighbors):
 @pytest.mark.parametrize("d", [5, 10, 100])
 @pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
-@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 11)])
-@pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
 def test_fast_sqeuclidean_correctness(
     n,
     d,
     ratio_train_test,
     n_neighbors,
-    chunk_size,
-    strategy,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy must return results

From 41bd644d2364246f5220d830e72e07fbe48e759b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 09:40:24 +0200
Subject: [PATCH 018/290] [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/_argkmin_fast.pyx | 750 +++++++++++++++---------------
 sklearn/metrics/pairwise.py       |   6 +-
 sklearn/neighbors/_base.py        |   4 +-
 3 files changed, 381 insertions(+), 379 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 46549816d3b1b..e1367012bdffa 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -38,286 +38,6 @@ from ..utils._typedefs cimport ITYPE_t
 from ..utils._typedefs import ITYPE
 
 
-### argkmin helpers
-
-cdef void _argkmin_on_chunk(
-    floating[:, ::1] X_c,                  # IN
-    floating[:, ::1] Y_c,                  # IN
-    floating[::1] Y_sq_norms,              # IN
-    floating *dist_middle_terms,           # IN
-    floating *heaps_red_distances,         # IN/OUT
-    ITYPE_t *heaps_indices,                # IN/OUT
-    ITYPE_t k,                             # IN
-    # ID of the first element of Y_c
-    ITYPE_t Y_idx_offset,
-) nogil:
-    """
-    Critical part of the computation of pairwise distances.
-
-    "Fast Squared Euclidean" distances strategy relying
-    on the gemm-trick.
-    """
-    cdef:
-        ITYPE_t i, j
-    # Instead of computing the full pairwise squared distances matrix,
-    # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-    # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
-    # term since the argmin for a given sample X_c^{i} does not depend on
-    # ||X_c^{i}||²
-
-    # Careful: LDA, LDB and LDC are given for F-ordered arrays.
-    # Here, we use their counterpart values as indicated in the documentation.
-    # See the documentation of parameters here:
-    # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
-    #
-    # dist_middle_terms = -2 * X_c.dot(Y_c.T)
-    _gemm(RowMajor, NoTrans, Trans,
-          X_c.shape[0], Y_c.shape[0], X_c.shape[1],
-          -2.0,
-          &X_c[0, 0], X_c.shape[1],
-          &Y_c[0, 0], X_c.shape[1], 0.0,
-          dist_middle_terms, Y_c.shape[0])
-
-    # Computing argmins here
-    for i in range(X_c.shape[0]):
-        for j in range(Y_c.shape[0]):
-            _push(heaps_red_distances + i * k,
-                  heaps_indices + i * k,
-                  k,
-                  # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                  dist_middle_terms[i * Y_c.shape[0] + j] + Y_sq_norms[j],
-                  j + Y_idx_offset)
-
-
-
-cdef int _argkmin_on_X(
-    floating[:, ::1] X,                       # IN
-    floating[:, ::1] Y,                       # IN
-    floating[::1] Y_sq_norms,                 # IN
-    ITYPE_t chunk_size,                       # IN
-    ITYPE_t effective_n_threads,              # IN
-    ITYPE_t[:, ::1] argkmin_indices,          # OUT
-    floating[:, ::1] argkmin_red_distances,   # OUT
-) nogil:
-    """Computes the argkmin of each vector (row) of X on Y
-    by parallelising computation on chunks of X.
-    """
-    cdef:
-        ITYPE_t k = argkmin_indices.shape[1]
-        ITYPE_t d = X.shape[1]
-        ITYPE_t sf = sizeof(floating)
-        ITYPE_t si = sizeof(ITYPE_t)
-        ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-        ITYPE_t n_train = Y.shape[0]
-        ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-        ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-        ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-        ITYPE_t n_test = X.shape[0]
-        ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-        ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-        ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-        # Counting remainder chunk in total number of chunks
-        ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-            n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-        )
-
-        ITYPE_t X_n_chunks = X_n_full_chunks + (
-            n_test != (X_n_full_chunks * X_n_samples_chunk)
-        )
-
-        ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
-
-        ITYPE_t Y_start, Y_end, X_start, X_end
-        ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
-
-        floating *dist_middle_terms_chunks
-        floating *heaps_red_distances_chunks
-
-
-    with nogil, parallel(num_threads=num_threads):
-        # Thread local buffers
-
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        dist_middle_terms_chunks = <floating*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
-        heaps_red_distances_chunks = <floating*> malloc(X_n_samples_chunk * k * sf)
-
-        for X_chunk_idx in prange(X_n_chunks, schedule='static'):
-            # We reset the heap between X chunks (memset isn't suitable here)
-            for idx in range(X_n_samples_chunk * k):
-                heaps_red_distances_chunks[idx] = FLOAT_INF
-
-            X_start = X_chunk_idx * X_n_samples_chunk
-            if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-                X_end = X_start + X_n_samples_rem
-            else:
-                X_end = X_start + X_n_samples_chunk
-
-            for Y_chunk_idx in range(Y_n_chunks):
-                Y_start = Y_chunk_idx * Y_n_samples_chunk
-                if Y_chunk_idx == Y_n_chunks - 1 and Y_n_samples_rem > 0:
-                    Y_end = Y_start + Y_n_samples_rem
-                else:
-                    Y_end = Y_start + Y_n_samples_chunk
-
-                _argkmin_on_chunk(
-                    X[X_start:X_end, :],
-                    Y[Y_start:Y_end, :],
-                    Y_sq_norms[Y_start:Y_end],
-                    dist_middle_terms_chunks,
-                    heaps_red_distances_chunks,
-                    &argkmin_indices[X_start, 0],
-                    k,
-                    Y_start
-                )
-
-            # Sorting indices so that the closests' come first.
-            for idx in range(X_end - X_start):
-                _simultaneous_sort(
-                    heaps_red_distances_chunks + idx * k,
-                    &argkmin_indices[X_start + idx, 0],
-                    k
-                )
-
-        # end: for X_chunk_idx
-        free(dist_middle_terms_chunks)
-        free(heaps_red_distances_chunks)
-
-    # end: with nogil, parallel
-    return X_n_chunks
-
-
-cdef int _argkmin_on_Y(
-    floating[:, ::1] X,                       # IN
-    floating[:, ::1] Y,                       # IN
-    floating[::1] Y_sq_norms,                 # IN
-    ITYPE_t chunk_size,                       # IN
-    ITYPE_t effective_n_threads,              # IN
-    ITYPE_t[:, ::1] argkmin_indices,          # OUT
-    floating[:, ::1] argkmin_red_distances,   # OUT
-) nogil:
-    """Computes the argkmin of each vector (row) of X on Y
-    by parallelising computation on chunks of Y.
-
-    This parallelisation strategy is more costly (as we need
-    extra heaps and synchronisation), yet it is useful in
-    most contexts.
-    """
-    cdef:
-        ITYPE_t k = argkmin_indices.shape[1]
-        ITYPE_t d = X.shape[1]
-        ITYPE_t sf = sizeof(floating)
-        ITYPE_t si = sizeof(ITYPE_t)
-        ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-        ITYPE_t n_train = Y.shape[0]
-        ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-        ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-        ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-        ITYPE_t n_test = X.shape[0]
-        ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-        ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-        ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-        # Counting remainder chunk in total number of chunks
-        ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-            n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-        )
-
-        ITYPE_t X_n_chunks = X_n_full_chunks + (
-            n_test != (X_n_full_chunks * X_n_samples_chunk)
-        )
-
-        ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
-
-        ITYPE_t Y_start, Y_end, X_start, X_end
-        ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
-
-        floating *dist_middle_terms_chunks
-        floating *heaps_red_distances_chunks
-
-        # As chunks of X are shared across threads, so must their
-        # heaps. To solve this, each thread has its own locals
-        # heaps which are then synchronised back in the main ones.
-        ITYPE_t *heaps_indices_chunks
-
-    for X_chunk_idx in range(X_n_chunks):
-        X_start = X_chunk_idx * X_n_samples_chunk
-        if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-            X_end = X_start + X_n_samples_rem
-        else:
-            X_end = X_start + X_n_samples_chunk
-
-        with nogil, parallel(num_threads=num_threads):
-            # Thread local buffers
-
-            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <floating*> malloc(
-                Y_n_samples_chunk * X_n_samples_chunk * sf)
-            heaps_red_distances_chunks = <floating*> malloc(
-                X_n_samples_chunk * k * sf)
-            heaps_indices_chunks = <ITYPE_t*> malloc(
-                X_n_samples_chunk * k * sf)
-
-            # Initialising heaps (memset can't be used here)
-            for idx in range(X_n_samples_chunk * k):
-                heaps_red_distances_chunks[idx] = FLOAT_INF
-                heaps_indices_chunks[idx] = -1
-
-            for Y_chunk_idx in prange(Y_n_chunks, schedule='static'):
-                Y_start = Y_chunk_idx * Y_n_samples_chunk
-                if Y_chunk_idx == Y_n_chunks - 1 \
-                    and Y_n_samples_rem > 0:
-                    Y_end = Y_start + Y_n_samples_rem
-                else:
-                    Y_end = Y_start + Y_n_samples_chunk
-
-                _argkmin_on_chunk(
-                    X[X_start:X_end, :],
-                    Y[Y_start:Y_end, :],
-                    Y_sq_norms[Y_start:Y_end],
-                    dist_middle_terms_chunks,
-                    heaps_red_distances_chunks,
-                    heaps_indices_chunks,
-                    k,
-                    Y_start,
-                )
-
-            # end: for Y_chunk_idx
-            with gil:
-                # Synchronising the thread local heaps
-                # with the main heaps
-                for idx in range(X_end - X_start):
-                    for jdx in range(k):
-                        _push(
-                            &argkmin_red_distances[X_start + idx, 0],
-                            &argkmin_indices[X_start + idx, 0],
-                            k,
-                            heaps_red_distances_chunks[idx * k + jdx],
-                            heaps_indices_chunks[idx * k + jdx],
-                        )
-
-            free(dist_middle_terms_chunks)
-            free(heaps_red_distances_chunks)
-            free(heaps_indices_chunks)
-
-        # end: with nogil, parallel
-        # Sorting indices of the argkmin for each query vector of X
-        for idx in prange(n_test,schedule='static',
-                          nogil=True, num_threads=num_threads):
-            _simultaneous_sort(
-                &argkmin_red_distances[idx, 0],
-                &argkmin_indices[idx, 0],
-                k,
-            )
-        # end: prange
-
-    # end: for X_chunk_idx
-    return Y_n_chunks
-
 cdef inline floating _euclidean_dist(
     floating[:, ::1] X,
     floating[:, ::1] Y,
@@ -368,100 +88,380 @@ cdef int _exact_euclidean_dist(
                                               Y_indices[i, k])
 
 
-# Python interface
+cdef class ArgKmin:
+
+    cdef void _argkmin_on_chunk(self,
+        floating[:, ::1] X_c,                  # IN
+        floating[:, ::1] Y_c,                  # IN
+        floating[::1] Y_sq_norms,              # IN
+        floating *dist_middle_terms,           # IN
+        floating *heaps_red_distances,         # IN/OUT
+        ITYPE_t *heaps_indices,                # IN/OUT
+        ITYPE_t k,                             # IN
+        # ID of the first element of Y_c
+        ITYPE_t Y_idx_offset,
+    ) nogil:
+        """
+        Critical part of the computation of pairwise distances.
+
+        "Fast Squared Euclidean" distances strategy relying
+        on the gemm-trick.
+        """
+        cdef:
+            ITYPE_t i, j
+        # Instead of computing the full pairwise squared distances matrix,
+        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
+        # term since the argmin for a given sample X_c^{i} does not depend on
+        # ||X_c^{i}||²
+
+        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
+        # Here, we use their counterpart values as indicated in the documentation.
+        # See the documentation of parameters here:
+        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+        #
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(RowMajor, NoTrans, Trans,
+              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
+              -2.0,
+              &X_c[0, 0], X_c.shape[1],
+              &Y_c[0, 0], X_c.shape[1], 0.0,
+              dist_middle_terms, Y_c.shape[0])
+
+        # Computing argmins here
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                _push(heaps_red_distances + i * k,
+                      heaps_indices + i * k,
+                      k,
+                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      dist_middle_terms[i * Y_c.shape[0] + j] + Y_sq_norms[j],
+                      j + Y_idx_offset)
+
+
+
+    cdef int _argkmin_on_X(self,
+        floating[:, ::1] X,                       # IN
+        floating[:, ::1] Y,                       # IN
+        floating[::1] Y_sq_norms,                 # IN
+        ITYPE_t chunk_size,                       # IN
+        ITYPE_t effective_n_threads,              # IN
+        ITYPE_t[:, ::1] argkmin_indices,          # OUT
+        floating[:, ::1] argkmin_red_distances,   # OUT
+    ) nogil:
+        """Computes the argkmin of each vector (row) of X on Y
+        by parallelising computation on chunks of X.
+        """
+        cdef:
+            ITYPE_t k = argkmin_indices.shape[1]
+            ITYPE_t d = X.shape[1]
+            ITYPE_t sf = sizeof(floating)
+            ITYPE_t si = sizeof(ITYPE_t)
+            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
+            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
+            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+            ITYPE_t n_test = X.shape[0]
+            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
+            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
+            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
+
+            # Counting remainder chunk in total number of chunks
+            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
+                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
+            )
+
+            ITYPE_t X_n_chunks = X_n_full_chunks + (
+                n_test != (X_n_full_chunks * X_n_samples_chunk)
+            )
 
-def _argkmin(
-    floating[:, ::1] X,
-    floating[:, ::1] Y,
-    ITYPE_t k,
-    ITYPE_t chunk_size = CHUNK_SIZE,
-    str strategy = "auto",
-    bint return_distance = False,
-):
-    """Computes the argkmin of vectors (rows) of X on Y for
-    the euclidean distance.
-
-    The implementation is parallelised on chunks whose size can
-    be set using ``chunk_size``.
-
-    Parameters
-    ----------
-    X: ndarray of shape (n, d)
-        Rows represent vectors
-
-    Y: ndarray of shape (m, d)
-        Rows represent vectors
-
-    chunk_size: int
-        The number of vectors per chunk.
-
-    strategy: str, {'auto', 'chunk_on_X', 'chunk_on_Y'}
-        The chunking strategy defining which dataset
-        parallelisation are made on.
-
-         - 'chunk_on_X' is embarassingly parallel but
-        is less used in practice.
-         - 'chunk_on_Y' comes with synchronisation but
-        is more useful in practice.
-         -'auto' relies on a simple heuristic to choose
-        between 'chunk_on_X' and 'chunk_on_Y'.
-
-    return_distance: boolean
-        Return distances between each X vectory and its
-        argkmin if set to True.
-
-    Returns
-    -------
-    distances: ndarray of shape (n, k)
-        Distances between each X vector and its argkmin
-        in Y. Only returned if ``return_distance=True``.
-
-    indices: ndarray of shape (n, k)
-        Indices of each X vector argkmin in Y.
-    """
-    int_dtype = np.intp
-    float_dtype = np.float32 if floating is float else np.float64
-    cdef:
-        ITYPE_t[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
-                                               dtype=ITYPE)
-        floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
-                                                  FLOAT_INF,
-                                                  dtype=float_dtype)
-        floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
-        ITYPE_t effective_n_threads = _openmp_effective_n_threads()
-
-    if strategy == 'auto':
-        # This is a simple heuristic whose constant for the
-        # comparison has been chosen based on experiments.
-        if 4 * chunk_size * effective_n_threads < X.shape[0]:
-            strategy = 'chunk_on_X'
+            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+
+            ITYPE_t Y_start, Y_end, X_start, X_end
+            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+
+            floating *dist_middle_terms_chunks
+            floating *heaps_red_distances_chunks
+
+
+        with nogil, parallel(num_threads=num_threads):
+            # Thread local buffers
+
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            dist_middle_terms_chunks = <floating*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
+            heaps_red_distances_chunks = <floating*> malloc(X_n_samples_chunk * k * sf)
+
+            for X_chunk_idx in prange(X_n_chunks, schedule='static'):
+                # We reset the heap between X chunks (memset isn't suitable here)
+                for idx in range(X_n_samples_chunk * k):
+                    heaps_red_distances_chunks[idx] = FLOAT_INF
+
+                X_start = X_chunk_idx * X_n_samples_chunk
+                if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
+                    X_end = X_start + X_n_samples_rem
+                else:
+                    X_end = X_start + X_n_samples_chunk
+
+                for Y_chunk_idx in range(Y_n_chunks):
+                    Y_start = Y_chunk_idx * Y_n_samples_chunk
+                    if Y_chunk_idx == Y_n_chunks - 1 and Y_n_samples_rem > 0:
+                        Y_end = Y_start + Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + Y_n_samples_chunk
+
+                    self._argkmin_on_chunk(
+                        X[X_start:X_end, :],
+                        Y[Y_start:Y_end, :],
+                        Y_sq_norms[Y_start:Y_end],
+                        dist_middle_terms_chunks,
+                        heaps_red_distances_chunks,
+                        &argkmin_indices[X_start, 0],
+                        k,
+                        Y_start
+                    )
+
+                # Sorting indices so that the closests' come first.
+                for idx in range(X_end - X_start):
+                    _simultaneous_sort(
+                        heaps_red_distances_chunks + idx * k,
+                        &argkmin_indices[X_start + idx, 0],
+                        k
+                    )
+
+            # end: for X_chunk_idx
+            free(dist_middle_terms_chunks)
+            free(heaps_red_distances_chunks)
+
+        # end: with nogil, parallel
+        return X_n_chunks
+
+
+    cdef int _argkmin_on_Y(self,
+        floating[:, ::1] X,                       # IN
+        floating[:, ::1] Y,                       # IN
+        floating[::1] Y_sq_norms,                 # IN
+        ITYPE_t chunk_size,                       # IN
+        ITYPE_t effective_n_threads,              # IN
+        ITYPE_t[:, ::1] argkmin_indices,          # OUT
+        floating[:, ::1] argkmin_red_distances,   # OUT
+    ) nogil:
+        """Computes the argkmin of each vector (row) of X on Y
+        by parallelising computation on chunks of Y.
+
+        This parallelisation strategy is more costly (as we need
+        extra heaps and synchronisation), yet it is useful in
+        most contexts.
+        """
+        cdef:
+            ITYPE_t k = argkmin_indices.shape[1]
+            ITYPE_t d = X.shape[1]
+            ITYPE_t sf = sizeof(floating)
+            ITYPE_t si = sizeof(ITYPE_t)
+            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
+            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
+            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+            ITYPE_t n_test = X.shape[0]
+            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
+            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
+            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
+
+            # Counting remainder chunk in total number of chunks
+            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
+                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
+            )
+
+            ITYPE_t X_n_chunks = X_n_full_chunks + (
+                n_test != (X_n_full_chunks * X_n_samples_chunk)
+            )
+
+            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+
+            ITYPE_t Y_start, Y_end, X_start, X_end
+            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+
+            floating *dist_middle_terms_chunks
+            floating *heaps_red_distances_chunks
+
+            # As chunks of X are shared across threads, so must their
+            # heaps. To solve this, each thread has its own locals
+            # heaps which are then synchronised back in the main ones.
+            ITYPE_t *heaps_indices_chunks
+
+        for X_chunk_idx in range(X_n_chunks):
+            X_start = X_chunk_idx * X_n_samples_chunk
+            if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
+                X_end = X_start + X_n_samples_rem
+            else:
+                X_end = X_start + X_n_samples_chunk
+
+            with nogil, parallel(num_threads=num_threads):
+                # Thread local buffers
+
+                # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+                dist_middle_terms_chunks = <floating*> malloc(
+                    Y_n_samples_chunk * X_n_samples_chunk * sf)
+                heaps_red_distances_chunks = <floating*> malloc(
+                    X_n_samples_chunk * k * sf)
+                heaps_indices_chunks = <ITYPE_t*> malloc(
+                    X_n_samples_chunk * k * sf)
+
+                # Initialising heaps (memset can't be used here)
+                for idx in range(X_n_samples_chunk * k):
+                    heaps_red_distances_chunks[idx] = FLOAT_INF
+                    heaps_indices_chunks[idx] = -1
+
+                for Y_chunk_idx in prange(Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * Y_n_samples_chunk
+                    if Y_chunk_idx == Y_n_chunks - 1 \
+                        and Y_n_samples_rem > 0:
+                        Y_end = Y_start + Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + Y_n_samples_chunk
+
+                    self._argkmin_on_chunk(
+                        X[X_start:X_end, :],
+                        Y[Y_start:Y_end, :],
+                        Y_sq_norms[Y_start:Y_end],
+                        dist_middle_terms_chunks,
+                        heaps_red_distances_chunks,
+                        heaps_indices_chunks,
+                        k,
+                        Y_start,
+                    )
+
+                # end: for Y_chunk_idx
+                with gil:
+                    # Synchronising the thread local heaps
+                    # with the main heaps
+                    for idx in range(X_end - X_start):
+                        for jdx in range(k):
+                            _push(
+                                &argkmin_red_distances[X_start + idx, 0],
+                                &argkmin_indices[X_start + idx, 0],
+                                k,
+                                heaps_red_distances_chunks[idx * k + jdx],
+                                heaps_indices_chunks[idx * k + jdx],
+                            )
+
+                free(dist_middle_terms_chunks)
+                free(heaps_red_distances_chunks)
+                free(heaps_indices_chunks)
+
+            # end: with nogil, parallel
+            # Sorting indices of the argkmin for each query vector of X
+            for idx in prange(n_test,schedule='static',
+                              nogil=True, num_threads=num_threads):
+                _simultaneous_sort(
+                    &argkmin_red_distances[idx, 0],
+                    &argkmin_indices[idx, 0],
+                    k,
+                )
+            # end: prange
+
+        # end: for X_chunk_idx
+        return Y_n_chunks
+
+    # Python interface
+
+    def _argkmin(self,
+        floating[:, ::1] X,
+        floating[:, ::1] Y,
+        ITYPE_t k,
+        ITYPE_t chunk_size = CHUNK_SIZE,
+        str strategy = "auto",
+        bint return_distance = False,
+    ):
+        """Computes the argkmin of vectors (rows) of X on Y for
+        the euclidean distance.
+
+        The implementation is parallelised on chunks whose size can
+        be set using ``chunk_size``.
+
+        Parameters
+        ----------
+        X: ndarray of shape (n, d)
+            Rows represent vectors
+
+        Y: ndarray of shape (m, d)
+            Rows represent vectors
+
+        chunk_size: int
+            The number of vectors per chunk.
+
+        strategy: str, {'auto', 'chunk_on_X', 'chunk_on_Y'}
+            The chunking strategy defining which dataset
+            parallelisation are made on.
+
+             - 'chunk_on_X' is embarassingly parallel but
+            is less used in practice.
+             - 'chunk_on_Y' comes with synchronisation but
+            is more useful in practice.
+             -'auto' relies on a simple heuristic to choose
+            between 'chunk_on_X' and 'chunk_on_Y'.
+
+        return_distance: boolean
+            Return distances between each X vectory and its
+            argkmin if set to True.
+
+        Returns
+        -------
+        distances: ndarray of shape (n, k)
+            Distances between each X vector and its argkmin
+            in Y. Only returned if ``return_distance=True``.
+
+        indices: ndarray of shape (n, k)
+            Indices of each X vector argkmin in Y.
+        """
+        int_dtype = np.intp
+        float_dtype = np.float32 if floating is float else np.float64
+        cdef:
+            ITYPE_t[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
+                                                   dtype=ITYPE)
+            floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
+                                                      FLOAT_INF,
+                                                      dtype=float_dtype)
+            floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+            ITYPE_t effective_n_threads = _openmp_effective_n_threads()
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * chunk_size * effective_n_threads < X.shape[0]:
+                strategy = 'chunk_on_X'
+            else:
+                strategy = 'chunk_on_Y'
+
+        if strategy == 'chunk_on_Y':
+            self._argkmin_on_Y(
+                X, Y, Y_sq_norms,
+                chunk_size, effective_n_threads,
+                argkmin_indices, argkmin_distances
+            )
+        elif strategy == 'chunk_on_X':
+            self._argkmin_on_X(
+                X, Y, Y_sq_norms,
+                chunk_size, effective_n_threads,
+                argkmin_indices, argkmin_distances
+            )
         else:
-            strategy = 'chunk_on_Y'
-
-    if strategy == 'chunk_on_Y':
-        _argkmin_on_Y(
-            X, Y, Y_sq_norms,
-            chunk_size, effective_n_threads,
-            argkmin_indices, argkmin_distances
-        )
-    elif strategy == 'chunk_on_X':
-        _argkmin_on_X(
-            X, Y, Y_sq_norms,
-            chunk_size, effective_n_threads,
-            argkmin_indices, argkmin_distances
-        )
-    else:
-        raise RuntimeError(f"strategy '{strategy}' not supported.")
-
-    if return_distance:
-        # We need to recompute distances because we relied on
-        # reduced distances using _gemm, which are missing a
-        # term for squarred norms and which are not the most
-        # precise (catastrophic cancellation might have happened).
-        _exact_euclidean_dist(X, Y, argkmin_indices,
-                              effective_n_threads,
-                              argkmin_distances)
-        return (np.asarray(argkmin_distances),
-                np.asarray(argkmin_indices))
-
-    return np.asarray(argkmin_indices)
+            raise RuntimeError(f"strategy '{strategy}' not supported.")
+
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # reduced distances using _gemm, which are missing a
+            # term for squarred norms and which are not the most
+            # precise (catastrophic cancellation might have happened).
+            _exact_euclidean_dist(X, Y, argkmin_indices,
+                                  effective_n_threads,
+                                  argkmin_distances)
+            return (np.asarray(argkmin_distances),
+                    np.asarray(argkmin_indices))
+
+        return np.asarray(argkmin_indices)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a20c49a20346c..ba04685c07ab9 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._argkmin_fast import _argkmin
+from ._argkmin_fast import ArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -648,7 +648,9 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = _argkmin(X, Y, k=1, strategy="auto", return_distance=True)
+        values, indices = ArgKmin()._argkmin(
+            X, Y, k=1, strategy="auto", return_distance=True
+        )
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)
     else:
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index be561e0bd3f64..9f79be031b0a9 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._argkmin_fast import _argkmin
+from ..metrics._argkmin_fast import ArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -740,7 +740,7 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = _argkmin(
+            results = ArgKmin()._argkmin(
                 X,
                 Y=self._fit_X,
                 k=n_neighbors,

From 568ed2a40b78b55a853c5dbd6a9a4bc182997470 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 10:04:41 +0200
Subject: [PATCH 019/290] [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/_argkmin_fast.pyx | 191 ++++++++++++++----------------
 sklearn/metrics/pairwise.py       |   4 +-
 sklearn/neighbors/_base.py        |   5 +-
 3 files changed, 94 insertions(+), 106 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index e1367012bdffa..b85aa1b94c388 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -4,16 +4,16 @@
 # cython: wraparound=False
 # cython: profile=False
 # cython: linetrace=False
+# cython: initializedcheck=False
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
 
 import numpy as np
 cimport numpy as np
 
-from libc.math cimport floor, sqrt
+from libc.math cimport sqrt
 from libc.stdlib cimport free, malloc
 
-from cython cimport floating
 from cython.parallel cimport parallel, prange
 
 DEF CHUNK_SIZE = 256  # number of vectors
@@ -34,18 +34,18 @@ from ..utils._cython_blas cimport (
 
 from ..utils._heap cimport _simultaneous_sort, _push
 from ..utils._openmp_helpers import _openmp_effective_n_threads
-from ..utils._typedefs cimport ITYPE_t
-from ..utils._typedefs import ITYPE
+from ..utils._typedefs cimport ITYPE_t, DTYPE_t
+from ..utils._typedefs import ITYPE, DTYPE
 
 
-cdef inline floating _euclidean_dist(
-    floating[:, ::1] X,
-    floating[:, ::1] Y,
+cdef inline DTYPE_t _euclidean_dist(
+    DTYPE_t[:, ::1] X,
+    DTYPE_t[:, ::1] Y,
     ITYPE_t i,
     ITYPE_t j,
 ) nogil:
     cdef:
-        floating dist = 0
+        DTYPE_t dist = 0
         ITYPE_t k
         ITYPE_t upper_unrolled_idx = (X.shape[1] // 4) * 4
 
@@ -62,11 +62,11 @@ cdef inline floating _euclidean_dist(
     return sqrt(dist)
 
 cdef int _exact_euclidean_dist(
-    floating[:, ::1] X,                  # IN
-    floating[:, ::1] Y,                  # IN
+    DTYPE_t[:, ::1] X,                  # IN
+    DTYPE_t[:, ::1] Y,                  # IN
     ITYPE_t[:, ::1] Y_indices,           # IN
     ITYPE_t effective_n_threads,         # IN
-    floating[:, ::1] distances,          # OUT
+    DTYPE_t[:, ::1] distances,          # OUT
 ) nogil:
     """
     Compute exact pairwise euclidean distances in parallel.
@@ -90,12 +90,33 @@ cdef int _exact_euclidean_dist(
 
 cdef class ArgKmin:
 
-    cdef void _argkmin_on_chunk(self,
-        floating[:, ::1] X_c,                  # IN
-        floating[:, ::1] Y_c,                  # IN
-        floating[::1] Y_sq_norms,              # IN
-        floating *dist_middle_terms,           # IN
-        floating *heaps_red_distances,         # IN/OUT
+    cdef:
+        ITYPE_t k
+        ITYPE_t chunk_size
+
+        DTYPE_t[:, ::1] X
+        DTYPE_t[:, ::1] Y
+
+        DTYPE_t[::1] Y_sq_norms
+
+    def __init__(self,
+                  DTYPE_t[:, ::1] X,
+                  DTYPE_t[:, ::1] Y,
+                  ITYPE_t k,
+                  ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        self.X = X
+        self.Y = Y
+        self.k = k
+        self.chunk_size = chunk_size
+        self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+
+    cdef void _reduce_on_chunks(self,
+        DTYPE_t[:, ::1] X_c,                  # IN
+        DTYPE_t[:, ::1] Y_c,                  # IN
+        DTYPE_t[::1] Y_sq_norms,              # IN
+        DTYPE_t *dist_middle_terms,           # IN
+        DTYPE_t *heaps_red_distances,         # IN/OUT
         ITYPE_t *heaps_indices,                # IN/OUT
         ITYPE_t k,                             # IN
         # ID of the first element of Y_c
@@ -139,32 +160,28 @@ cdef class ArgKmin:
                       j + Y_idx_offset)
 
 
-
-    cdef int _argkmin_on_X(self,
-        floating[:, ::1] X,                       # IN
-        floating[:, ::1] Y,                       # IN
-        floating[::1] Y_sq_norms,                 # IN
+    cdef int _parallel_on_X(self,
         ITYPE_t chunk_size,                       # IN
         ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        floating[:, ::1] argkmin_red_distances,   # OUT
+        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelising computation on chunks of X.
         """
         cdef:
             ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = X.shape[1]
-            ITYPE_t sf = sizeof(floating)
+            ITYPE_t d = self.X.shape[1]
+            ITYPE_t sf = sizeof(DTYPE_t)
             ITYPE_t si = sizeof(ITYPE_t)
             ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
-            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t n_train = self.Y.shape[0]
             ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
             ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
             ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
 
-            ITYPE_t n_test = X.shape[0]
+            ITYPE_t n_test = self.X.shape[0]
             ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
             ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
             ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
@@ -183,16 +200,16 @@ cdef class ArgKmin:
             ITYPE_t Y_start, Y_end, X_start, X_end
             ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            floating *dist_middle_terms_chunks
-            floating *heaps_red_distances_chunks
+            DTYPE_t *dist_middle_terms_chunks
+            DTYPE_t *heaps_red_distances_chunks
 
 
         with nogil, parallel(num_threads=num_threads):
             # Thread local buffers
 
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <floating*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
-            heaps_red_distances_chunks = <floating*> malloc(X_n_samples_chunk * k * sf)
+            dist_middle_terms_chunks = <DTYPE_t*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
+            heaps_red_distances_chunks = <DTYPE_t*> malloc(X_n_samples_chunk * k * sf)
 
             for X_chunk_idx in prange(X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
@@ -212,10 +229,10 @@ cdef class ArgKmin:
                     else:
                         Y_end = Y_start + Y_n_samples_chunk
 
-                    self._argkmin_on_chunk(
-                        X[X_start:X_end, :],
-                        Y[Y_start:Y_end, :],
-                        Y_sq_norms[Y_start:Y_end],
+                    self._reduce_on_chunks(
+                        self.X[X_start:X_end, :],
+                        self.Y[Y_start:Y_end, :],
+                        self.Y_sq_norms[Y_start:Y_end],
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         &argkmin_indices[X_start, 0],
@@ -239,14 +256,11 @@ cdef class ArgKmin:
         return X_n_chunks
 
 
-    cdef int _argkmin_on_Y(self,
-        floating[:, ::1] X,                       # IN
-        floating[:, ::1] Y,                       # IN
-        floating[::1] Y_sq_norms,                 # IN
+    cdef int _parallel_on_Y(self,
         ITYPE_t chunk_size,                       # IN
         ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        floating[:, ::1] argkmin_red_distances,   # OUT
+        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelising computation on chunks of Y.
@@ -257,17 +271,17 @@ cdef class ArgKmin:
         """
         cdef:
             ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = X.shape[1]
-            ITYPE_t sf = sizeof(floating)
+            ITYPE_t d = self.X.shape[1]
+            ITYPE_t sf = sizeof(DTYPE_t)
             ITYPE_t si = sizeof(ITYPE_t)
             ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
-            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t n_train = self.Y.shape[0]
             ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
             ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
             ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
 
-            ITYPE_t n_test = X.shape[0]
+            ITYPE_t n_test = self.X.shape[0]
             ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
             ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
             ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
@@ -286,8 +300,8 @@ cdef class ArgKmin:
             ITYPE_t Y_start, Y_end, X_start, X_end
             ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            floating *dist_middle_terms_chunks
-            floating *heaps_red_distances_chunks
+            DTYPE_t *dist_middle_terms_chunks
+            DTYPE_t *heaps_red_distances_chunks
 
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own locals
@@ -305,9 +319,9 @@ cdef class ArgKmin:
                 # Thread local buffers
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                dist_middle_terms_chunks = <floating*> malloc(
+                dist_middle_terms_chunks = <DTYPE_t*> malloc(
                     Y_n_samples_chunk * X_n_samples_chunk * sf)
-                heaps_red_distances_chunks = <floating*> malloc(
+                heaps_red_distances_chunks = <DTYPE_t*> malloc(
                     X_n_samples_chunk * k * sf)
                 heaps_indices_chunks = <ITYPE_t*> malloc(
                     X_n_samples_chunk * k * sf)
@@ -325,10 +339,10 @@ cdef class ArgKmin:
                     else:
                         Y_end = Y_start + Y_n_samples_chunk
 
-                    self._argkmin_on_chunk(
-                        X[X_start:X_end, :],
-                        Y[Y_start:Y_end, :],
-                        Y_sq_norms[Y_start:Y_end],
+                    self._reduce_on_chunks(
+                        self.X[X_start:X_end, :],
+                        self.Y[Y_start:Y_end, :],
+                        self.Y_sq_norms[Y_start:Y_end],
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         heaps_indices_chunks,
@@ -369,47 +383,28 @@ cdef class ArgKmin:
         return Y_n_chunks
 
     # Python interface
+    def compute(self,
+           str strategy = "auto",
+           bint return_distance = False
+           ):
+        """Computes the argkmin of vectors (rows) of X on Y.
 
-    def _argkmin(self,
-        floating[:, ::1] X,
-        floating[:, ::1] Y,
-        ITYPE_t k,
-        ITYPE_t chunk_size = CHUNK_SIZE,
-        str strategy = "auto",
-        bint return_distance = False,
-    ):
-        """Computes the argkmin of vectors (rows) of X on Y for
-        the euclidean distance.
-
-        The implementation is parallelised on chunks whose size can
-        be set using ``chunk_size``.
-
-        Parameters
-        ----------
-        X: ndarray of shape (n, d)
-            Rows represent vectors
-
-        Y: ndarray of shape (m, d)
-            Rows represent vectors
-
-        chunk_size: int
-            The number of vectors per chunk.
-
-        strategy: str, {'auto', 'chunk_on_X', 'chunk_on_Y'}
+        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
             The chunking strategy defining which dataset
             parallelisation are made on.
 
-             - 'chunk_on_X' is embarassingly parallel but
+             - 'parallel_on_X' is embarassingly parallel but
             is less used in practice.
-             - 'chunk_on_Y' comes with synchronisation but
+             - 'parallel_on_Y' comes with synchronisation but
             is more useful in practice.
              -'auto' relies on a simple heuristic to choose
-            between 'chunk_on_X' and 'chunk_on_Y'.
+            between 'parallel_on_X' and 'parallel_on_Y'.
 
         return_distance: boolean
             Return distances between each X vectory and its
             argkmin if set to True.
 
+
         Returns
         -------
         distances: ndarray of shape (n, k)
@@ -419,35 +414,31 @@ cdef class ArgKmin:
         indices: ndarray of shape (n, k)
             Indices of each X vector argkmin in Y.
         """
-        int_dtype = np.intp
-        float_dtype = np.float32 if floating is float else np.float64
         cdef:
-            ITYPE_t[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
+            ITYPE_t n_X = self.X.shape[0]
+            ITYPE_t[:, ::1] argkmin_indices = np.full((n_X, self.k), 0,
                                                    dtype=ITYPE)
-            floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
+            DTYPE_t[:, ::1] argkmin_distances = np.full((n_X, self.k),
                                                       FLOAT_INF,
-                                                      dtype=float_dtype)
-            floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+                                                      dtype=DTYPE)
             ITYPE_t effective_n_threads = _openmp_effective_n_threads()
 
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * chunk_size * effective_n_threads < X.shape[0]:
-                strategy = 'chunk_on_X'
+            if 4 * self.chunk_size * effective_n_threads < n_X:
+                strategy = 'parallel_on_X'
             else:
-                strategy = 'chunk_on_Y'
+                strategy = 'parallel_on_Y'
 
-        if strategy == 'chunk_on_Y':
-            self._argkmin_on_Y(
-                X, Y, Y_sq_norms,
-                chunk_size, effective_n_threads,
+        if strategy == 'parallel_on_Y':
+            self._parallel_on_Y(
+                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
-        elif strategy == 'chunk_on_X':
-            self._argkmin_on_X(
-                X, Y, Y_sq_norms,
-                chunk_size, effective_n_threads,
+        elif strategy == 'parallel_on_X':
+            self._parallel_on_X(
+                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
         else:
@@ -456,9 +447,9 @@ cdef class ArgKmin:
         if return_distance:
             # We need to recompute distances because we relied on
             # reduced distances using _gemm, which are missing a
-            # term for squarred norms and which are not the most
+            # term for squared norms and which are not the most
             # precise (catastrophic cancellation might have happened).
-            _exact_euclidean_dist(X, Y, argkmin_indices,
+            _exact_euclidean_dist(self.X, self.Y, argkmin_indices,
                                   effective_n_threads,
                                   argkmin_distances)
             return (np.asarray(argkmin_distances),
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index ba04685c07ab9..c78ed7bc23e17 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -648,8 +648,8 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = ArgKmin()._argkmin(
-            X, Y, k=1, strategy="auto", return_distance=True
+        values, indices = ArgKmin(X, Y, k=1).compute(
+            k=1, strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 9f79be031b0a9..53774a3b78510 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -740,10 +740,7 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = ArgKmin()._argkmin(
-                X,
-                Y=self._fit_X,
-                k=n_neighbors,
+            results = ArgKmin(X=X, Y=self._fit_X, k=n_neighbors).compute(
                 strategy="auto",
                 return_distance=return_distance,
             )

From 2bf34aae7b2cdf4296fdcaccc8de7fcbe72e5dc7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 10:22:35 +0200
Subject: [PATCH 020/290] [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/_argkmin_fast.pyx | 184 ++++++++++++++----------------
 1 file changed, 85 insertions(+), 99 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index b85aa1b94c388..192b8782d2f6a 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -91,24 +91,63 @@ cdef int _exact_euclidean_dist(
 cdef class ArgKmin:
 
     cdef:
-        ITYPE_t k
-        ITYPE_t chunk_size
+        ITYPE_t k, d, sf, si
+        ITYPE_t n_samples_chunk, chunk_size
+
+        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_samples_rem
+        ITYPE_t n_X, X_n_samples_chunk, X_n_samples_rem
+
+        # Counting remainder chunk in total number of chunks
+        ITYPE_t Y_n_chunks, X_n_chunks, num_threads
 
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
         DTYPE_t[::1] Y_sq_norms
 
+    def __cinit__(self):
+        # Initializing memory view to prevent memory errors and seg-faults
+        # in rare cases where __init__ is not called
+        self.X = np.empty((1, 1), dtype=DTYPE, order='c')
+        self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
+
     def __init__(self,
                   DTYPE_t[:, ::1] X,
                   DTYPE_t[:, ::1] Y,
                   ITYPE_t k,
                   ITYPE_t chunk_size = CHUNK_SIZE,
     ):
+        cdef:
+            ITYPE_t X_n_full_chunks, Y_n_full_chunks
         self.X = X
         self.Y = Y
+
         self.k = k
+        self.d = X.shape[1]
+        self.sf = sizeof(DTYPE_t)
+        self.si = sizeof(ITYPE_t)
         self.chunk_size = chunk_size
+        self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+        self.n_Y = Y.shape[0]
+        self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
+        Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
+        self.Y_n_samples_rem = self.n_Y % self.Y_n_samples_chunk
+
+        self.n_X = X.shape[0]
+        self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
+        X_n_full_chunks = self.n_X // self.X_n_samples_chunk
+        self.X_n_samples_rem = self.n_X % self.X_n_samples_chunk
+
+        # Counting remainder chunk in total number of chunks
+        self.Y_n_chunks = Y_n_full_chunks + (
+            self.n_Y != (Y_n_full_chunks * self.Y_n_samples_chunk)
+        )
+
+        self.X_n_chunks = X_n_full_chunks + (
+            self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
+        )
+
         self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
 
     cdef void _reduce_on_chunks(self,
@@ -170,35 +209,9 @@ cdef class ArgKmin:
         by parallelising computation on chunks of X.
         """
         cdef:
-            ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = self.X.shape[1]
-            ITYPE_t sf = sizeof(DTYPE_t)
-            ITYPE_t si = sizeof(ITYPE_t)
-            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-            ITYPE_t n_train = self.Y.shape[0]
-            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-            ITYPE_t n_test = self.X.shape[0]
-            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-            # Counting remainder chunk in total number of chunks
-            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-            )
-
-            ITYPE_t X_n_chunks = X_n_full_chunks + (
-                n_test != (X_n_full_chunks * X_n_samples_chunk)
-            )
-
-            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
 
-            ITYPE_t Y_start, Y_end, X_start, X_end
-            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
             DTYPE_t *dist_middle_terms_chunks
             DTYPE_t *heaps_red_distances_chunks
@@ -208,26 +221,26 @@ cdef class ArgKmin:
             # Thread local buffers
 
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <DTYPE_t*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
-            heaps_red_distances_chunks = <DTYPE_t*> malloc(X_n_samples_chunk * k * sf)
+            dist_middle_terms_chunks = <DTYPE_t*> malloc(self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+            heaps_red_distances_chunks = <DTYPE_t*> malloc(self.X_n_samples_chunk * self.k * self.sf)
 
-            for X_chunk_idx in prange(X_n_chunks, schedule='static'):
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
-                for idx in range(X_n_samples_chunk * k):
+                for idx in range(self.X_n_samples_chunk * self.k):
                     heaps_red_distances_chunks[idx] = FLOAT_INF
 
-                X_start = X_chunk_idx * X_n_samples_chunk
-                if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-                    X_end = X_start + X_n_samples_rem
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                    X_end = X_start + self.X_n_samples_rem
                 else:
-                    X_end = X_start + X_n_samples_chunk
+                    X_end = X_start + self.X_n_samples_chunk
 
-                for Y_chunk_idx in range(Y_n_chunks):
-                    Y_start = Y_chunk_idx * Y_n_samples_chunk
-                    if Y_chunk_idx == Y_n_chunks - 1 and Y_n_samples_rem > 0:
-                        Y_end = Y_start + Y_n_samples_rem
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
                     else:
-                        Y_end = Y_start + Y_n_samples_chunk
+                        Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
                         self.X[X_start:X_end, :],
@@ -236,16 +249,16 @@ cdef class ArgKmin:
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         &argkmin_indices[X_start, 0],
-                        k,
+                        self.k,
                         Y_start
                     )
 
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        heaps_red_distances_chunks + idx * k,
+                        heaps_red_distances_chunks + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
-                        k
+                        self.k
                     )
 
             # end: for X_chunk_idx
@@ -253,8 +266,7 @@ cdef class ArgKmin:
             free(heaps_red_distances_chunks)
 
         # end: with nogil, parallel
-        return X_n_chunks
-
+        return self.X_n_chunks
 
     cdef int _parallel_on_Y(self,
         ITYPE_t chunk_size,                       # IN
@@ -270,35 +282,9 @@ cdef class ArgKmin:
         most contexts.
         """
         cdef:
-            ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = self.X.shape[1]
-            ITYPE_t sf = sizeof(DTYPE_t)
-            ITYPE_t si = sizeof(ITYPE_t)
-            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-            ITYPE_t n_train = self.Y.shape[0]
-            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-            ITYPE_t n_test = self.X.shape[0]
-            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-            # Counting remainder chunk in total number of chunks
-            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-            )
-
-            ITYPE_t X_n_chunks = X_n_full_chunks + (
-                n_test != (X_n_full_chunks * X_n_samples_chunk)
-            )
-
-            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
 
-            ITYPE_t Y_start, Y_end, X_start, X_end
-            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
             DTYPE_t *dist_middle_terms_chunks
             DTYPE_t *heaps_red_distances_chunks
@@ -308,36 +294,36 @@ cdef class ArgKmin:
             # heaps which are then synchronised back in the main ones.
             ITYPE_t *heaps_indices_chunks
 
-        for X_chunk_idx in range(X_n_chunks):
-            X_start = X_chunk_idx * X_n_samples_chunk
-            if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-                X_end = X_start + X_n_samples_rem
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                X_end = X_start + self.X_n_samples_rem
             else:
-                X_end = X_start + X_n_samples_chunk
+                X_end = X_start + self.X_n_samples_chunk
 
             with nogil, parallel(num_threads=num_threads):
                 # Thread local buffers
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
                 dist_middle_terms_chunks = <DTYPE_t*> malloc(
-                    Y_n_samples_chunk * X_n_samples_chunk * sf)
+                    self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
                 heaps_red_distances_chunks = <DTYPE_t*> malloc(
-                    X_n_samples_chunk * k * sf)
+                    self.X_n_samples_chunk * self.k * self.sf)
                 heaps_indices_chunks = <ITYPE_t*> malloc(
-                    X_n_samples_chunk * k * sf)
+                    self.X_n_samples_chunk * self.k * self.sf)
 
                 # Initialising heaps (memset can't be used here)
-                for idx in range(X_n_samples_chunk * k):
+                for idx in range(self.X_n_samples_chunk * self.k):
                     heaps_red_distances_chunks[idx] = FLOAT_INF
                     heaps_indices_chunks[idx] = -1
 
-                for Y_chunk_idx in prange(Y_n_chunks, schedule='static'):
-                    Y_start = Y_chunk_idx * Y_n_samples_chunk
-                    if Y_chunk_idx == Y_n_chunks - 1 \
-                        and Y_n_samples_rem > 0:
-                        Y_end = Y_start + Y_n_samples_rem
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 \
+                        and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
                     else:
-                        Y_end = Y_start + Y_n_samples_chunk
+                        Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
                         self.X[X_start:X_end, :],
@@ -346,7 +332,7 @@ cdef class ArgKmin:
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         heaps_indices_chunks,
-                        k,
+                        self.k,
                         Y_start,
                     )
 
@@ -355,13 +341,13 @@ cdef class ArgKmin:
                     # Synchronising the thread local heaps
                     # with the main heaps
                     for idx in range(X_end - X_start):
-                        for jdx in range(k):
+                        for jdx in range(self.k):
                             _push(
                                 &argkmin_red_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
-                                k,
-                                heaps_red_distances_chunks[idx * k + jdx],
-                                heaps_indices_chunks[idx * k + jdx],
+                                self.k,
+                                heaps_red_distances_chunks[idx * self.k + jdx],
+                                heaps_indices_chunks[idx * self.k + jdx],
                             )
 
                 free(dist_middle_terms_chunks)
@@ -370,17 +356,17 @@ cdef class ArgKmin:
 
             # end: with nogil, parallel
             # Sorting indices of the argkmin for each query vector of X
-            for idx in prange(n_test,schedule='static',
+            for idx in prange(self.n_X, schedule='static',
                               nogil=True, num_threads=num_threads):
                 _simultaneous_sort(
                     &argkmin_red_distances[idx, 0],
                     &argkmin_indices[idx, 0],
-                    k,
+                    self.k,
                 )
             # end: prange
 
         # end: for X_chunk_idx
-        return Y_n_chunks
+        return self.Y_n_chunks
 
     # Python interface
     def compute(self,

From c1415d693beaa67f9657d0f27706070920c3886e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 11:05:43 +0200
Subject: [PATCH 021/290] [WIP] Adapting to use class hierarchy

This test segfaults:

test_neighbors.py::test_fast_sqeuclidean_correctness[1-10-5-1000]
---
 sklearn/metrics/_argkmin_fast.pyx | 74 ++++++++++++++++---------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 192b8782d2f6a..dcdfca80c9c22 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -105,6 +105,17 @@ cdef class ArgKmin:
 
         DTYPE_t[::1] Y_sq_norms
 
+        # ArgKmin
+        DTYPE_t * dist_middle_terms_chunks
+        DTYPE_t * heaps_red_distances_chunks
+
+        # Used for parallel_on_Y:
+
+        # As chunks of X are shared across threads, so must their
+        # heaps. To solve this, each thread has its own locals
+        # heaps which are then synchronised back in the main ones.
+        ITYPE_t * heaps_indices_chunks
+
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
         # in rare cases where __init__ is not called
@@ -210,24 +221,23 @@ cdef class ArgKmin:
         """
         cdef:
             ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
-
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            DTYPE_t *dist_middle_terms_chunks
-            DTYPE_t *heaps_red_distances_chunks
-
+            # in bytes
+            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
+            ITYPE_t heap_size = self.X_n_samples_chunk * self.k * self.sf
 
         with nogil, parallel(num_threads=num_threads):
             # Thread local buffers
 
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <DTYPE_t*> malloc(self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
-            heaps_red_distances_chunks = <DTYPE_t*> malloc(self.X_n_samples_chunk * self.k * self.sf)
+            self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
+            self.heaps_red_distances_chunks = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    heaps_red_distances_chunks[idx] = FLOAT_INF
+                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
 
                 X_start = X_chunk_idx * self.X_n_samples_chunk
                 if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
@@ -246,8 +256,8 @@ cdef class ArgKmin:
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        dist_middle_terms_chunks,
-                        heaps_red_distances_chunks,
+                        self.dist_middle_terms_chunks,
+                        self.heaps_red_distances_chunks,
                         &argkmin_indices[X_start, 0],
                         self.k,
                         Y_start
@@ -256,14 +266,14 @@ cdef class ArgKmin:
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        heaps_red_distances_chunks + idx * self.k,
+                        self.heaps_red_distances_chunks + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
                         self.k
                     )
 
             # end: for X_chunk_idx
-            free(dist_middle_terms_chunks)
-            free(heaps_red_distances_chunks)
+            free(self.dist_middle_terms_chunks)
+            free(self.heaps_red_distances_chunks)
 
         # end: with nogil, parallel
         return self.X_n_chunks
@@ -286,13 +296,10 @@ cdef class ArgKmin:
 
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            DTYPE_t *dist_middle_terms_chunks
-            DTYPE_t *heaps_red_distances_chunks
-
-            # As chunks of X are shared across threads, so must their
-            # heaps. To solve this, each thread has its own locals
-            # heaps which are then synchronised back in the main ones.
-            ITYPE_t *heaps_indices_chunks
+            # in bytes
+            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
+            ITYPE_t int_heap_size = self.X_n_samples_chunk * self.k * self.si
+            ITYPE_t float_heap_size = self.X_n_samples_chunk * self.k * self.sf
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
@@ -305,17 +312,14 @@ cdef class ArgKmin:
                 # Thread local buffers
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                dist_middle_terms_chunks = <DTYPE_t*> malloc(
-                    self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
-                heaps_red_distances_chunks = <DTYPE_t*> malloc(
-                    self.X_n_samples_chunk * self.k * self.sf)
-                heaps_indices_chunks = <ITYPE_t*> malloc(
-                    self.X_n_samples_chunk * self.k * self.sf)
+                self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
+                self.heaps_red_distances_chunks = <DTYPE_t*> malloc(float_heap_size)
+                self.heaps_indices_chunks = <ITYPE_t*> malloc(int_heap_size)
 
                 # Initialising heaps (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    heaps_red_distances_chunks[idx] = FLOAT_INF
-                    heaps_indices_chunks[idx] = -1
+                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
+                    self.heaps_indices_chunks[idx] = -1
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -329,9 +333,9 @@ cdef class ArgKmin:
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        dist_middle_terms_chunks,
-                        heaps_red_distances_chunks,
-                        heaps_indices_chunks,
+                        self.dist_middle_terms_chunks,
+                        self.heaps_red_distances_chunks,
+                        self.heaps_indices_chunks,
                         self.k,
                         Y_start,
                     )
@@ -346,13 +350,13 @@ cdef class ArgKmin:
                                 &argkmin_red_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
                                 self.k,
-                                heaps_red_distances_chunks[idx * self.k + jdx],
-                                heaps_indices_chunks[idx * self.k + jdx],
+                                self.heaps_red_distances_chunks[idx * self.k + jdx],
+                                self.heaps_indices_chunks[idx * self.k + jdx],
                             )
 
-                free(dist_middle_terms_chunks)
-                free(heaps_red_distances_chunks)
-                free(heaps_indices_chunks)
+                free(self.dist_middle_terms_chunks)
+                free(self.heaps_red_distances_chunks)
+                free(self.heaps_indices_chunks)
 
             # end: with nogil, parallel
             # Sorting indices of the argkmin for each query vector of X

From 80aaf0bcb13cf31f5b2a251d615410c6b680db8f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 12:05:57 +0200
Subject: [PATCH 022/290] [WIP] Adapting to use class hierarchy

The segfaults was due to reallocation
of on the same pointers, causing multiple
freeing on the same reference and memory leaks.

To resolve this, arrays of pointers for local
datastructures are allocated at the initialisation
of the interface so that they can be handled
separately in threads with proper allocation
and deallocation.

The memory management will be wrapped in
subsequent private template method for
each types of reduction and parallelisation
strategy. This is one of the next iteration.
---
 sklearn/metrics/_argkmin_fast.pyx | 97 +++++++++++++++++--------------
 1 file changed, 53 insertions(+), 44 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index dcdfca80c9c22..d1b374dcae299 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -10,6 +10,7 @@
 
 import numpy as np
 cimport numpy as np
+cimport openmp
 
 from libc.math cimport sqrt
 from libc.stdlib cimport free, malloc
@@ -64,8 +65,8 @@ cdef inline DTYPE_t _euclidean_dist(
 cdef int _exact_euclidean_dist(
     DTYPE_t[:, ::1] X,                  # IN
     DTYPE_t[:, ::1] Y,                  # IN
-    ITYPE_t[:, ::1] Y_indices,           # IN
-    ITYPE_t effective_n_threads,         # IN
+    ITYPE_t[:, ::1] Y_indices,          # IN
+    ITYPE_t n_threads,                  # IN
     DTYPE_t[:, ::1] distances,          # OUT
 ) nogil:
     """
@@ -82,7 +83,7 @@ cdef int _exact_euclidean_dist(
         ITYPE_t i, k
 
     for i in prange(X.shape[0], schedule='static',
-                    nogil=True, num_threads=effective_n_threads):
+                    nogil=True, num_threads=n_threads):
         for k in range(Y_indices.shape[1]):
             distances[i, k] = _euclidean_dist(X, Y, i,
                                               Y_indices[i, k])
@@ -91,6 +92,8 @@ cdef int _exact_euclidean_dist(
 cdef class ArgKmin:
 
     cdef:
+        ITYPE_t effective_omp_n_thread
+
         ITYPE_t k, d, sf, si
         ITYPE_t n_samples_chunk, chunk_size
 
@@ -103,18 +106,16 @@ cdef class ArgKmin:
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
-        DTYPE_t[::1] Y_sq_norms
-
         # ArgKmin
-        DTYPE_t * dist_middle_terms_chunks
-        DTYPE_t * heaps_red_distances_chunks
+        DTYPE_t[::1] Y_sq_norms
+        DTYPE_t ** dist_middle_terms_chunks
+        DTYPE_t ** heaps_red_distances_chunks
 
         # Used for parallel_on_Y:
-
         # As chunks of X are shared across threads, so must their
         # heaps. To solve this, each thread has its own locals
         # heaps which are then synchronised back in the main ones.
-        ITYPE_t * heaps_indices_chunks
+        ITYPE_t ** heaps_indices_chunks
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -159,8 +160,23 @@ cdef class ArgKmin:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+        self.effective_omp_n_thread = _openmp_effective_n_threads()
+
+        # ArgKmin
         self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
 
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
+
+    def __dealloc__(self):
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+        if self.heaps_red_distances_chunks is not NULL:
+            free(self.heaps_red_distances_chunks)
+        if self.heaps_indices_chunks is not NULL:
+            free(self.heaps_indices_chunks)
+
     cdef void _reduce_on_chunks(self,
         DTYPE_t[:, ::1] X_c,                  # IN
         DTYPE_t[:, ::1] Y_c,                  # IN
@@ -211,8 +227,6 @@ cdef class ArgKmin:
 
 
     cdef int _parallel_on_X(self,
-        ITYPE_t chunk_size,                       # IN
-        ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
@@ -220,8 +234,9 @@ cdef class ArgKmin:
         by parallelising computation on chunks of X.
         """
         cdef:
-            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
 
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
@@ -229,15 +244,15 @@ cdef class ArgKmin:
 
         with nogil, parallel(num_threads=num_threads):
             # Thread local buffers
-
+            thread_num = openmp.omp_get_thread_num()
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
-            self.heaps_red_distances_chunks = <DTYPE_t*> malloc(heap_size)
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
+            self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
+                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
 
                 X_start = X_chunk_idx * self.X_n_samples_chunk
                 if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
@@ -256,8 +271,8 @@ cdef class ArgKmin:
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks,
-                        self.heaps_red_distances_chunks,
+                        self.dist_middle_terms_chunks[thread_num],
+                        self.heaps_red_distances_chunks[thread_num],
                         &argkmin_indices[X_start, 0],
                         self.k,
                         Y_start
@@ -266,21 +281,19 @@ cdef class ArgKmin:
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        self.heaps_red_distances_chunks + idx * self.k,
+                        self.heaps_red_distances_chunks[thread_num] + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
                         self.k
                     )
 
             # end: for X_chunk_idx
-            free(self.dist_middle_terms_chunks)
-            free(self.heaps_red_distances_chunks)
+            free(self.dist_middle_terms_chunks[thread_num])
+            free(self.heaps_red_distances_chunks[thread_num])
 
         # end: with nogil, parallel
         return self.X_n_chunks
 
     cdef int _parallel_on_Y(self,
-        ITYPE_t chunk_size,                       # IN
-        ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
@@ -292,9 +305,9 @@ cdef class ArgKmin:
         most contexts.
         """
         cdef:
-            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
-
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
 
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
@@ -310,16 +323,17 @@ cdef class ArgKmin:
 
             with nogil, parallel(num_threads=num_threads):
                 # Thread local buffers
+                thread_num = openmp.omp_get_thread_num()
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
-                self.heaps_red_distances_chunks = <DTYPE_t*> malloc(float_heap_size)
-                self.heaps_indices_chunks = <ITYPE_t*> malloc(int_heap_size)
+                self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
+                self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
+                self.heaps_indices_chunks[thread_num] = <ITYPE_t*> malloc(int_heap_size)
 
                 # Initialising heaps (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
-                    self.heaps_indices_chunks[idx] = -1
+                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
+                    self.heaps_indices_chunks[thread_num][idx] = -1
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -329,13 +343,14 @@ cdef class ArgKmin:
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+
                     self._reduce_on_chunks(
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks,
-                        self.heaps_red_distances_chunks,
-                        self.heaps_indices_chunks,
+                        self.dist_middle_terms_chunks[thread_num],
+                        self.heaps_red_distances_chunks[thread_num],
+                        self.heaps_indices_chunks[thread_num],
                         self.k,
                         Y_start,
                     )
@@ -350,15 +365,12 @@ cdef class ArgKmin:
                                 &argkmin_red_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
                                 self.k,
-                                self.heaps_red_distances_chunks[idx * self.k + jdx],
-                                self.heaps_indices_chunks[idx * self.k + jdx],
+                                self.heaps_red_distances_chunks[thread_num][idx * self.k + jdx],
+                                self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                             )
 
-                free(self.dist_middle_terms_chunks)
-                free(self.heaps_red_distances_chunks)
-                free(self.heaps_indices_chunks)
-
             # end: with nogil, parallel
+
             # Sorting indices of the argkmin for each query vector of X
             for idx in prange(self.n_X, schedule='static',
                               nogil=True, num_threads=num_threads):
@@ -411,24 +423,21 @@ cdef class ArgKmin:
             DTYPE_t[:, ::1] argkmin_distances = np.full((n_X, self.k),
                                                       FLOAT_INF,
                                                       dtype=DTYPE)
-            ITYPE_t effective_n_threads = _openmp_effective_n_threads()
 
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * effective_n_threads < n_X:
+            if 4 * self.chunk_size * self.effective_omp_n_thread < n_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
 
         if strategy == 'parallel_on_Y':
             self._parallel_on_Y(
-                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
         elif strategy == 'parallel_on_X':
             self._parallel_on_X(
-                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
         else:
@@ -440,7 +449,7 @@ cdef class ArgKmin:
             # term for squared norms and which are not the most
             # precise (catastrophic cancellation might have happened).
             _exact_euclidean_dist(self.X, self.Y, argkmin_indices,
-                                  effective_n_threads,
+                                  self.effective_omp_n_thread,
                                   argkmin_distances)
             return (np.asarray(argkmin_distances),
                     np.asarray(argkmin_indices))

From 49e247d80b1b1b23f4a619983afd478123921a5e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 12:05:57 +0200
Subject: [PATCH 023/290] [WIP] Adapting to use class hierarchy

Refactor ArgKmin._reduce_on_chunks to pave the
way to general interface for reductions.

Private datastructures will have to be accessed via
the implementation of this private method.
---
 sklearn/metrics/_argkmin_fast.pyx | 64 +++++++++++++++----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index d1b374dcae299..96042432a80a5 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -110,11 +110,6 @@ cdef class ArgKmin:
         DTYPE_t[::1] Y_sq_norms
         DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_red_distances_chunks
-
-        # Used for parallel_on_Y:
-        # As chunks of X are shared across threads, so must their
-        # heaps. To solve this, each thread has its own locals
-        # heaps which are then synchronised back in the main ones.
         ITYPE_t ** heaps_indices_chunks
 
     def __cinit__(self):
@@ -178,15 +173,13 @@ cdef class ArgKmin:
             free(self.heaps_indices_chunks)
 
     cdef void _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X_c,                  # IN
-        DTYPE_t[:, ::1] Y_c,                  # IN
-        DTYPE_t[::1] Y_sq_norms,              # IN
-        DTYPE_t *dist_middle_terms,           # IN
-        DTYPE_t *heaps_red_distances,         # IN/OUT
-        ITYPE_t *heaps_indices,                # IN/OUT
-        ITYPE_t k,                             # IN
-        # ID of the first element of Y_c
-        ITYPE_t Y_idx_offset,
+        DTYPE_t[:, ::1] X,                  # IN
+        DTYPE_t[:, ::1] Y,                  # IN
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
     ) nogil:
         """
         Critical part of the computation of pairwise distances.
@@ -196,6 +189,13 @@ cdef class ArgKmin:
         """
         cdef:
             ITYPE_t i, j
+            DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            ITYPE_t k = self.k
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
         # Instead of computing the full pairwise squared distances matrix,
         # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
         # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
@@ -222,8 +222,8 @@ cdef class ArgKmin:
                       heaps_indices + i * k,
                       k,
                       # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                      dist_middle_terms[i * Y_c.shape[0] + j] + Y_sq_norms[j],
-                      j + Y_idx_offset)
+                      dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
+                      j + Y_start)
 
 
     cdef int _parallel_on_X(self,
@@ -260,6 +260,8 @@ cdef class ArgKmin:
                 else:
                     X_end = X_start + self.X_n_samples_chunk
 
+                self.heaps_indices_chunks[thread_num] = &argkmin_indices[X_start, 0]
+
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
                     if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
@@ -268,14 +270,11 @@ cdef class ArgKmin:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
-                        self.X[X_start:X_end, :],
-                        self.Y[Y_start:Y_end, :],
-                        self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks[thread_num],
-                        self.heaps_red_distances_chunks[thread_num],
-                        &argkmin_indices[X_start, 0],
-                        self.k,
-                        Y_start
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
                     )
 
                 # Sorting indices so that the closests' come first.
@@ -328,6 +327,10 @@ cdef class ArgKmin:
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
                 self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
                 self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
+
+                # As chunks of X are shared across threads, so must their
+                # heaps. To solve this, each thread has its own locals
+                # heaps which are then synchronised back in the main ones.
                 self.heaps_indices_chunks[thread_num] = <ITYPE_t*> malloc(int_heap_size)
 
                 # Initialising heaps (memset can't be used here)
@@ -345,14 +348,11 @@ cdef class ArgKmin:
 
 
                     self._reduce_on_chunks(
-                        self.X[X_start:X_end, :],
-                        self.Y[Y_start:Y_end, :],
-                        self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks[thread_num],
-                        self.heaps_red_distances_chunks[thread_num],
-                        self.heaps_indices_chunks[thread_num],
-                        self.k,
-                        Y_start,
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
                     )
 
                 # end: for Y_chunk_idx

From 25c9a2c90b280e15c7c7d7943bb93345ff7ad8af Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 18:25:29 +0200
Subject: [PATCH 024/290] fixup! [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/pairwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c78ed7bc23e17..f657ad5ccb49b 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -649,7 +649,7 @@ def pairwise_distances_argmin_min(
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
         values, indices = ArgKmin(X, Y, k=1).compute(
-            k=1, strategy="auto", return_distance=True
+            strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)

From eb8b9313ac6ace0db051f727931aa8cd54d2c64b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 12:05:57 +0200
Subject: [PATCH 025/290] [WIP] Adapting to use class hierarchy

Introduce ParallelReduction as a abstract class,
and extend it using ArgKmin.

FastSquaredEuclideanArgKmin extends ArgKmin
for the "fast_sqeuclidean" strategy.
---
 sklearn/metrics/_argkmin_fast.pyx | 217 ++++++++++++++++++++++--------
 sklearn/metrics/pairwise.py       |   4 +-
 sklearn/neighbors/_base.py        |   6 +-
 3 files changed, 170 insertions(+), 57 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 96042432a80a5..62dce34bceb1c 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -17,6 +17,8 @@ from libc.stdlib cimport free, malloc
 
 from cython.parallel cimport parallel, prange
 
+# from ..neighbors._dist_metrics cimport DistanceMetric
+
 DEF CHUNK_SIZE = 256  # number of vectors
 
 DEF MIN_CHUNK_SAMPLES = 20
@@ -89,7 +91,21 @@ cdef int _exact_euclidean_dist(
                                               Y_indices[i, k])
 
 
-cdef class ArgKmin:
+cdef class ParallelReduction:
+    """Abstract class to computes a reduction of a set of
+    vectors (rows) of X on another set of vectors (rows) of Y
+
+    The implementation of the reduction is done parallelised
+    on chunks whose size can be set using ``chunk_size``.
+    Parameters
+    ----------
+    X: ndarray of shape (n, d)
+        Rows represent vectors
+    Y: ndarray of shape (m, d)
+        Rows represent vectors
+    chunk_size: int
+        The number of vectors per chunk.
+    """
 
     cdef:
         ITYPE_t effective_omp_n_thread
@@ -106,11 +122,10 @@ cdef class ArgKmin:
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
-        # ArgKmin
-        DTYPE_t[::1] Y_sq_norms
-        DTYPE_t ** dist_middle_terms_chunks
-        DTYPE_t ** heaps_red_distances_chunks
-        ITYPE_t ** heaps_indices_chunks
+        # TODO: needs to move DistanceMetric
+        # from neighbors to be able to use them
+        # some adaptation
+        # DistanceMetric distance_metric
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -129,6 +144,13 @@ cdef class ArgKmin:
         self.X = X
         self.Y = Y
 
+        # TODO: use proper internals checks of scikit-learn
+        assert X.shape[1] == Y.shape[1], (
+            f"Vectors of X and Y must have the same "
+            f"number of dimensions but are respectively "
+            f"{X.shape[1]}-dimensional and {Y.shape[1]}-dimensional."
+        )
+
         self.k = k
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)
@@ -157,22 +179,56 @@ cdef class ArgKmin:
 
         self.effective_omp_n_thread = _openmp_effective_n_threads()
 
-        # ArgKmin
-        self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+
+    cdef int _reduce_on_chunks(self,
+        DTYPE_t[:, ::1] X,                  # IN
+        DTYPE_t[:, ::1] Y,                  # IN
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil except -1:
+        """ Abstract method: Sub-classes implemented the reduction
+        on a pair of chunks"""
+        return -1
+
+cdef class ArgKmin(ParallelReduction):
+
+    cdef:
+        DTYPE_t ** dist_middle_terms_chunks
+        DTYPE_t ** heaps_red_distances_chunks
+        ITYPE_t ** heaps_indices_chunks
+
+    def __init__(self,
+                  DTYPE_t[:, ::1] X,
+                  DTYPE_t[:, ::1] Y,
+                  ITYPE_t k,
+                  ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        ParallelReduction.__init__(self, X, Y, k)
 
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
-        if self.dist_middle_terms_chunks is not NULL:
-            free(self.dist_middle_terms_chunks)
-        if self.heaps_red_distances_chunks is not NULL:
-            free(self.heaps_red_distances_chunks)
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
+        else:
+            raise RuntimeError("Trying to free heaps_indices_chunks which is NULL")
+
+        if self.heaps_red_distances_chunks is not NULL:
+            free(self.heaps_red_distances_chunks)
+        else:
+            raise RuntimeError("Trying to free heaps_red_distances_chunks which is NULL")
 
-    cdef void _reduce_on_chunks(self,
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+        else:
+            raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
+
+    cdef int _reduce_on_chunks(self,
         DTYPE_t[:, ::1] X,                  # IN
         DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
@@ -180,55 +236,38 @@ cdef class ArgKmin:
         ITYPE_t Y_start,
         ITYPE_t Y_end,
         ITYPE_t thread_num,
-    ) nogil:
-        """
-        Critical part of the computation of pairwise distances.
-
-        "Fast Squared Euclidean" distances strategy relying
-        on the gemm-trick.
-        """
+    ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-        # Instead of computing the full pairwise squared distances matrix,
-        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
-        # term since the argmin for a given sample X_c^{i} does not depend on
-        # ||X_c^{i}||²
+            ITYPE_t n_x = X_end - X_start
+            ITYPE_t n_y = Y_end - Y_start
 
-        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
-        # Here, we use their counterpart values as indicated in the documentation.
-        # See the documentation of parameters here:
-        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
-        #
-        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
-        _gemm(RowMajor, NoTrans, Trans,
-              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
-              -2.0,
-              &X_c[0, 0], X_c.shape[1],
-              &Y_c[0, 0], X_c.shape[1], 0.0,
-              dist_middle_terms, Y_c.shape[0])
-
-        # Computing argmins here
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_red_distances + i * k,
-                      heaps_indices + i * k,
+                _push(heaps_red_distances + i * self.k,
+                      heaps_indices + i * self.k,
                       k,
-                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                      dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
-                      j + Y_start)
+                      0,
+                      # TODO: needs to move DistanceMetric
+                      # from neighbors to be able to use them
+                      # some adaptation
+                      # self.distance_metric.rdist(&X_c[i, 0],
+                      #                           &Y_c[j, 0],
+                      #                           self.d),
+                      Y_start + j)
 
+        return 0
 
     cdef int _parallel_on_X(self,
-        ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
+        ITYPE_t[:, ::1] argkmin_indices,
+        DTYPE_t[:, ::1] argkmin_red_distances,
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelising computation on chunks of X.
@@ -250,7 +289,7 @@ cdef class ArgKmin:
             self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
-                # We reset the heap between X chunks (memset isn't suitable here)
+                # We reset the heap between X chunks (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
                     self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
 
@@ -260,6 +299,8 @@ cdef class ArgKmin:
                 else:
                     X_end = X_start + self.X_n_samples_chunk
 
+                # Referencing the thread-local heaps via the thread-scope pointer
+                # of pointers attached to the instance
                 self.heaps_indices_chunks[thread_num] = &argkmin_indices[X_start, 0]
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
@@ -292,6 +333,7 @@ cdef class ArgKmin:
         # end: with nogil, parallel
         return self.X_n_chunks
 
+
     cdef int _parallel_on_Y(self,
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
@@ -384,12 +426,13 @@ cdef class ArgKmin:
         # end: for X_chunk_idx
         return self.Y_n_chunks
 
+
     # Python interface
     def compute(self,
            str strategy = "auto",
            bint return_distance = False
-           ):
-        """Computes the argkmin of vectors (rows) of X on Y.
+    ):
+        """Computes the reduction of vectors (rows) of X on Y.
 
         strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
             The chunking strategy defining which dataset
@@ -403,10 +446,9 @@ cdef class ArgKmin:
             between 'parallel_on_X' and 'parallel_on_Y'.
 
         return_distance: boolean
-            Return distances between each X vectory and its
+            Return distances between each X vector and its
             argkmin if set to True.
 
-
         Returns
         -------
         distances: ndarray of shape (n, k)
@@ -455,3 +497,72 @@ cdef class ArgKmin:
                     np.asarray(argkmin_indices))
 
         return np.asarray(argkmin_indices)
+
+cdef class FastSquaredEuclideanArgKmin(ArgKmin):
+
+    cdef:
+        DTYPE_t[::1] Y_sq_norms
+
+    def __init__(self,
+                  DTYPE_t[:, ::1] X,
+                  DTYPE_t[:, ::1] Y,
+                  ITYPE_t k,
+                  ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        ArgKmin.__init__(self, X, Y, k)
+        self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+
+
+    cdef int _reduce_on_chunks(self,
+        DTYPE_t[:, ::1] X,                  # IN
+        DTYPE_t[:, ::1] Y,                  # IN
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil except -1:
+        """
+        Critical part of the computation of pairwise distances.
+
+        "Fast Squared Euclidean" distances strategy relying
+        on the gemm-trick.
+        """
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            ITYPE_t k = self.k
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+        # Instead of computing the full pairwise squared distances matrix,
+        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
+        # term since the argmin for a given sample X_c^{i} does not depend on
+        # ||X_c^{i}||²
+
+        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
+        # Here, we use their counterpart values as indicated in the documentation.
+        # See the documentation of parameters here:
+        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+        #
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(RowMajor, NoTrans, Trans,
+              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
+              -2.0,
+              &X_c[0, 0], X_c.shape[1],
+              &Y_c[0, 0], X_c.shape[1], 0.0,
+              dist_middle_terms, Y_c.shape[0])
+
+        # Computing argmins here
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                _push(heaps_red_distances + i * k,
+                      heaps_indices + i * k,
+                      k,
+                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
+                      j + Y_start)
+        return 0
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index f657ad5ccb49b..c800412677b09 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._argkmin_fast import ArgKmin
+from ._argkmin_fast import FastSquaredEuclideanArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -648,7 +648,7 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = ArgKmin(X, Y, k=1).compute(
+        values, indices = FastSquaredEuclideanArgKmin(X, Y, k=1).compute(
             strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 53774a3b78510..d3c886fc17e40 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._argkmin_fast import ArgKmin
+from ..metrics._argkmin_fast import FastSquaredEuclideanArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -740,7 +740,9 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = ArgKmin(X=X, Y=self._fit_X, k=n_neighbors).compute(
+            results = FastSquaredEuclideanArgKmin(
+                X=X, Y=self._fit_X, k=n_neighbors
+            ).compute(
                 strategy="auto",
                 return_distance=return_distance,
             )

From e0d1c99f69fb0061574c1db0c20fc3054c410c8a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Jun 2021 12:08:37 +0200
Subject: [PATCH 026/290] Move neighbors.DistanceMetric to metrics

The associated _typedefs.pyx file has been
moved to utils to avoid circular dependencies
has it is being used in neighbors.
---
 sklearn/cluster/_agglomerative.py                 |  4 ++--
 sklearn/cluster/_hierarchical_fast.pyx            | 15 +++++++--------
 sklearn/cluster/tests/test_hierarchical.py        |  5 +++--
 sklearn/metrics/__init__.py                       |  3 +++
 sklearn/{neighbors => metrics}/_dist_metrics.pxd  |  0
 sklearn/{neighbors => metrics}/_dist_metrics.pyx  |  4 ++--
 sklearn/metrics/pairwise.py                       |  2 +-
 sklearn/metrics/setup.py                          |  8 ++++++++
 .../tests/test_dist_metrics.py                    | 13 +------------
 sklearn/neighbors/__init__.py                     |  2 --
 sklearn/neighbors/_binary_tree.pxi                |  6 ++----
 sklearn/neighbors/_classification.py              |  8 ++++----
 sklearn/neighbors/_graph.py                       | 14 ++++++++------
 sklearn/neighbors/_partition_nodes.pxd            |  2 +-
 sklearn/neighbors/_regression.py                  |  8 ++++----
 sklearn/neighbors/_unsupervised.py                |  4 ++--
 sklearn/neighbors/setup.py                        |  7 -------
 sklearn/neighbors/tests/test_ball_tree.py         | 13 ++++++++++++-
 sklearn/neighbors/tests/test_neighbors_tree.py    |  2 +-
 19 files changed, 61 insertions(+), 59 deletions(-)
 rename sklearn/{neighbors => metrics}/_dist_metrics.pxd (100%)
 rename sklearn/{neighbors => metrics}/_dist_metrics.pyx (99%)
 rename sklearn/{neighbors => metrics}/tests/test_dist_metrics.py (95%)

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 48e2d38ebf32b..2c259e0287065 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -16,8 +16,8 @@
 
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics.pairwise import paired_distances, pairwise_distances
-from ..neighbors import DistanceMetric
-from ..neighbors._dist_metrics import METRIC_MAPPING
+from ..metrics import DistanceMetric
+from ..metrics._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx
index 2a58757ce327d..11ea3294c086a 100644
--- a/sklearn/cluster/_hierarchical_fast.pyx
+++ b/sklearn/cluster/_hierarchical_fast.pyx
@@ -13,7 +13,7 @@ ctypedef np.int8_t INT8
 
 np.import_array()
 
-from ..neighbors._dist_metrics cimport DistanceMetric
+from ..metrics._dist_metrics cimport DistanceMetric
 from ..utils._fast_dict cimport IntFloatDict
 
 # C++
@@ -236,8 +236,8 @@ def max_merge(IntFloatDict a, IntFloatDict b,
 def average_merge(IntFloatDict a, IntFloatDict b,
               np.ndarray[ITYPE_t, ndim=1] mask,
               ITYPE_t n_a, ITYPE_t n_b):
-    """Merge two IntFloatDicts with the average strategy: when the 
-    same key is present in the two dicts, the weighted average of the two 
+    """Merge two IntFloatDicts with the average strategy: when the
+    same key is present in the two dicts, the weighted average of the two
     values is used.
 
     Parameters
@@ -290,13 +290,13 @@ def average_merge(IntFloatDict a, IntFloatDict b,
 
 
 ###############################################################################
-# An edge object for fast comparisons 
+# An edge object for fast comparisons
 
 cdef class WeightedEdge:
     cdef public ITYPE_t a
     cdef public ITYPE_t b
     cdef public DTYPE_t weight
-    
+
     def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b):
         self.weight = weight
         self.a = a
@@ -326,7 +326,7 @@ cdef class WeightedEdge:
             return self.weight > other.weight
         elif op == 5:
             return self.weight >= other.weight
-        
+
     def __repr__(self):
         return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
                                               self.weight,
@@ -475,7 +475,7 @@ def mst_linkage_core(
 
     dist_metric: DistanceMetric
         A DistanceMetric object conforming to the API from
-        ``sklearn.neighbors._dist_metrics.pxd`` that will be
+        ``sklearn.metrics._dist_metrics.pxd`` that will be
         used to compute distances.
 
     Returns
@@ -534,4 +534,3 @@ def mst_linkage_core(
         current_node = new_node
 
     return np.array(result)
-
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 8aff7136c574f..73fee94b1b016 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -16,7 +16,7 @@
 from scipy.cluster import hierarchy
 
 from sklearn.metrics.cluster import adjusted_rand_score
-from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
+from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
 from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import ignore_warnings
@@ -30,6 +30,7 @@
     _fix_connectivity,
 )
 from sklearn.feature_extraction.image import grid_to_graph
+from sklearn.metrics import DistanceMetric
 from sklearn.metrics.pairwise import (
     PAIRED_DISTANCES,
     cosine_distances,
@@ -37,7 +38,7 @@
     pairwise_distances,
 )
 from sklearn.metrics.cluster import normalized_mutual_info_score
-from sklearn.neighbors import kneighbors_graph, DistanceMetric
+from sklearn.neighbors import kneighbors_graph
 from sklearn.cluster._hierarchical_fast import (
     average_merge,
     max_merge,
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index a0b06a02ad6d1..68409a7f85d35 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -36,6 +36,8 @@
 from ._classification import brier_score_loss
 from ._classification import multilabel_confusion_matrix
 
+from ._dist_metrics import DistanceMetric
+
 from . import cluster
 from .cluster import adjusted_mutual_info_score
 from .cluster import adjusted_rand_score
@@ -113,6 +115,7 @@
     "davies_bouldin_score",
     "DetCurveDisplay",
     "det_curve",
+    "DistanceMetric",
     "euclidean_distances",
     "explained_variance_score",
     "f1_score",
diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
similarity index 100%
rename from sklearn/neighbors/_dist_metrics.pxd
rename to sklearn/metrics/_dist_metrics.pxd
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
similarity index 99%
rename from sklearn/neighbors/_dist_metrics.pyx
rename to sklearn/metrics/_dist_metrics.pyx
index c9941cab0fc60..8d28773821127 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -108,7 +108,7 @@ cdef class DistanceMetric:
 
     Examples
     --------
-    >>> from sklearn.neighbors import DistanceMetric
+    >>> from sklearn.metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('euclidean')
     >>> X = [[0, 1, 2],
              [3, 4, 5]]
@@ -513,7 +513,7 @@ cdef class ChebyshevDistance(DistanceMetric):
 
     Examples
     --------
-    >>> from sklearn.neighbors.dist_metrics import DistanceMetric
+    >>> from sklearn.metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('chebyshev')
     >>> X = [[0, 1, 2],
     ...      [3, 4, 5]]
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c800412677b09..c7fc93034f85c 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -788,7 +788,7 @@ def haversine_distances(X, Y=None):
     array([[    0.        , 11099.54035582],
            [11099.54035582,     0.        ]])
     """
-    from ..neighbors import DistanceMetric
+    from ..metrics import DistanceMetric
 
     return DistanceMetric.get_metric("haversine").pairwise(X, Y)
 
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 1edd6fe368d5e..346898eb60a94 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -1,4 +1,5 @@
 import os
+import numpy as np
 
 from numpy.distutils.misc_util import Configuration
 
@@ -22,6 +23,13 @@ def configuration(parent_package="", top_path=None):
         "_argkmin_fast", sources=["_argkmin_fast.pyx"], libraries=libraries
     )
 
+    config.add_extension(
+        "_dist_metrics",
+        sources=["_dist_metrics.pyx"],
+        include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
similarity index 95%
rename from sklearn/neighbors/tests/test_dist_metrics.py
rename to sklearn/metrics/tests/test_dist_metrics.py
index 0703819536916..efa8031c53935 100644
--- a/sklearn/neighbors/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -7,8 +7,7 @@
 import pytest
 
 from scipy.spatial.distance import cdist
-from sklearn.neighbors import DistanceMetric
-from sklearn.neighbors import BallTree
+from sklearn.metrics import DistanceMetric
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version
@@ -230,16 +229,6 @@ def test_pyfunc_metric():
     assert_array_almost_equal(D1_pkl, D2_pkl)
 
 
-def test_bad_pyfunc_metric():
-    def wrong_distance(x, y):
-        return "1"
-
-    X = np.ones((5, 2))
-    msg = "Custom distance function must accept two vectors"
-    with pytest.raises(TypeError, match=msg):
-        BallTree(X, metric=wrong_distance)
-
-
 def test_input_data_size():
     # Regression test for #6288
     # Previously, a metric requiring a particular input dimension would fail
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 8a0934eecf142..3cd1d7925acf6 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -5,7 +5,6 @@
 
 from ._ball_tree import BallTree
 from ._kd_tree import KDTree
-from ._dist_metrics import DistanceMetric
 from ._graph import kneighbors_graph, radius_neighbors_graph
 from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
 from ._unsupervised import NearestNeighbors
@@ -19,7 +18,6 @@
 
 __all__ = [
     "BallTree",
-    "DistanceMetric",
     "KDTree",
     "KNeighborsClassifier",
     "KNeighborsRegressor",
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 37aa13b0a4f30..b64dbecac9e24 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -142,7 +142,6 @@
 #                                   BinaryTree tree2, ITYPE_t i_node2):
 #     """Compute the maximum distance between two nodes"""
 
-cimport cython
 cimport numpy as np
 from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma
 from libc.math cimport fmin, fmax
@@ -152,8 +151,7 @@ from libc.string cimport memcpy
 import numpy as np
 import warnings
 
-from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
-                             euclidean_dist_to_rdist, euclidean_rdist_to_dist)
+from ..metrics._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist)
 
 from ._partition_nodes cimport partition_node_indices
 
@@ -796,7 +794,7 @@ def newObj(obj):
 
 ######################################################################
 # define the reverse mapping of VALID_METRICS
-from ._dist_metrics import get_valid_metric_ids
+from sklearn.metrics._dist_metrics import get_valid_metric_ids
 VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
 
 
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 1e47e1b8020f2..bf433fea30aea 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -67,8 +67,8 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -339,8 +339,8 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index d5bcaf9408c72..1fcb568e5dff4 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -65,10 +65,11 @@ def kneighbors_graph(
         between neighbors according to the given metric.
 
     metric : str, default='minkowski'
-        The distance metric used to calculate the k-Neighbors for each sample
-        point. The DistanceMetric class gives a list of available metrics.
-        The default distance is 'euclidean' ('minkowski' metric with the p
-        param equal to 2.)
+        The distance metric used to calculate the neighbors within a
+        given radius for each sample point. The default distance is
+        'euclidean' ('minkowski' metric with the param equal to 2.)
+        See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
@@ -158,9 +159,10 @@ def radius_neighbors_graph(
 
     metric : str, default='minkowski'
         The distance metric used to calculate the neighbors within a
-        given radius for each sample point. The DistanceMetric class
-        gives a list of available metrics. The default distance is
+        given radius for each sample point. The default distance is
         'euclidean' ('minkowski' metric with the param equal to 2.)
+        See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd
index 1659801db469d..94b02002d7a1e 100644
--- a/sklearn/neighbors/_partition_nodes.pxd
+++ b/sklearn/neighbors/_partition_nodes.pxd
@@ -1,4 +1,4 @@
-from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
 cdef int partition_node_indices(
         DTYPE_t *data,
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index fe536f06c20a5..77179f3bb317f 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -75,8 +75,8 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -301,8 +301,8 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBa
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 06566b0807b7a..b11df8af8790f 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -41,8 +41,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py
index 34921de75041a..aa19ba501b18d 100644
--- a/sklearn/neighbors/setup.py
+++ b/sklearn/neighbors/setup.py
@@ -32,13 +32,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_dist_metrics",
-        sources=["_dist_metrics.pyx"],
-        include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), "numpy")],
-        libraries=libraries,
-    )
-
     config.add_extension(
         "_quad_tree",
         sources=["_quad_tree.pyx"],
diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py
index c751539f2a1ae..a823a03251a1b 100644
--- a/sklearn/neighbors/tests/test_ball_tree.py
+++ b/sklearn/neighbors/tests/test_ball_tree.py
@@ -4,7 +4,6 @@
 import pytest
 from numpy.testing import assert_array_almost_equal
 from sklearn.neighbors._ball_tree import BallTree
-from sklearn.neighbors import DistanceMetric
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_array
 from sklearn.utils._testing import _convert_container
@@ -40,6 +39,8 @@
 
 
 def brute_force_neighbors(X, Y, k, metric, **kwargs):
+    from sklearn.metrics import DistanceMetric
+
     X, Y = check_array(X), check_array(Y)
     D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
     ind = np.argsort(D, axis=1)[:, :k]
@@ -84,3 +85,13 @@ def test_array_object_type():
     X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
     with pytest.raises(ValueError, match="setting an array element with a sequence"):
         BallTree(X)
+
+
+def test_bad_pyfunc_metric():
+    def wrong_distance(x, y):
+        return "1"
+
+    X = np.ones((5, 2))
+    msg = "Custom distance function must accept two vectors"
+    with pytest.raises(TypeError, match=msg):
+        BallTree(X, metric=wrong_distance)
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index de34b4d230171..e043ffb730708 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from sklearn.neighbors import DistanceMetric
+from sklearn.metrics import DistanceMetric
 from sklearn.neighbors._ball_tree import (
     BallTree,
     kernel_norm,

From ac5ddc1dcfbfb4599371d7099fadfa8547bc8424 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 19:33:35 +0200
Subject: [PATCH 027/290] Rename private submodule to _parallel_reductions

---
 .../metrics/{_argkmin_fast.pyx => _parallel_reductions.pyx}   | 0
 sklearn/metrics/pairwise.py                                   | 2 +-
 sklearn/metrics/setup.py                                      | 4 +++-
 sklearn/neighbors/_base.py                                    | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)
 rename sklearn/metrics/{_argkmin_fast.pyx => _parallel_reductions.pyx} (100%)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_parallel_reductions.pyx
similarity index 100%
rename from sklearn/metrics/_argkmin_fast.pyx
rename to sklearn/metrics/_parallel_reductions.pyx
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c7fc93034f85c..4370b6d39d4b1 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._argkmin_fast import FastSquaredEuclideanArgKmin
+from ._parallel_reductions import FastSquaredEuclideanArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 346898eb60a94..6fd445d2c1a00 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -20,7 +20,9 @@ def configuration(parent_package="", top_path=None):
     )
 
     config.add_extension(
-        "_argkmin_fast", sources=["_argkmin_fast.pyx"], libraries=libraries
+        "_parallel_reductions",
+        sources=["_parallel_reductions.pyx"],
+        libraries=libraries,
     )
 
     config.add_extension(
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index d3c886fc17e40..ff601ec3bb59a 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._argkmin_fast import FastSquaredEuclideanArgKmin
+from ..metrics._parallel_reductions import FastSquaredEuclideanArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,

From 4e7b3cb7e67033b098d90110c8cb7e95d49e9936 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 19:57:33 +0200
Subject: [PATCH 028/290] Introduce DistanceMetric and ArgKmin factory method

---
 sklearn/metrics/_parallel_reductions.pyx | 86 +++++++++++++++++-------
 sklearn/metrics/pairwise.py              |  4 +-
 sklearn/neighbors/_base.py               |  6 +-
 3 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 62dce34bceb1c..0a1d870d748fa 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -17,7 +17,7 @@ from libc.stdlib cimport free, malloc
 
 from cython.parallel cimport parallel, prange
 
-# from ..neighbors._dist_metrics cimport DistanceMetric
+from ._dist_metrics cimport DistanceMetric
 
 DEF CHUNK_SIZE = 256  # number of vectors
 
@@ -93,9 +93,9 @@ cdef int _exact_euclidean_dist(
 
 cdef class ParallelReduction:
     """Abstract class to computes a reduction of a set of
-    vectors (rows) of X on another set of vectors (rows) of Y
+    vectors (rows) of X on another set of vectors (rows) of Y.
 
-    The implementation of the reduction is done parallelised
+    The implementation of the reduction is done parallelized
     on chunks whose size can be set using ``chunk_size``.
     Parameters
     ----------
@@ -103,8 +103,10 @@ cdef class ParallelReduction:
         Rows represent vectors
     Y: ndarray of shape (m, d)
         Rows represent vectors
+    distance_metric: DistanceMetric
+        The distance to use
     chunk_size: int
-        The number of vectors per chunk.
+        The number of vectors per chunk
     """
 
     cdef:
@@ -122,10 +124,7 @@ cdef class ParallelReduction:
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
-        # TODO: needs to move DistanceMetric
-        # from neighbors to be able to use them
-        # some adaptation
-        # DistanceMetric distance_metric
+        DistanceMetric distance_metric
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -134,10 +133,10 @@ cdef class ParallelReduction:
         self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
 
     def __init__(self,
-                  DTYPE_t[:, ::1] X,
-                  DTYPE_t[:, ::1] Y,
-                  ITYPE_t k,
-                  ITYPE_t chunk_size = CHUNK_SIZE,
+                 DTYPE_t[:, ::1] X,
+                 DTYPE_t[:, ::1] Y,
+                 DistanceMetric distance_metric,
+                 ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         cdef:
             ITYPE_t X_n_full_chunks, Y_n_full_chunks
@@ -151,13 +150,14 @@ cdef class ParallelReduction:
             f"{X.shape[1]}-dimensional and {Y.shape[1]}-dimensional."
         )
 
-        self.k = k
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)
         self.si = sizeof(ITYPE_t)
         self.chunk_size = chunk_size
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
+        self.distance_metric = distance_metric
+
         self.n_Y = Y.shape[0]
         self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
         Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
@@ -194,20 +194,55 @@ cdef class ParallelReduction:
         return -1
 
 cdef class ArgKmin(ParallelReduction):
+    """Computes the argkmin of vectors (rows) of a set of
+    vectors (rows) of X on another set of vectors (rows) of Y.
+
+    The implementation is parallelized on chunks whose size can
+    be set using ``chunk_size``.
+    Parameters
+    ----------
+    X: ndarray of shape (n, d)
+        Rows represent vectors
+    Y: ndarray of shape (m, d)
+        Rows represent vectors
+    distance_metric: DistanceMetric
+        The distance to use
+    k: int
+        The k for the argkmin reduction
+    chunk_size: int
+        The number of vectors per chunk
+    """
 
     cdef:
         DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_red_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
+    @classmethod
+    def get_for(cls,
+                DTYPE_t[:, ::1] X,
+                DTYPE_t[:, ::1] Y,
+                ITYPE_t k,
+                str metric="fast_sqeuclidean",
+                ITYPE_t chunk_size=CHUNK_SIZE,
+    ):
+        if metric == "fast_sqeuclidean":
+            return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
+        return ArgKmin(X=X, Y=Y,
+                       distance_metric=DistanceMetric.get_metric(metric),
+                       k=k,
+                       chunk_size=chunk_size)
+
     def __init__(self,
-                  DTYPE_t[:, ::1] X,
-                  DTYPE_t[:, ::1] Y,
-                  ITYPE_t k,
-                  ITYPE_t chunk_size = CHUNK_SIZE,
+                 DTYPE_t[:, ::1] X,
+                 DTYPE_t[:, ::1] Y,
+                 DistanceMetric distance_metric,
+                 ITYPE_t k,
+                 ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        ParallelReduction.__init__(self, X, Y, k)
+        ParallelReduction.__init__(self, X, Y, distance_metric, chunk_size)
 
+        self.k = k
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
@@ -254,13 +289,9 @@ cdef class ArgKmin(ParallelReduction):
                 _push(heaps_red_distances + i * self.k,
                       heaps_indices + i * self.k,
                       k,
-                      0,
-                      # TODO: needs to move DistanceMetric
-                      # from neighbors to be able to use them
-                      # some adaptation
-                      # self.distance_metric.rdist(&X_c[i, 0],
-                      #                           &Y_c[j, 0],
-                      #                           self.d),
+                      self.distance_metric.rdist(&X_c[i, 0],
+                                                 &Y_c[j, 0],
+                                                 self.d),
                       Y_start + j)
 
         return 0
@@ -509,7 +540,10 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                   ITYPE_t k,
                   ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        ArgKmin.__init__(self, X, Y, k)
+        ArgKmin.__init__(self, X, Y,
+                         distance_metric=DistanceMetric.get_metric("euclidean"),
+                         k=k,
+                         chunk_size=chunk_size)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
 
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 4370b6d39d4b1..75660cddb1ab8 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._parallel_reductions import FastSquaredEuclideanArgKmin
+from ._parallel_reductions import ArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -648,7 +648,7 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = FastSquaredEuclideanArgKmin(X, Y, k=1).compute(
+        values, indices = ArgKmin.get_for(X=X, Y=Y, k=1, metric=metric).compute(
             strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index ff601ec3bb59a..7e6578dbf22a5 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._parallel_reductions import FastSquaredEuclideanArgKmin
+from ..metrics._parallel_reductions import ArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -740,8 +740,8 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = FastSquaredEuclideanArgKmin(
-                X=X, Y=self._fit_X, k=n_neighbors
+            results = ArgKmin.get_for(
+                X=X, Y=self._fit_X, k=n_neighbors, metric=self.effective_metric_
             ).compute(
                 strategy="auto",
                 return_distance=return_distance,

From d386fe11a58ae910e988a17c02276964b917bce1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 20:47:22 +0200
Subject: [PATCH 029/290] Support DistanceMetric's and branch on ArgKmin when
 possible

---
 sklearn/metrics/_parallel_reductions.pyx | 69 ++++++++++--------------
 sklearn/metrics/pairwise.py              | 21 ++++----
 sklearn/neighbors/_base.py               |  4 +-
 3 files changed, 41 insertions(+), 53 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 0a1d870d748fa..9009d0866c0ba 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -18,6 +18,7 @@ from libc.stdlib cimport free, malloc
 from cython.parallel cimport parallel, prange
 
 from ._dist_metrics cimport DistanceMetric
+from ._dist_metrics import METRIC_MAPPING
 
 DEF CHUNK_SIZE = 256  # number of vectors
 
@@ -64,32 +65,6 @@ cdef inline DTYPE_t _euclidean_dist(
 
     return sqrt(dist)
 
-cdef int _exact_euclidean_dist(
-    DTYPE_t[:, ::1] X,                  # IN
-    DTYPE_t[:, ::1] Y,                  # IN
-    ITYPE_t[:, ::1] Y_indices,          # IN
-    ITYPE_t n_threads,                  # IN
-    DTYPE_t[:, ::1] distances,          # OUT
-) nogil:
-    """
-    Compute exact pairwise euclidean distances in parallel.
-
-    The pairwise distances considered are X vectors
-    and a subset of Y given for each row if X given in
-    Y_indices.
-
-    Notes: the body of this function could have been inlined,
-    but we use a function to have a cdef nogil context.
-    """
-    cdef:
-        ITYPE_t i, k
-
-    for i in prange(X.shape[0], schedule='static',
-                    nogil=True, num_threads=n_threads):
-        for k in range(Y_indices.shape[1]):
-            distances[i, k] = _euclidean_dist(X, Y, i,
-                                              Y_indices[i, k])
-
 
 cdef class ParallelReduction:
     """Abstract class to computes a reduction of a set of
@@ -218,6 +193,10 @@ cdef class ArgKmin(ParallelReduction):
         DTYPE_t ** heaps_red_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
+    @classmethod
+    def valid_metrics(cls):
+        return {"fast_sqeuclidean", *METRIC_MAPPING.keys()}
+
     @classmethod
     def get_for(cls,
                 DTYPE_t[:, ::1] X,
@@ -225,11 +204,12 @@ cdef class ArgKmin(ParallelReduction):
                 ITYPE_t k,
                 str metric="fast_sqeuclidean",
                 ITYPE_t chunk_size=CHUNK_SIZE,
-    ):
+                dict metric_kwargs=dict(),
+        ):
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
         return ArgKmin(X=X, Y=Y,
-                       distance_metric=DistanceMetric.get_metric(metric),
+                       distance_metric=DistanceMetric.get_metric(metric, **metric_kwargs),
                        k=k,
                        chunk_size=chunk_size)
 
@@ -301,7 +281,7 @@ cdef class ArgKmin(ParallelReduction):
         DTYPE_t[:, ::1] argkmin_red_distances,
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
-        by parallelising computation on chunks of X.
+        by parallelizing computation on chunks of X.
         """
         cdef:
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
@@ -370,9 +350,9 @@ cdef class ArgKmin(ParallelReduction):
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
-        by parallelising computation on chunks of Y.
+        by parallelizing computation on chunks of Y.
 
-        This parallelisation strategy is more costly (as we need
+        This parallelization strategy is more costly (as we need
         extra heaps and synchronisation), yet it is useful in
         most contexts.
         """
@@ -457,6 +437,20 @@ cdef class ArgKmin(ParallelReduction):
         # end: for X_chunk_idx
         return self.Y_n_chunks
 
+    cdef void _exact_distances(self,
+        ITYPE_t[:, ::1] Y_indices,          # IN
+        DTYPE_t[:, ::1] distances,          # IN/OUT
+    ) nogil:
+        """Convert reduced distances to pairwise distances in parallel."""
+        cdef:
+            ITYPE_t i, j
+
+        for i in prange(self.n_X, schedule='static', nogil=True,
+                        num_threads=self.effective_omp_n_thread):
+            for j in range(self.k):
+                distances[i, j] = self.distance_metric.dist(&self.X[i, 0],
+                                                 &self.Y[Y_indices[i, j], 0],
+                                                 self.d)
 
     # Python interface
     def compute(self,
@@ -467,7 +461,7 @@ cdef class ArgKmin(ParallelReduction):
 
         strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
             The chunking strategy defining which dataset
-            parallelisation are made on.
+            parallelization are made on.
 
              - 'parallel_on_X' is embarassingly parallel but
             is less used in practice.
@@ -518,14 +512,9 @@ cdef class ArgKmin(ParallelReduction):
 
         if return_distance:
             # We need to recompute distances because we relied on
-            # reduced distances using _gemm, which are missing a
-            # term for squared norms and which are not the most
-            # precise (catastrophic cancellation might have happened).
-            _exact_euclidean_dist(self.X, self.Y, argkmin_indices,
-                                  self.effective_omp_n_thread,
-                                  argkmin_distances)
-            return (np.asarray(argkmin_distances),
-                    np.asarray(argkmin_indices))
+            # reduced distances.
+            self._exact_distances(argkmin_indices, argkmin_distances)
+            return np.asarray(argkmin_distances), np.asarray(argkmin_indices)
 
         return np.asarray(argkmin_indices)
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 75660cddb1ab8..c35b38ad1fde4 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -646,20 +646,19 @@ def pairwise_distances_argmin_min(
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    if metric == "fast_sqeuclidean":
-        # TODO: generalise this simple plug here
-        values, indices = ArgKmin.get_for(X=X, Y=Y, k=1, metric=metric).compute(
-            strategy="auto", return_distance=True
-        )
+    if axis == 0:
+        X, Y = Y, X
+
+    if metric_kwargs is None:
+        metric_kwargs = {}
+
+    if metric in ArgKmin.valid_metrics():
+        values, indices = ArgKmin.get_for(
+            X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
+        ).compute(strategy="auto", return_distance=True)
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)
     else:
-        if metric_kwargs is None:
-            metric_kwargs = {}
-
-        if axis == 0:
-            X, Y = Y, X
-
         indices, values = zip(
             *pairwise_distances_chunked(
                 X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 7e6578dbf22a5..e97894e8731c4 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -737,9 +737,9 @@ class from an array representing our data set and ask who's
             )
 
         elif (
-            self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
+            self._fit_method == "brute"
+            and self.effective_metric_ in ArgKmin.valid_metrics()
         ):
-            # TODO: generalise this simple plug here
             results = ArgKmin.get_for(
                 X=X, Y=self._fit_X, k=n_neighbors, metric=self.effective_metric_
             ).compute(

From a61e81f42f2aeea65153f83b16d7d2650a3d3f41 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 09:09:06 +0200
Subject: [PATCH 030/290] Document GEMM call and change wording to
 "approximated distance"

---
 sklearn/metrics/_parallel_reductions.pyx | 97 ++++++++++++++----------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 9009d0866c0ba..3eac36ae3980d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -190,7 +190,7 @@ cdef class ArgKmin(ParallelReduction):
 
     cdef:
         DTYPE_t ** dist_middle_terms_chunks
-        DTYPE_t ** heaps_red_distances_chunks
+        DTYPE_t ** heaps_approx_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
@@ -224,7 +224,7 @@ cdef class ArgKmin(ParallelReduction):
 
         self.k = k
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
-        self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
@@ -233,10 +233,10 @@ cdef class ArgKmin(ParallelReduction):
         else:
             raise RuntimeError("Trying to free heaps_indices_chunks which is NULL")
 
-        if self.heaps_red_distances_chunks is not NULL:
-            free(self.heaps_red_distances_chunks)
+        if self.heaps_approx_distances_chunks is not NULL:
+            free(self.heaps_approx_distances_chunks)
         else:
-            raise RuntimeError("Trying to free heaps_red_distances_chunks which is NULL")
+            raise RuntimeError("Trying to free heaps_approx_distances_chunks which is NULL")
 
         if self.dist_middle_terms_chunks is not NULL:
             free(self.dist_middle_terms_chunks)
@@ -258,7 +258,7 @@ cdef class ArgKmin(ParallelReduction):
             DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
             ITYPE_t n_x = X_end - X_start
@@ -266,7 +266,7 @@ cdef class ArgKmin(ParallelReduction):
 
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_red_distances + i * self.k,
+                _push(heaps_approx_distances + i * self.k,
                       heaps_indices + i * self.k,
                       k,
                       self.distance_metric.rdist(&X_c[i, 0],
@@ -278,7 +278,7 @@ cdef class ArgKmin(ParallelReduction):
 
     cdef int _parallel_on_X(self,
         ITYPE_t[:, ::1] argkmin_indices,
-        DTYPE_t[:, ::1] argkmin_red_distances,
+        DTYPE_t[:, ::1] argkmin_approx_distances,
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelizing computation on chunks of X.
@@ -297,12 +297,12 @@ cdef class ArgKmin(ParallelReduction):
             thread_num = openmp.omp_get_thread_num()
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-            self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
+            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
+                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
 
                 X_start = X_chunk_idx * self.X_n_samples_chunk
                 if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
@@ -332,14 +332,14 @@ cdef class ArgKmin(ParallelReduction):
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        self.heaps_red_distances_chunks[thread_num] + idx * self.k,
+                        self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
                         self.k
                     )
 
             # end: for X_chunk_idx
             free(self.dist_middle_terms_chunks[thread_num])
-            free(self.heaps_red_distances_chunks[thread_num])
+            free(self.heaps_approx_distances_chunks[thread_num])
 
         # end: with nogil, parallel
         return self.X_n_chunks
@@ -347,7 +347,7 @@ cdef class ArgKmin(ParallelReduction):
 
     cdef int _parallel_on_Y(self,
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
+        DTYPE_t[:, ::1] argkmin_approx_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelizing computation on chunks of Y.
@@ -379,7 +379,7 @@ cdef class ArgKmin(ParallelReduction):
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
                 self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-                self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
+                self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
 
                 # As chunks of X are shared across threads, so must their
                 # heaps. To solve this, each thread has its own locals
@@ -388,7 +388,7 @@ cdef class ArgKmin(ParallelReduction):
 
                 # Initialising heaps (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
+                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
                     self.heaps_indices_chunks[thread_num][idx] = -1
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
@@ -415,10 +415,10 @@ cdef class ArgKmin(ParallelReduction):
                     for idx in range(X_end - X_start):
                         for jdx in range(self.k):
                             _push(
-                                &argkmin_red_distances[X_start + idx, 0],
+                                &argkmin_approx_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
                                 self.k,
-                                self.heaps_red_distances_chunks[thread_num][idx * self.k + jdx],
+                                self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
                                 self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                             )
 
@@ -428,7 +428,7 @@ cdef class ArgKmin(ParallelReduction):
             for idx in prange(self.n_X, schedule='static',
                               nogil=True, num_threads=num_threads):
                 _simultaneous_sort(
-                    &argkmin_red_distances[idx, 0],
+                    &argkmin_approx_distances[idx, 0],
                     &argkmin_indices[idx, 0],
                     self.k,
                 )
@@ -557,35 +557,52 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-        # Instead of computing the full pairwise squared distances matrix,
-        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
-        # term since the argmin for a given sample X_c^{i} does not depend on
-        # ||X_c^{i}||²
-
-        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
-        # Here, we use their counterpart values as indicated in the documentation.
-        # See the documentation of parameters here:
-        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
-        #
+            # Instead of computing the full pairwise squared distances matrix,
+            #
+            #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+            #
+            # we only need to store the
+            #                                - 2 X_c.Y_c^T + ||Y_c||²
+            #
+            # term since the argkmin for a given sample X_c^{i} does not depend on
+            # ||X_c^{i}||²
+            #
+            # This term gets computed efficiently bellow using GEMM from BLAS Level 3.
+            #
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays in BLAS documentations,
+            # for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            DTYPE_t * A = & X_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            DTYPE_t * B = & Y_c[0, 0]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            DTYPE_t * C = dist_middle_terms
+            ITYPE_t ldc = Y_c.shape[0]
+
         # dist_middle_terms = -2 * X_c.dot(Y_c.T)
-        _gemm(RowMajor, NoTrans, Trans,
-              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
-              -2.0,
-              &X_c[0, 0], X_c.shape[1],
-              &Y_c[0, 0], X_c.shape[1], 0.0,
-              dist_middle_terms, Y_c.shape[0])
-
-        # Computing argmins here
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_red_distances + i * k,
+                _push(heaps_approx_distances + i * k,
                       heaps_indices + i * k,
                       k,
-                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      # approximated distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                       dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
                       j + Y_start)
         return 0

From e0d2881f7455b1c2ea925740017d9bd76c8053d9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 10:01:38 +0200
Subject: [PATCH 031/290] Support memmapped and integral arrays

This also makes memoryviews const, clarifying their use.
---
 sklearn/metrics/_parallel_reductions.pyx | 89 +++++++++++++-----------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 3eac36ae3980d..01d1a10371615 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -19,6 +19,7 @@ from cython.parallel cimport parallel, prange
 
 from ._dist_metrics cimport DistanceMetric
 from ._dist_metrics import METRIC_MAPPING
+from ..utils import check_array
 
 DEF CHUNK_SIZE = 256  # number of vectors
 
@@ -85,21 +86,22 @@ cdef class ParallelReduction:
     """
 
     cdef:
-        ITYPE_t effective_omp_n_thread
+        const DTYPE_t[:, ::1] X  # shape: (n_X, d)
+        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+
+        DistanceMetric distance_metric
 
-        ITYPE_t k, d, sf, si
+        ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
 
-        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_samples_rem
-        ITYPE_t n_X, X_n_samples_chunk, X_n_samples_rem
+        ITYPE_t d
 
-        # Counting remainder chunk in total number of chunks
-        ITYPE_t Y_n_chunks, X_n_chunks, num_threads
+        # dtypes sizes
+        ITYPE_t sf, si
 
-        DTYPE_t[:, ::1] X
-        DTYPE_t[:, ::1] Y
+        ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_rem
+        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_rem
 
-        DistanceMetric distance_metric
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -108,22 +110,23 @@ cdef class ParallelReduction:
         self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
 
     def __init__(self,
-                 DTYPE_t[:, ::1] X,
-                 DTYPE_t[:, ::1] Y,
+                 X,
+                 Y,
                  DistanceMetric distance_metric,
                  ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         cdef:
             ITYPE_t X_n_full_chunks, Y_n_full_chunks
-        self.X = X
-        self.Y = Y
-
-        # TODO: use proper internals checks of scikit-learn
-        assert X.shape[1] == Y.shape[1], (
-            f"Vectors of X and Y must have the same "
-            f"number of dimensions but are respectively "
-            f"{X.shape[1]}-dimensional and {Y.shape[1]}-dimensional."
-        )
+
+        self.effective_omp_n_thread = _openmp_effective_n_threads()
+
+        self.X = check_array(X, dtype=DTYPE)
+        self.Y = check_array(Y, dtype=DTYPE)
+
+        assert X.shape[1] == Y.shape[1], "Vectors of X and Y must have the " \
+                                         "same dimension but currently are " \
+                                         f"respectively {X.shape[1]}-dimensional " \
+                                         f"and {Y.shape[1]}-dimensional."
 
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)
@@ -152,12 +155,9 @@ cdef class ParallelReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
-        self.effective_omp_n_thread = _openmp_effective_n_threads()
-
-
     cdef int _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X,                  # IN
-        DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,                  # IN
+        const DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -189,18 +189,23 @@ cdef class ArgKmin(ParallelReduction):
     """
 
     cdef:
+        ITYPE_t k
+
         DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_approx_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
+        ITYPE_t[:, ::1] argkmin_indices
+        DTYPE_t[:, ::1] argkmin_distances
+
     @classmethod
     def valid_metrics(cls):
         return {"fast_sqeuclidean", *METRIC_MAPPING.keys()}
 
     @classmethod
     def get_for(cls,
-                DTYPE_t[:, ::1] X,
-                DTYPE_t[:, ::1] Y,
+                X,
+                Y,
                 ITYPE_t k,
                 str metric="fast_sqeuclidean",
                 ITYPE_t chunk_size=CHUNK_SIZE,
@@ -214,8 +219,8 @@ cdef class ArgKmin(ParallelReduction):
                        chunk_size=chunk_size)
 
     def __init__(self,
-                 DTYPE_t[:, ::1] X,
-                 DTYPE_t[:, ::1] Y,
+                 X,
+                 Y,
                  DistanceMetric distance_metric,
                  ITYPE_t k,
                  ITYPE_t chunk_size = CHUNK_SIZE,
@@ -223,6 +228,12 @@ cdef class ArgKmin(ParallelReduction):
         ParallelReduction.__init__(self, X, Y, distance_metric, chunk_size)
 
         self.k = k
+
+        # Results returned by ArgKmin.compute
+        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
+
+        # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
@@ -244,8 +255,8 @@ cdef class ArgKmin(ParallelReduction):
             raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
 
     cdef int _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X,                  # IN
-        DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,                  # IN
+        const DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -254,8 +265,8 @@ cdef class ArgKmin(ParallelReduction):
     ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
-            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
@@ -524,8 +535,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         DTYPE_t[::1] Y_sq_norms
 
     def __init__(self,
-                  DTYPE_t[:, ::1] X,
-                  DTYPE_t[:, ::1] Y,
+                  X,
+                  Y,
                   ITYPE_t k,
                   ITYPE_t chunk_size = CHUNK_SIZE,
     ):
@@ -537,8 +548,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
 
     cdef int _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X,                  # IN
-        DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,                  # IN
+        const DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -553,8 +564,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         """
         cdef:
             ITYPE_t i, j
-            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
-            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]

From b23f97294daf62f0009c637190516e2ce85219b5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 10:24:21 +0200
Subject: [PATCH 032/290] Pull _parallel_on_{X,Y} up on ParallelReduction

Also add private template methods, allowing
each reduction to interact with its private
datastructures at various stages.

Reintroduce thread-local datastructures deallocation
(fix-up for 80aaf0bcb13cf31f5b2a251d615410c6b680db8f)
---
 sklearn/metrics/_parallel_reductions.pyx | 425 ++++++++++++++---------
 1 file changed, 261 insertions(+), 164 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 01d1a10371615..db484002f67a5 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -155,9 +155,164 @@ cdef class ParallelReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+    cdef void _on_X_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _on_X_parallel_finalize(self,
+            ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef void _on_X_prange_iter_init(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _on_X_prange_iter_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X(self) nogil:
+        """Computes the reduction of each vector (row) of X on Y
+        by parallelizing computation on chunks of X.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
+
+        with nogil, parallel(num_threads=num_threads):
+            thread_num = openmp.omp_get_thread_num()
+
+            # Allocating thread local datastructures
+            self._on_X_parallel_init(thread_num)
+
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                    X_end = X_start + self.X_n_samples_rem
+                else:
+                    X_end = X_start + self.X_n_samples_chunk
+
+                # Reinitializing thread local datastructures for the new X chunk
+                self._on_X_prange_iter_init(thread_num, X_chunk_idx, X_start, X_end)
+
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._reduce_on_chunks(
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                # Adjusting thread local datastructures on the full pass on Y
+                self._on_X_prange_iter_finalize(thread_num, X_chunk_idx, X_start, X_end)
+
+            # end: for X_chunk_idx
+
+            # Deallocating thread local datastructures
+            self._on_X_parallel_finalize(thread_num)
+
+        # end: with nogil, parallel
+        return
+
+    cdef void _on_Y_parallel_init(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _on_Y_parallel_finalize(self,
+        ITYPE_t thread_num,
+        ITYPE_t X_chunk_idx,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _on_Y_finalize(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y(self) nogil:
+        """Computes the argkmin of each vector (row) of X on Y
+        by parallelizing computation on chunks of Y.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
+
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                X_end = X_start + self.X_n_samples_rem
+            else:
+                X_end = X_start + self.X_n_samples_chunk
+
+            with nogil, parallel(num_threads=num_threads):
+                # Thread local buffers
+                thread_num = openmp.omp_get_thread_num()
+
+                # Allocating thread local datastructures
+                self._on_Y_parallel_init(thread_num)
+
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 \
+                            and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._reduce_on_chunks(
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+                # end: prange
+
+                # Synchronizing thread local datastructures with the main ones
+                # This can potentially block
+                self._on_Y_parallel_finalize(thread_num, X_chunk_idx, X_start, X_end)
+            # end: with nogil, parallel
+
+        # end: for X_chunk_idx
+        # Adjusting main datastructures before returning
+        self._on_Y_finalize(num_threads)
+        return
+
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,                  # IN
-        const DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -255,8 +410,8 @@ cdef class ArgKmin(ParallelReduction):
             raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
 
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,                  # IN
-        const DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -287,170 +442,124 @@ cdef class ArgKmin(ParallelReduction):
 
         return 0
 
-    cdef int _parallel_on_X(self,
-        ITYPE_t[:, ::1] argkmin_indices,
-        DTYPE_t[:, ::1] argkmin_approx_distances,
+    cdef void _on_X_parallel_init(self,
+            ITYPE_t thread_num,
     ) nogil:
-        """Computes the argkmin of each vector (row) of X on Y
-        by parallelizing computation on chunks of X.
-        """
         cdef:
-            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
-            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
-            ITYPE_t thread_num
-
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t heap_size = self.X_n_samples_chunk * self.k * self.sf
 
-        with nogil, parallel(num_threads=num_threads):
-            # Thread local buffers
-            thread_num = openmp.omp_get_thread_num()
-            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
-
-            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
-                # We reset the heap between X chunks (memset can't be used here)
-                for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
-
-                X_start = X_chunk_idx * self.X_n_samples_chunk
-                if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
-                    X_end = X_start + self.X_n_samples_rem
-                else:
-                    X_end = X_start + self.X_n_samples_chunk
-
-                # Referencing the thread-local heaps via the thread-scope pointer
-                # of pointers attached to the instance
-                self.heaps_indices_chunks[thread_num] = &argkmin_indices[X_start, 0]
-
-                for Y_chunk_idx in range(self.Y_n_chunks):
-                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
-                        Y_end = Y_start + self.Y_n_samples_rem
-                    else:
-                        Y_end = Y_start + self.Y_n_samples_chunk
-
-                    self._reduce_on_chunks(
-                        self.X,
-                        self.Y,
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
-                    )
-
-                # Sorting indices so that the closests' come first.
-                for idx in range(X_end - X_start):
-                    _simultaneous_sort(
-                        self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
-                        &argkmin_indices[X_start + idx, 0],
-                        self.k
-                    )
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
+        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heap_size)
 
-            # end: for X_chunk_idx
-            free(self.dist_middle_terms_chunks[thread_num])
-            free(self.heaps_approx_distances_chunks[thread_num])
+    cdef void _on_X_prange_iter_init(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
 
-        # end: with nogil, parallel
-        return self.X_n_chunks
+        # We reset the heap between X chunks (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
 
+        # Referencing the thread-local heaps via the thread-scope pointer
+        # of pointers attached to the instance
+        self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
-    cdef int _parallel_on_Y(self,
-        ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        DTYPE_t[:, ::1] argkmin_approx_distances,   # OUT
+    cdef void _on_X_prange_iter_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
     ) nogil:
-        """Computes the argkmin of each vector (row) of X on Y
-        by parallelizing computation on chunks of Y.
-
-        This parallelization strategy is more costly (as we need
-        extra heaps and synchronisation), yet it is useful in
-        most contexts.
-        """
         cdef:
-            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
-            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t idx, jdx
+
+        # Sorting indices of the argkmin for each query vector of X
+        for idx in range(X_end - X_start):
+            _simultaneous_sort(
+                self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
+                &self.argkmin_indices[X_start + idx, 0],
+                self.k
+            )
+
+    cdef void _on_X_parallel_finalize(self,
             ITYPE_t thread_num
+    ) nogil:
+        free(self.dist_middle_terms_chunks[thread_num])
+        free(self.heaps_approx_distances_chunks[thread_num])
 
+    cdef void _on_Y_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        cdef:
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t int_heap_size = self.X_n_samples_chunk * self.k * self.si
             ITYPE_t float_heap_size = self.X_n_samples_chunk * self.k * self.sf
 
-        for X_chunk_idx in range(self.X_n_chunks):
-            X_start = X_chunk_idx * self.X_n_samples_chunk
-            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
-                X_end = X_start + self.X_n_samples_rem
-            else:
-                X_end = X_start + self.X_n_samples_chunk
-
-            with nogil, parallel(num_threads=num_threads):
-                # Thread local buffers
-                thread_num = openmp.omp_get_thread_num()
-
-                # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-                self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
-
-                # As chunks of X are shared across threads, so must their
-                # heaps. To solve this, each thread has its own locals
-                # heaps which are then synchronised back in the main ones.
-                self.heaps_indices_chunks[thread_num] = <ITYPE_t*> malloc(int_heap_size)
-
-                # Initialising heaps (memset can't be used here)
-                for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
-                    self.heaps_indices_chunks[thread_num][idx] = -1
-
-                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
-                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1 \
-                        and self.Y_n_samples_rem > 0:
-                        Y_end = Y_start + self.Y_n_samples_rem
-                    else:
-                        Y_end = Y_start + self.Y_n_samples_chunk
-
-
-                    self._reduce_on_chunks(
-                        self.X,
-                        self.Y,
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
+        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(float_heap_size)
+
+        # As chunks of X are shared across threads, so must their
+        # heaps. To solve this, each thread has its own locals
+        # heaps which are then synchronised back in the main ones.
+        self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(int_heap_size)
+
+        # Initialising heaps (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
+            self.heaps_indices_chunks[thread_num][idx] = -1
+
+    cdef void _on_Y_parallel_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+        with gil:
+            # Synchronising the thread local heaps
+            # with the main heaps
+            for idx in range(X_end - X_start):
+                for jdx in range(self.k):
+                    _push(
+                        &self.argkmin_distances[X_start + idx, 0],
+                        &self.argkmin_indices[X_start + idx, 0],
+                        self.k,
+                        self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
+                        self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                     )
 
-                # end: for Y_chunk_idx
-                with gil:
-                    # Synchronising the thread local heaps
-                    # with the main heaps
-                    for idx in range(X_end - X_start):
-                        for jdx in range(self.k):
-                            _push(
-                                &argkmin_approx_distances[X_start + idx, 0],
-                                &argkmin_indices[X_start + idx, 0],
-                                self.k,
-                                self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
-                                self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
-                            )
+        free(self.dist_middle_terms_chunks[thread_num])
+        free(self.heaps_approx_distances_chunks[thread_num])
+        free(self.heaps_indices_chunks[thread_num])
 
-            # end: with nogil, parallel
-
-            # Sorting indices of the argkmin for each query vector of X
-            for idx in prange(self.n_X, schedule='static',
-                              nogil=True, num_threads=num_threads):
-                _simultaneous_sort(
-                    &argkmin_approx_distances[idx, 0],
-                    &argkmin_indices[idx, 0],
-                    self.k,
-                )
-            # end: prange
-
-        # end: for X_chunk_idx
-        return self.Y_n_chunks
+    cdef void _on_Y_finalize(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t idx
+
+        # Sorting indices of the argkmin for each query vector of X
+        for idx in prange(self.n_X, schedule='static',
+                          nogil=True, num_threads=num_threads):
+            _simultaneous_sort(
+                &self.argkmin_distances[idx, 0],
+                &self.argkmin_indices[idx, 0],
+                self.k,
+            )
+        return
 
     cdef void _exact_distances(self,
-        ITYPE_t[:, ::1] Y_indices,          # IN
-        DTYPE_t[:, ::1] distances,          # IN/OUT
+        ITYPE_t[:, ::1] Y_indices,  # IN
+        DTYPE_t[:, ::1] distances,  # IN/OUT
     ) nogil:
         """Convert reduced distances to pairwise distances in parallel."""
         cdef:
@@ -494,40 +603,28 @@ cdef class ArgKmin(ParallelReduction):
         indices: ndarray of shape (n, k)
             Indices of each X vector argkmin in Y.
         """
-        cdef:
-            ITYPE_t n_X = self.X.shape[0]
-            ITYPE_t[:, ::1] argkmin_indices = np.full((n_X, self.k), 0,
-                                                   dtype=ITYPE)
-            DTYPE_t[:, ::1] argkmin_distances = np.full((n_X, self.k),
-                                                      FLOAT_INF,
-                                                      dtype=DTYPE)
-
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_omp_n_thread < n_X:
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
 
         if strategy == 'parallel_on_Y':
-            self._parallel_on_Y(
-                argkmin_indices, argkmin_distances
-            )
+            self._parallel_on_Y()
         elif strategy == 'parallel_on_X':
-            self._parallel_on_X(
-                argkmin_indices, argkmin_distances
-            )
+            self._parallel_on_X()
         else:
             raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
             # We need to recompute distances because we relied on
             # reduced distances.
-            self._exact_distances(argkmin_indices, argkmin_distances)
-            return np.asarray(argkmin_distances), np.asarray(argkmin_indices)
+            self._exact_distances(self.argkmin_indices, self.argkmin_distances)
+            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
-        return np.asarray(argkmin_indices)
+        return np.asarray(self.argkmin_indices)
 
 cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
@@ -548,8 +645,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
 
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,                  # IN
-        const DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,

From 67e02d48b174a14182ef60f6fdef2d4997140156 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 10:24:21 +0200
Subject: [PATCH 033/290] Pull GEMM buffers down to FastSquaredEuclideanArgKmin

Also add some method call chain and documentation for
FastSquaredEuclideanArgKmin.
---
 sklearn/metrics/_parallel_reductions.pyx | 71 +++++++++++++++++++-----
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index db484002f67a5..c3d01a7f9e016 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -329,6 +329,7 @@ cdef class ArgKmin(ParallelReduction):
 
     The implementation is parallelized on chunks whose size can
     be set using ``chunk_size``.
+
     Parameters
     ----------
     X: ndarray of shape (n, d)
@@ -346,7 +347,6 @@ cdef class ArgKmin(ParallelReduction):
     cdef:
         ITYPE_t k
 
-        DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_approx_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
@@ -389,11 +389,11 @@ cdef class ArgKmin(ParallelReduction):
         self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
 
         # Temporary datastructures used in threads
-        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
+        ParallelReduction.__dealloc__(self)
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
         else:
@@ -404,11 +404,6 @@ cdef class ArgKmin(ParallelReduction):
         else:
             raise RuntimeError("Trying to free heaps_approx_distances_chunks which is NULL")
 
-        if self.dist_middle_terms_chunks is not NULL:
-            free(self.dist_middle_terms_chunks)
-        else:
-            raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
-
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -423,7 +418,6 @@ cdef class ArgKmin(ParallelReduction):
             const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
             const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
@@ -447,11 +441,9 @@ cdef class ArgKmin(ParallelReduction):
     ) nogil:
         cdef:
             # in bytes
-            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t heap_size = self.X_n_samples_chunk * self.k * self.sf
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
         self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heap_size)
 
     cdef void _on_X_prange_iter_init(self,
@@ -489,7 +481,6 @@ cdef class ArgKmin(ParallelReduction):
     cdef void _on_X_parallel_finalize(self,
             ITYPE_t thread_num
     ) nogil:
-        free(self.dist_middle_terms_chunks[thread_num])
         free(self.heaps_approx_distances_chunks[thread_num])
 
     cdef void _on_Y_parallel_init(self,
@@ -497,12 +488,9 @@ cdef class ArgKmin(ParallelReduction):
     ) nogil:
         cdef:
             # in bytes
-            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t int_heap_size = self.X_n_samples_chunk * self.k * self.si
             ITYPE_t float_heap_size = self.X_n_samples_chunk * self.k * self.sf
 
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
         self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(float_heap_size)
 
         # As chunks of X are shared across threads, so must their
@@ -536,7 +524,6 @@ cdef class ArgKmin(ParallelReduction):
                         self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                     )
 
-        free(self.dist_middle_terms_chunks[thread_num])
         free(self.heaps_approx_distances_chunks[thread_num])
         free(self.heaps_indices_chunks[thread_num])
 
@@ -627,10 +614,25 @@ cdef class ArgKmin(ParallelReduction):
         return np.asarray(self.argkmin_indices)
 
 cdef class FastSquaredEuclideanArgKmin(ArgKmin):
+    """Fast specialized alternative for ArgKmin on
+    EuclideanDistance.
+
+    Computes the argkmin of vectors (rows) of a set of
+    vectors (rows) of X on another set of vectors (rows) of Y
+    using the GEMM-trick.
+
+    This implementation has an superior arithmetic intensity
+    and hence running time, but it can suffer from numerical
+    instability. We recommend using ArgKmin with
+    EuclideanDistance when exact precision is needed.
+    """
 
     cdef:
         DTYPE_t[::1] Y_sq_norms
 
+        # Buffers for GEMM
+        DTYPE_t ** dist_middle_terms_chunks
+
     def __init__(self,
                   X,
                   Y,
@@ -642,7 +644,46 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                          k=k,
                          chunk_size=chunk_size)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+        # Temporary datastructures used in threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
 
+    def __dealloc__(self):
+        ArgKmin.__dealloc__(self)
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+        else:
+            raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
+
+    cdef void _on_X_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        ArgKmin._on_X_parallel_init(self, thread_num)
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+
+    cdef void _on_X_parallel_finalize(self,
+            ITYPE_t thread_num
+    ) nogil:
+        ArgKmin._on_X_parallel_finalize(self, thread_num)
+        free(self.dist_middle_terms_chunks[thread_num])
+
+    cdef void _on_Y_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        ArgKmin._on_Y_parallel_init(self, thread_num)
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+
+    cdef void _on_Y_parallel_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        ArgKmin._on_Y_parallel_finalize(self, thread_num, X_chunk_idx, X_start, X_end)
+        free(self.dist_middle_terms_chunks[thread_num])
 
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,

From a8331da48ac45137143cc9e3466c3440ba254cc5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 13:41:29 +0200
Subject: [PATCH 034/290] Skip tests for translation invariance

---
 sklearn/neighbors/tests/test_neighbors.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index ca16f373212bc..b0d231721d71f 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1846,6 +1846,10 @@ def test_fast_sqeuclidean_correctness(
 @pytest.mark.parametrize("d", [5, 10, 100, 500])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
 @pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+@pytest.mark.skip(
+    reason="Long test, translation invariance should "
+    "have its own study: skipping for now"
+)
 def test_fast_sqeuclidean_translation_invariance(
     n,
     d,

From 0617dbd2c8f9acd40d0cad8c4a3efac196f65025 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 14:39:07 +0200
Subject: [PATCH 035/290] Define methods on the base class

---
 sklearn/metrics/_parallel_reductions.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index c3d01a7f9e016..9b664f14dd4e8 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -102,6 +102,9 @@ cdef class ParallelReduction:
         ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_rem
         ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_rem
 
+    @classmethod
+    def valid_metrics(cls):
+        return {*METRIC_MAPPING.keys()}
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -155,6 +158,9 @@ cdef class ParallelReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+    def __dealloc__(self):
+        pass
+
     cdef void _on_X_parallel_init(self,
             ITYPE_t thread_num,
     ) nogil:

From 38b97c331b39ee02105a483c6bc644eed844f165 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 14:38:19 +0200
Subject: [PATCH 036/290] Improve tests for NearestNeighbors the algorithm

Parametrise tests.
Add all common metrics between the methods.
Test for strict equality.
---
 sklearn/neighbors/tests/test_neighbors.py | 71 +++++++++++++++++------
 1 file changed, 53 insertions(+), 18 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index b0d231721d71f..437facb18752a 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -57,6 +57,7 @@
 SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,)
 
 ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto")
+COMMON_VALID_METRICS = set.intersection(*map(set, neighbors.VALID_METRICS.values()))
 P = (1, 2, 3, 4, np.inf)
 JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys())
 
@@ -77,31 +78,65 @@ def _weight_func(dist):
     return retval ** 2
 
 
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("n_query_pts", [1, 10, 100])
+@pytest.mark.parametrize("n_neighbors", [1, 10, 100])
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
 def test_unsupervised_kneighbors(
-    n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5
+    n_samples,
+    n_features,
+    n_query_pts,
+    n_neighbors,
+    metric,
 ):
-    # Test unsupervised neighbors methods
-    X = rng.rand(n_samples, n_features)
+    # The different algorithms must return identical results
+    # on their common metrics, with and without returning
+    # distances
 
-    test = rng.rand(n_query_pts, n_features)
+    # Redefining the rng locally to use the same generated X
+    local_rng = np.random.RandomState(0)
+    X = local_rng.rand(n_samples, n_features)
 
-    for p in P:
-        results_nodist = []
-        results = []
+    test = local_rng.rand(n_query_pts, n_features)
 
-        for algorithm in ALGORITHMS:
-            neigh = neighbors.NearestNeighbors(
-                n_neighbors=n_neighbors, algorithm=algorithm, p=p
-            )
-            neigh.fit(X)
+    results_nodist = []
+    results = []
+
+    for algorithm in ALGORITHMS:
+        neigh = neighbors.NearestNeighbors(
+            n_neighbors=n_neighbors, algorithm=algorithm, metric=metric
+        )
+        neigh.fit(X)
 
-            results_nodist.append(neigh.kneighbors(test, return_distance=False))
-            results.append(neigh.kneighbors(test, return_distance=True))
+        results_nodist.append(neigh.kneighbors(test, return_distance=False))
+        results.append(neigh.kneighbors(test, return_distance=True))
 
-        for i in range(len(results) - 1):
-            assert_array_almost_equal(results_nodist[i], results[i][1])
-            assert_array_almost_equal(results[i][0], results[i + 1][0])
-            assert_array_almost_equal(results[i][1], results[i + 1][1])
+    for i in range(len(results) - 1):
+        algorithm = ALGORITHMS[i]
+        next_algorithm = ALGORITHMS[i + 1]
+
+        indices_no_dist = results_nodist[i]
+        distances, next_distances = results[i][0], results[i + 1][0]
+        indices, next_indices = results[i][1], results[i + 1][1]
+        assert_array_equal(
+            indices_no_dist,
+            indices,
+            err_msg=f"The '{algorithm}' algorithm returns different"
+            f"indices depending on 'return_distances'.",
+        )
+        assert_array_equal(
+            indices,
+            next_indices,
+            err_msg=f"The '{algorithm}' and '{next_algorithm}' "
+            f"algorithms return different indices.",
+        )
+        assert_array_equal(
+            distances,
+            next_distances,
+            err_msg=f"The '{algorithm}' and '{next_algorithm}' "
+            f"algorithms return different distances.",
+        )
 
 
 @pytest.mark.parametrize(

From ef9c7f6b842ee2040b095c3799827cbdda1cc8be Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 14:45:56 +0200
Subject: [PATCH 037/290] Propagate metric kwargs in KNeighborsMixin.kneighbors

---
 sklearn/neighbors/_base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index e97894e8731c4..ac8dc0bea0e03 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -741,7 +741,11 @@ class from an array representing our data set and ask who's
             and self.effective_metric_ in ArgKmin.valid_metrics()
         ):
             results = ArgKmin.get_for(
-                X=X, Y=self._fit_X, k=n_neighbors, metric=self.effective_metric_
+                X=X,
+                Y=self._fit_X,
+                k=n_neighbors,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
             ).compute(
                 strategy="auto",
                 return_distance=return_distance,

From 882e6e767043863f4d7d9dd2e564ee1e727e982d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:20:28 +0200
Subject: [PATCH 038/290] Add DistanceMetric data validation at initialisation

---
 sklearn/metrics/_parallel_reductions.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 9b664f14dd4e8..f57d7bcd9fd5e 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -130,6 +130,8 @@ cdef class ParallelReduction:
                                          "same dimension but currently are " \
                                          f"respectively {X.shape[1]}-dimensional " \
                                          f"and {Y.shape[1]}-dimensional."
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
 
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)

From 15c110ae2432fdedcb39a0304bbc0e5b234a56bb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:34:30 +0200
Subject: [PATCH 039/290] Remove warning checks for 'wminkowski' now that Scipy
 is not used

---
 sklearn/neighbors/tests/test_neighbors.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 437facb18752a..bfc8a0c665bbb 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1334,19 +1334,9 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor
 
             neigh.fit(X[:, feature_sl])
 
-            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
-            ExceptionToAssert = None
-            if (
-                metric == "wminkowski"
-                and algorithm == "brute"
-                and sp_version >= parse_version("1.6.0")
-            ):
-                ExceptionToAssert = DeprecationWarning
-
-            with pytest.warns(ExceptionToAssert):
-                results[algorithm] = neigh.kneighbors(
-                    test[:, feature_sl], return_distance=True
-                )
+            results[algorithm] = neigh.kneighbors(
+                test[:, feature_sl], return_distance=True
+            )
 
         assert_array_almost_equal(results["brute"][0], results["ball_tree"][0])
         assert_array_almost_equal(results["brute"][1], results["ball_tree"][1])

From 7e3c4b7f66e62564f071b92d204f1950bb323938 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:38:18 +0200
Subject: [PATCH 040/290] Parametrise test_k_and_radius_neighbors_duplicates on
 algorithms

---
 sklearn/neighbors/tests/test_neighbors.py | 83 +++++++++++------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index bfc8a0c665bbb..959ed6bfd7210 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1543,49 +1543,48 @@ def test_k_and_radius_neighbors_X_None():
         )
 
 
-def test_k_and_radius_neighbors_duplicates():
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_k_and_radius_neighbors_duplicates(algorithm):
     # Test behavior of kneighbors when duplicates are present in query
-
-    for algorithm in ALGORITHMS:
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
-        nn.fit([[0], [1]])
-
-        # Do not do anything special to duplicates.
-        kng = nn.kneighbors_graph([[0], [1]], mode="distance")
-        assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
-        assert_array_equal(kng.data, [0.0, 0.0])
-        assert_array_equal(kng.indices, [0, 1])
-
-        dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
-        check_object_arrays(dist, [[0, 1], [1, 0]])
-        check_object_arrays(ind, [[0, 1], [0, 1]])
-
-        rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
-        assert_array_equal(rng.A, np.ones((2, 2)))
-
-        rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
-        rng.sort_indices()
-        assert_array_equal(rng.A, [[0, 1], [1, 0]])
-        assert_array_equal(rng.indices, [0, 1, 0, 1])
-        assert_array_equal(rng.data, [0, 1, 1, 0])
-
-        # Mask the first duplicates when n_duplicates > n_neighbors.
-        X = np.ones((3, 1))
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
-        nn.fit(X)
-        dist, ind = nn.kneighbors()
-        assert_array_equal(dist, np.zeros((3, 1)))
-        assert_array_equal(ind, [[1], [0], [1]])
-
-        # Test that zeros are explicitly marked in kneighbors_graph.
-        kng = nn.kneighbors_graph(mode="distance")
-        assert_array_equal(kng.A, np.zeros((3, 3)))
-        assert_array_equal(kng.data, np.zeros(3))
-        assert_array_equal(kng.indices, [1.0, 0.0, 1.0])
-        assert_array_equal(
-            nn.kneighbors_graph().A,
-            np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
-        )
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
+    nn.fit([[0], [1]])
+
+    # Do not do anything special to duplicates.
+    kng = nn.kneighbors_graph([[0], [1]], mode="distance")
+    assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
+    assert_array_equal(kng.data, [0.0, 0.0])
+    assert_array_equal(kng.indices, [0, 1])
+
+    dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
+    check_object_arrays(dist, [[0, 1], [1, 0]])
+    check_object_arrays(ind, [[0, 1], [0, 1]])
+
+    rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
+    assert_array_equal(rng.A, np.ones((2, 2)))
+
+    rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
+    rng.sort_indices()
+    assert_array_equal(rng.A, [[0, 1], [1, 0]])
+    assert_array_equal(rng.indices, [0, 1, 0, 1])
+    assert_array_equal(rng.data, [0, 1, 1, 0])
+
+    # Mask the first duplicates when n_duplicates > n_neighbors.
+    X = np.ones((3, 1))
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
+    nn.fit(X)
+    dist, ind = nn.kneighbors()
+    assert_array_equal(dist, np.zeros((3, 1)))
+    assert_array_equal(ind, [[1], [0], [1]])
+
+    # Test that zeros are explicitly marked in kneighbors_graph.
+    kng = nn.kneighbors_graph(mode="distance")
+    assert_array_equal(kng.A, np.zeros((3, 3)))
+    assert_array_equal(kng.data, np.zeros(3))
+    assert_array_equal(kng.indices, [1.0, 0.0, 1.0])
+    assert_array_equal(
+        nn.kneighbors_graph().A,
+        np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
+    )
 
 
 def test_include_self_neighbors_graph():

From 2ec36c1c4c7ffad2734b0f3f97ec4cc89ea48368 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:56:00 +0200
Subject: [PATCH 041/290] Remove uncalled snippet

This can be simplified as the condition won't be verified
anymore.
---
 sklearn/neighbors/_base.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index ac8dc0bea0e03..0d3f15974a7e6 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -758,12 +758,6 @@ class from an array representing our data set and ask who's
                 return_distance=return_distance,
             )
 
-            # for efficiency, use squared euclidean distances
-            if self.effective_metric_ == "euclidean":
-                kwds = {"squared": True}
-            else:
-                kwds = self.effective_metric_params_
-
             chunked_results = list(
                 pairwise_distances_chunked(
                     X,
@@ -771,7 +765,7 @@ class from an array representing our data set and ask who's
                     reduce_func=reduce_func,
                     metric=self.effective_metric_,
                     n_jobs=n_jobs,
-                    **kwds,
+                    **self.effective_metric_params_,
                 )
             )
 

From ad496f00dd7fefdc4c98f8befce0d6fedd30adff Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 16:00:00 +0200
Subject: [PATCH 042/290] Do not branch on sparse arrays, yet

We need to implement a strategy for sparse arrays for
ParallelReduction.
---
 sklearn/metrics/pairwise.py | 7 ++++++-
 sklearn/neighbors/_base.py  | 5 ++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c35b38ad1fde4..bcf42ac2ca9fa 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -652,7 +652,12 @@ def pairwise_distances_argmin_min(
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    if metric in ArgKmin.valid_metrics():
+    if (
+        # TODO: support sparse arrays
+        not issparse(X)
+        and not issparse(X)
+        and metric in ArgKmin.valid_metrics()
+    ):
         values, indices = ArgKmin.get_for(
             X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
         ).compute(strategy="auto", return_distance=True)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 0d3f15974a7e6..85c3cd743cb57 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -737,7 +737,10 @@ class from an array representing our data set and ask who's
             )
 
         elif (
-            self._fit_method == "brute"
+            # TODO: support sparse arrays
+            not issparse(X)
+            and not issparse(self._fit_X)
+            and self._fit_method == "brute"
             and self.effective_metric_ in ArgKmin.valid_metrics()
         ):
             results = ArgKmin.get_for(

From 68af6a7fe1913f80d250adc85c9d25ad4cfcc034 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 2 Jul 2021 13:53:15 +0200
Subject: [PATCH 043/290] Document

---
 sklearn/metrics/_parallel_reductions.pyx | 145 ++++++++++-------------
 1 file changed, 61 insertions(+), 84 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index f57d7bcd9fd5e..ec6d9d4c3217d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -12,7 +12,6 @@ import numpy as np
 cimport numpy as np
 cimport openmp
 
-from libc.math cimport sqrt
 from libc.stdlib cimport free, malloc
 
 from cython.parallel cimport parallel, prange
@@ -22,9 +21,7 @@ from ._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
 
 DEF CHUNK_SIZE = 256  # number of vectors
-
 DEF MIN_CHUNK_SAMPLES = 20
-
 DEF FLOAT_INF = 1e36
 
 from ..utils._cython_blas cimport (
@@ -43,30 +40,6 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t
 from ..utils._typedefs import ITYPE, DTYPE
 
 
-cdef inline DTYPE_t _euclidean_dist(
-    DTYPE_t[:, ::1] X,
-    DTYPE_t[:, ::1] Y,
-    ITYPE_t i,
-    ITYPE_t j,
-) nogil:
-    cdef:
-        DTYPE_t dist = 0
-        ITYPE_t k
-        ITYPE_t upper_unrolled_idx = (X.shape[1] // 4) * 4
-
-    # Unrolling loop to help with vectorisation
-    for k in range(0, upper_unrolled_idx, 4):
-        dist += (X[i, k] - Y[j, k]) * (X[i, k] - Y[j, k])
-        dist += (X[i, k + 1] - Y[j, k + 1]) * (X[i, k + 1] - Y[j, k + 1])
-        dist += (X[i, k + 2] - Y[j, k + 2]) * (X[i, k + 2] - Y[j, k + 2])
-        dist += (X[i, k + 3] - Y[j, k + 3]) * (X[i, k + 3] - Y[j, k + 3])
-
-    for k in range(upper_unrolled_idx, X.shape[1]):
-        dist += (X[i, k] - Y[j, k]) * (X[i, k] - Y[j, k])
-
-    return sqrt(dist)
-
-
 cdef class ParallelReduction:
     """Abstract class to computes a reduction of a set of
     vectors (rows) of X on another set of vectors (rows) of Y.
@@ -160,35 +133,6 @@ cdef class ParallelReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
-    def __dealloc__(self):
-        pass
-
-    cdef void _on_X_parallel_init(self,
-            ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _on_X_parallel_finalize(self,
-            ITYPE_t thread_num
-    ) nogil:
-        return
-
-    cdef void _on_X_prange_iter_init(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _on_X_prange_iter_finalize(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
-    ) nogil:
-        return
-
     cdef void _parallel_on_X(self) nogil:
         """Computes the reduction of each vector (row) of X on Y
         by parallelizing computation on chunks of X.
@@ -245,26 +189,8 @@ cdef class ParallelReduction:
         # end: with nogil, parallel
         return
 
-    cdef void _on_Y_parallel_init(self,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _on_Y_parallel_finalize(self,
-        ITYPE_t thread_num,
-        ITYPE_t X_chunk_idx,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _on_Y_finalize(self,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
     cdef void _parallel_on_Y(self) nogil:
-        """Computes the argkmin of each vector (row) of X on Y
+        """Computes the reduction of each vector (row) of X on Y
         by parallelizing computation on chunks of Y.
 
         Private datastructures are modified internally by threads.
@@ -318,6 +244,7 @@ cdef class ParallelReduction:
         self._on_Y_finalize(num_threads)
         return
 
+    # Placeholder methods which have to be implemented
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -331,6 +258,55 @@ cdef class ParallelReduction:
         on a pair of chunks"""
         return -1
 
+    def __dealloc__(self):
+        pass
+
+    # Placeholder methods which can be implemented
+
+    cdef void _on_X_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _on_X_parallel_finalize(self,
+            ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef void _on_X_prange_iter_init(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _on_X_prange_iter_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _on_Y_parallel_init(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _on_Y_parallel_finalize(self,
+        ITYPE_t thread_num,
+        ITYPE_t X_chunk_idx,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _on_Y_finalize(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
 cdef class ArgKmin(ParallelReduction):
     """Computes the argkmin of vectors (rows) of a set of
     vectors (rows) of X on another set of vectors (rows) of Y.
@@ -374,6 +350,7 @@ cdef class ArgKmin(ParallelReduction):
                 ITYPE_t chunk_size=CHUNK_SIZE,
                 dict metric_kwargs=dict(),
         ):
+        # This factory comes to handle specialisation on fast_sqeuclidean.
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
         return ArgKmin(X=X, Y=Y,
@@ -396,7 +373,7 @@ cdef class ArgKmin(ParallelReduction):
         self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
 
-        # Temporary datastructures used in threads
+        # Pointers to thread local heaps used in threads for `parallel_on_Y` solely
         self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
@@ -432,6 +409,8 @@ cdef class ArgKmin(ParallelReduction):
             ITYPE_t n_x = X_end - X_start
             ITYPE_t n_y = Y_end - Y_start
 
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
                 _push(heaps_approx_distances + i * self.k,
@@ -447,12 +426,8 @@ cdef class ArgKmin(ParallelReduction):
     cdef void _on_X_parallel_init(self,
             ITYPE_t thread_num,
     ) nogil:
-        cdef:
-            # in bytes
-            ITYPE_t heap_size = self.X_n_samples_chunk * self.k * self.sf
-
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heap_size)
+        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.X_n_samples_chunk * self.k * self.sf)
 
     cdef void _on_X_prange_iter_init(self,
             ITYPE_t thread_num,
@@ -629,10 +604,12 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
     vectors (rows) of X on another set of vectors (rows) of Y
     using the GEMM-trick.
 
+    Notes
+    -----
     This implementation has an superior arithmetic intensity
     and hence running time, but it can suffer from numerical
-    instability. We recommend using ArgKmin with
-    EuclideanDistance when exact precision is needed.
+    instability. ArgKmin with EuclideanDistance must be
+    used when exact precision is needed.
     """
 
     cdef:

From f7403349fe65446241dc35db53ac7521e935735e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 2 Jul 2021 14:09:52 +0200
Subject: [PATCH 044/290] Remove unnecessary parallel_on_X thread-local
 datastructures

---
 sklearn/metrics/_parallel_reductions.pyx | 44 ++++++++----------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index ec6d9d4c3217d..c67e09c3bf476 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -369,7 +369,7 @@ cdef class ArgKmin(ParallelReduction):
 
         self.k = k
 
-        # Results returned by ArgKmin.compute
+        # Results returned by ArgKmin.compute used as the main heaps
         self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
 
@@ -423,12 +423,6 @@ cdef class ArgKmin(ParallelReduction):
 
         return 0
 
-    cdef void _on_X_parallel_init(self,
-            ITYPE_t thread_num,
-    ) nogil:
-        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.X_n_samples_chunk * self.k * self.sf)
-
     cdef void _on_X_prange_iter_init(self,
             ITYPE_t thread_num,
             ITYPE_t X_chunk_idx,
@@ -436,12 +430,10 @@ cdef class ArgKmin(ParallelReduction):
             ITYPE_t X_end,
     ) nogil:
 
-        # We reset the heap between X chunks (memset can't be used here)
-        for idx in range(self.X_n_samples_chunk * self.k):
-            self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
-
-        # Referencing the thread-local heaps via the thread-scope pointer
-        # of pointers attached to the instance
+        # As this strategy is embarassingly parallel, we can set the
+        # thread-local heaps pointers to the proper position
+        # on the main heaps
+        self.heaps_approx_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
     cdef void _on_X_prange_iter_finalize(self,
@@ -457,29 +449,22 @@ cdef class ArgKmin(ParallelReduction):
         for idx in range(X_end - X_start):
             _simultaneous_sort(
                 self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
-                &self.argkmin_indices[X_start + idx, 0],
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
                 self.k
             )
 
-    cdef void _on_X_parallel_finalize(self,
-            ITYPE_t thread_num
-    ) nogil:
-        free(self.heaps_approx_distances_chunks[thread_num])
-
     cdef void _on_Y_parallel_init(self,
             ITYPE_t thread_num,
     ) nogil:
         cdef:
-            # in bytes
-            ITYPE_t int_heap_size = self.X_n_samples_chunk * self.k * self.si
-            ITYPE_t float_heap_size = self.X_n_samples_chunk * self.k * self.sf
-
-        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(float_heap_size)
+            # number of scalar elements
+            ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
 
         # As chunks of X are shared across threads, so must their
         # heaps. To solve this, each thread has its own locals
         # heaps which are then synchronised back in the main ones.
-        self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(int_heap_size)
+        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heaps_size * self.sf)
+        self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(heaps_size * self.si)
 
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
@@ -517,7 +502,8 @@ cdef class ArgKmin(ParallelReduction):
             ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
             ITYPE_t idx
 
-        # Sorting indices of the argkmin for each query vector of X
+        # Sort the main heaps into arrays in parallel
+        # in ascending order w.r.t the distances
         for idx in prange(self.n_X, schedule='static',
                           nogil=True, num_threads=num_threads):
             _simultaneous_sort(
@@ -542,7 +528,6 @@ cdef class ArgKmin(ParallelReduction):
                                                  &self.Y[Y_indices[i, j], 0],
                                                  self.d)
 
-    # Python interface
     def compute(self,
            str strategy = "auto",
            bint return_distance = False
@@ -590,7 +575,7 @@ cdef class ArgKmin(ParallelReduction):
 
         if return_distance:
             # We need to recompute distances because we relied on
-            # reduced distances.
+            # approximate distances.
             self._exact_distances(self.argkmin_indices, self.argkmin_distances)
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
@@ -625,6 +610,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                   ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         ArgKmin.__init__(self, X, Y,
+                         # The distance metric here is used for exact distances computations
                          distance_metric=DistanceMetric.get_metric("euclidean"),
                          k=k,
                          chunk_size=chunk_size)
@@ -736,7 +722,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                 _push(heaps_approx_distances + i * k,
                       heaps_indices + i * k,
                       k,
-                      # approximated distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      # approximate distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                       dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
                       j + Y_start)
         return 0

From 0b9732b20f8644632d8890ac58063bac00ed4b61 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 2 Jul 2021 14:23:49 +0200
Subject: [PATCH 045/290] Remove attributes for dtypes' size

---
 sklearn/metrics/_parallel_reductions.pyx | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index c67e09c3bf476..7750ed3da1091 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -69,9 +69,6 @@ cdef class ParallelReduction:
 
         ITYPE_t d
 
-        # dtypes sizes
-        ITYPE_t sf, si
-
         ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_rem
         ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_rem
 
@@ -107,8 +104,6 @@ cdef class ParallelReduction:
         distance_metric._validate_data(Y)
 
         self.d = X.shape[1]
-        self.sf = sizeof(DTYPE_t)
-        self.si = sizeof(ITYPE_t)
         self.chunk_size = chunk_size
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
@@ -463,8 +458,8 @@ cdef class ArgKmin(ParallelReduction):
         # As chunks of X are shared across threads, so must their
         # heaps. To solve this, each thread has its own locals
         # heaps which are then synchronised back in the main ones.
-        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heaps_size * self.sf)
-        self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(heaps_size * self.si)
+        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heaps_size * sizeof(DTYPE_t))
+        self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(heaps_size * sizeof(ITYPE_t))
 
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
@@ -631,7 +626,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         ArgKmin._on_X_parallel_init(self, thread_num)
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
 
     cdef void _on_X_parallel_finalize(self,
             ITYPE_t thread_num
@@ -645,7 +640,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         ArgKmin._on_Y_parallel_init(self, thread_num)
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
 
     cdef void _on_Y_parallel_finalize(self,
             ITYPE_t thread_num,

From 5c57860d9b370f22496c7521d9dd73a2a8b5a10f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 2 Jul 2021 14:24:22 +0200
Subject: [PATCH 046/290] Use all threads when sorting

---
 sklearn/metrics/_parallel_reductions.pyx | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 7750ed3da1091..be6d1b9668d0e 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -425,7 +425,7 @@ cdef class ArgKmin(ParallelReduction):
             ITYPE_t X_end,
     ) nogil:
 
-        # As this strategy is embarassingly parallel, we can set the
+        # As this strategy is embarrassingly parallel, we can set the
         # thread-local heaps pointers to the proper position
         # on the main heaps
         self.heaps_approx_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
@@ -494,13 +494,12 @@ cdef class ArgKmin(ParallelReduction):
             ITYPE_t thread_num,
     ) nogil:
         cdef:
-            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
             ITYPE_t idx
 
         # Sort the main heaps into arrays in parallel
         # in ascending order w.r.t the distances
-        for idx in prange(self.n_X, schedule='static',
-                          nogil=True, num_threads=num_threads):
+        for idx in prange(self.n_X, schedule='static', nogil=True,
+                          num_threads=self.effective_omp_n_thread):
             _simultaneous_sort(
                 &self.argkmin_distances[idx, 0],
                 &self.argkmin_indices[idx, 0],
@@ -512,7 +511,7 @@ cdef class ArgKmin(ParallelReduction):
         ITYPE_t[:, ::1] Y_indices,  # IN
         DTYPE_t[:, ::1] distances,  # IN/OUT
     ) nogil:
-        """Convert reduced distances to pairwise distances in parallel."""
+        """Convert approximate distances to pairwise distances in parallel."""
         cdef:
             ITYPE_t i, j
 

From 384b9a898f8ba2dfc2bf5aebf915f8241ce2c12a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 2 Jul 2021 17:47:28 +0200
Subject: [PATCH 047/290] Rename ParallelReduction to
 PairwiseDistancesReduction

---
 sklearn/metrics/_parallel_reductions.pyx | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index be6d1b9668d0e..d97a37de33ede 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -40,9 +40,10 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t
 from ..utils._typedefs import ITYPE, DTYPE
 
 
-cdef class ParallelReduction:
-    """Abstract class to computes a reduction of a set of
-    vectors (rows) of X on another set of vectors (rows) of Y.
+cdef class PairwiseDistancesReduction:
+    """Abstract class to computes a reduction on pairwise
+    distances between a set of vectors (rows) X and another
+    set of vectors (rows) of Y.
 
     The implementation of the reduction is done parallelized
     on chunks whose size can be set using ``chunk_size``.
@@ -302,7 +303,7 @@ cdef class ParallelReduction:
     ) nogil:
         return
 
-cdef class ArgKmin(ParallelReduction):
+cdef class ArgKmin(PairwiseDistancesReduction):
     """Computes the argkmin of vectors (rows) of a set of
     vectors (rows) of X on another set of vectors (rows) of Y.
 
@@ -334,7 +335,7 @@ cdef class ArgKmin(ParallelReduction):
 
     @classmethod
     def valid_metrics(cls):
-        return {"fast_sqeuclidean", *METRIC_MAPPING.keys()}
+        return {"fast_sqeuclidean", *PairwiseDistancesReduction.valid_metrics()}
 
     @classmethod
     def get_for(cls,
@@ -360,7 +361,7 @@ cdef class ArgKmin(ParallelReduction):
                  ITYPE_t k,
                  ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        ParallelReduction.__init__(self, X, Y, distance_metric, chunk_size)
+        PairwiseDistancesReduction.__init__(self, X, Y, distance_metric, chunk_size)
 
         self.k = k
 
@@ -373,7 +374,7 @@ cdef class ArgKmin(ParallelReduction):
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
-        ParallelReduction.__dealloc__(self)
+        PairwiseDistancesReduction.__dealloc__(self)
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
         else:

From ed03b88267aab8451fadc402b545e034cc1aa6f8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 5 Jul 2021 10:38:31 +0200
Subject: [PATCH 048/290] Cast pointer to const value for gemm interface

Necessarily casting because APIs exposed
via scipy.linalg.cython_blas aren't reflecting
the const-identifier for arguments
---
 sklearn/metrics/_parallel_reductions.pyx | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index d97a37de33ede..911bf2612d00d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -699,9 +699,12 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             ITYPE_t n = Y_c.shape[0]
             ITYPE_t K = X_c.shape[1]
             DTYPE_t alpha = - 2.
-            DTYPE_t * A = & X_c[0, 0]
+            # TODO: necessarily casting because APIs exposed
+            # via scipy.linalg.cython_blas aren't reflecting
+            # the const-identifier for arguments
+            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
             ITYPE_t lda = X_c.shape[1]
-            DTYPE_t * B = & Y_c[0, 0]
+            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
             ITYPE_t ldb = X_c.shape[1]
             DTYPE_t beta = 0.
             DTYPE_t * C = dist_middle_terms

From 6c5e0b9be2b8c427fec4ddb4bea7fe5281cdfa37 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 18:37:03 +0200
Subject: [PATCH 049/290] [WIP] Add RadiusNeighborhood

---
 sklearn/metrics/_dist_metrics.pxd        |   2 +-
 sklearn/metrics/_parallel_reductions.pyx | 135 ++++++++++++++++++++++-
 sklearn/metrics/setup.py                 |   1 +
 3 files changed, 134 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index fe0d7322268dd..b5ad7969d27e3 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -62,7 +62,7 @@ cdef class DistanceMetric:
     cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                       ITYPE_t size) nogil except -1
 
-    cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 911bf2612d00d..a73ce9225efe4 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -12,7 +12,10 @@ import numpy as np
 cimport numpy as np
 cimport openmp
 
+np.import_array()
+
 from libc.stdlib cimport free, malloc
+from libcpp.vector cimport vector
 
 from cython.parallel cimport parallel, prange
 
@@ -402,9 +405,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-            ITYPE_t n_x = X_end - X_start
-            ITYPE_t n_y = Y_end - Y_start
-
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
         for i in range(X_c.shape[0]):
@@ -568,6 +568,11 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         else:
             raise RuntimeError(f"strategy '{strategy}' not supported.")
 
+        return self._finalise_compute(return_distance)
+
+    def _finalise_compute(self,
+           bint return_distance
+    ):
         if return_distance:
             # We need to recompute distances because we relied on
             # approximate distances.
@@ -724,3 +729,127 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                       dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
                       j + Y_start)
         return 0
+
+
+cdef class RadiusNeighborhood(PairwiseDistancesReduction):
+
+    cdef:
+        DTYPE_t radius
+
+        # Distances metrics compute approximated distance
+        # ("reduced distance" in the original wording),
+        # which are proxies necessitating less computations.
+        # We get the proxy for the radius to be able to compare
+
+        # NOTE: not used for now.
+        DTYPE_t radius_proxy
+
+        vector[vector[ITYPE_t]] neigh_indices
+        vector[vector[DTYPE_t]] neigh_distances
+
+        bint sort_results
+
+
+    @classmethod
+    def get_for(cls,
+                X,
+                Y,
+                DTYPE_t radius,
+                str metric="euclidean",
+                ITYPE_t chunk_size=CHUNK_SIZE,
+                dict metric_kwargs=dict(),
+        ):
+        return RadiusNeighborhood(X=X, Y=Y,
+                       distance_metric=DistanceMetric.get_metric(metric, **metric_kwargs),
+                       radius=radius,
+                       chunk_size=chunk_size)
+
+    def __init__(self,
+                 X,
+                 Y,
+                 DistanceMetric distance_metric,
+                 DTYPE_t radius,
+                 ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        PairwiseDistancesReduction.__init__(self, X, Y, distance_metric, chunk_size)
+
+        self.radius = radius
+        self.sort_results = False
+
+        self.neigh_indices.resize(self.n_X)
+        self.neigh_distances.resize(self.n_X)
+
+
+    cdef int _reduce_on_chunks(self,
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil except -1:
+        cdef:
+            ITYPE_t i, j
+            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            DTYPE_t dist_i_j
+
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                dist_i_j = self.distance_metric.dist(
+                    &X_c[i, 0], &Y_c[j, 0], self.d)
+                if dist_i_j <= self.radius:
+                    self.neigh_distances[X_start + i].push_back(dist_i_j)
+                    self.neigh_indices[X_start + i].push_back(Y_start + j)
+
+        return 0
+
+    cdef void _on_X_prange_iter_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+
+        # Sorting neighbors for each query vector of X
+        if self.sort_results:
+            for idx in range(X_start, X_end):
+                _simultaneous_sort(
+                    self.neigh_distances[idx].data(),
+                    self.neigh_indices[idx].data(),
+                    self.neigh_indices[idx].size()
+                )
+
+    def compute(self,
+           str strategy = "auto",
+           bint return_distance = False,
+           bint sort_results = False
+    ):
+        # TODO: setup thread local datastructures for supporting _parallel_on_Y
+        self.sort_results = sort_results
+        self._parallel_on_X()
+        return self._finalise_compute(return_distance)
+
+    def _finalise_compute(self,
+           bint return_distance
+    ):
+        # TODO: this is totally inefficient.
+        # Each vector content is copied into a Python list, which boxes
+        # integers. Those lists are then converted into numpy arrays
+        def _vector_to_np_ndarray(vec_of_vecs):
+            # In the case of inner vectors of different shapes
+            # some boilerplate is needed to coerce
+            # a vector[vector[T]] into a np.ndarray[np.ndarray[T]]
+            vec_of_vecs = np.array(vec_of_vecs, dtype=np.ndarray)
+            for i, v in enumerate(vec_of_vecs):
+                vec_of_vecs[i] = np.array(v)
+
+            return vec_of_vecs
+
+        if return_distance:
+             return _vector_to_np_ndarray(self.neigh_distances), _vector_to_np_ndarray(self.neigh_indices)
+
+        return _vector_to_np_ndarray(self.neigh_indices)
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 6fd445d2c1a00..01f8056319408 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -22,6 +22,7 @@ def configuration(parent_package="", top_path=None):
     config.add_extension(
         "_parallel_reductions",
         sources=["_parallel_reductions.pyx"],
+        language="c++",
         libraries=libraries,
     )
 

From e37b147e1f8b6a0da026d1265aa22f9f85ee88f2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 18:37:03 +0200
Subject: [PATCH 050/290] [WIP] Add RadiusNeighborhood

---
 sklearn/metrics/_parallel_reductions.pyx | 50 +++++++++++++++++-------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index a73ce9225efe4..f9c171edabbf7 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -41,6 +41,38 @@ from ..utils._heap cimport _simultaneous_sort, _push
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t
 from ..utils._typedefs import ITYPE, DTYPE
+from ..utils._typedefs import DTYPECODE, ITYPECODE
+
+######################
+## std::vector to np.ndarray coercion
+# TODO: for now using this simple solution: https://stackoverflow.com/a/23873586
+# A better solution would make sure of using the same allocator implementations.
+# Those implementations depend on the runtimes' allocator which can be different
+# in some configuration and thus would make the program crash.
+
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+
+cdef np.ndarray[DITYPE_t, ndim=1] buffer_to_numpy_array(DITYPE_t * ptr, np.npy_intp size):
+    """ Create a numpy ndarray given a buffer and its size. """
+    typenum = DTYPECODE if DITYPE_t is DTYPE_t else ITYPECODE
+    cdef np.ndarray[DITYPE_t, ndim=1] arr = np.PyArray_SimpleNewFromData(1, &size, typenum, ptr)
+
+    # Makes the numpy array responsible to the life-cycle of its buffer.
+    PyArray_ENABLEFLAGS(arr, np.NPY_OWNDATA)
+    return arr
+
+cpdef np.ndarray _vector_to_np_ndarray(vector[vector[DITYPE_t]] vec_of_vecs):
+    # In the case of inner vectors of different shapes
+    # some boilerplate is needed to coerce
+    # a vector[vector[T]] into a np.ndarray[np.ndarray[T]]
+    np_arrays = []
+
+    for i in range(vec_of_vecs.size()):
+        np_arrays.append(buffer_to_numpy_array(vec_of_vecs[i].data(), vec_of_vecs[i].size()))
+
+    return np.array(np_arrays, np.ndarray)
+#####################
 
 
 cdef class PairwiseDistancesReduction:
@@ -836,20 +868,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     def _finalise_compute(self,
            bint return_distance
     ):
-        # TODO: this is totally inefficient.
-        # Each vector content is copied into a Python list, which boxes
-        # integers. Those lists are then converted into numpy arrays
-        def _vector_to_np_ndarray(vec_of_vecs):
-            # In the case of inner vectors of different shapes
-            # some boilerplate is needed to coerce
-            # a vector[vector[T]] into a np.ndarray[np.ndarray[T]]
-            vec_of_vecs = np.array(vec_of_vecs, dtype=np.ndarray)
-            for i, v in enumerate(vec_of_vecs):
-                vec_of_vecs[i] = np.array(v)
-
-            return vec_of_vecs
-
         if return_distance:
-             return _vector_to_np_ndarray(self.neigh_distances), _vector_to_np_ndarray(self.neigh_indices)
+             return (_vector_to_np_ndarray[DTYPE_t](self.neigh_distances),
+                    _vector_to_np_ndarray[ITYPE_t](self.neigh_indices))
 
-        return _vector_to_np_ndarray(self.neigh_indices)
+        return _vector_to_np_ndarray[ITYPE_t](self.neigh_indices)

From 29d145ad4df7de89fc6208a5ce446ecfb2056661 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 18:37:03 +0200
Subject: [PATCH 051/290] [WIP] Add RadiusNeighborhood

This creates a double freeing as the buffers
are tight to vectors attached to the instance.

When the RadiusNeighborhood instance gets
deleted, so do the buffers and the content
or returned numpy array makes no sense.

Also it crashes when ones deallocates the numpy
arrays.

One needs to have the final buffers life-time
no be tight to the instance's vectors.
---
 sklearn/metrics/_parallel_reductions.pyx | 31 ++++++++++++------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index f9c171edabbf7..da9fb0dd70e65 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -39,9 +39,9 @@ from ..utils._cython_blas cimport (
 
 from ..utils._heap cimport _simultaneous_sort, _push
 from ..utils._openmp_helpers import _openmp_effective_n_threads
-from ..utils._typedefs cimport ITYPE_t, DTYPE_t
+from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
+from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 from ..utils._typedefs import ITYPE, DTYPE
-from ..utils._typedefs import DTYPECODE, ITYPECODE
 
 ######################
 ## std::vector to np.ndarray coercion
@@ -62,16 +62,6 @@ cdef np.ndarray[DITYPE_t, ndim=1] buffer_to_numpy_array(DITYPE_t * ptr, np.npy_i
     PyArray_ENABLEFLAGS(arr, np.NPY_OWNDATA)
     return arr
 
-cpdef np.ndarray _vector_to_np_ndarray(vector[vector[DITYPE_t]] vec_of_vecs):
-    # In the case of inner vectors of different shapes
-    # some boilerplate is needed to coerce
-    # a vector[vector[T]] into a np.ndarray[np.ndarray[T]]
-    np_arrays = []
-
-    for i in range(vec_of_vecs.size()):
-        np_arrays.append(buffer_to_numpy_array(vec_of_vecs[i].data(), vec_of_vecs[i].size()))
-
-    return np.array(np_arrays, np.ndarray)
 #####################
 
 
@@ -869,7 +859,18 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
            bint return_distance
     ):
         if return_distance:
-             return (_vector_to_np_ndarray[DTYPE_t](self.neigh_distances),
-                    _vector_to_np_ndarray[ITYPE_t](self.neigh_indices))
+            np_arrays_indices = []
+            np_arrays_distances = []
+
+            for i in range(self.n_X):
+                np_arrays_distances.append(buffer_to_numpy_array(self.neigh_distances[i].data(), self.neigh_distances[i].size()))
+                np_arrays_indices.append(buffer_to_numpy_array(self.neigh_indices[i].data(), self.neigh_indices[i].size()))
+
+            return np.array(np_arrays_distances, np.ndarray), np.array(np_arrays_indices, np.ndarray)
+
+        np_arrays_indices = []
+
+        for i in range(self.n_X):
+            np_arrays_indices.append(buffer_to_numpy_array(self.neigh_indices[i].data(), self.neigh_indices[i].size()))
 
-        return _vector_to_np_ndarray[ITYPE_t](self.neigh_indices)
+        return np.array(np_arrays_indices, np.ndarray)

From dcca5033dedc3326b5285f789ce2fcebf68f9403 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 18:37:03 +0200
Subject: [PATCH 052/290] [WIP] Add RadiusNeighborhood

To solve the problem given above,
we allocate dynamically
allocate vectors which won't be
freed, but their buffer eventually will
as the ownership will be transferred to
numpy arrays.
---
 sklearn/metrics/_parallel_reductions.pyx | 62 +++++++++++++++++-------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index da9fb0dd70e65..67c974e965b8f 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -16,6 +16,7 @@ np.import_array()
 
 from libc.stdlib cimport free, malloc
 from libcpp.vector cimport vector
+from cython.operator cimport dereference as deref
 
 from cython.parallel cimport parallel, prange
 
@@ -279,9 +280,6 @@ cdef class PairwiseDistancesReduction:
         on a pair of chunks"""
         return -1
 
-    def __dealloc__(self):
-        pass
-
     # Placeholder methods which can be implemented
 
     cdef void _on_X_parallel_init(self,
@@ -399,7 +397,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
-        PairwiseDistancesReduction.__dealloc__(self)
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
         else:
@@ -766,8 +763,32 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # NOTE: not used for now.
         DTYPE_t radius_proxy
 
-        vector[vector[ITYPE_t]] neigh_indices
-        vector[vector[DTYPE_t]] neigh_distances
+        # We want resizable buffers which we will
+        # to wrapped within numpy arrays at the end.
+        #
+        # std::vector comes as a handy interface for
+        # efficient resizable buffers.
+        #
+        # Though it is possible to access their buffer
+        # address with std::vector::data, their buffer
+        # can't be stolen: their life-time is tight to
+        # the buffer's.
+        #
+        # To solve this, we allocate dynamically
+        # allocate vectors which won't be
+        # freed, but their buffer eventually will
+        # as the ownership will be transferred to
+        # numpy arrays.
+        #
+        # TODO: Find a proper way to handle this
+        # It's "OK-ish" as numpy arrays are
+        # then responsible for their buffer
+        # lifetime which consist of most of the
+        # vectors actual data (residual metadata
+        # exist, don't account but won't be deleted).
+        # Still, meh.
+        vector[vector[ITYPE_t]] * neigh_indices
+        vector[vector[DTYPE_t]] * neigh_distances
 
         bint sort_results
 
@@ -798,8 +819,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.radius = radius
         self.sort_results = False
 
-        self.neigh_indices.resize(self.n_X)
-        self.neigh_distances.resize(self.n_X)
+        # Won't be freed for reasons stated at their definition.
+        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
+        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
 
 
     cdef int _reduce_on_chunks(self,
@@ -822,8 +844,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                 dist_i_j = self.distance_metric.dist(
                     &X_c[i, 0], &Y_c[j, 0], self.d)
                 if dist_i_j <= self.radius:
-                    self.neigh_distances[X_start + i].push_back(dist_i_j)
-                    self.neigh_indices[X_start + i].push_back(Y_start + j)
+                    deref(self.neigh_distances)[X_start + i].push_back(dist_i_j)
+                    deref(self.neigh_indices)[X_start + i].push_back(Y_start + j)
 
         return 0
 
@@ -840,9 +862,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         if self.sort_results:
             for idx in range(X_start, X_end):
                 _simultaneous_sort(
-                    self.neigh_distances[idx].data(),
-                    self.neigh_indices[idx].data(),
-                    self.neigh_indices[idx].size()
+                    deref(self.neigh_distances)[idx].data(),
+                    deref(self.neigh_indices)[idx].data(),
+                    deref(self.neigh_indices)[idx].size()
                 )
 
     def compute(self,
@@ -855,6 +877,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self._parallel_on_X()
         return self._finalise_compute(return_distance)
 
+
     def _finalise_compute(self,
            bint return_distance
     ):
@@ -863,14 +886,17 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             np_arrays_distances = []
 
             for i in range(self.n_X):
-                np_arrays_distances.append(buffer_to_numpy_array(self.neigh_distances[i].data(), self.neigh_distances[i].size()))
-                np_arrays_indices.append(buffer_to_numpy_array(self.neigh_indices[i].data(), self.neigh_indices[i].size()))
+                np_arrays_distances.append(buffer_to_numpy_array(deref(self.neigh_distances)[i].data(),
+                                                                 deref(self.neigh_distances)[i].size()))
+                np_arrays_indices.append(buffer_to_numpy_array(deref(self.neigh_indices)[i].data(),
+                                                               deref(self.neigh_indices)[i].size()))
 
-            return np.array(np_arrays_distances, np.ndarray), np.array(np_arrays_indices, np.ndarray)
+            return np.array(np_arrays_distances, dtype=np.ndarray), np.array(np_arrays_indices, dtype=np.ndarray)
 
         np_arrays_indices = []
 
         for i in range(self.n_X):
-            np_arrays_indices.append(buffer_to_numpy_array(self.neigh_indices[i].data(), self.neigh_indices[i].size()))
+            np_arrays_indices.append(buffer_to_numpy_array(deref(self.neigh_indices)[i].data(),
+                                                           deref(self.neigh_indices)[i].size()))
 
-        return np.array(np_arrays_indices, np.ndarray)
+        return np.array(np_arrays_indices, dtype=np.ndarray)

From eaedf5382ada84f9fb64611e7f6c9c7f7e3c107c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 5 Jul 2021 17:09:48 +0200
Subject: [PATCH 053/290] Excluse some distances from valid ones

---
 sklearn/metrics/_parallel_reductions.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 67c974e965b8f..365e2f841cf5c 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -101,7 +101,7 @@ cdef class PairwiseDistancesReduction:
 
     @classmethod
     def valid_metrics(cls):
-        return {*METRIC_MAPPING.keys()}
+        return {*METRIC_MAPPING.keys()}.difference({"pyfunc", "sokalmichener"})
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults

From 70b13cd722f619c9d3b88bd8fbdd331111ebba82 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 5 Jul 2021 17:12:04 +0200
Subject: [PATCH 054/290] Add temporary consistency test

---
 sklearn/neighbors/tests/test_neighbors.py | 36 +++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 959ed6bfd7210..40c2dfd5bcaa1 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -19,6 +19,7 @@
 from sklearn.exceptions import DataConversionWarning
 from sklearn.exceptions import EfficiencyWarning
 from sklearn.exceptions import NotFittedError
+from sklearn.metrics._parallel_reductions import RadiusNeighborhood
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
@@ -1909,3 +1910,38 @@ def test_fast_sqeuclidean_translation_invariance(
 
     assert_allclose(reference_dist, dist)
     assert_array_equal(reference_nns, nns)
+
+
+@pytest.mark.parametrize("n", [10 ** i for i in [3, 4]])
+@pytest.mark.parametrize("d", [2])
+@pytest.mark.parametrize("ratio_train_test", [10, 1, 0.5])
+@pytest.mark.parametrize("radius", [100, 500])
+@pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
+def test_fast_radius_neighborhood_reduction_consistency(
+    n,
+    d,
+    ratio_train_test,
+    radius,
+    metric,
+    spread=1000,
+    dtype=np.float64,
+):
+    # Temporary consistency check
+    rng = np.random.RandomState(1)
+
+    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_test = (
+        rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d)) * spread
+    )
+
+    rn = RadiusNeighborhood.get_for(X=X_test, Y=X_train, radius=radius, metric=metric)
+    dists, indices = rn.compute(return_distance=True, sort_results=True)
+    nn = NearestNeighbors(radius=radius, metric=metric)
+    nn.fit(X_train)
+    reference_dists, references_indices = nn.radius_neighbors(
+        X_test, return_distance=True, sort_results=True
+    )
+
+    for i in range(X_test.shape[0]):
+        assert_allclose(reference_dists[i], dists[i])
+        assert_array_equal(references_indices[i], indices[i])

From 0f87e6f60d841e1833f4a6f184b70f538b28a426 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 5 Jul 2021 17:24:55 +0200
Subject: [PATCH 055/290] Move results allocation from initialisation to
 compute

So that buffers are recreated and not freed.
---
 sklearn/metrics/_parallel_reductions.pyx | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 365e2f841cf5c..d413b344229f5 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -819,11 +819,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.radius = radius
         self.sort_results = False
 
-        # Won't be freed for reasons stated at their definition.
-        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
-        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
-
-
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -872,6 +867,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
            bint return_distance = False,
            bint sort_results = False
     ):
+        # Won't be freed for reasons stated at their definition.
+        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
+        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
+
         # TODO: setup thread local datastructures for supporting _parallel_on_Y
         self.sort_results = sort_results
         self._parallel_on_X()

From 633d3f4f7f7668acf515904003bca5ff1727a433 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 7 Jul 2021 17:58:42 +0200
Subject: [PATCH 056/290] Address review comments

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_parallel_reductions.pyx  | 38 +++++++++++------------
 sklearn/neighbors/tests/test_neighbors.py |  3 +-
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index d413b344229f5..f4843c06c3ca1 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -763,29 +763,25 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # NOTE: not used for now.
         DTYPE_t radius_proxy
 
-        # We want resizable buffers which we will
-        # to wrapped within numpy arrays at the end.
+        # We want resizable buffers which we will to wrapped within numpy
+        # arrays at the end.
         #
-        # std::vector comes as a handy interface for
-        # efficient resizable buffers.
+        # std::vector comes as a handy interface for efficient resizable
+        # buffers.
         #
-        # Though it is possible to access their buffer
-        # address with std::vector::data, their buffer
-        # can't be stolen: their life-time is tight to
-        # the buffer's.
+        # Though it is possible to access their buffer address with
+        # std::vector::data, their buffer can't be stolen: their
+        # life-time is tight to the buffer's.
         #
-        # To solve this, we allocate dynamically
-        # allocate vectors which won't be
-        # freed, but their buffer eventually will
-        # as the ownership will be transferred to
-        # numpy arrays.
+        # To solve this, we allocate dynamically allocate vectors which won't be
+        # freed, but their buffer eventually will as the ownership will be
+        # transferred to numpy arrays.
+        #
+        # TODO: Find a proper way to handle buffers' lifetime
+        # It's "OK-ish" as numpy arrays are then responsible for their buffer
+        # lifetime which consist of most of the vectors actual data (residual
+        # metadata exist, don't account but won't be deleted).
         #
-        # TODO: Find a proper way to handle this
-        # It's "OK-ish" as numpy arrays are
-        # then responsible for their buffer
-        # lifetime which consist of most of the
-        # vectors actual data (residual metadata
-        # exist, don't account but won't be deleted).
         # Still, meh.
         vector[vector[ITYPE_t]] * neigh_indices
         vector[vector[DTYPE_t]] * neigh_distances
@@ -880,6 +876,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     def _finalise_compute(self,
            bint return_distance
     ):
+        # TODO: factorise this (currently set like so to avoid having a missing symbol
+        # in the generated shared library
         if return_distance:
             np_arrays_indices = []
             np_arrays_distances = []
@@ -892,6 +890,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
             return np.array(np_arrays_distances, dtype=np.ndarray), np.array(np_arrays_indices, dtype=np.ndarray)
 
+        free(self.neigh_distances)
+
         np_arrays_indices = []
 
         for i in range(self.n_X):
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 40c2dfd5bcaa1..7e7ac3b985a3e 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1926,7 +1926,8 @@ def test_fast_radius_neighborhood_reduction_consistency(
     spread=1000,
     dtype=np.float64,
 ):
-    # Temporary consistency check
+    # Temporary transitionalconsistency check
+    # TODO: remove once the implementation is stabilized.
     rng = np.random.RandomState(1)
 
     X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread

From f5d291507763dcc4c5a47ec89d38224441e08b1e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 7 Jul 2021 18:01:05 +0200
Subject: [PATCH 057/290] Exclude some DistanceMetrics for
 PairwiseDistancesReduction

---
 sklearn/metrics/_parallel_reductions.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index f4843c06c3ca1..840ec0aa10fb1 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -101,7 +101,8 @@ cdef class PairwiseDistancesReduction:
 
     @classmethod
     def valid_metrics(cls):
-        return {*METRIC_MAPPING.keys()}.difference({"pyfunc", "sokalmichener"})
+        excluded = {"pyfunc", "sokalmichener", "matching", "jaccard"}
+        return {*METRIC_MAPPING.keys()}.difference(excluded)
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults

From 39c4788b8edb2587ae1d6afc1eba9e8e688ef182 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 8 Jul 2021 17:16:09 +0200
Subject: [PATCH 058/290] Introduce utils functions for vector to ndarray
 coercion

---
 sklearn/metrics/_parallel_reductions.pyx | 51 ++++++++++++++----------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 840ec0aa10fb1..422d5e1c14214 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -63,6 +63,31 @@ cdef np.ndarray[DITYPE_t, ndim=1] buffer_to_numpy_array(DITYPE_t * ptr, np.npy_i
     PyArray_ENABLEFLAGS(arr, np.NPY_OWNDATA)
     return arr
 
+# TODO: this got duplicated because type covariance is not support; i.e. the following function
+#
+#       cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector[vector[DITYPE_t]] * vecs)
+#
+# cannot be called dispatched for vector[vector[ITYPE_t]]* and vector[vector[DTYPE_t]]*
+cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays_ITYPE(vector[vector[ITYPE_t]]* vecs):
+    cdef ITYPE_t n = deref(vecs).size()
+    np_arrays_of_np_arrays = np.empty(n, dtype=np.ndarray)
+
+    for i in range(n):
+        np_arrays_of_np_arrays[i] = buffer_to_numpy_array(deref(vecs)[i].data(),
+                                                           deref(vecs)[i].size())
+
+    return np_arrays_of_np_arrays
+
+cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays_DTYPE(vector[vector[DTYPE_t]]* vecs):
+    cdef ITYPE_t n = deref(vecs).size()
+    np_arrays_of_np_arrays = np.empty(n, dtype=np.ndarray)
+
+    for i in range(n):
+        np_arrays_of_np_arrays[i] = buffer_to_numpy_array(deref(vecs)[i].data(),
+                                                           deref(vecs)[i].size())
+
+    return np_arrays_of_np_arrays
+
 #####################
 
 
@@ -873,30 +898,14 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self._parallel_on_X()
         return self._finalise_compute(return_distance)
 
-
     def _finalise_compute(self,
            bint return_distance
     ):
-        # TODO: factorise this (currently set like so to avoid having a missing symbol
-        # in the generated shared library
         if return_distance:
-            np_arrays_indices = []
-            np_arrays_distances = []
-
-            for i in range(self.n_X):
-                np_arrays_distances.append(buffer_to_numpy_array(deref(self.neigh_distances)[i].data(),
-                                                                 deref(self.neigh_distances)[i].size()))
-                np_arrays_indices.append(buffer_to_numpy_array(deref(self.neigh_indices)[i].data(),
-                                                               deref(self.neigh_indices)[i].size()))
-
-            return np.array(np_arrays_distances, dtype=np.ndarray), np.array(np_arrays_indices, dtype=np.ndarray)
+            return (_coerce_vectors_to_np_nd_arrays_DTYPE(self.neigh_distances),
+                    _coerce_vectors_to_np_nd_arrays_ITYPE(self.neigh_indices))
 
+        # We need to free the buffers here because they won't be
+        #
         free(self.neigh_distances)
-
-        np_arrays_indices = []
-
-        for i in range(self.n_X):
-            np_arrays_indices.append(buffer_to_numpy_array(deref(self.neigh_indices)[i].data(),
-                                                           deref(self.neigh_indices)[i].size()))
-
-        return np.array(np_arrays_indices, dtype=np.ndarray)
+        return _coerce_vectors_to_np_nd_arrays_ITYPE(self.neigh_indices)

From b58321f9def97e88af2589e3ac71489da6fbbe4a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 8 Jul 2021 18:59:41 +0200
Subject: [PATCH 059/290] Introduce parallel_on_Y for RadiusNeighborhood

---
 sklearn/metrics/_parallel_reductions.pyx | 68 +++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 422d5e1c14214..992ccfe694054 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -44,6 +44,10 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 from ..utils._typedefs import ITYPE, DTYPE
 
+
+cdef extern from "<algorithm>" namespace "std" nogil:
+    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first)
+
 ######################
 ## std::vector to np.ndarray coercion
 # TODO: for now using this simple solution: https://stackoverflow.com/a/23873586
@@ -348,7 +352,7 @@ cdef class PairwiseDistancesReduction:
         return
 
     cdef void _on_Y_finalize(self,
-        ITYPE_t thread_num,
+        ITYPE_t num_threads,
     ) nogil:
         return
 
@@ -537,7 +541,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         free(self.heaps_indices_chunks[thread_num])
 
     cdef void _on_Y_finalize(self,
-            ITYPE_t thread_num,
+        ITYPE_t num_threads,
     ) nogil:
         cdef:
             ITYPE_t idx
@@ -812,6 +816,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         vector[vector[ITYPE_t]] * neigh_indices
         vector[vector[DTYPE_t]] * neigh_distances
 
+        vector[vector[ITYPE_t]] ** neigh_indices_chunks
+        vector[vector[DTYPE_t]] ** neigh_distances_chunks
+
         bint sort_results
 
 
@@ -866,6 +873,18 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         return 0
 
+    cdef void _on_X_prange_iter_init(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+
+        # As this strategy is embarrassingly parallel, we can set the
+        # thread-local vectors' pointers to the main vectors'.
+        self.neigh_distances_chunks[thread_num] = self.neigh_distances
+        self.neigh_indices_chunks[thread_num] = self.neigh_indices
+
     cdef void _on_X_prange_iter_finalize(self,
             ITYPE_t thread_num,
             ITYPE_t X_chunk_idx,
@@ -883,6 +902,51 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_indices)[idx].data(),
                     deref(self.neigh_indices)[idx].size()
                 )
+    cdef void _on_Y_parallel_init(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X)
+        self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
+
+    cdef void _on_Y_finalize(self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
+
+        # Merge associated vectors into one and sort in parallel
+        # in ascending order w.r.t the distances if needed
+        for idx in range(self.n_X): #, schedule='static'):
+            idx_n_element = 0
+            for jdx in range(num_threads):
+                idx_n_element += self.neigh_indices_chunks[jdx][idx].size()
+
+            deref(self.neigh_distances)[idx].reserve(idx_n_element)
+            deref(self.neigh_indices)[idx].reserve(idx_n_element)
+
+            for jdx in range(num_threads):
+                move(deref(self.neigh_distances_chunks[jdx])[idx].begin(),
+                     deref(self.neigh_distances_chunks[jdx])[idx].end(),
+                     deref(self.neigh_distances)[idx].end())
+                move(deref(self.neigh_indices_chunks[jdx])[idx].begin(),
+                     deref(self.neigh_indices_chunks[jdx])[idx].end(),
+                     deref(self.neigh_indices)[idx].end())
+
+        for jdx in range(num_threads):
+            free(self.neigh_distances_chunks[jdx])
+            free(self.neigh_indices_chunks[jdx])
+
+        if self.sort_results:
+            for idx in prange(self.n_X, schedule='static',
+                              num_threads=self.effective_omp_n_thread,
+                              nogil=True):
+                _simultaneous_sort(
+                    deref(self.neigh_distances)[idx].data(),
+                    deref(self.neigh_indices)[idx].data(),
+                    deref(self.neigh_indices)[idx].size()
+                )
+
+        return
 
     def compute(self,
            str strategy = "auto",

From c24d18476a8561470e8d667c102cfa4b9d9ba263 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 09:26:13 +0200
Subject: [PATCH 060/290] Sort returned valid distances

---
 sklearn/metrics/_parallel_reductions.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 992ccfe694054..f248680fac6fd 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -131,7 +131,7 @@ cdef class PairwiseDistancesReduction:
     @classmethod
     def valid_metrics(cls):
         excluded = {"pyfunc", "sokalmichener", "matching", "jaccard"}
-        return {*METRIC_MAPPING.keys()}.difference(excluded)
+        return sorted({*METRIC_MAPPING.keys()}.difference(excluded))
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults

From 1ee3ae1d45aee187a3e6f1d64d03b9d26b35c9f7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 09:27:58 +0200
Subject: [PATCH 061/290] Update comments

---
 sklearn/metrics/_parallel_reductions.pyx | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index f248680fac6fd..dc085ee110790 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -202,7 +202,7 @@ cdef class PairwiseDistancesReduction:
         with nogil, parallel(num_threads=num_threads):
             thread_num = openmp.omp_get_thread_num()
 
-            # Allocating thread local datastructures
+            # Allocating thread datastructures
             self._on_X_parallel_init(thread_num)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
@@ -212,7 +212,7 @@ cdef class PairwiseDistancesReduction:
                 else:
                     X_end = X_start + self.X_n_samples_chunk
 
-                # Reinitializing thread local datastructures for the new X chunk
+                # Reinitializing thread datastructures for the new X chunk
                 self._on_X_prange_iter_init(thread_num, X_chunk_idx, X_start, X_end)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
@@ -230,12 +230,12 @@ cdef class PairwiseDistancesReduction:
                         thread_num,
                     )
 
-                # Adjusting thread local datastructures on the full pass on Y
+                # Adjusting thread datastructures on the full pass on Y
                 self._on_X_prange_iter_finalize(thread_num, X_chunk_idx, X_start, X_end)
 
             # end: for X_chunk_idx
 
-            # Deallocating thread local datastructures
+            # Deallocating thread datastructures
             self._on_X_parallel_finalize(thread_num)
 
         # end: with nogil, parallel
@@ -263,10 +263,9 @@ cdef class PairwiseDistancesReduction:
                 X_end = X_start + self.X_n_samples_chunk
 
             with nogil, parallel(num_threads=num_threads):
-                # Thread local buffers
                 thread_num = openmp.omp_get_thread_num()
 
-                # Allocating thread local datastructures
+                # Initializing datastructures used in this thread
                 self._on_Y_parallel_init(thread_num)
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
@@ -286,12 +285,13 @@ cdef class PairwiseDistancesReduction:
                     )
                 # end: prange
 
-                # Synchronizing thread local datastructures with the main ones
+                # Synchronizing the thread datastructures with the main ones
                 # This can potentially block
                 self._on_Y_parallel_finalize(thread_num, X_chunk_idx, X_start, X_end)
             # end: with nogil, parallel
 
         # end: for X_chunk_idx
+        # Deallocating temporary datastructures
         # Adjusting main datastructures before returning
         self._on_Y_finalize(num_threads)
         return
@@ -422,7 +422,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
 
-        # Pointers to thread local heaps used in threads for `parallel_on_Y` solely
+        # Pointers to thread heaps used in threads for `parallel_on_Y` solely
         self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
@@ -476,8 +476,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ) nogil:
 
         # As this strategy is embarrassingly parallel, we can set the
-        # thread-local heaps pointers to the proper position
-        # on the main heaps
+        # thread heaps pointers to the proper position on the main heaps
         self.heaps_approx_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
@@ -969,7 +968,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             return (_coerce_vectors_to_np_nd_arrays_DTYPE(self.neigh_distances),
                     _coerce_vectors_to_np_nd_arrays_ITYPE(self.neigh_indices))
 
-        # We need to free the buffers here because they won't be
-        #
+        # We need to free the buffers here because they won't be managed
+        # by a numpy array then.
         free(self.neigh_distances)
         return _coerce_vectors_to_np_nd_arrays_ITYPE(self.neigh_indices)

From 6ecf3f3f664682dbc74841eab93d403f2aecd869 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 09:29:47 +0200
Subject: [PATCH 062/290] Change template for parallel_on_X

---
 sklearn/metrics/_parallel_reductions.pyx | 75 ++++++++++++++----------
 1 file changed, 43 insertions(+), 32 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index dc085ee110790..69ba6c9a9b9b4 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -213,7 +213,7 @@ cdef class PairwiseDistancesReduction:
                     X_end = X_start + self.X_n_samples_chunk
 
                 # Reinitializing thread datastructures for the new X chunk
-                self._on_X_prange_iter_init(thread_num, X_chunk_idx, X_start, X_end)
+                self._on_X_prange_iter_init(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -231,7 +231,7 @@ cdef class PairwiseDistancesReduction:
                     )
 
                 # Adjusting thread datastructures on the full pass on Y
-                self._on_X_prange_iter_finalize(thread_num, X_chunk_idx, X_start, X_end)
+                self._on_X_prange_iter_finalize(thread_num, X_start, X_end)
 
             # end: for X_chunk_idx
 
@@ -255,6 +255,10 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
             ITYPE_t thread_num
 
+        # TODO: put the "with nogil, parallel"-context here
+        # Allocating datastructures
+        self._on_Y_init(num_threads)
+
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
             if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
@@ -287,7 +291,7 @@ cdef class PairwiseDistancesReduction:
 
                 # Synchronizing the thread datastructures with the main ones
                 # This can potentially block
-                self._on_Y_parallel_finalize(thread_num, X_chunk_idx, X_start, X_end)
+                self._on_Y_parallel_finalize(thread_num, X_start, X_end)
             # end: with nogil, parallel
 
         # end: for X_chunk_idx
@@ -324,7 +328,6 @@ cdef class PairwiseDistancesReduction:
 
     cdef void _on_X_prange_iter_init(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
@@ -332,12 +335,16 @@ cdef class PairwiseDistancesReduction:
 
     cdef void _on_X_prange_iter_finalize(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
         return
 
+    cdef void _on_Y_init(self,
+        ITYPE_t num_threads,
+    ) nogil:
+        return
+
     cdef void _on_Y_parallel_init(self,
         ITYPE_t thread_num,
     ) nogil:
@@ -345,7 +352,6 @@ cdef class PairwiseDistancesReduction:
 
     cdef void _on_Y_parallel_finalize(self,
         ITYPE_t thread_num,
-        ITYPE_t X_chunk_idx,
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
@@ -470,7 +476,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     cdef void _on_X_prange_iter_init(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
@@ -482,7 +487,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     cdef void _on_X_prange_iter_finalize(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
@@ -497,19 +501,25 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 self.k
             )
 
-    cdef void _on_Y_parallel_init(self,
-            ITYPE_t thread_num,
+    cdef void _on_Y_init(self,
+            ITYPE_t num_threads,
     ) nogil:
         cdef:
             # number of scalar elements
             ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
+            ITYPE_t thread_num
 
-        # As chunks of X are shared across threads, so must their
-        # heaps. To solve this, each thread has its own locals
-        # heaps which are then synchronised back in the main ones.
-        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heaps_size * sizeof(DTYPE_t))
-        self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(heaps_size * sizeof(ITYPE_t))
+        for thread_num in prange(num_threads, schedule='static', nogil=True,
+                                 num_threads=num_threads):
+            # As chunks of X are shared across threads, so must their
+            # heaps. To solve this, each thread has its own heaps
+            # which are then synchronised back in the main ones.
+            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heaps_size * sizeof(DTYPE_t))
+            self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(heaps_size * sizeof(ITYPE_t))
 
+    cdef void _on_Y_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
             self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
@@ -517,12 +527,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     cdef void _on_Y_parallel_finalize(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
         cdef:
             ITYPE_t idx, jdx
+        # TODO: see if synchronisation can be made in parallel samples-wise
+        # thanks to the context which wraps the call to this method
         with gil:
             # Synchronising the thread local heaps
             # with the main heaps
@@ -536,24 +547,27 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                         self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                     )
 
-        free(self.heaps_approx_distances_chunks[thread_num])
-        free(self.heaps_indices_chunks[thread_num])
 
     cdef void _on_Y_finalize(self,
         ITYPE_t num_threads,
     ) nogil:
         cdef:
-            ITYPE_t idx
+            ITYPE_t idx, thread_num
 
-        # Sort the main heaps into arrays in parallel
-        # in ascending order w.r.t the distances
-        for idx in prange(self.n_X, schedule='static', nogil=True,
-                          num_threads=self.effective_omp_n_thread):
-            _simultaneous_sort(
-                &self.argkmin_distances[idx, 0],
-                &self.argkmin_indices[idx, 0],
-                self.k,
-            )
+        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+            # Deallocating temporary datastructures
+            for thread_num in prange(num_threads, schedule='static'):
+                free(self.heaps_approx_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sort the main heaps into arrays in parallel
+            # in ascending order w.r.t the distances
+            for idx in prange(self.n_X, schedule='static'):
+                _simultaneous_sort(
+                    &self.argkmin_distances[idx, 0],
+                    &self.argkmin_indices[idx, 0],
+                    self.k,
+                )
         return
 
     cdef void _exact_distances(self,
@@ -697,11 +711,10 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
     cdef void _on_Y_parallel_finalize(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
-        ArgKmin._on_Y_parallel_finalize(self, thread_num, X_chunk_idx, X_start, X_end)
+        ArgKmin._on_Y_parallel_finalize(self, thread_num, X_start, X_end)
         free(self.dist_middle_terms_chunks[thread_num])
 
     cdef int _reduce_on_chunks(self,
@@ -874,7 +887,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     cdef void _on_X_prange_iter_init(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
@@ -886,7 +898,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     cdef void _on_X_prange_iter_finalize(self,
             ITYPE_t thread_num,
-            ITYPE_t X_chunk_idx,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:

From 4e0c46569c98245e0d18147dd5558360f2f43cb3 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 10:04:15 +0200
Subject: [PATCH 063/290] Fix number of threads for parallel_on_Y

---
 sklearn/metrics/_parallel_reductions.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 69ba6c9a9b9b4..e3763d771dcad 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -252,7 +252,7 @@ cdef class PairwiseDistancesReduction:
         """
         cdef:
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
-            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t num_threads = min(self.Y_n_chunks, self.effective_omp_n_thread)
             ITYPE_t thread_num
 
         # TODO: put the "with nogil, parallel"-context here

From 30dcbbaa0819bc8b06c26417886b7cb9d80c2241 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 10:27:58 +0200
Subject: [PATCH 064/290] Adapt test for 'brute' algorithm

---
 sklearn/metrics/tests/test_pairwise.py    |  7 ++--
 sklearn/neighbors/tests/test_neighbors.py | 46 ++++++++++++-----------
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 35a6d3dd0f9e9..e2fc7d70aa4b4 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -442,7 +442,6 @@ def test_pairwise_distances_argmin_min():
 
     expected_idx = [0, 1]
     expected_vals = [2, 2]
-    expected_vals_sq = [4, 4]
 
     # euclidean metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
@@ -460,10 +459,12 @@ def test_pairwise_distances_argmin_min():
 
     # euclidean metric squared
     idx, vals = pairwise_distances_argmin_min(
-        X, Y, metric="euclidean", metric_kwargs={"squared": True}
+        X,
+        Y,
+        metric="fast_sqeuclidean",
     )
     assert_array_almost_equal(idx, expected_idx)
-    assert_array_almost_equal(vals, expected_vals_sq)
+    assert_array_almost_equal(vals, expected_vals)
 
     # Non-euclidean scikit-learn metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 7e7ac3b985a3e..c1d8a2d2a068a 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1285,9 +1285,11 @@ def test_neighbors_badargs():
 
 def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5):
     # Test computing the neighbors for various metrics
-    # create a symmetric matrix
-    V = rng.rand(n_features, n_features)
-    VI = np.dot(V, V.T)
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(n_samples, n_features)
+    test = rng.rand(n_query_pts, n_features)
+    V = np.cov(X.T)
 
     metrics = [
         ("euclidean", {}),
@@ -1299,18 +1301,25 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor
         ("chebyshev", {}),
         ("seuclidean", dict(V=rng.rand(n_features))),
         ("wminkowski", dict(p=3, w=rng.rand(n_features))),
-        ("mahalanobis", dict(VI=VI)),
+        ("mahalanobis", dict(V=V)),
         ("haversine", {}),
     ]
     algorithms = ["brute", "ball_tree", "kd_tree"]
-    X = rng.rand(n_samples, n_features)
-
-    test = rng.rand(n_query_pts, n_features)
 
     for metric, metric_params in metrics:
         if metric == "wminkowski" and sp_version >= parse_version("1.8.0"):
             # wminkowski will be removed in SciPy 1.8.0
             continue
+
+        # Haversine distance only accepts 2D data
+        if metric == "haversine":
+            feature_sl = slice(None, 2)
+            X_train = np.ascontiguousarray(X[:, feature_sl])
+            X_test = np.ascontiguousarray(test[:, feature_sl])
+        else:
+            X_train = X
+            X_test = test
+
         results = {}
         p = metric_params.pop("p", 2)
         for algorithm in algorithms:
@@ -1330,20 +1339,15 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor
                 metric_params=metric_params,
             )
 
-            # Haversine distance only accepts 2D data
-            feature_sl = slice(None, 2) if metric == "haversine" else slice(None)
-
-            neigh.fit(X[:, feature_sl])
+            neigh.fit(X_train)
 
-            results[algorithm] = neigh.kneighbors(
-                test[:, feature_sl], return_distance=True
-            )
+            results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
 
-        assert_array_almost_equal(results["brute"][0], results["ball_tree"][0])
-        assert_array_almost_equal(results["brute"][1], results["ball_tree"][1])
+        assert_allclose(results["brute"][0], results["ball_tree"][0])
+        assert_allclose(results["brute"][1], results["ball_tree"][1])
         if "kd_tree" in results:
-            assert_array_almost_equal(results["brute"][0], results["kd_tree"][0])
-            assert_array_almost_equal(results["brute"][1], results["kd_tree"][1])
+            assert_allclose(results["brute"][0], results["kd_tree"][0])
+            assert_allclose(results["brute"][1], results["kd_tree"][1])
 
 
 def test_callable_metric():
@@ -1575,16 +1579,16 @@ def test_k_and_radius_neighbors_duplicates(algorithm):
     nn.fit(X)
     dist, ind = nn.kneighbors()
     assert_array_equal(dist, np.zeros((3, 1)))
-    assert_array_equal(ind, [[1], [0], [1]])
+    assert_array_equal(ind, [[2], [2], [0]])
 
     # Test that zeros are explicitly marked in kneighbors_graph.
     kng = nn.kneighbors_graph(mode="distance")
     assert_array_equal(kng.A, np.zeros((3, 3)))
     assert_array_equal(kng.data, np.zeros(3))
-    assert_array_equal(kng.indices, [1.0, 0.0, 1.0])
+    assert_array_equal(kng.indices, [2.0, 2.0, 0.0])
     assert_array_equal(
         nn.kneighbors_graph().A,
-        np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
+        np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0]]),
     )
 
 

From 1a25476e90b73f41ea0e6aeb1de68bc83673dfe9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 11:30:41 +0200
Subject: [PATCH 065/290] Only allocate temporary buffer once

---
 sklearn/metrics/_parallel_reductions.pyx | 28 ++++++++++++++----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index e3763d771dcad..2d74fbff1394b 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -701,21 +701,25 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         ArgKmin._on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
-    cdef void _on_Y_parallel_init(self,
-            ITYPE_t thread_num,
+    cdef void _on_Y_init(self,
+            ITYPE_t num_threads,
     ) nogil:
-        ArgKmin._on_Y_parallel_init(self, thread_num)
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+        cdef ITYPE_t thread_num
+        ArgKmin._on_Y_init(self, num_threads)
 
-    cdef void _on_Y_parallel_finalize(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        for thread_num in range(num_threads):
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+
+    cdef void _on_Y_finalize(self,
+            ITYPE_t num_threads,
     ) nogil:
-        ArgKmin._on_Y_parallel_finalize(self, thread_num, X_start, X_end)
-        free(self.dist_middle_terms_chunks[thread_num])
+        cdef ITYPE_t thread_num
+        ArgKmin._on_Y_finalize(self, num_threads)
+
+        for thread_num in range(num_threads):
+            free(self.dist_middle_terms_chunks[thread_num])
 
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,

From 93db1eeefe01666cb92f7eaace8ae08afbe4864d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 11:59:29 +0200
Subject: [PATCH 066/290] Remove called to Argkmin.__dealloc__ in subclass'

It gets automatically called then.
---
 sklearn/metrics/_parallel_reductions.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 2d74fbff1394b..4d24fc3d5c6ed 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -681,7 +681,6 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
-        ArgKmin.__dealloc__(self)
         if self.dist_middle_terms_chunks is not NULL:
             free(self.dist_middle_terms_chunks)
         else:

From 12d7dfd46b2bb66f6a757a30969c6536b2ee71bc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 12:01:57 +0200
Subject: [PATCH 067/290] [WIP] Introduce parallel_on_Y for RadiusNeighborhood

This reworks the synchronisation out of the first
"with nogil, parallel"-context allowing more
flexibility.

For instance, this allow removing the lock previously
needed for ArgKmin's synchronisation as we now
parallelize on samples.

This add a simple merge of vectors which should not
work.
---
 sklearn/metrics/_parallel_reductions.pyx | 148 +++++++++++++++--------
 1 file changed, 99 insertions(+), 49 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 4d24fc3d5c6ed..1bada5c52df70 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -289,9 +289,9 @@ cdef class PairwiseDistancesReduction:
                     )
                 # end: prange
 
-                # Synchronizing the thread datastructures with the main ones
-                # This can potentially block
-                self._on_Y_parallel_finalize(thread_num, X_start, X_end)
+            # Synchronizing the thread datastructures with the main ones
+            # This can potentially block
+            self._on_Y_after_parallel(num_threads, X_start, X_end)
             # end: with nogil, parallel
 
         # end: for X_chunk_idx
@@ -350,8 +350,8 @@ cdef class PairwiseDistancesReduction:
     ) nogil:
         return
 
-    cdef void _on_Y_parallel_finalize(self,
-        ITYPE_t thread_num,
+    cdef void _on_Y_after_parallel(self,
+        ITYPE_t num_threads,
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
@@ -525,27 +525,28 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
             self.heaps_indices_chunks[thread_num][idx] = -1
 
-    cdef void _on_Y_parallel_finalize(self,
-            ITYPE_t thread_num,
+    cdef void _on_Y_after_parallel(self,
+            ITYPE_t num_threads,
             ITYPE_t X_start,
             ITYPE_t X_end,
     ) nogil:
         cdef:
-            ITYPE_t idx, jdx
-        # TODO: see if synchronisation can be made in parallel samples-wise
-        # thanks to the context which wraps the call to this method
-        with gil:
-            # Synchronising the thread local heaps
-            # with the main heaps
-            for idx in range(X_end - X_start):
-                for jdx in range(self.k):
-                    _push(
-                        &self.argkmin_distances[X_start + idx, 0],
-                        &self.argkmin_indices[X_start + idx, 0],
-                        self.k,
-                        self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
-                        self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
-                    )
+            ITYPE_t idx, jdx, thread_num
+        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+            # Synchronising the thread local heaps with the main heaps
+            # This is done in parallel samples-wise (no need for locks)
+            #
+            # Note: can this lead to false sharing?
+            for idx in prange(X_end - X_start, schedule="static"):
+                for thread_num in range(num_threads):
+                    for jdx in range(self.k):
+                        _push(
+                            &self.argkmin_distances[X_start + idx, 0],
+                            &self.argkmin_indices[X_start + idx, 0],
+                            self.k,
+                            self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
+                            self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
+                        )
 
 
     cdef void _on_Y_finalize(self,
@@ -863,6 +864,16 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.radius = radius
         self.sort_results = False
 
+        # Pointers to datastructures used in threads for `parallel_on_Y` solely
+        self.neigh_distances_chunks = <vector[vector[DTYPE_t]] **> malloc(
+            sizeof(self.neigh_distances) * self.effective_omp_n_thread)
+        self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
+            sizeof(self.neigh_indices) * self.effective_omp_n_thread)
+
+    def __dealloc(self):
+        free(self.neigh_distances_chunks)
+        free(self.neigh_indices_chunks)
+
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -915,12 +926,64 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_indices)[idx].data(),
                     deref(self.neigh_indices)[idx].size()
                 )
+
     cdef void _on_Y_parallel_init(self,
         ITYPE_t thread_num,
     ) nogil:
+        # As chunks of X are shared across threads, so must their
+        # vectors. To solve this, each thread has its own vectors
+        # which are then synchronised merged back in the main ones.
         self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X)
         self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
 
+    cdef void _merge_vectors(self,
+        ITYPE_t idx,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t thread_num, idx_n_elements = 0, last_element_idx = 0
+        for thread_num in range(num_threads):
+            idx_n_elements += self.neigh_indices_chunks[thread_num][idx].size()
+
+        deref(self.neigh_distances)[idx].reserve(idx_n_elements)
+        deref(self.neigh_indices)[idx].reserve(idx_n_elements)
+
+        for thread_num in range(num_threads):
+            move(deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
+                 deref(self.neigh_distances_chunks[thread_num])[idx].end(),
+                 deref(self.neigh_distances)[idx].begin() + last_element_idx)
+            move(deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
+                 deref(self.neigh_indices_chunks[thread_num])[idx].end(),
+                 deref(self.neigh_indices)[idx].begin() + last_element_idx)
+            last_element_idx += self.neigh_indices_chunks[thread_num][idx].size()
+
+            free(self.neigh_distances_chunks[thread_num])
+            free(self.neigh_indices_chunks[thread_num])
+
+    cdef void _on_Y_after_parallel(self,
+        ITYPE_t num_threads,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, idx_n_element, idx_current
+
+        # Merge associated vectors into one and sort in parallel
+        # in ascending order w.r.t the distances if needed
+        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+            for idx in prange(self.n_X, schedule='static'):
+                self._merge_vectors(idx, num_threads)
+
+            if self.sort_results:
+                for idx in prange(self.n_X, schedule='static'):
+                    _simultaneous_sort(
+                        deref(self.neigh_distances)[idx].data(),
+                        deref(self.neigh_indices)[idx].data(),
+                        deref(self.neigh_indices)[idx].size()
+                    )
+
+        return
+
     cdef void _on_Y_finalize(self,
         ITYPE_t num_threads,
     ) nogil:
@@ -929,30 +992,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         # Merge associated vectors into one and sort in parallel
         # in ascending order w.r.t the distances if needed
-        for idx in range(self.n_X): #, schedule='static'):
-            idx_n_element = 0
-            for jdx in range(num_threads):
-                idx_n_element += self.neigh_indices_chunks[jdx][idx].size()
-
-            deref(self.neigh_distances)[idx].reserve(idx_n_element)
-            deref(self.neigh_indices)[idx].reserve(idx_n_element)
-
-            for jdx in range(num_threads):
-                move(deref(self.neigh_distances_chunks[jdx])[idx].begin(),
-                     deref(self.neigh_distances_chunks[jdx])[idx].end(),
-                     deref(self.neigh_distances)[idx].end())
-                move(deref(self.neigh_indices_chunks[jdx])[idx].begin(),
-                     deref(self.neigh_indices_chunks[jdx])[idx].end(),
-                     deref(self.neigh_indices)[idx].end())
-
-        for jdx in range(num_threads):
-            free(self.neigh_distances_chunks[jdx])
-            free(self.neigh_indices_chunks[jdx])
-
         if self.sort_results:
-            for idx in prange(self.n_X, schedule='static',
-                              num_threads=self.effective_omp_n_thread,
-                              nogil=True):
+            for idx in prange(self.n_X, schedule='static', nogil=True,
+                              num_threads=self.effective_omp_n_thread):
                 _simultaneous_sort(
                     deref(self.neigh_distances)[idx].data(),
                     deref(self.neigh_indices)[idx].data(),
@@ -966,13 +1008,21 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
            bint return_distance = False,
            bint sort_results = False
     ):
-        # Won't be freed for reasons stated at their definition.
-        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
+        # This won't be freed for reasons stated at their definition.
         self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
 
-        # TODO: setup thread local datastructures for supporting _parallel_on_Y
+        # This is freed then if return_distance = False
+        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
+
         self.sort_results = sort_results
-        self._parallel_on_X()
+
+        if strategy == 'parallel_on_Y':
+            self._parallel_on_Y()
+        elif strategy == 'parallel_on_X':
+            self._parallel_on_X()
+        else:
+            raise RuntimeError(f"strategy '{strategy}' not supported.")
+
         return self._finalise_compute(return_distance)
 
     def _finalise_compute(self,

From 937c4f3a6de920f42ef41042b7eef33bd91013a2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 17:01:45 +0200
Subject: [PATCH 068/290] Introduce parallel_on_Y for RadiusNeighborhood

Use std::move to copy data from temporary vector
to the main ones.
---
 sklearn/metrics/_parallel_reductions.pyx | 58 +++++++++++++++---------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 1bada5c52df70..00ff5371f959d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -864,7 +864,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.radius = radius
         self.sort_results = False
 
-        # Pointers to datastructures used in threads for `parallel_on_Y` solely
+        # Pointers to datastructures used in threads
         self.neigh_distances_chunks = <vector[vector[DTYPE_t]] **> malloc(
             sizeof(self.neigh_distances) * self.effective_omp_n_thread)
         self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
@@ -891,11 +891,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                dist_i_j = self.distance_metric.dist(
-                    &X_c[i, 0], &Y_c[j, 0], self.d)
+                dist_i_j = self.distance_metric.dist(&X_c[i, 0], &Y_c[j, 0], self.d)
                 if dist_i_j <= self.radius:
-                    deref(self.neigh_distances)[X_start + i].push_back(dist_i_j)
-                    deref(self.neigh_indices)[X_start + i].push_back(Y_start + j)
+                    deref(self.neigh_distances_chunks[thread_num])[X_start + i].push_back(dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[X_start + i].push_back(Y_start + j)
 
         return 0
 
@@ -941,24 +940,31 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         ITYPE_t num_threads,
     ) nogil:
         cdef:
-            ITYPE_t thread_num, idx_n_elements = 0, last_element_idx = 0
+            ITYPE_t thread_num
+            ITYPE_t idx_n_elements = 0
+            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
+
+        # Resizing buffers once for the given
         for thread_num in range(num_threads):
-            idx_n_elements += self.neigh_indices_chunks[thread_num][idx].size()
+            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
-        deref(self.neigh_distances)[idx].reserve(idx_n_elements)
-        deref(self.neigh_indices)[idx].reserve(idx_n_elements)
+        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
+        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
 
+        # Moving the element at the right place
         for thread_num in range(num_threads):
-            move(deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
-                 deref(self.neigh_distances_chunks[thread_num])[idx].end(),
-                 deref(self.neigh_distances)[idx].begin() + last_element_idx)
-            move(deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
-                 deref(self.neigh_indices_chunks[thread_num])[idx].end(),
-                 deref(self.neigh_indices)[idx].begin() + last_element_idx)
-            last_element_idx += self.neigh_indices_chunks[thread_num][idx].size()
+            move(
+                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
+                deref(self.neigh_distances)[idx].begin() + last_element_idx
+            )
+            move(
+                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
+                deref(self.neigh_indices)[idx].begin() + last_element_idx
+            )
+            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
-            free(self.neigh_distances_chunks[thread_num])
-            free(self.neigh_indices_chunks[thread_num])
 
     cdef void _on_Y_after_parallel(self,
         ITYPE_t num_threads,
@@ -966,14 +972,17 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         ITYPE_t X_end,
     ) nogil:
         cdef:
-            ITYPE_t idx, jdx, idx_n_element, idx_current
-
+            ITYPE_t idx, thread_num
         # Merge associated vectors into one and sort in parallel
         # in ascending order w.r.t the distances if needed
         with nogil, parallel(num_threads=self.effective_omp_n_thread):
             for idx in prange(self.n_X, schedule='static'):
                 self._merge_vectors(idx, num_threads)
 
+            for thread_num in prange(num_threads, schedule='static'):
+                del self.neigh_distances_chunks[thread_num]
+                del self.neigh_indices_chunks[thread_num]
+
             if self.sort_results:
                 for idx in prange(self.n_X, schedule='static'):
                     _simultaneous_sort(
@@ -981,7 +990,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                         deref(self.neigh_indices)[idx].data(),
                         deref(self.neigh_indices)[idx].size()
                     )
-
         return
 
     cdef void _on_Y_finalize(self,
@@ -1016,6 +1024,14 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         self.sort_results = sort_results
 
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
         if strategy == 'parallel_on_Y':
             self._parallel_on_Y()
         elif strategy == 'parallel_on_X':

From 9780f288f5011396cb2bc6e9245615304898bf4b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 17:14:56 +0200
Subject: [PATCH 069/290] Remove unnecessary intermediary sorts

Also add comments.
---
 sklearn/metrics/_parallel_reductions.pyx | 36 +++++++++++-------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 00ff5371f959d..37da4af825016 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -44,9 +44,11 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 from ..utils._typedefs import ITYPE, DTYPE
 
-
+# TODO: This has been introduced in Cython 3.0, change for `libcpp.algorithm.move` once Cython 3 is used
+# Introduction in Cython:
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47
 cdef extern from "<algorithm>" namespace "std" nogil:
-    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first)
+    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except +
 
 ######################
 ## std::vector to np.ndarray coercion
@@ -533,10 +535,10 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         cdef:
             ITYPE_t idx, jdx, thread_num
         with nogil, parallel(num_threads=self.effective_omp_n_thread):
-            # Synchronising the thread local heaps with the main heaps
+            # Synchronising the thread heaps with the main heaps
             # This is done in parallel samples-wise (no need for locks)
             #
-            # Note: can this lead to false sharing?
+            # NOTE: can this lead to false sharing?
             for idx in prange(X_end - X_start, schedule="static"):
                 for thread_num in range(num_threads):
                     for jdx in range(self.k):
@@ -905,7 +907,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ) nogil:
 
         # As this strategy is embarrassingly parallel, we can set the
-        # thread-local vectors' pointers to the main vectors'.
+        # thread  vectors' pointers to the main vectors'.
         self.neigh_distances_chunks[thread_num] = self.neigh_distances
         self.neigh_indices_chunks[thread_num] = self.neigh_indices
 
@@ -944,14 +946,15 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             ITYPE_t idx_n_elements = 0
             ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
 
-        # Resizing buffers once for the given
+        # Resizing buffers only once for the given
         for thread_num in range(num_threads):
             idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
         deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
         deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
 
-        # Moving the element at the right place
+        # Moving the elements by range using the range first element
+        # as the reference for the insertion
         for thread_num in range(num_threads):
             move(
                 deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
@@ -973,23 +976,19 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ) nogil:
         cdef:
             ITYPE_t idx, thread_num
-        # Merge associated vectors into one and sort in parallel
-        # in ascending order w.r.t the distances if needed
+        # Merge associated vectors into one
+        # This is done in parallel samples-wise (no need for locks)
         with nogil, parallel(num_threads=self.effective_omp_n_thread):
             for idx in prange(self.n_X, schedule='static'):
                 self._merge_vectors(idx, num_threads)
 
+            # The content of the vector have been std::moved,
+            # Hence they can't be used anymore and can only
+            # be deleted.
             for thread_num in prange(num_threads, schedule='static'):
                 del self.neigh_distances_chunks[thread_num]
                 del self.neigh_indices_chunks[thread_num]
 
-            if self.sort_results:
-                for idx in prange(self.n_X, schedule='static'):
-                    _simultaneous_sort(
-                        deref(self.neigh_distances)[idx].data(),
-                        deref(self.neigh_indices)[idx].data(),
-                        deref(self.neigh_indices)[idx].size()
-                    )
         return
 
     cdef void _on_Y_finalize(self,
@@ -998,8 +997,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         cdef:
             ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
 
-        # Merge associated vectors into one and sort in parallel
-        # in ascending order w.r.t the distances if needed
+        # Sort in parallel in ascending order w.r.t the distances if needed
         if self.sort_results:
             for idx in prange(self.n_X, schedule='static', nogil=True,
                               num_threads=self.effective_omp_n_thread):
@@ -1019,7 +1017,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # This won't be freed for reasons stated at their definition.
         self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
 
-        # This is freed then if return_distance = False
+        # This will be freed then solely if return_distance = False
         self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
 
         self.sort_results = sort_results

From 40e369d5e5e712157807ac8fd22b36302b459bbd Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 17:22:56 +0200
Subject: [PATCH 070/290] Remove duplicate code to vector-to-ndarray coercion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger
<jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_parallel_reductions.pyx | 34 +++++++++---------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 37da4af825016..ab2de95223d01 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -44,6 +44,12 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 from ..utils._typedefs import ITYPE, DTYPE
 
+# As type covariance is not supported for C++ container via Cython,
+# we need to redefine a fused type
+ctypedef fused vector_vector_DITYPE_t:
+    vector[vector[ITYPE_t]]
+    vector[vector[DTYPE_t]]
+
 # TODO: This has been introduced in Cython 3.0, change for `libcpp.algorithm.move` once Cython 3 is used
 # Introduction in Cython:
 # https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47
@@ -69,24 +75,10 @@ cdef np.ndarray[DITYPE_t, ndim=1] buffer_to_numpy_array(DITYPE_t * ptr, np.npy_i
     PyArray_ENABLEFLAGS(arr, np.NPY_OWNDATA)
     return arr
 
-# TODO: this got duplicated because type covariance is not support; i.e. the following function
-#
-#       cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector[vector[DITYPE_t]] * vecs)
-#
-# cannot be called dispatched for vector[vector[ITYPE_t]]* and vector[vector[DTYPE_t]]*
-cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays_ITYPE(vector[vector[ITYPE_t]]* vecs):
-    cdef ITYPE_t n = deref(vecs).size()
-    np_arrays_of_np_arrays = np.empty(n, dtype=np.ndarray)
-
-    for i in range(n):
-        np_arrays_of_np_arrays[i] = buffer_to_numpy_array(deref(vecs)[i].data(),
-                                                           deref(vecs)[i].size())
-
-    return np_arrays_of_np_arrays
-
-cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays_DTYPE(vector[vector[DTYPE_t]]* vecs):
-    cdef ITYPE_t n = deref(vecs).size()
-    np_arrays_of_np_arrays = np.empty(n, dtype=np.ndarray)
+cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector_vector_DITYPE_t* vecs):
+    cdef:
+        ITYPE_t n = deref(vecs).size()
+        np.ndarray[object, ndim=1] np_arrays_of_np_arrays = np.empty(n, dtype=np.ndarray)
 
     for i in range(n):
         np_arrays_of_np_arrays[i] = buffer_to_numpy_array(deref(vecs)[i].data(),
@@ -1043,10 +1035,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
            bint return_distance
     ):
         if return_distance:
-            return (_coerce_vectors_to_np_nd_arrays_DTYPE(self.neigh_distances),
-                    _coerce_vectors_to_np_nd_arrays_ITYPE(self.neigh_indices))
+            return (_coerce_vectors_to_np_nd_arrays(self.neigh_distances),
+                    _coerce_vectors_to_np_nd_arrays(self.neigh_indices))
 
         # We need to free the buffers here because they won't be managed
         # by a numpy array then.
         free(self.neigh_distances)
-        return _coerce_vectors_to_np_nd_arrays_ITYPE(self.neigh_indices)
+        return _coerce_vectors_to_np_nd_arrays(self.neigh_indices)

From a3de08ab20dddc5480938a1afe1eaddb1192a24e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 17:37:10 +0200
Subject: [PATCH 071/290] Use consistent names

---
 sklearn/metrics/tests/test_pairwise.py    | 12 ++---
 sklearn/neighbors/tests/test_neighbors.py | 55 +++++++++++++----------
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index e2fc7d70aa4b4..170949655a160 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1469,19 +1469,21 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
     assert_allclose(dist, expected_dist, rtol=rtol)
 
 
-@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("d", [5, 10, 100])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("X_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("sign", [1, -1])
-def test_fast_sqeuclidean_correctness(n, d, X_translation, Y_translation, sign):
+def test_fast_sqeuclidean_correctness(
+    n_samples, n_features, X_translation, Y_translation, sign
+):
     # The fast squared euclidean strategy must return results
     # that are close to the ones obtained with the euclidean distance
     rng = np.random.RandomState(1)
 
     spread = 100
-    X = X_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread
-    Y = Y_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread * sign
+    X = X_translation + rng.rand(n_samples, n_features) * spread
+    Y = Y_translation + rng.rand(n_samples, n_features) * spread * sign
 
     argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
     fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index c1d8a2d2a068a..d9be6e81d1b66 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1826,31 +1826,37 @@ def test_pairwise_deprecated(NearestNeighbors):
         nn._pairwise
 
 
-@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("d", [5, 10, 100])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
 def test_fast_sqeuclidean_correctness(
-    n,
-    d,
+    n_samples,
+    n_features,
     ratio_train_test,
     n_neighbors,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy must return results
     # that are close to the ones obtained with the euclidean distance
-    if n < n_neighbors:
+    if n_samples < n_neighbors:
         pytest.skip(
-            f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
+            f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})",
             allow_module_level=True,
         )
 
     rng = np.random.RandomState(1)
 
     spread = 100
-    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_train = (
+        rng.rand(int(n_samples * n_features)).astype(dtype).reshape((-1, n_features))
+        * spread
+    )
     X_test = (
-        rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d)) * spread
+        rng.rand(int(n_samples * n_features / ratio_train_test))
+        .astype(dtype)
+        .reshape((-1, n_features))
+        * spread
     )
 
     neigh = NearestNeighbors(
@@ -1871,8 +1877,8 @@ def test_fast_sqeuclidean_correctness(
     assert_array_equal(eucl_nn, fse_nn)
 
 
-@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("d", [5, 10, 100, 500])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_features", [5, 10, 100, 500])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
 @pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.skip(
@@ -1880,23 +1886,23 @@ def test_fast_sqeuclidean_correctness(
     "have its own study: skipping for now"
 )
 def test_fast_sqeuclidean_translation_invariance(
-    n,
-    d,
+    n_samples,
+    n_features,
     n_neighbors,
     translation,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy should be translation invariant.
-    if n < n_neighbors:
+    if n_samples < n_neighbors:
         pytest.skip(
-            f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
+            f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})",
             allow_module_level=True,
         )
 
     rng = np.random.RandomState(1)
     spread = 100
-    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
-    X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_train = rng.rand(n_samples, n_features).astype(dtype) * spread
+    X_test = rng.rand(n_samples, n_features).astype(dtype) * spread
 
     neigh = NearestNeighbors(
         n_neighbors=n_neighbors, algorithm="brute", metric="fast_sqeuclidean"
@@ -1916,27 +1922,30 @@ def test_fast_sqeuclidean_translation_invariance(
     assert_array_equal(reference_nns, nns)
 
 
-@pytest.mark.parametrize("n", [10 ** i for i in [3, 4]])
-@pytest.mark.parametrize("d", [2])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [3, 4]])
+@pytest.mark.parametrize("n_features", [2])
 @pytest.mark.parametrize("ratio_train_test", [10, 1, 0.5])
 @pytest.mark.parametrize("radius", [100, 500])
 @pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
 def test_fast_radius_neighborhood_reduction_consistency(
-    n,
-    d,
+    n_samples,
+    n_features,
     ratio_train_test,
     radius,
     metric,
     spread=1000,
     dtype=np.float64,
 ):
-    # Temporary transitionalconsistency check
+    # Temporary transitional consistency check
     # TODO: remove once the implementation is stabilized.
     rng = np.random.RandomState(1)
 
-    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_train = rng.rand(n_samples, n_features).astype(dtype) * spread
     X_test = (
-        rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d)) * spread
+        rng.rand(int(n_samples * n_features / ratio_train_test))
+        .astype(dtype)
+        .reshape((-1, n_features))
+        * spread
     )
 
     rn = RadiusNeighborhood.get_for(X=X_test, Y=X_train, radius=radius, metric=metric)

From 81b1b1b95783d2cbb9f6c33df2d6e3c575d0373a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 9 Jul 2021 17:54:52 +0200
Subject: [PATCH 072/290] Plug RadiusNeighborhood in
 RadiusNeighborsMixin.radius_neighbors

Introduce PairwiseDistancesReduction.is_usable_for encapsulating
the test for branching logic.

Also removed unused code due to new implementation.
---
 sklearn/metrics/_parallel_reductions.pyx | 12 ++++++++-
 sklearn/metrics/pairwise.py              |  7 +----
 sklearn/neighbors/_base.py               | 33 ++++++++++++++----------
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index ab2de95223d01..b99e603e2b2bf 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -17,9 +17,10 @@ np.import_array()
 from libc.stdlib cimport free, malloc
 from libcpp.vector cimport vector
 from cython.operator cimport dereference as deref
-
 from cython.parallel cimport parallel, prange
 
+from scipy.sparse import issparse
+
 from ._dist_metrics cimport DistanceMetric
 from ._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
@@ -127,6 +128,15 @@ cdef class PairwiseDistancesReduction:
         excluded = {"pyfunc", "sokalmichener", "matching", "jaccard"}
         return sorted({*METRIC_MAPPING.keys()}.difference(excluded))
 
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        # TODO: support sparse arrays
+        return (not issparse(X) and
+                not issparse(X) and
+                X.dtype == Y.dtype == np.float64
+                and metric in cls.valid_metrics())
+
+
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
         # in rare cases where __init__ is not called
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index bcf42ac2ca9fa..2ef7c9fc00058 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -652,12 +652,7 @@ def pairwise_distances_argmin_min(
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    if (
-        # TODO: support sparse arrays
-        not issparse(X)
-        and not issparse(X)
-        and metric in ArgKmin.valid_metrics()
-    ):
+    if ArgKmin.is_usable_for(X, Y, metric):
         values, indices = ArgKmin.get_for(
             X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
         ).compute(strategy="auto", return_distance=True)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 85c3cd743cb57..11097775f51bb 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._parallel_reductions import ArgKmin
+from ..metrics._parallel_reductions import ArgKmin, RadiusNeighborhood
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -736,12 +736,8 @@ class from an array representing our data set and ask who's
                 X, n_neighbors=n_neighbors, return_distance=return_distance
             )
 
-        elif (
-            # TODO: support sparse arrays
-            not issparse(X)
-            and not issparse(self._fit_X)
-            and self._fit_method == "brute"
-            and self.effective_metric_ in ArgKmin.valid_metrics()
+        elif self._fit_method == "brute" and ArgKmin.is_usable_for(
+            X, self._fit_X, self.effective_metric_
         ):
             results = ArgKmin.get_for(
                 X=X,
@@ -1059,13 +1055,22 @@ class from an array representing our data set and ask who's
                 X, radius=radius, return_distance=return_distance
             )
 
+        elif self._fit_method == "brute" and RadiusNeighborhood.is_usable_for(
+            X, self._fit_X, self.effective_metric_
+        ):
+            results = RadiusNeighborhood.get_for(
+                X=X,
+                Y=self._fit_X,
+                radius=radius,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
+            ).compute(
+                strategy="auto",
+                return_distance=return_distance,
+                sort_results=sort_results,
+            )
+
         elif self._fit_method == "brute":
-            # for efficiency, use squared euclidean distances
-            if self.effective_metric_ == "euclidean":
-                radius *= radius
-                kwds = {"squared": True}
-            else:
-                kwds = self.effective_metric_params_
 
             reduce_func = partial(
                 self._radius_neighbors_reduce_func,
@@ -1079,7 +1084,7 @@ class from an array representing our data set and ask who's
                 reduce_func=reduce_func,
                 metric=self.effective_metric_,
                 n_jobs=self.n_jobs,
-                **kwds,
+                **self.effective_metric_params_,
             )
             if return_distance:
                 neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results)

From b8fe6e1c8ed38cf6a7d25d4d661dfaae389ad6e0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 12 Jul 2021 11:45:28 +0200
Subject: [PATCH 073/290] Correctly free vectors using del

---
 sklearn/metrics/_parallel_reductions.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index b99e603e2b2bf..57f31466643ea 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -1050,5 +1050,5 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         # We need to free the buffers here because they won't be managed
         # by a numpy array then.
-        free(self.neigh_distances)
+        del self.neigh_distances
         return _coerce_vectors_to_np_nd_arrays(self.neigh_indices)

From 54fb2c5cbc39f394e450be769360dcee674591fc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 12 Jul 2021 14:05:26 +0200
Subject: [PATCH 074/290] Use a sentinel for managing vectors' memory

StdVectorSentinel makes a proper life-cycle
management for std::vectors' buffers possible.

Duplication seems needed as fused types
can't be used as attributes.

It's possible to obtain a missing symbol
(`_ZSt28__throw_bad_array_new_lengthv`) at
runtime.

This is unrelated to the implementation here,
and there are issues reporting the problem, e.g.:
https://github.com/cython/cython/issues/4218.

A temporary workaround:
https://github.com/stan-dev/pystan/issues/294#issuecomment-878292636
---
 sklearn/metrics/_parallel_reductions.pyx | 102 +++++++++++++++--------
 1 file changed, 67 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 57f31466643ea..a733bb45dda8d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -16,8 +16,10 @@ np.import_array()
 
 from libc.stdlib cimport free, malloc
 from libcpp.vector cimport vector
+from cpython.object cimport PyObject
 from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
+from cpython.ref cimport Py_INCREF
 
 from scipy.sparse import issparse
 
@@ -45,12 +47,6 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 from ..utils._typedefs import ITYPE, DTYPE
 
-# As type covariance is not supported for C++ container via Cython,
-# we need to redefine a fused type
-ctypedef fused vector_vector_DITYPE_t:
-    vector[vector[ITYPE_t]]
-    vector[vector[DTYPE_t]]
-
 # TODO: This has been introduced in Cython 3.0, change for `libcpp.algorithm.move` once Cython 3 is used
 # Introduction in Cython:
 # https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47
@@ -59,21 +55,65 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 
 ######################
 ## std::vector to np.ndarray coercion
-# TODO: for now using this simple solution: https://stackoverflow.com/a/23873586
-# A better solution would make sure of using the same allocator implementations.
-# Those implementations depend on the runtimes' allocator which can be different
-# in some configuration and thus would make the program crash.
+# As type covariance is not supported for C++ container via Cython,
+# we need to redefine fused types.
+ctypedef fused vector_DITYPE_t:
+    vector[ITYPE_t]
+    vector[DTYPE_t]
+
+ctypedef fused vector_vector_DITYPE_t:
+    vector[vector[ITYPE_t]]
+    vector[vector[DTYPE_t]]
 
 cdef extern from "numpy/arrayobject.h":
-    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+    int PyArray_SetBaseObject(np.ndarray arr, PyObject *obj) nogil except -1
+
+cdef class StdVectorSentinel:
+    """Wraps a reference to a vector which will be
+    deallocated with this object."""
+    pass
+
+cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
+    cdef vector[DTYPE_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[DTYPE_t] * vec_ptr):
+        sentinel = StdVectorSentinelDTYPE()
+        sentinel.vec.swap(deref(vec_ptr))
+        return sentinel
 
-cdef np.ndarray[DITYPE_t, ndim=1] buffer_to_numpy_array(DITYPE_t * ptr, np.npy_intp size):
-    """ Create a numpy ndarray given a buffer and its size. """
-    typenum = DTYPECODE if DITYPE_t is DTYPE_t else ITYPECODE
-    cdef np.ndarray[DITYPE_t, ndim=1] arr = np.PyArray_SimpleNewFromData(1, &size, typenum, ptr)
+cdef class StdVectorSentinelITYPE(StdVectorSentinel):
+    cdef vector[ITYPE_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[ITYPE_t] * vec_ptr):
+        sentinel = StdVectorSentinelITYPE()
+        sentinel.vec.swap(deref(vec_ptr))
+        return sentinel
+
+
+cdef np.ndarray vector_to_numpy_array(vector_DITYPE_t * vect_ptr):
+    """ Create a numpy ndarray given a C++ vector.
+
+    This registers a Sentinel as the base object for the numpy array
+    freeing the C++ vector it encapsulates when it must.
+    """
+    typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE
+    cdef:
+        np.npy_intp size = deref(vect_ptr).size()
+        np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum, deref(vect_ptr).data())
+        StdVectorSentinel sentinel
+
+    if vector_DITYPE_t is vector[DTYPE_t]:
+        sentinel = StdVectorSentinelDTYPE.create_for(vect_ptr)
+    else:
+        sentinel = StdVectorSentinelITYPE.create_for(vect_ptr)
 
     # Makes the numpy array responsible to the life-cycle of its buffer.
-    PyArray_ENABLEFLAGS(arr, np.NPY_OWNDATA)
+    # A reference to the sentinel will be stolen by the call bellow,
+    # so we increase its reference count.
+    Py_INCREF(sentinel)
+    PyArray_SetBaseObject(arr, <PyObject*>sentinel)
     return arr
 
 cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector_vector_DITYPE_t* vecs):
@@ -82,8 +122,7 @@ cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector_vector_DI
         np.ndarray[object, ndim=1] np_arrays_of_np_arrays = np.empty(n, dtype=np.ndarray)
 
     for i in range(n):
-        np_arrays_of_np_arrays[i] = buffer_to_numpy_array(deref(vecs)[i].data(),
-                                                           deref(vecs)[i].size())
+        np_arrays_of_np_arrays[i] = vector_to_numpy_array(&(deref(vecs)[i]))
 
     return np_arrays_of_np_arrays
 
@@ -823,16 +862,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # std::vector::data, their buffer can't be stolen: their
         # life-time is tight to the buffer's.
         #
-        # To solve this, we allocate dynamically allocate vectors which won't be
-        # freed, but their buffer eventually will as the ownership will be
-        # transferred to numpy arrays.
-        #
-        # TODO: Find a proper way to handle buffers' lifetime
-        # It's "OK-ish" as numpy arrays are then responsible for their buffer
-        # lifetime which consist of most of the vectors actual data (residual
-        # metadata exist, don't account but won't be deleted).
-        #
-        # Still, meh.
+        # To solve this, we dynamically allocate vectors and then
+        # encapsulate them in a StdVectorSentinel responsible for
+        # freeing them when needed
         vector[vector[ITYPE_t]] * neigh_indices
         vector[vector[DTYPE_t]] * neigh_distances
 
@@ -1016,10 +1048,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
            bint return_distance = False,
            bint sort_results = False
     ):
-        # This won't be freed for reasons stated at their definition.
         self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
-
-        # This will be freed then solely if return_distance = False
         self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
 
         self.sort_results = sort_results
@@ -1044,11 +1073,14 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     def _finalise_compute(self,
            bint return_distance
     ):
+
         if return_distance:
-            return (_coerce_vectors_to_np_nd_arrays(self.neigh_distances),
+            res = (_coerce_vectors_to_np_nd_arrays(self.neigh_distances),
                     _coerce_vectors_to_np_nd_arrays(self.neigh_indices))
+        else:
+            res = _coerce_vectors_to_np_nd_arrays(self.neigh_indices)
 
-        # We need to free the buffers here because they won't be managed
-        # by a numpy array then.
         del self.neigh_distances
-        return _coerce_vectors_to_np_nd_arrays(self.neigh_indices)
+        del self.neigh_indices
+
+        return res

From 9df40472ee66d3a3c574c25c928959c5b11128d2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 12 Jul 2021 16:09:41 +0200
Subject: [PATCH 075/290] Remove temporary consistency test

---
 sklearn/neighbors/tests/test_neighbors.py | 40 -----------------------
 1 file changed, 40 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index d9be6e81d1b66..bfffda0ab5867 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -19,7 +19,6 @@
 from sklearn.exceptions import DataConversionWarning
 from sklearn.exceptions import EfficiencyWarning
 from sklearn.exceptions import NotFittedError
-from sklearn.metrics._parallel_reductions import RadiusNeighborhood
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
@@ -1920,42 +1919,3 @@ def test_fast_sqeuclidean_translation_invariance(
 
     assert_allclose(reference_dist, dist)
     assert_array_equal(reference_nns, nns)
-
-
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [3, 4]])
-@pytest.mark.parametrize("n_features", [2])
-@pytest.mark.parametrize("ratio_train_test", [10, 1, 0.5])
-@pytest.mark.parametrize("radius", [100, 500])
-@pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
-def test_fast_radius_neighborhood_reduction_consistency(
-    n_samples,
-    n_features,
-    ratio_train_test,
-    radius,
-    metric,
-    spread=1000,
-    dtype=np.float64,
-):
-    # Temporary transitional consistency check
-    # TODO: remove once the implementation is stabilized.
-    rng = np.random.RandomState(1)
-
-    X_train = rng.rand(n_samples, n_features).astype(dtype) * spread
-    X_test = (
-        rng.rand(int(n_samples * n_features / ratio_train_test))
-        .astype(dtype)
-        .reshape((-1, n_features))
-        * spread
-    )
-
-    rn = RadiusNeighborhood.get_for(X=X_test, Y=X_train, radius=radius, metric=metric)
-    dists, indices = rn.compute(return_distance=True, sort_results=True)
-    nn = NearestNeighbors(radius=radius, metric=metric)
-    nn.fit(X_train)
-    reference_dists, references_indices = nn.radius_neighbors(
-        X_test, return_distance=True, sort_results=True
-    )
-
-    for i in range(X_test.shape[0]):
-        assert_allclose(reference_dists[i], dists[i])
-        assert_array_equal(references_indices[i], indices[i])

From 7c713a13839fe0269841f302b8cba2a267746b17 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 12 Jul 2021 16:29:37 +0200
Subject: [PATCH 076/290] Add comments

Also removes _finalise_compute.
---
 sklearn/metrics/_parallel_reductions.pyx | 35 ++++++++++++------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index a733bb45dda8d..487b3128358ce 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -429,12 +429,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     cdef:
         ITYPE_t k
 
-        DTYPE_t ** heaps_approx_distances_chunks
-        ITYPE_t ** heaps_indices_chunks
-
         ITYPE_t[:, ::1] argkmin_indices
         DTYPE_t[:, ::1] argkmin_distances
 
+        # Used as array of pointers to private datastructures used in threads.
+        DTYPE_t ** heaps_approx_distances_chunks
+        ITYPE_t ** heaps_indices_chunks
+
     @classmethod
     def valid_metrics(cls):
         return {"fast_sqeuclidean", *PairwiseDistancesReduction.valid_metrics()}
@@ -471,7 +472,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
 
-        # Pointers to thread heaps used in threads for `parallel_on_Y` solely
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There's potentially more pointers than actual thread used for the
+        # reduction but as many datastructures as threads.
         self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
@@ -674,11 +677,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         else:
             raise RuntimeError(f"strategy '{strategy}' not supported.")
 
-        return self._finalise_compute(return_distance)
-
-    def _finalise_compute(self,
-           bint return_distance
-    ):
         if return_distance:
             # We need to recompute distances because we relied on
             # approximate distances.
@@ -840,6 +838,10 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
 
 cdef class RadiusNeighborhood(PairwiseDistancesReduction):
+    """Returns the indices of neighbors of a first set
+    of vectors (rows of X) present in another set of vectors
+    (rows of Y) for a given a radius and distance.
+    """
 
     cdef:
         DTYPE_t radius
@@ -868,6 +870,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         vector[vector[ITYPE_t]] * neigh_indices
         vector[vector[DTYPE_t]] * neigh_distances
 
+        # Used as array of pointers to private datastructures used in threads.
         vector[vector[ITYPE_t]] ** neigh_indices_chunks
         vector[vector[DTYPE_t]] ** neigh_distances_chunks
 
@@ -900,7 +903,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.radius = radius
         self.sort_results = False
 
-        # Pointers to datastructures used in threads
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There's potentially more pointers than actual thread used for the
+        # reduction but as many datastructures as threads.
         self.neigh_distances_chunks = <vector[vector[DTYPE_t]] **> malloc(
             sizeof(self.neigh_distances) * self.effective_omp_n_thread)
         self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
@@ -941,7 +946,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ) nogil:
 
         # As this strategy is embarrassingly parallel, we can set the
-        # thread  vectors' pointers to the main vectors'.
+        # thread vectors' pointers to the main vectors'.
         self.neigh_distances_chunks[thread_num] = self.neigh_distances
         self.neigh_indices_chunks[thread_num] = self.neigh_indices
 
@@ -1048,6 +1053,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
            bint return_distance = False,
            bint sort_results = False
     ):
+        # Temporary datastructures which will be coerced to
+        # numpy arrays on return and then freed.
         self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
         self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
 
@@ -1068,12 +1075,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         else:
             raise RuntimeError(f"strategy '{strategy}' not supported.")
 
-        return self._finalise_compute(return_distance)
-
-    def _finalise_compute(self,
-           bint return_distance
-    ):
-
         if return_distance:
             res = (_coerce_vectors_to_np_nd_arrays(self.neigh_distances),
                     _coerce_vectors_to_np_nd_arrays(self.neigh_indices))

From 1d3336d4895afadf06b0890e41c0be20e64146cf Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 13 Jul 2021 17:08:14 +0200
Subject: [PATCH 077/290] Revert to 'euclidean' when 'fast_sqeuclidean' can't
 be used

---
 sklearn/metrics/pairwise.py | 7 +++++++
 sklearn/neighbors/_base.py  | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 33ddebfe69a82..c13a5643d6f5e 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -660,6 +660,13 @@ def pairwise_distances_argmin_min(
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)
     else:
+        # TODO: support sparse matrices
+        # When ArgKmin is not supported and when the
+        # user asked for "fast_sqeuclidean", we need to
+        # revert to "euclidean"
+        if metric == "fast_sqeuclidean":
+            metric = "euclidean"
+
         indices, values = zip(
             *pairwise_distances_chunked(
                 X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 0d9ac359354cf..1d48840caedc6 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -750,6 +750,13 @@ class from an array representing our data set and ask who's
             )
 
         elif self._fit_method == "brute":
+            # TODO: support sparse matrices
+            # When ArgKmin is not supported and when the
+            # user asked for "fast_sqeuclidean", we need to
+            # revert to "euclidean"
+            if self.effective_metric_ == "fast_sqeuclidean":
+                self.effective_metric_ = "euclidean"
+
             reduce_func = partial(
                 self._kneighbors_reduce_func,
                 n_neighbors=n_neighbors,

From d96b1632e8965c70230336ec23c43a02fdd020be Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 13 Jul 2021 17:09:25 +0200
Subject: [PATCH 078/290] Use 'fast_sqeuclidean' for Birch internals

---
 sklearn/cluster/_birch.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 68a7a741a88aa..68afceb8c5d9f 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -12,7 +12,6 @@
 from ..metrics import pairwise_distances_argmin
 from ..metrics.pairwise import euclidean_distances
 from ..base import TransformerMixin, ClusterMixin, BaseEstimator
-from ..utils.extmath import row_norms
 from ..utils import deprecated
 from ..utils.validation import check_is_fitted
 from ..exceptions import ConvergenceWarning
@@ -648,11 +647,10 @@ def predict(self, X):
         """
         check_is_fitted(self)
         X = self._validate_data(X, accept_sparse="csr", reset=False)
-        kwargs = {"Y_norm_squared": self._subcluster_norms}
 
         with config_context(assume_finite=True):
             argmin = pairwise_distances_argmin(
-                X, self.subcluster_centers_, metric_kwargs=kwargs
+                X, self.subcluster_centers_, metric="fast_sqeuclidean"
             )
         return self.subcluster_labels_[argmin]
 
@@ -698,9 +696,6 @@ def _global_clustering(self, X=None):
                 "n_clusters should be an instance of ClusterMixin or an int"
             )
 
-        # To use in predict to avoid recalculation.
-        self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
-
         if clusterer is None or not_enough_centroids:
             self.subcluster_labels_ = np.arange(len(centroids))
             if not_enough_centroids:

From f660dbf56061a1d8f1dc9cfd45d1b591268dcc29 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 13 Jul 2021 15:00:42 +0200
Subject: [PATCH 079/290] Fix PairwiseDistancesReduction.is_usable

---
 sklearn/metrics/_parallel_reductions.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 487b3128358ce..c68211257054f 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -171,7 +171,7 @@ cdef class PairwiseDistancesReduction:
     def is_usable_for(cls, X, Y, metric) -> bool:
         # TODO: support sparse arrays
         return (not issparse(X) and
-                not issparse(X) and
+                not issparse(Y) and
                 X.dtype == Y.dtype == np.float64
                 and metric in cls.valid_metrics())
 

From bb24d958dd120acafa67d3789bfe2cdd8c7efda5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 13 Jul 2021 17:16:29 +0200
Subject: [PATCH 080/290] Make array C-ordered for test

---
 sklearn/cluster/tests/test_optics.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 3f68f3b62df78..1449733fed5c0 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -784,7 +784,9 @@ def test_extract_dbscan():
 
 
 def test_precomputed_dists():
-    redX = X[::2]
+    # This slicing makes the array F-ordered.
+    # but we need C-ordering.
+    redX = np.ascontiguousarray(X[::2])
     dists = pairwise_distances(redX, metric="euclidean")
     clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists)
     clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)

From 6eea1aaee29841040b62e7519d7137172deb2ae9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 13 Jul 2021 15:19:11 +0200
Subject: [PATCH 081/290] Annotate with cython.final when relevant

---
 sklearn/metrics/_parallel_reductions.pyx | 25 ++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index c68211257054f..af252dae1449f 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -16,6 +16,7 @@ np.import_array()
 
 from libc.stdlib cimport free, malloc
 from libcpp.vector cimport vector
+from cython cimport final
 from cpython.object cimport PyObject
 from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
@@ -228,6 +229,7 @@ cdef class PairwiseDistancesReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+    @final
     cdef void _parallel_on_X(self) nogil:
         """Computes the reduction of each vector (row) of X on Y
         by parallelizing computation on chunks of X.
@@ -284,6 +286,7 @@ cdef class PairwiseDistancesReduction:
         # end: with nogil, parallel
         return
 
+    @final
     cdef void _parallel_on_Y(self) nogil:
         """Computes the reduction of each vector (row) of X on Y
         by parallelizing computation on chunks of Y.
@@ -344,6 +347,7 @@ cdef class PairwiseDistancesReduction:
         return
 
     # Placeholder methods which have to be implemented
+
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -520,6 +524,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
         return 0
 
+    @final
     cdef void _on_X_prange_iter_init(self,
             ITYPE_t thread_num,
             ITYPE_t X_start,
@@ -531,6 +536,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         self.heaps_approx_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
+    @final
     cdef void _on_X_prange_iter_finalize(self,
             ITYPE_t thread_num,
             ITYPE_t X_start,
@@ -563,6 +569,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heaps_size * sizeof(DTYPE_t))
             self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(heaps_size * sizeof(ITYPE_t))
 
+    @final
     cdef void _on_Y_parallel_init(self,
             ITYPE_t thread_num,
     ) nogil:
@@ -571,6 +578,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
             self.heaps_indices_chunks[thread_num][idx] = -1
 
+    @final
     cdef void _on_Y_after_parallel(self,
             ITYPE_t num_threads,
             ITYPE_t X_start,
@@ -594,7 +602,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                             self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                         )
 
-
     cdef void _on_Y_finalize(self,
         ITYPE_t num_threads,
     ) nogil:
@@ -617,6 +624,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 )
         return
 
+    @final
     cdef void _exact_distances(self,
         ITYPE_t[:, ::1] Y_indices,  # IN
         DTYPE_t[:, ::1] distances,  # IN/OUT
@@ -632,6 +640,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                                                  &self.Y[Y_indices[i, j], 0],
                                                  self.d)
 
+    @final
     def compute(self,
            str strategy = "auto",
            bint return_distance = False
@@ -728,6 +737,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         else:
             raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
 
+    @final
     cdef void _on_X_parallel_init(self,
             ITYPE_t thread_num,
     ) nogil:
@@ -736,12 +746,14 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
             self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
 
+    @final
     cdef void _on_X_parallel_finalize(self,
             ITYPE_t thread_num
     ) nogil:
         ArgKmin._on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
+    @final
     cdef void _on_Y_init(self,
             ITYPE_t num_threads,
     ) nogil:
@@ -753,6 +765,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
                 self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
 
+    @final
     cdef void _on_Y_finalize(self,
             ITYPE_t num_threads,
     ) nogil:
@@ -762,6 +775,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
 
+    @final
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -915,6 +929,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         free(self.neigh_distances_chunks)
         free(self.neigh_indices_chunks)
 
+    @final
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -939,6 +954,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         return 0
 
+    @final
     cdef void _on_X_prange_iter_init(self,
             ITYPE_t thread_num,
             ITYPE_t X_start,
@@ -950,6 +966,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.neigh_distances_chunks[thread_num] = self.neigh_distances
         self.neigh_indices_chunks[thread_num] = self.neigh_indices
 
+    @final
     cdef void _on_X_prange_iter_finalize(self,
             ITYPE_t thread_num,
             ITYPE_t X_start,
@@ -967,6 +984,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_indices)[idx].size()
                 )
 
+    @final
     cdef void _on_Y_parallel_init(self,
         ITYPE_t thread_num,
     ) nogil:
@@ -976,6 +994,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X)
         self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
 
+    @final
     cdef void _merge_vectors(self,
         ITYPE_t idx,
         ITYPE_t num_threads,
@@ -1007,7 +1026,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             )
             last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
-
+    @final
     cdef void _on_Y_after_parallel(self,
         ITYPE_t num_threads,
         ITYPE_t X_start,
@@ -1030,6 +1049,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         return
 
+    @final
     cdef void _on_Y_finalize(self,
         ITYPE_t num_threads,
     ) nogil:
@@ -1048,6 +1068,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         return
 
+    @final
     def compute(self,
            str strategy = "auto",
            bint return_distance = False,

From 15c4150c911f463a9eaacf11e1b815f1d16345b9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 15 Jul 2021 18:12:44 +0200
Subject: [PATCH 082/290] Black contains all the color that I like

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_parallel_reductions.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index af252dae1449f..d3864830361f3 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -1097,8 +1097,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
-            res = (_coerce_vectors_to_np_nd_arrays(self.neigh_distances),
-                    _coerce_vectors_to_np_nd_arrays(self.neigh_indices))
+            res = (
+                _coerce_vectors_to_np_nd_arrays(self.neigh_distances),
+                _coerce_vectors_to_np_nd_arrays(self.neigh_indices),
+            )
         else:
             res = _coerce_vectors_to_np_nd_arrays(self.neigh_indices)
 

From a9706d6d0207545e00b74ae564dfc09b9e25e459 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 15 Jul 2021 18:37:39 +0200
Subject: [PATCH 083/290] Use relative imports

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/neighbors/_binary_tree.pxi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index bbafc27d8ca68..1c87edc723969 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -794,7 +794,7 @@ def newObj(obj):
 
 ######################################################################
 # define the reverse mapping of VALID_METRICS
-from sklearn.metrics._dist_metrics import get_valid_metric_ids
+from ..metrics._dist_metrics import get_valid_metric_ids
 VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
 
 

From 093597531d693bc780a80b754bdde4cfbf85fcb5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 15 Jul 2021 18:39:14 +0200
Subject: [PATCH 084/290] Use method for flatten

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/pairwise.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c13a5643d6f5e..9dbc762a4891a 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -657,8 +657,8 @@ def pairwise_distances_argmin_min(
         values, indices = ArgKmin.get_for(
             X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
         ).compute(strategy="auto", return_distance=True)
-        values = np.ndarray.flatten(values)
-        indices = np.ndarray.flatten(indices)
+        values = values.flatten()
+        indices = indices.flatten()
     else:
         # TODO: support sparse matrices
         # When ArgKmin is not supported and when the

From e2b5398393073b501008828c34d3c14ad12ce6c4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 15 Jul 2021 18:47:56 +0200
Subject: [PATCH 085/290] Correct cross-referencing for metrics.DistanceMetric

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/neighbors/_classification.py | 8 ++++----
 sklearn/neighbors/_graph.py          | 8 ++++----
 sklearn/neighbors/_regression.py     | 9 +++++----
 sklearn/neighbors/_unsupervised.py   | 4 ++--
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 3c13f5f941632..9bb6ccc7f0a73 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -67,8 +67,8 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`metrics.DistanceMetric`
-        for a list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -339,8 +339,8 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`metrics.DistanceMetric`
-        for a list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index e836da97721b3..3ad44bd55f7c5 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -68,8 +68,8 @@ def kneighbors_graph(
         The distance metric used to calculate the neighbors within a
         given radius for each sample point. The default distance is
         'euclidean' ('minkowski' metric with the param equal to 2.)
-        See the documentation of :class:`metrics.DistanceMetric`
-        for a list of available metrics.
+        For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
@@ -161,8 +161,8 @@ def radius_neighbors_graph(
         The distance metric used to calculate the neighbors within a
         given radius for each sample point. The default distance is
         'euclidean' ('minkowski' metric with the param equal to 2.)
-        See the documentation of :class:`metrics.DistanceMetric`
-        for a list of available metrics.
+        For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index d5f6e9363629c..e8f524d525fa8 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -75,8 +75,9 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`metrics.DistanceMetric`
-        for a list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+
+
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -302,8 +303,8 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBa
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`metrics.DistanceMetric`
-        for a list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 7a2e81a18aa98..4bd23367a7367 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -41,8 +41,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`metrics.DistanceMetric`
-        for a list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.

From ce1ccdce101b7669e806a1c27d20dcf6db9e8f7c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 15 Jul 2021 18:50:25 +0200
Subject: [PATCH 086/290] Precise that p is the parameter used by 'minkowski'

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/neighbors/_graph.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 3ad44bd55f7c5..1e8d92cada599 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -67,7 +67,7 @@ def kneighbors_graph(
     metric : str, default='minkowski'
         The distance metric used to calculate the neighbors within a
         given radius for each sample point. The default distance is
-        'euclidean' ('minkowski' metric with the param equal to 2.)
+        'euclidean' ('minkowski' metric with the p param equal to 2.)
         For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
 
@@ -160,7 +160,7 @@ def radius_neighbors_graph(
     metric : str, default='minkowski'
         The distance metric used to calculate the neighbors within a
         given radius for each sample point. The default distance is
-        'euclidean' ('minkowski' metric with the param equal to 2.)
+        'euclidean' ('minkowski' metric with the p param equal to 2.)
         For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
 

From a9fe71fb20190ffcd622ab7805aa8f1b1bfcc208 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 20 Jul 2021 17:34:05 +0200
Subject: [PATCH 087/290] Prefer assert_allclose over assert_array_equal

---
 sklearn/neighbors/tests/test_neighbors.py | 55 +++++++++++++----------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index dd2d2d6f3910d..ee4abf7cac36f 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -119,23 +119,29 @@ def test_unsupervised_kneighbors(
         indices_no_dist = results_nodist[i]
         distances, next_distances = results[i][0], results[i + 1][0]
         indices, next_indices = results[i][1], results[i + 1][1]
-        assert_array_equal(
+        assert_allclose(
             indices_no_dist,
             indices,
-            err_msg=f"The '{algorithm}' algorithm returns different"
-            f"indices depending on 'return_distances'.",
+            err_msg=(
+                f"The '{algorithm}' algorithm returns different"
+                "indices depending on 'return_distances'."
+            ),
         )
-        assert_array_equal(
+        assert_allclose(
             indices,
             next_indices,
-            err_msg=f"The '{algorithm}' and '{next_algorithm}' "
-            f"algorithms return different indices.",
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different indices."
+            ),
         )
-        assert_array_equal(
+        assert_allclose(
             distances,
             next_distances,
-            err_msg=f"The '{algorithm}' and '{next_algorithm}' "
-            f"algorithms return different distances.",
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different distances."
+            ),
         )
 
 
@@ -1555,37 +1561,37 @@ def test_k_and_radius_neighbors_duplicates(algorithm):
 
     # Do not do anything special to duplicates.
     kng = nn.kneighbors_graph([[0], [1]], mode="distance")
-    assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
-    assert_array_equal(kng.data, [0.0, 0.0])
-    assert_array_equal(kng.indices, [0, 1])
+    assert_allclose(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
+    assert_allclose(kng.data, [0.0, 0.0])
+    assert_allclose(kng.indices, [0, 1])
 
     dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
     check_object_arrays(dist, [[0, 1], [1, 0]])
     check_object_arrays(ind, [[0, 1], [0, 1]])
 
     rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
-    assert_array_equal(rng.A, np.ones((2, 2)))
+    assert_allclose(rng.A, np.ones((2, 2)))
 
     rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
     rng.sort_indices()
-    assert_array_equal(rng.A, [[0, 1], [1, 0]])
-    assert_array_equal(rng.indices, [0, 1, 0, 1])
-    assert_array_equal(rng.data, [0, 1, 1, 0])
+    assert_allclose(rng.A, [[0, 1], [1, 0]])
+    assert_allclose(rng.indices, [0, 1, 0, 1])
+    assert_allclose(rng.data, [0, 1, 1, 0])
 
     # Mask the first duplicates when n_duplicates > n_neighbors.
     X = np.ones((3, 1))
     nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
     nn.fit(X)
     dist, ind = nn.kneighbors()
-    assert_array_equal(dist, np.zeros((3, 1)))
-    assert_array_equal(ind, [[2], [2], [0]])
+    assert_allclose(dist, np.zeros((3, 1)))
+    assert_allclose(ind, [[2], [2], [0]])
 
     # Test that zeros are explicitly marked in kneighbors_graph.
     kng = nn.kneighbors_graph(mode="distance")
-    assert_array_equal(kng.A, np.zeros((3, 3)))
-    assert_array_equal(kng.data, np.zeros(3))
-    assert_array_equal(kng.indices, [2.0, 2.0, 0.0])
-    assert_array_equal(
+    assert_allclose(kng.A, np.zeros((3, 3)))
+    assert_allclose(kng.data, np.zeros(3))
+    assert_allclose(kng.indices, [2.0, 2.0, 0.0])
+    assert_allclose(
         nn.kneighbors_graph().A,
         np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0]]),
     )
@@ -1881,8 +1887,9 @@ def test_fast_sqeuclidean_correctness(
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
 @pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.skip(
-    reason="Long test, translation invariance should "
-    "have its own study: skipping for now"
+    reason=(
+        "Long test, translation invariance should have its own study: skipping for now"
+    )
 )
 def test_fast_sqeuclidean_translation_invariance(
     n_samples,

From 1593dab18d116d3adcca52bff2886d5f1a9ca9a7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 20 Jul 2021 17:38:27 +0200
Subject: [PATCH 088/290] Prefer csr_matrix.toarray over csr_matrix.A

---
 sklearn/neighbors/tests/test_neighbors.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index ee4abf7cac36f..9a49e9627210c 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1561,7 +1561,7 @@ def test_k_and_radius_neighbors_duplicates(algorithm):
 
     # Do not do anything special to duplicates.
     kng = nn.kneighbors_graph([[0], [1]], mode="distance")
-    assert_allclose(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
+    assert_allclose(kng.toarray(), np.array([[0.0, 0.0], [0.0, 0.0]]))
     assert_allclose(kng.data, [0.0, 0.0])
     assert_allclose(kng.indices, [0, 1])
 
@@ -1570,11 +1570,11 @@ def test_k_and_radius_neighbors_duplicates(algorithm):
     check_object_arrays(ind, [[0, 1], [0, 1]])
 
     rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
-    assert_allclose(rng.A, np.ones((2, 2)))
+    assert_allclose(rng.toarray(), np.ones((2, 2)))
 
     rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
     rng.sort_indices()
-    assert_allclose(rng.A, [[0, 1], [1, 0]])
+    assert_allclose(rng.toarray(), [[0, 1], [1, 0]])
     assert_allclose(rng.indices, [0, 1, 0, 1])
     assert_allclose(rng.data, [0, 1, 1, 0])
 
@@ -1588,11 +1588,11 @@ def test_k_and_radius_neighbors_duplicates(algorithm):
 
     # Test that zeros are explicitly marked in kneighbors_graph.
     kng = nn.kneighbors_graph(mode="distance")
-    assert_allclose(kng.A, np.zeros((3, 3)))
+    assert_allclose(kng.toarray(), np.zeros((3, 3)))
     assert_allclose(kng.data, np.zeros(3))
     assert_allclose(kng.indices, [2.0, 2.0, 0.0])
     assert_allclose(
-        nn.kneighbors_graph().A,
+        nn.kneighbors_graph().toarray(),
         np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0]]),
     )
 

From 01c1294f0f9c892c1c7727e5bac148aeede39e21 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 20 Jul 2021 17:52:08 +0200
Subject: [PATCH 089/290] Rework test for correct behavior regarding the radius

Probably this should get its own test.
---
 sklearn/neighbors/tests/test_neighbors.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 9a49e9627210c..b5dbfadd8b2ac 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1557,24 +1557,30 @@ def test_k_and_radius_neighbors_X_None():
 def test_k_and_radius_neighbors_duplicates(algorithm):
     # Test behavior of kneighbors when duplicates are present in query
     nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
-    nn.fit([[0], [1]])
+    duplicates = [[0], [1], [3]]
+
+    nn.fit(duplicates)
 
     # Do not do anything special to duplicates.
-    kng = nn.kneighbors_graph([[0], [1]], mode="distance")
-    assert_allclose(kng.toarray(), np.array([[0.0, 0.0], [0.0, 0.0]]))
-    assert_allclose(kng.data, [0.0, 0.0])
-    assert_allclose(kng.indices, [0, 1])
+    kng = nn.kneighbors_graph(duplicates, mode="distance")
+    assert_allclose(
+        kng.toarray(), np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+    )
+    assert_allclose(kng.data, [0.0, 0.0, 0.0])
+    assert_allclose(kng.indices, [0, 1, 2])
 
     dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
     check_object_arrays(dist, [[0, 1], [1, 0]])
     check_object_arrays(ind, [[0, 1], [0, 1]])
 
-    rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
-    assert_allclose(rng.toarray(), np.ones((2, 2)))
+    rng = nn.radius_neighbors_graph(duplicates, radius=1.5)
+    assert_allclose(
+        rng.toarray(), np.array([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+    )
 
     rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
     rng.sort_indices()
-    assert_allclose(rng.toarray(), [[0, 1], [1, 0]])
+    assert_allclose(rng.toarray(), [[0, 1, 0], [1, 0, 0]])
     assert_allclose(rng.indices, [0, 1, 0, 1])
     assert_allclose(rng.data, [0, 1, 1, 0])
 

From 4b6a0414f36e62af2d21aa8df4bd831e5cbce07b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 20 Jul 2021 17:57:05 +0200
Subject: [PATCH 090/290] Inline heap pushes

---
 sklearn/utils/_heap.pxd | 2 +-
 sklearn/utils/_heap.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
index 05e4760994e33..be10b066386c9 100644
--- a/sklearn/utils/_heap.pxd
+++ b/sklearn/utils/_heap.pxd
@@ -10,7 +10,7 @@ cdef int _simultaneous_sort(
     ITYPE_t size
 ) nogil except -1
 
-cdef int _push(
+cdef inline int _push(
     floating* dist,
     ITYPE_t* idx,
     ITYPE_t size,
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index ef2d393cc1a55..4b2b5641697be 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -84,7 +84,7 @@ cdef int _simultaneous_sort(
     return 0
 
 
-cdef int _push(
+cdef inline int _push(
     floating* dist,
     ITYPE_t* idx,
     ITYPE_t size,

From c26b583e9164ed60276e656b14f105d46c844b41 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 20 Jul 2021 18:07:22 +0200
Subject: [PATCH 091/290] Mirror ValueError for incorrectly set sort_results
 and return_distances

---
 sklearn/metrics/_parallel_reductions.pyx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index d3864830361f3..9af40611d8515 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -1074,6 +1074,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
            bint return_distance = False,
            bint sort_results = False
     ):
+        if sort_results and not return_distance:
+            raise ValueError("return_distance must be True "
+                             "if sort_results is True.")
+
         # Temporary datastructures which will be coerced to
         # numpy arrays on return and then freed.
         self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)

From 5bcea9f3b72631f195f99dc02a4f3eaaca46d565 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 20 Jul 2021 18:21:11 +0200
Subject: [PATCH 092/290] Parametrise test_radius_neighbors_graph_sparse

---
 sklearn/neighbors/tests/test_neighbors.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index b5dbfadd8b2ac..4ca5c5b27efd4 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1209,21 +1209,19 @@ def test_radius_neighbors_graph():
     )
 
 
-def test_radius_neighbors_graph_sparse(seed=36):
+@pytest.mark.parametrize("n_neighbors", [1, 2, 3])
+@pytest.mark.parametrize("mode", ["connectivity", "distance"])
+def test_radius_neighbors_graph_sparse(n_neighbors, mode, seed=36):
     # Test radius_neighbors_graph to build the Nearest Neighbor graph
     # for sparse input.
     rng = np.random.RandomState(seed)
     X = rng.randn(10, 10)
     Xcsr = csr_matrix(X)
 
-    for n_neighbors in [1, 2, 3]:
-        for mode in ["connectivity", "distance"]:
-            assert_array_almost_equal(
-                neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(),
-                neighbors.radius_neighbors_graph(
-                    Xcsr, n_neighbors, mode=mode
-                ).toarray(),
-            )
+    assert_array_almost_equal(
+        neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(),
+        neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
+    )
 
 
 def test_neighbors_badargs():

From 474804c00d10de788f665cfe654025ad96cc3615 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 09:05:12 +0200
Subject: [PATCH 093/290] Lighten and correct test

---
 sklearn/metrics/tests/test_pairwise.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 225bbd57780b0..5a2e645d442c5 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1467,13 +1467,11 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
     assert_allclose(dist, expected_dist, rtol=rtol)
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("X_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("sign", [1, -1])
 def test_fast_sqeuclidean_correctness(
-    n_samples, n_features, X_translation, Y_translation, sign
+    X_translation, Y_translation, sign, n_samples=10000, n_features=10
 ):
     # The fast squared euclidean strategy must return results
     # that are close to the ones obtained with the euclidean distance
@@ -1481,7 +1479,7 @@ def test_fast_sqeuclidean_correctness(
 
     spread = 100
     X = X_translation + rng.rand(n_samples, n_features) * spread
-    Y = Y_translation + rng.rand(n_samples, n_features) * spread * sign
+    Y = (Y_translation + rng.rand(n_samples, n_features) * spread) * sign
 
     argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
     fsq_argmins, fsq_distances = pairwise_distances_argmin_min(

From e84ff1b40d58479a488469f7ccb4cdb58dbb1fc0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 09:33:48 +0200
Subject: [PATCH 094/290] Allow other dtypes than np.float64

---
 sklearn/metrics/_parallel_reductions.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 9af40611d8515..787dbffed1f3f 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -173,8 +173,7 @@ cdef class PairwiseDistancesReduction:
         # TODO: support sparse arrays
         return (not issparse(X) and
                 not issparse(Y) and
-                X.dtype == Y.dtype == np.float64
-                and metric in cls.valid_metrics())
+                metric in cls.valid_metrics())
 
 
     def __cinit__(self):

From ac2ce70b0db98ba4bafd27f3a9c1f39cd65b9f3e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 09:36:53 +0200
Subject: [PATCH 095/290] Parametrise test_kneighbors_graph_sparse

---
 sklearn/neighbors/tests/test_neighbors.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 4ca5c5b27efd4..17b432b27dfdf 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1181,19 +1181,19 @@ def test_kneighbors_graph():
     assert_array_almost_equal(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
 
 
-def test_kneighbors_graph_sparse(seed=36):
+@pytest.mark.parametrize("n_neighbors", [1, 2, 3])
+@pytest.mark.parametrize("mode", ["connectivity", "distance"])
+def test_kneighbors_graph_sparse(n_neighbors, mode, seed=36):
     # Test kneighbors_graph to build the k-Nearest Neighbor graph
     # for sparse input.
     rng = np.random.RandomState(seed)
     X = rng.randn(10, 10)
     Xcsr = csr_matrix(X)
 
-    for n_neighbors in [1, 2, 3]:
-        for mode in ["connectivity", "distance"]:
-            assert_array_almost_equal(
-                neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(),
-                neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
-            )
+    assert_array_almost_equal(
+        neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(),
+        neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
+    )
 
 
 def test_radius_neighbors_graph():

From 9f612a19f5d47ddd3ded162c8106294df74342df Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 15:06:24 +0200
Subject: [PATCH 096/290] fixup! Remove uncalled snippet

---
 sklearn/neighbors/_base.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 1d48840caedc6..2b8a76be39fcb 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -635,10 +635,7 @@ def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance):
         # argpartition doesn't guarantee sorted order, so we sort again
         neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])]
         if return_distance:
-            if self.effective_metric_ == "euclidean":
-                result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
-            else:
-                result = dist[sample_range, neigh_ind], neigh_ind
+            result = dist[sample_range, neigh_ind], neigh_ind
         else:
             result = neigh_ind
         return result
@@ -959,10 +956,7 @@ def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance):
         neigh_ind = [np.where(d <= radius)[0] for d in dist]
 
         if return_distance:
-            if self.effective_metric_ == "euclidean":
-                dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)]
-            else:
-                dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
+            dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
             results = dist, neigh_ind
         else:
             results = neigh_ind

From 74240bd6292969822231990a8dcaa8122b9ba868 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 15:15:04 +0200
Subject: [PATCH 097/290] Mark a test case as xfail for
 test_fast_sqeuclidean_correctness

---
 sklearn/metrics/tests/test_pairwise.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 5a2e645d442c5..0b20942965afa 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1473,6 +1473,14 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
 def test_fast_sqeuclidean_correctness(
     X_translation, Y_translation, sign, n_samples=10000, n_features=10
 ):
+    # This is the only failing test case, so we prefer xfailing.
+    numerical_edge_case = (1e7, 1e7, 1)
+    if (X_translation, Y_translation, sign) == numerical_edge_case:
+        pytest.xfail(
+            "Numerical edge-case for (X_translation, Y_translation,"
+            f" sign)={numerical_edge_case}"
+        )
+
     # The fast squared euclidean strategy must return results
     # that are close to the ones obtained with the euclidean distance
     rng = np.random.RandomState(1)

From 32b08aff5103113ec7d282fb71a55facf5aa1a3c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 15:32:21 +0200
Subject: [PATCH 098/290] fixup! Inline heap pushes

`inline` comes only on definitions.
---
 sklearn/utils/_heap.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
index be10b066386c9..05e4760994e33 100644
--- a/sklearn/utils/_heap.pxd
+++ b/sklearn/utils/_heap.pxd
@@ -10,7 +10,7 @@ cdef int _simultaneous_sort(
     ITYPE_t size
 ) nogil except -1
 
-cdef inline int _push(
+cdef int _push(
     floating* dist,
     ITYPE_t* idx,
     ITYPE_t size,

From 8cfabc855b4279039e063d6b1d471dee6d0026a8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 15:43:41 +0200
Subject: [PATCH 099/290] fixup! Adapt cython submodule for heaps

---
 sklearn/neighbors/_binary_tree.pxi | 50 +++---------------------------
 1 file changed, 5 insertions(+), 45 deletions(-)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 1c87edc723969..d713dd491bdfd 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -542,52 +542,12 @@ cdef class NeighborsHeap:
     cdef int _push(self, ITYPE_t row, DTYPE_t val,
                    ITYPE_t i_val) nogil except -1:
         """push (val, i_val) into the given row"""
-        cdef ITYPE_t i, ic1, ic2, i_swap
-        cdef ITYPE_t size = self.distances.shape[1]
-        cdef DTYPE_t* dist_arr = &self.distances[row, 0]
-        cdef ITYPE_t* ind_arr = &self.indices[row, 0]
+        cdef:
+            ITYPE_t size = self.distances.shape[1]
+            DTYPE_t* dist_arr = &self.distances[row, 0]
+            ITYPE_t* ind_arr = &self.indices[row, 0]
+        return _push(dist_arr, ind_arr, size, val, i_val)
 
-        # check if val should be in heap
-        if val > dist_arr[0]:
-            return 0
-
-        # insert val at position zero
-        dist_arr[0] = val
-        ind_arr[0] = i_val
-
-        # descend the heap, swapping values until the max heap criterion is met
-        i = 0
-        while True:
-            ic1 = 2 * i + 1
-            ic2 = ic1 + 1
-
-            if ic1 >= size:
-                break
-            elif ic2 >= size:
-                if dist_arr[ic1] > val:
-                    i_swap = ic1
-                else:
-                    break
-            elif dist_arr[ic1] >= dist_arr[ic2]:
-                if val < dist_arr[ic1]:
-                    i_swap = ic1
-                else:
-                    break
-            else:
-                if val < dist_arr[ic2]:
-                    i_swap = ic2
-                else:
-                    break
-
-            dist_arr[i] = dist_arr[i_swap]
-            ind_arr[i] = ind_arr[i_swap]
-
-            i = i_swap
-
-        dist_arr[i] = val
-        ind_arr[i] = i_val
-
-        return 0
 
     cdef int _sort(self) except -1:
         """simultaneously sort the distances and indices"""

From dc1079f15fd207bd11b677864873fdfd214ebbac Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 15:52:18 +0200
Subject: [PATCH 100/290] Do not push if the element is identical to the
 largest

ArgKMin context: shall there be several candidates for the k-th position
(in the k-th nn context, the furthest neighbors of the query points),
the first candidate will be picked and won't be replace the by others
candidates (which is what was happening before).

This allow matching the current behavior of k-th NN search using trees.
---
 sklearn/neighbors/tests/test_neighbors.py | 15 +++++++++++++++
 sklearn/utils/_heap.pyx                   |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 17b432b27dfdf..512e266430b58 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -777,6 +777,21 @@ def test_radius_neighbors_returns_array_of_objects():
     assert_array_equal(neigh_ind, expected_ind)
 
 
+@pytest.mark.parametrize("algorithm", ["ball_tree", "kd_tree", "brute"])
+def test_query_equidistant_kth_nn(algorithm):
+    # For several candidates for the k-th nearest neighbor position,
+    # the first candidate should be chosen
+    query_point = np.array([[0, 0]])
+    equidistant_points = np.array([[1, 0], [0, 1], [-1, 0], [0, -1]])
+    # The 3rd and 4th points should not replace the 2nd point
+    # for the 2th nearest neighbor position
+    k = 2
+    knn_indices = np.array([[0, 1]])
+    nn = neighbors.NearestNeighbors(algorithm=algorithm).fit(equidistant_points)
+    indices = np.sort(nn.kneighbors(query_point, n_neighbors=k, return_distance=False))
+    assert_array_equal(indices, knn_indices)
+
+
 @pytest.mark.parametrize(
     ["algorithm", "metric"],
     [
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index 4b2b5641697be..b9ae8049848a8 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -96,7 +96,7 @@ cdef inline int _push(
         ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx
 
     # check if val should be in heap
-    if val > dist[0]:
+    if val >= dist[0]:
         return 0
 
     # insert val at position zero

From f5308b07f32f3c5f7e623bfd5491d304ec4b59b5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 12 Jul 2021 18:07:59 +0200
Subject: [PATCH 101/290] Remove X and Y from _reduce_on_chunks signature

---
 sklearn/metrics/_parallel_reductions.pyx | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 787dbffed1f3f..e6baf09dd511c 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -267,8 +267,6 @@ cdef class PairwiseDistancesReduction:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
-                        self.X,
-                        self.Y,
                         X_start, X_end,
                         Y_start, Y_end,
                         thread_num,
@@ -326,8 +324,6 @@ cdef class PairwiseDistancesReduction:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
-                        self.X,
-                        self.Y,
                         X_start, X_end,
                         Y_start, Y_end,
                         thread_num,
@@ -348,8 +344,6 @@ cdef class PairwiseDistancesReduction:
     # Placeholder methods which have to be implemented
 
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -493,8 +487,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             raise RuntimeError("Trying to free heaps_approx_distances_chunks which is NULL")
 
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -503,8 +495,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
@@ -776,8 +768,6 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
     @final
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -792,8 +782,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         """
         cdef:
             ITYPE_t i, j
-            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
@@ -930,8 +920,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     @final
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -940,8 +928,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             DTYPE_t dist_i_j
 
         for i in range(X_c.shape[0]):

From 19d461f18c50e3b0f6f0a7bdc50d15d39802cbc3 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 14:12:25 +0200
Subject: [PATCH 102/290] Introduce DistanceMetric.sparse_{rdist,dist}

Motive: extends computations on fused pairs
of dense and sparse vectors.

! Note that this does not implement those !
! interfaces                              !
---
 sklearn/metrics/_dist_metrics.pxd | 12 ++++++++++++
 sklearn/metrics/_dist_metrics.pyx | 29 +++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index b5ad7969d27e3..917b0300f2387 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -65,6 +65,18 @@ cdef class DistanceMetric:
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
+    cdef DTYPE_t sparse_dist(self, const DTYPE_t[:] x1_data,
+                      const ITYPE_t[:] x1_indices,
+                      const DTYPE_t[:] x2_data,
+                      const ITYPE_t[:] x2_indices,
+                      ) nogil except -1
+
+    cdef DTYPE_t sparse_rdist(self, const DTYPE_t[:] x1_data,
+                      const ITYPE_t[:] x1_indices,
+                      const DTYPE_t[:] x2_data,
+                      const ITYPE_t[:] x2_indices,
+                      ) nogil except -1
+
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
 
     cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 0a3eff6bcb9ff..25e04a422918d 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -316,6 +316,35 @@ cdef class DistanceMetric:
         """
         return self.dist(x1, x2, size)
 
+    cdef DTYPE_t sparse_dist(self, const DTYPE_t[:] x1_data,
+                      const ITYPE_t[:] x1_indices,
+                      const DTYPE_t[:] x2_data,
+                      const ITYPE_t[:] x2_indices,
+                      ) nogil except -1:
+        """Compute the reduced distance between vectors x1 and x2
+        given non null coordinates and their corresponding indices.
+
+        This should be overridden in a base class.
+        """
+        return -999
+
+    cdef DTYPE_t sparse_rdist(self, const DTYPE_t[:] x1_data,
+                      const ITYPE_t[:] x1_indices,
+                      const DTYPE_t[:] x2_data,
+                      const ITYPE_t[:] x2_indices,
+                      ) nogil except -1:
+        """Compute the reduced distance between vectors x1 and x2
+        given non null coordinates and their corresponding indices.
+
+        This can optionally be overridden in a base class.
+
+        The reduced distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the reduced distance is the squared-euclidean
+        distance.
+        """
+        return self.sparse_dist(x1_data, x1_indices, x2_data, x2_indices)
+
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
         """compute the pairwise distances between points in X"""
         cdef ITYPE_t i1, i2

From 444c4bce33868385a3889caaee96addc20f5f910 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 18:09:54 +0200
Subject: [PATCH 103/290] Introduce DistanceComputer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allow decoupling distances computation
from their usage, paving the way to the
support for computations on pairs of
array-likes in "{dense, sparse}²" easily.
---
 sklearn/metrics/_parallel_reductions.pyx | 412 +++++++++++++++++++----
 1 file changed, 338 insertions(+), 74 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index e6baf09dd511c..b0d7c6eb61e13 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -129,6 +129,309 @@ cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector_vector_DI
 
 #####################
 
+cdef class DistanceComputer:
+    """ Abstract class responsible to compute
+    distances between vectors of array-like.
+    """
+
+    @classmethod
+    def get_for(cls,
+                X,
+                Y,
+                str metric="euclidean",
+                dict metric_kwargs=dict(),
+        ) -> DistanceComputer:
+        cdef:
+            DistanceMetric distance_metric = DistanceMetric.get_metric(metric, **metric_kwargs)
+
+        if X.shape[1] != Y.shape[1]:
+            raise RuntimeError("Vectors of X and Y must have the "
+                               "same dimension but currently are "
+                               f"respectively {X.shape[1]}-dimensional "
+                               f"and {Y.shape[1]}-dimensional.")
+
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        if not issparse(X) and not issparse(Y):
+            return DenseDenseDistanceComputer(X, Y, distance_metric)
+        if issparse(X) and not issparse(Y):
+            return SparseDenseDistanceComputer(X, Y, distance_metric)
+        if not issparse(X) and issparse(Y):
+            return DenseSparseDistanceComputer(X, Y, distance_metric)
+        return SparseSparseDistanceComputer(X, Y, distance_metric)
+
+    @property
+    def n_X(self):
+        raise RuntimeError()
+
+    @property
+    def n_Y(self):
+        raise RuntimeError()
+
+    cdef DTYPE_t approx_dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        return self.dist(i, j)
+
+    cdef DTYPE_t dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        return -1
+
+cdef class DenseDenseDistanceComputer(DistanceComputer):
+    """ Compute distances between vectors of two arrays.
+
+    X: ndarray of shape (n_X, d)
+        Rows represent vectors
+    Y: ndarray of shape (n_Y, d)
+        Rows represent vectors
+    """
+    cdef:
+        DistanceMetric distance_metric
+
+        const DTYPE_t[:, ::1] X  # shape: (n_X, d)
+        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+        ITYPE_t d
+
+    def __cinit__(self):
+        # Initializing memory view to prevent memory errors and seg-faults
+        # in rare cases where __init__ is not called
+        self.X = np.empty((1, 1), dtype=DTYPE, order='c')
+        self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+        self.X = check_array(X, dtype=DTYPE, order='C')
+        self.Y = check_array(Y, dtype=DTYPE, order='C')
+        self.d = X.shape[1]
+
+    @property
+    @final
+    def n_X(self):
+        return self.X.shape[0]
+
+    @property
+    @final
+    def n_Y(self):
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t approx_dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        return self.distance_metric.rdist(&self.X[i, 0],
+                                          &self.Y[j, 0],
+                                          self.d)
+
+    @final
+    cdef DTYPE_t dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        return self.distance_metric.dist(&self.X[i, 0],
+                                         &self.Y[j, 0],
+                                         self.d)
+
+cdef class SparseSparseDistanceComputer(DistanceComputer):
+    """ Compute distances between vectors of two sparse matrices.
+
+    X: sparse matrix of shape (n_X, d)
+        Rows represent vectors
+    Y: sparse matrix of shape (n_X, d)
+        Rows represent vectors
+    """
+    cdef:
+        DistanceMetric distance_metric
+
+        const DTYPE_t[:] X_data
+        const ITYPE_t[:] X_indices,
+        const ITYPE_t[:] X_indptr,
+
+        const DTYPE_t[:] Y_data
+        const ITYPE_t[:] Y_indices
+        const ITYPE_t[:] Y_indptr
+
+    @property
+    @final
+    def n_X(self):
+        return self.X_indptr.shape[0] - 1
+
+    @property
+    @final
+    def n_Y(self):
+        return self.Y_indptr.shape[0] -1
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+
+        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
+
+        self.X_data = X.data
+        self.X_indices = X.indices
+        self.X_indptr = X.indptr
+
+        self.Y_data = Y.data
+        self.Y_indices = Y.indices
+        self.Y_indptr = Y.indptr
+
+    @final
+    cdef DTYPE_t approx_dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+            ITYPE_t yj_start = self.Y_indptr[j]
+            ITYPE_t yj_end = self.Y_indptr[j + 1]
+
+        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
+                                          self.X_indices[xi_start:xi_end],
+                                          self.Y_data[yj_start:yj_end],
+                                          self.Y_indices[yj_start:yj_end])
+
+    @final
+    cdef DTYPE_t dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+            ITYPE_t yj_start = self.Y_indptr[j]
+            ITYPE_t yj_end = self.Y_indptr[j + 1]
+
+        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
+                                         self.X_indices[xi_start:xi_end],
+                                         self.Y_data[yj_start:yj_end],
+                                         self.Y_indices[yj_start:yj_end])
+
+
+cdef class SparseDenseDistanceComputer(DistanceComputer):
+    """ Compute distances between vectors of two sparse matrices.
+
+    X: sparse matrix of shape (n_X, d)
+        Rows represent vectors
+    Y: ndarray of shape (n_Y, d)
+        Rows represent vectors
+    """
+    cdef:
+        DistanceMetric distance_metric
+
+        const DTYPE_t[:] X_data
+        const ITYPE_t[:] X_indices,
+        const ITYPE_t[:] X_indptr,
+
+        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+        const ITYPE_t[:] Y_indices
+
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+
+        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+        self.X_data = X.data
+        self.X_indices = X.indices
+        self.X_indptr = X.indptr
+
+        self.Y = check_array(Y, dtype=DTYPE)
+        self.Y_indices = np.arange(self.Y.shape[1])
+
+    @property
+    @final
+    def n_X(self):
+        return self.X_indptr.shape[0] - 1
+
+    @property
+    @final
+    def n_Y(self):
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t approx_dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+
+        # TODO: the 2D to 1D memory-view conversion might make computation slower, see:
+        #
+        # Ideally, we could pass pointers and indices and access elements
+        # then in distance_metric.dist
+        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
+                                          self.X_indices[xi_start:xi_end],
+                                          self.Y[j, :],
+                                          self.Y_indices)
+
+    @final
+    cdef DTYPE_t dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+
+        # TODO: same as previous comment
+        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
+                                         self.X_indices[xi_start:xi_end],
+                                         self.Y[j, :],
+                                         self.Y_indices)
+
+
+cdef class DenseSparseDistanceComputer(DistanceComputer):
+    """ Compute distances between vectors of a sparse matrix and
+     vectors of an array.
+
+    X: ndarray of shape (n_X, d)
+        Rows represent vectors
+    Y: sparse matrix of shape (n_Y, d)
+        Rows represent vectors
+    """
+    cdef:
+        # As distance metrics are commutative functions, we can
+        # simply rely on the other strategy and swap arguments
+        DistanceComputer distance_computer
+
+    def __init__(self, X, Y, distance_metric):
+        # Swapping arguments on the constructor
+        self.distance_computer = SparseSparseDistanceComputer(Y, X, distance_metric)
+
+    @property
+    @final
+    def n_X(self):
+        # Swapping interface
+        return self.distance_computer.n_Y
+
+    @property
+    @final
+    def n_Y(self):
+        # Swapping interface
+        return self.distance_computer.n_X
+
+    @final
+    cdef DTYPE_t approx_dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        # Swapping arguments on the same interface
+        return self.distance_computer.approx_dist(j, i)
+
+    @final
+    cdef DTYPE_t dist(self,
+        ITYPE_t i,
+        ITYPE_t j,
+    ) nogil except -1:
+        # Swapping arguments on the same interface
+        return self.distance_computer.dist(j, i)
+
 
 cdef class PairwiseDistancesReduction:
     """Abstract class to computes a reduction on pairwise
@@ -139,10 +442,6 @@ cdef class PairwiseDistancesReduction:
     on chunks whose size can be set using ``chunk_size``.
     Parameters
     ----------
-    X: ndarray of shape (n, d)
-        Rows represent vectors
-    Y: ndarray of shape (m, d)
-        Rows represent vectors
     distance_metric: DistanceMetric
         The distance to use
     chunk_size: int
@@ -150,16 +449,11 @@ cdef class PairwiseDistancesReduction:
     """
 
     cdef:
-        const DTYPE_t[:, ::1] X  # shape: (n_X, d)
-        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
-
-        DistanceMetric distance_metric
+        DistanceComputer distance_computer
 
         ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
 
-        ITYPE_t d
-
         ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_rem
         ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_rem
 
@@ -176,16 +470,8 @@ cdef class PairwiseDistancesReduction:
                 metric in cls.valid_metrics())
 
 
-    def __cinit__(self):
-        # Initializing memory view to prevent memory errors and seg-faults
-        # in rare cases where __init__ is not called
-        self.X = np.empty((1, 1), dtype=DTYPE, order='c')
-        self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
-
     def __init__(self,
-                 X,
-                 Y,
-                 DistanceMetric distance_metric,
+                 DistanceComputer distance_computer,
                  ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         cdef:
@@ -193,28 +479,17 @@ cdef class PairwiseDistancesReduction:
 
         self.effective_omp_n_thread = _openmp_effective_n_threads()
 
-        self.X = check_array(X, dtype=DTYPE)
-        self.Y = check_array(Y, dtype=DTYPE)
-
-        assert X.shape[1] == Y.shape[1], "Vectors of X and Y must have the " \
-                                         "same dimension but currently are " \
-                                         f"respectively {X.shape[1]}-dimensional " \
-                                         f"and {Y.shape[1]}-dimensional."
-        distance_metric._validate_data(X)
-        distance_metric._validate_data(Y)
-
-        self.d = X.shape[1]
         self.chunk_size = chunk_size
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
-        self.distance_metric = distance_metric
+        self.distance_computer = distance_computer
 
-        self.n_Y = Y.shape[0]
+        self.n_Y = distance_computer.n_Y
         self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
         Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
         self.Y_n_samples_rem = self.n_Y % self.Y_n_samples_chunk
 
-        self.n_X = X.shape[0]
+        self.n_X = distance_computer.n_X
         self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
         X_n_full_chunks = self.n_X // self.X_n_samples_chunk
         self.X_n_samples_rem = self.n_X % self.X_n_samples_chunk
@@ -411,10 +686,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     Parameters
     ----------
-    X: ndarray of shape (n, d)
-        Rows represent vectors
-    Y: ndarray of shape (m, d)
-        Rows represent vectors
     distance_metric: DistanceMetric
         The distance to use
     k: int
@@ -447,21 +718,18 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 dict metric_kwargs=dict(),
         ):
         # This factory comes to handle specialisation on fast_sqeuclidean.
-        if metric == "fast_sqeuclidean":
+        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
-        return ArgKmin(X=X, Y=Y,
-                       distance_metric=DistanceMetric.get_metric(metric, **metric_kwargs),
+        return ArgKmin(distance_computer=DistanceComputer.get_for(X, Y, metric, metric_kwargs),
                        k=k,
                        chunk_size=chunk_size)
 
     def __init__(self,
-                 X,
-                 Y,
-                 DistanceMetric distance_metric,
+                 DistanceComputer distance_computer,
                  ITYPE_t k,
                  ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        PairwiseDistancesReduction.__init__(self, X, Y, distance_metric, chunk_size)
+        PairwiseDistancesReduction.__init__(self, distance_computer, chunk_size)
 
         self.k = k
 
@@ -495,22 +763,20 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            ITYPE_t n_X = X_end - X_start
+            ITYPE_t n_Y = Y_end - Y_start
             ITYPE_t k = self.k
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
-        for i in range(X_c.shape[0]):
-            for j in range(Y_c.shape[0]):
+        for i in range(n_X):
+            for j in range(n_Y):
                 _push(heaps_approx_distances + i * self.k,
                       heaps_indices + i * self.k,
                       k,
-                      self.distance_metric.rdist(&X_c[i, 0],
-                                                 &Y_c[j, 0],
-                                                 self.d),
+                      self.distance_computer.approx_dist(X_start + i, Y_start + j),
                       Y_start + j)
 
         return 0
@@ -627,9 +893,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         for i in prange(self.n_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
             for j in range(self.k):
-                distances[i, j] = self.distance_metric.dist(&self.X[i, 0],
-                                                 &self.Y[Y_indices[i, j], 0],
-                                                 self.d)
+                distances[i, j] = self.distance_computer.dist(i, Y_indices[i, j])
 
     @final
     def compute(self,
@@ -702,22 +966,26 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
     """
 
     cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
         DTYPE_t[::1] Y_sq_norms
 
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
 
     def __init__(self,
-                  X,
-                  Y,
-                  ITYPE_t k,
-                  ITYPE_t chunk_size = CHUNK_SIZE,
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
+        ITYPE_t k,
+        ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        ArgKmin.__init__(self, X, Y,
-                         # The distance metric here is used for exact distances computations
-                         distance_metric=DistanceMetric.get_metric("euclidean"),
+        ArgKmin.__init__(self,
+                         # The distance computer here is used for exact distances computations
+                         distance_computer=DistanceComputer.get_for(X, Y, metric="euclidean"),
                          k=k,
                          chunk_size=chunk_size)
+        self.X = X
+        self.Y = Y
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
@@ -889,19 +1157,17 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                 ITYPE_t chunk_size=CHUNK_SIZE,
                 dict metric_kwargs=dict(),
         ):
-        return RadiusNeighborhood(X=X, Y=Y,
-                       distance_metric=DistanceMetric.get_metric(metric, **metric_kwargs),
+        return RadiusNeighborhood(
+                       distance_computer=DistanceComputer.get_for(X, Y, metric, metric_kwargs),
                        radius=radius,
                        chunk_size=chunk_size)
 
     def __init__(self,
-                 X,
-                 Y,
-                 DistanceMetric distance_metric,
+                 DistanceComputer distance_computer,
                  DTYPE_t radius,
                  ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        PairwiseDistancesReduction.__init__(self, X, Y, distance_metric, chunk_size)
+        PairwiseDistancesReduction.__init__(self, distance_computer, chunk_size)
 
         self.radius = radius
         self.sort_results = False
@@ -928,16 +1194,14 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             DTYPE_t dist_i_j
 
-        for i in range(X_c.shape[0]):
-            for j in range(Y_c.shape[0]):
-                dist_i_j = self.distance_metric.dist(&X_c[i, 0], &Y_c[j, 0], self.d)
+        for i in range(X_start, X_end):
+            for j in range(Y_start, Y_end):
+                dist_i_j = self.distance_computer.dist(i, j)
                 if dist_i_j <= self.radius:
-                    deref(self.neigh_distances_chunks[thread_num])[X_start + i].push_back(dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[X_start + i].push_back(Y_start + j)
+                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
 
         return 0
 

From cf24fd01c0bc508b763d93d24b76d6a2c2de2469 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Jul 2021 18:25:39 +0200
Subject: [PATCH 104/290] Adapt tests for behavior on duplicates

---
 sklearn/neighbors/tests/test_neighbors.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 512e266430b58..b7e760163baf8 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1603,16 +1603,16 @@ def test_k_and_radius_neighbors_duplicates(algorithm):
     nn.fit(X)
     dist, ind = nn.kneighbors()
     assert_allclose(dist, np.zeros((3, 1)))
-    assert_allclose(ind, [[2], [2], [0]])
+    assert_allclose(ind, [[1], [0], [1]])
 
     # Test that zeros are explicitly marked in kneighbors_graph.
     kng = nn.kneighbors_graph(mode="distance")
     assert_allclose(kng.toarray(), np.zeros((3, 3)))
     assert_allclose(kng.data, np.zeros(3))
-    assert_allclose(kng.indices, [2.0, 2.0, 0.0])
+    assert_allclose(kng.indices, [1, 0, 1])
     assert_allclose(
         nn.kneighbors_graph().toarray(),
-        np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0]]),
+        np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
     )
 
 

From e7aaa711f146c072c210dd19d8585e012eb15186 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 08:36:32 +0200
Subject: [PATCH 105/290] Free datastructure if present and do not raise
 otherwise

In some cases, datastructure aren't allocated.
Hence we still check for eventual nullity but
we do not raise and error in this case.
---
 sklearn/metrics/_parallel_reductions.pyx | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index b0d7c6eb61e13..7f71df4052837 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -746,13 +746,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     def __dealloc__(self):
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
-        else:
-            raise RuntimeError("Trying to free heaps_indices_chunks which is NULL")
 
         if self.heaps_approx_distances_chunks is not NULL:
             free(self.heaps_approx_distances_chunks)
-        else:
-            raise RuntimeError("Trying to free heaps_approx_distances_chunks which is NULL")
 
     cdef int _reduce_on_chunks(self,
         ITYPE_t X_start,
@@ -993,8 +989,6 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
     def __dealloc__(self):
         if self.dist_middle_terms_chunks is not NULL:
             free(self.dist_middle_terms_chunks)
-        else:
-            raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
 
     @final
     cdef void _on_X_parallel_init(self,
@@ -1181,8 +1175,11 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             sizeof(self.neigh_indices) * self.effective_omp_n_thread)
 
     def __dealloc(self):
-        free(self.neigh_distances_chunks)
-        free(self.neigh_indices_chunks)
+        if self.neigh_distances_chunks is not NULL:
+            free(self.neigh_distances_chunks)
+
+        if self.neigh_indices_chunks is not NULL:
+            free(self.neigh_indices_chunks)
 
     @final
     cdef int _reduce_on_chunks(self,

From 718a4c2bf746a05b8f46841d53a40e6150fe9c53 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 08:39:05 +0200
Subject: [PATCH 106/290] Adapt doctest to the new behavior

---
 sklearn/multioutput.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 98c82c1a03467..61437928541d0 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -372,7 +372,7 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):
     >>> X, y = make_multilabel_classification(n_classes=3, random_state=0)
     >>> clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y)
     >>> clf.predict(X[-2:])
-    array([[1, 1, 0], [1, 1, 1]])
+    array([[1, 1, 1], [1, 1, 1]])
     """
 
     def __init__(self, estimator, *, n_jobs=None):

From b70db13926de3c7f80b2d2bdfeebc33e9fcc6570 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 09:24:16 +0200
Subject: [PATCH 107/290] Monkey-patch neighbors to alias
 metrics.DistanceMetric

---
 sklearn/neighbors/DistanceMetric.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 sklearn/neighbors/DistanceMetric.py

diff --git a/sklearn/neighbors/DistanceMetric.py b/sklearn/neighbors/DistanceMetric.py
new file mode 100644
index 0000000000000..c0c175800e925
--- /dev/null
+++ b/sklearn/neighbors/DistanceMetric.py
@@ -0,0 +1,16 @@
+# TODO: Remove in 1.2
+import warnings
+
+from ..metrics import DistanceMetric
+from .. import neighbors
+
+warnings.warn(
+    "sklearn.neighbors.DistanceMetric has been moved "
+    "to sklearn.metrics.DistanceMetric in 1.0. "
+    "This import path will be removed in 1.2",
+    category=FutureWarning,
+)
+
+# Monkey-patching neighbors to alias sklearn.metrics.DistanceMetric
+setattr(neighbors, "DistanceMetric", DistanceMetric)
+neighbors.__all__ += ["DistanceMetric"]

From 39e8189e81724f2aa9dbea81c90e9b207a902e47 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 09:26:35 +0200
Subject: [PATCH 108/290] Revert "Make array C-ordered for test"

This reverts commit bb24d958dd120acafa67d3789bfe2cdd8c7efda5.
---
 sklearn/cluster/tests/test_optics.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 1449733fed5c0..3f68f3b62df78 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -784,9 +784,7 @@ def test_extract_dbscan():
 
 
 def test_precomputed_dists():
-    # This slicing makes the array F-ordered.
-    # but we need C-ordering.
-    redX = np.ascontiguousarray(X[::2])
+    redX = X[::2]
     dists = pairwise_distances(redX, metric="euclidean")
     clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists)
     clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)

From 71f0130198cfeec9156d9e065ec6d55375bcc0c8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 10:01:24 +0200
Subject: [PATCH 109/290] Makes test parametrisation execution order fixed

Otherwise, pytest-xdist workers aren't collecting
tests in the same order.
---
 sklearn/neighbors/tests/test_neighbors.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index b7e760163baf8..59abf01a6c6a1 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -57,7 +57,9 @@
 SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,)
 
 ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto")
-COMMON_VALID_METRICS = set.intersection(*map(set, neighbors.VALID_METRICS.values()))
+COMMON_VALID_METRICS = sorted(
+    set.intersection(*map(set, neighbors.VALID_METRICS.values()))
+)
 P = (1, 2, 3, 4, np.inf)
 JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys())
 

From cd7e1a2c4eaf103ed77c2bb0a13dc41d4b7084eb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 10:47:22 +0200
Subject: [PATCH 110/290] Introduce _openmp_thread_num in helpers

---
 sklearn/metrics/_parallel_reductions.pyx |  9 +++++----
 sklearn/utils/_openmp_helpers.pxd        |  6 ++++++
 sklearn/utils/_openmp_helpers.pyx        | 15 ++++++++++++---
 3 files changed, 23 insertions(+), 7 deletions(-)
 create mode 100644 sklearn/utils/_openmp_helpers.pxd

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 7f71df4052837..4b5bf0422db18 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -10,7 +10,6 @@
 
 import numpy as np
 cimport numpy as np
-cimport openmp
 
 np.import_array()
 
@@ -43,9 +42,11 @@ from ..utils._cython_blas cimport (
 )
 
 from ..utils._heap cimport _simultaneous_sort, _push
-from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
+
+from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
 # TODO: This has been introduced in Cython 3.0, change for `libcpp.algorithm.move` once Cython 3 is used
@@ -519,7 +520,7 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t thread_num
 
         with nogil, parallel(num_threads=num_threads):
-            thread_num = openmp.omp_get_thread_num()
+            thread_num = _openmp_thread_num()
 
             # Allocating thread datastructures
             self._on_X_parallel_init(thread_num)
@@ -585,7 +586,7 @@ cdef class PairwiseDistancesReduction:
                 X_end = X_start + self.X_n_samples_chunk
 
             with nogil, parallel(num_threads=num_threads):
-                thread_num = openmp.omp_get_thread_num()
+                thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
                 self._on_Y_parallel_init(thread_num)
diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd
new file mode 100644
index 0000000000000..e57fc9bfa6bf5
--- /dev/null
+++ b/sklearn/utils/_openmp_helpers.pxd
@@ -0,0 +1,6 @@
+# Helpers to access OpenMP threads information
+#
+# Those interfaces act as indirections which allows the non-support of OpenMP
+# for implementations which have been written for it.
+
+cdef int _openmp_thread_num() nogil
diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx
index fb8920074a84e..cddd77ac42746 100644
--- a/sklearn/utils/_openmp_helpers.pyx
+++ b/sklearn/utils/_openmp_helpers.pyx
@@ -6,7 +6,7 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
 
 def _openmp_parallelism_enabled():
     """Determines whether scikit-learn has been built with OpenMP
-    
+
     It allows to retrieve at runtime the information gathered at compile time.
     """
     # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during
@@ -22,7 +22,7 @@ cpdef _openmp_effective_n_threads(n_threads=None):
       - if the ``OMP_NUM_THREADS`` environment variable is set, return
         ``openmp.omp_get_max_threads()``
       - otherwise, return the minimum between ``openmp.omp_get_max_threads()``
-        and the number of cpus, taking cgroups quotas into account. Cgroups 
+        and the number of cpus, taking cgroups quotas into account. Cgroups
         quotas can typically be set by tools such as Docker.
       The result of ``omp_get_max_threads`` can be influenced by environment
       variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``.
@@ -59,4 +59,13 @@ cpdef _openmp_effective_n_threads(n_threads=None):
         # OpenMP disabled at build-time => sequential mode
         return 1
 
-    
+
+cdef inline int _openmp_thread_num() nogil:
+    """Return the number of the thread calling this function.
+
+    If scikit-learn is built without OpenMP support, always return 0.
+    """
+    IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
+        return openmp.omp_get_thread_num()
+    ELSE:
+        return 0

From 29e5c7bb13990d4a9747cc0ede4e5a26d3822303 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 22 Jul 2021 10:55:19 +0200
Subject: [PATCH 111/290] Reorder macros, cython and python imports

---
 sklearn/metrics/_parallel_reductions.pyx | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 4b5bf0422db18..cade1f11d1c98 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -8,6 +8,10 @@
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
 
+DEF CHUNK_SIZE = 256  # number of vectors
+DEF MIN_CHUNK_SAMPLES = 20
+DEF FLOAT_INF = 1e36
+
 import numpy as np
 cimport numpy as np
 
@@ -21,16 +25,7 @@ from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
 from cpython.ref cimport Py_INCREF
 
-from scipy.sparse import issparse
-
 from ._dist_metrics cimport DistanceMetric
-from ._dist_metrics import METRIC_MAPPING
-from ..utils import check_array
-
-DEF CHUNK_SIZE = 256  # number of vectors
-DEF MIN_CHUNK_SAMPLES = 20
-DEF FLOAT_INF = 1e36
-
 from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -40,12 +35,15 @@ from ..utils._cython_blas cimport (
   Trans,
   _gemm,
 )
-
 from ..utils._heap cimport _simultaneous_sort, _push
 from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 
+
+from scipy.sparse import issparse
+from ._dist_metrics import METRIC_MAPPING
+from ..utils import check_array
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 

From 550bb76c6c67afb33c62a5798019e9fc0c9cd5fe Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 17:19:47 +0200
Subject: [PATCH 112/290] Improve comments for DistanceComputer

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_parallel_reductions.pyx | 177 ++++++++++++++---------
 1 file changed, 106 insertions(+), 71 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index cade1f11d1c98..08be48937fcd7 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -8,10 +8,6 @@
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
 
-DEF CHUNK_SIZE = 256  # number of vectors
-DEF MIN_CHUNK_SAMPLES = 20
-DEF FLOAT_INF = 1e36
-
 import numpy as np
 cimport numpy as np
 
@@ -47,7 +43,13 @@ from ..utils import check_array
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
-# TODO: This has been introduced in Cython 3.0, change for `libcpp.algorithm.move` once Cython 3 is used
+
+DEF CHUNK_SIZE = 256  # number of vectors
+DEF MIN_CHUNK_SAMPLES = 20
+DEF FLOAT_INF = 1e36
+
+# TODO: This has been introduced in Cython 3.0, change for
+# `libcpp.algorithm.move` once Cython 3 is used
 # Introduction in Cython:
 # https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47
 cdef extern from "<algorithm>" namespace "std" nogil:
@@ -61,18 +63,22 @@ ctypedef fused vector_DITYPE_t:
     vector[ITYPE_t]
     vector[DTYPE_t]
 
+
 ctypedef fused vector_vector_DITYPE_t:
     vector[vector[ITYPE_t]]
     vector[vector[DTYPE_t]]
 
+
 cdef extern from "numpy/arrayobject.h":
     int PyArray_SetBaseObject(np.ndarray arr, PyObject *obj) nogil except -1
 
+
 cdef class StdVectorSentinel:
     """Wraps a reference to a vector which will be
     deallocated with this object."""
     pass
 
+
 cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
     cdef vector[DTYPE_t] vec
 
@@ -82,6 +88,7 @@ cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
         sentinel.vec.swap(deref(vec_ptr))
         return sentinel
 
+
 cdef class StdVectorSentinelITYPE(StdVectorSentinel):
     cdef vector[ITYPE_t] vec
 
@@ -101,7 +108,8 @@ cdef np.ndarray vector_to_numpy_array(vector_DITYPE_t * vect_ptr):
     typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE
     cdef:
         np.npy_intp size = deref(vect_ptr).size()
-        np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum, deref(vect_ptr).data())
+        np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum,
+                                                      deref(vect_ptr).data())
         StdVectorSentinel sentinel
 
     if vector_DITYPE_t is vector[DTYPE_t]:
@@ -116,10 +124,14 @@ cdef np.ndarray vector_to_numpy_array(vector_DITYPE_t * vect_ptr):
     PyArray_SetBaseObject(arr, <PyObject*>sentinel)
     return arr
 
-cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector_vector_DITYPE_t* vecs):
+
+cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(
+    vector_vector_DITYPE_t* vecs
+    ):
     cdef:
         ITYPE_t n = deref(vecs).size()
-        np.ndarray[object, ndim=1] np_arrays_of_np_arrays = np.empty(n, dtype=np.ndarray)
+        np.ndarray[object, ndim=1] np_arrays_of_np_arrays = np.empty(n,
+                                                                     dtype=np.ndarray)
 
     for i in range(n):
         np_arrays_of_np_arrays[i] = vector_to_numpy_array(&(deref(vecs)[i]))
@@ -129,8 +141,17 @@ cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(vector_vector_DI
 #####################
 
 cdef class DistanceComputer:
-    """ Abstract class responsible to compute
-    distances between vectors of array-like.
+    """Abstract class to compute distances between vectors.
+
+    Compute distances for one pair of vectors at a time.
+    Vectors can be stored as rows of np.ndarrays or CSR matrices.
+
+    This class avoids the overhead of dispatching distance computations
+    based on the physical representation of the vectors (sparse vs. dense)
+    for each row of the collection.
+
+    This makes use of cython.final to remove the overhead of method calls'
+    dispatch.
     """
 
     @classmethod
@@ -141,7 +162,8 @@ cdef class DistanceComputer:
                 dict metric_kwargs=dict(),
         ) -> DistanceComputer:
         cdef:
-            DistanceMetric distance_metric = DistanceMetric.get_metric(metric, **metric_kwargs)
+            DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
+                                                                       **metric_kwargs)
 
         if X.shape[1] != Y.shape[1]:
             raise RuntimeError("Vectors of X and Y must have the "
@@ -180,6 +202,7 @@ cdef class DistanceComputer:
     ) nogil except -1:
         return -1
 
+
 cdef class DenseDenseDistanceComputer(DistanceComputer):
     """ Compute distances between vectors of two arrays.
 
@@ -235,6 +258,7 @@ cdef class DenseDenseDistanceComputer(DistanceComputer):
                                          &self.Y[j, 0],
                                          self.d)
 
+
 cdef class SparseSparseDistanceComputer(DistanceComputer):
     """ Compute distances between vectors of two sparse matrices.
 
@@ -401,7 +425,7 @@ cdef class DenseSparseDistanceComputer(DistanceComputer):
 
     def __init__(self, X, Y, distance_metric):
         # Swapping arguments on the constructor
-        self.distance_computer = SparseSparseDistanceComputer(Y, X, distance_metric)
+        self.distance_computer = SparseDenseDistanceComputer(Y, X, distance_metric)
 
     @property
     @final
@@ -453,11 +477,12 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
 
-        ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_rem
-        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_rem
+        ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
+        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
 
     @classmethod
     def valid_metrics(cls):
+        # TODO: support those distances
         excluded = {"pyfunc", "sokalmichener", "matching", "jaccard"}
         return sorted({*METRIC_MAPPING.keys()}.difference(excluded))
 
@@ -486,12 +511,12 @@ cdef class PairwiseDistancesReduction:
         self.n_Y = distance_computer.n_Y
         self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
         Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
-        self.Y_n_samples_rem = self.n_Y % self.Y_n_samples_chunk
+        self.Y_n_samples_remainder = self.n_Y % self.Y_n_samples_chunk
 
         self.n_X = distance_computer.n_X
         self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
         X_n_full_chunks = self.n_X // self.X_n_samples_chunk
-        self.X_n_samples_rem = self.n_X % self.X_n_samples_chunk
+        self.X_n_samples_remainder = self.n_X % self.X_n_samples_chunk
 
         # Counting remainder chunk in total number of chunks
         self.Y_n_chunks = Y_n_full_chunks + (
@@ -525,8 +550,9 @@ cdef class PairwiseDistancesReduction:
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 X_start = X_chunk_idx * self.X_n_samples_chunk
-                if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
-                    X_end = X_start + self.X_n_samples_rem
+                if (X_chunk_idx == self.X_n_chunks - 1
+                    and self.X_n_samples_remainder > 0):
+                    X_end = X_start + self.X_n_samples_remainder
                 else:
                     X_end = X_start + self.X_n_samples_chunk
 
@@ -535,8 +561,9 @@ cdef class PairwiseDistancesReduction:
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
-                        Y_end = Y_start + self.Y_n_samples_rem
+                    if (Y_chunk_idx == self.Y_n_chunks - 1
+                        and self.Y_n_samples_remainder > 0):
+                        Y_end = Y_start + self.Y_n_samples_remainder
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
@@ -578,8 +605,8 @@ cdef class PairwiseDistancesReduction:
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
-            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
-                X_end = X_start + self.X_n_samples_rem
+            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_remainder > 0:
+                X_end = X_start + self.X_n_samples_remainder
             else:
                 X_end = X_start + self.X_n_samples_chunk
 
@@ -592,8 +619,8 @@ cdef class PairwiseDistancesReduction:
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
                     if Y_chunk_idx == self.Y_n_chunks - 1 \
-                            and self.Y_n_samples_rem > 0:
-                        Y_end = Y_start + self.Y_n_samples_rem
+                            and self.Y_n_samples_remainder > 0:
+                        Y_end = Y_start + self.Y_n_samples_remainder
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
@@ -603,11 +630,11 @@ cdef class PairwiseDistancesReduction:
                         thread_num,
                     )
                 # end: prange
+            # end: with nogil, parallel
 
             # Synchronizing the thread datastructures with the main ones
             # This can potentially block
             self._on_Y_after_parallel(num_threads, X_start, X_end)
-            # end: with nogil, parallel
 
         # end: for X_chunk_idx
         # Deallocating temporary datastructures
@@ -631,26 +658,26 @@ cdef class PairwiseDistancesReduction:
     # Placeholder methods which can be implemented
 
     cdef void _on_X_parallel_init(self,
-            ITYPE_t thread_num,
+        ITYPE_t thread_num,
     ) nogil:
         return
 
     cdef void _on_X_parallel_finalize(self,
-            ITYPE_t thread_num
+        ITYPE_t thread_num
     ) nogil:
         return
 
     cdef void _on_X_prange_iter_init(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         return
 
     cdef void _on_X_prange_iter_finalize(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         return
 
@@ -716,7 +743,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 ITYPE_t chunk_size=CHUNK_SIZE,
                 dict metric_kwargs=dict(),
         ):
-        # This factory comes to handle specialisation on fast_sqeuclidean.
+        # This factory comes to handle specialisations.
+        # TODO: take the size of X vs chunk_size into account for this choice.
         if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
         return ArgKmin(distance_computer=DistanceComputer.get_for(X, Y, metric, metric_kwargs),
@@ -724,9 +752,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                        chunk_size=chunk_size)
 
     def __init__(self,
-                 DistanceComputer distance_computer,
-                 ITYPE_t k,
-                 ITYPE_t chunk_size = CHUNK_SIZE,
+        DistanceComputer distance_computer,
+        ITYPE_t k,
+        ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         PairwiseDistancesReduction.__init__(self, distance_computer, chunk_size)
 
@@ -739,8 +767,10 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         # Allocating pointers to datastructures but not the datastructures themselves.
         # There's potentially more pointers than actual thread used for the
         # reduction but as many datastructures as threads.
-        self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
-        self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_indices_chunks = <ITYPE_t **> malloc(
+            sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
         if self.heaps_indices_chunks is not NULL:
@@ -778,9 +808,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     @final
     cdef void _on_X_prange_iter_init(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
 
         # As this strategy is embarrassingly parallel, we can set the
@@ -790,9 +820,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     @final
     cdef void _on_X_prange_iter_finalize(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         cdef:
             ITYPE_t idx, jdx
@@ -806,7 +836,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             )
 
     cdef void _on_Y_init(self,
-            ITYPE_t num_threads,
+        ITYPE_t num_threads,
     ) nogil:
         cdef:
             # number of scalar elements
@@ -818,12 +848,14 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
-            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heaps_size * sizeof(DTYPE_t))
-            self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(heaps_size * sizeof(ITYPE_t))
+            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+                heaps_size * sizeof(DTYPE_t))
+            self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
+                heaps_size * sizeof(ITYPE_t))
 
     @final
     cdef void _on_Y_parallel_init(self,
-            ITYPE_t thread_num,
+        ITYPE_t thread_num,
     ) nogil:
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
@@ -832,9 +864,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     @final
     cdef void _on_Y_after_parallel(self,
-            ITYPE_t num_threads,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        ITYPE_t num_threads,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         cdef:
             ITYPE_t idx, jdx, thread_num
@@ -892,8 +924,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     @final
     def compute(self,
-           str strategy = "auto",
-           bint return_distance = False
+       str strategy = "auto",
+       bint return_distance = False
     ):
         """Computes the reduction of vectors (rows) of X on Y.
 
@@ -944,6 +976,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
         return np.asarray(self.argkmin_indices)
 
+
 cdef class FastSquaredEuclideanArgKmin(ArgKmin):
     """Fast specialized alternative for ArgKmin on
     EuclideanDistance.
@@ -975,15 +1008,16 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         ArgKmin.__init__(self,
-                         # The distance computer here is used for exact distances computations
-                         distance_computer=DistanceComputer.get_for(X, Y, metric="euclidean"),
-                         k=k,
-                         chunk_size=chunk_size)
+            # The distance computer here is used for exact distances computations
+            distance_computer=DistanceComputer.get_for(X, Y, metric="euclidean"),
+            k=k,
+            chunk_size=chunk_size)
         self.X = X
         self.Y = Y
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
         # Temporary datastructures used in threads
-        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
         if self.dist_middle_terms_chunks is not NULL:
@@ -994,6 +1028,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             ITYPE_t thread_num,
     ) nogil:
         ArgKmin._on_X_parallel_init(self, thread_num)
+
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
             self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
@@ -1156,9 +1191,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                        chunk_size=chunk_size)
 
     def __init__(self,
-                 DistanceComputer distance_computer,
-                 DTYPE_t radius,
-                 ITYPE_t chunk_size = CHUNK_SIZE,
+        DistanceComputer distance_computer,
+        DTYPE_t radius,
+        ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         PairwiseDistancesReduction.__init__(self, distance_computer, chunk_size)
 
@@ -1203,9 +1238,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     @final
     cdef void _on_X_prange_iter_init(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
 
         # As this strategy is embarrassingly parallel, we can set the
@@ -1215,9 +1250,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     @final
     cdef void _on_X_prange_iter_finalize(self,
-            ITYPE_t thread_num,
-            ITYPE_t X_start,
-            ITYPE_t X_end,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         cdef:
             ITYPE_t idx, jdx
@@ -1317,9 +1352,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     @final
     def compute(self,
-           str strategy = "auto",
-           bint return_distance = False,
-           bint sort_results = False
+        str strategy = "auto",
+        bint return_distance = False,
+        bint sort_results = False
     ):
         if sort_results and not return_distance:
             raise ValueError("return_distance must be True "

From 0d5480cecfbc8ace726622ccc3f26df12f0e1b38 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 17:39:01 +0200
Subject: [PATCH 113/290] Use 'proxy' instead for 'approx' as a wording
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_parallel_reductions.pyx | 52 ++++++++++++------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 08be48937fcd7..2ee6513e6663d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -190,7 +190,7 @@ cdef class DistanceComputer:
     def n_Y(self):
         raise RuntimeError()
 
-    cdef DTYPE_t approx_dist(self,
+    cdef DTYPE_t proxy_dist(self,
         ITYPE_t i,
         ITYPE_t j,
     ) nogil except -1:
@@ -241,7 +241,7 @@ cdef class DenseDenseDistanceComputer(DistanceComputer):
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t approx_dist(self,
+    cdef DTYPE_t proxy_dist(self,
         ITYPE_t i,
         ITYPE_t j,
     ) nogil except -1:
@@ -303,7 +303,7 @@ cdef class SparseSparseDistanceComputer(DistanceComputer):
         self.Y_indptr = Y.indptr
 
     @final
-    cdef DTYPE_t approx_dist(self,
+    cdef DTYPE_t proxy_dist(self,
         ITYPE_t i,
         ITYPE_t j,
     ) nogil except -1:
@@ -376,7 +376,7 @@ cdef class SparseDenseDistanceComputer(DistanceComputer):
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t approx_dist(self,
+    cdef DTYPE_t proxy_dist(self,
         ITYPE_t i,
         ITYPE_t j,
     ) nogil except -1:
@@ -440,12 +440,12 @@ cdef class DenseSparseDistanceComputer(DistanceComputer):
         return self.distance_computer.n_X
 
     @final
-    cdef DTYPE_t approx_dist(self,
+    cdef DTYPE_t proxy_dist(self,
         ITYPE_t i,
         ITYPE_t j,
     ) nogil except -1:
         # Swapping arguments on the same interface
-        return self.distance_computer.approx_dist(j, i)
+        return self.distance_computer.proxy_dist(j, i)
 
     @final
     cdef DTYPE_t dist(self,
@@ -727,7 +727,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         DTYPE_t[:, ::1] argkmin_distances
 
         # Used as array of pointers to private datastructures used in threads.
-        DTYPE_t ** heaps_approx_distances_chunks
+        DTYPE_t ** heaps_proxy_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
@@ -767,7 +767,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         # Allocating pointers to datastructures but not the datastructures themselves.
         # There's potentially more pointers than actual thread used for the
         # reduction but as many datastructures as threads.
-        self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(
+        self.heaps_proxy_distances_chunks = <DTYPE_t **> malloc(
             sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
             sizeof(ITYPE_t *) * self.effective_omp_n_thread)
@@ -776,8 +776,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
 
-        if self.heaps_approx_distances_chunks is not NULL:
-            free(self.heaps_approx_distances_chunks)
+        if self.heaps_proxy_distances_chunks is not NULL:
+            free(self.heaps_proxy_distances_chunks)
 
     cdef int _reduce_on_chunks(self,
         ITYPE_t X_start,
@@ -791,17 +791,17 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             ITYPE_t n_X = X_end - X_start
             ITYPE_t n_Y = Y_end - Y_start
             ITYPE_t k = self.k
-            DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
+            DTYPE_t *heaps_proxy_distances = self.heaps_proxy_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
         for i in range(n_X):
             for j in range(n_Y):
-                _push(heaps_approx_distances + i * self.k,
+                _push(heaps_proxy_distances + i * self.k,
                       heaps_indices + i * self.k,
                       k,
-                      self.distance_computer.approx_dist(X_start + i, Y_start + j),
+                      self.distance_computer.proxy_dist(X_start + i, Y_start + j),
                       Y_start + j)
 
         return 0
@@ -815,7 +815,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
         # As this strategy is embarrassingly parallel, we can set the
         # thread heaps pointers to the proper position on the main heaps
-        self.heaps_approx_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
+        self.heaps_proxy_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
     @final
@@ -830,7 +830,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         # Sorting indices of the argkmin for each query vector of X
         for idx in range(X_end - X_start):
             _simultaneous_sort(
-                self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_proxy_distances_chunks[thread_num] + idx * self.k,
                 self.heaps_indices_chunks[thread_num] + idx * self.k,
                 self.k
             )
@@ -848,7 +848,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
-            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.heaps_proxy_distances_chunks[thread_num] = <DTYPE_t *> malloc(
                 heaps_size * sizeof(DTYPE_t))
             self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
                 heaps_size * sizeof(ITYPE_t))
@@ -859,7 +859,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ) nogil:
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
-            self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
+            self.heaps_proxy_distances_chunks[thread_num][idx] = FLOAT_INF
             self.heaps_indices_chunks[thread_num][idx] = -1
 
     @final
@@ -882,7 +882,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                             &self.argkmin_distances[X_start + idx, 0],
                             &self.argkmin_indices[X_start + idx, 0],
                             self.k,
-                            self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
+                            self.heaps_proxy_distances_chunks[thread_num][idx * self.k + jdx],
                             self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                         )
 
@@ -895,7 +895,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         with nogil, parallel(num_threads=self.effective_omp_n_thread):
             # Deallocating temporary datastructures
             for thread_num in prange(num_threads, schedule='static'):
-                free(self.heaps_approx_distances_chunks[thread_num])
+                free(self.heaps_proxy_distances_chunks[thread_num])
                 free(self.heaps_indices_chunks[thread_num])
 
             # Sort the main heaps into arrays in parallel
@@ -913,7 +913,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         ITYPE_t[:, ::1] Y_indices,  # IN
         DTYPE_t[:, ::1] distances,  # IN/OUT
     ) nogil:
-        """Convert approximate distances to pairwise distances in parallel."""
+        """Convert proxy distances to pairwise distances in parallel."""
         cdef:
             ITYPE_t i, j
 
@@ -970,7 +970,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
         if return_distance:
             # We need to recompute distances because we relied on
-            # approximate distances.
+            # proxy distances.
             self._exact_distances(self.argkmin_indices, self.argkmin_distances)
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
@@ -1082,7 +1082,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
+            DTYPE_t *heaps_proxy_distances = self.heaps_proxy_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
             # Instead of computing the full pairwise squared distances matrix,
@@ -1127,10 +1127,10 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         # which keep tracks of the argkmin.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_approx_distances + i * k,
+                _push(heaps_proxy_distances + i * k,
                       heaps_indices + i * k,
                       k,
-                      # approximate distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      # proxy distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                       dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
                       j + Y_start)
         return 0
@@ -1145,12 +1145,12 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     cdef:
         DTYPE_t radius
 
-        # Distances metrics compute approximated distance
+        # Distances metrics compute rank preserving distance
         # ("reduced distance" in the original wording),
         # which are proxies necessitating less computations.
         # We get the proxy for the radius to be able to compare
 
-        # NOTE: not used for now.
+        # TODO: use it?
         DTYPE_t radius_proxy
 
         # We want resizable buffers which we will to wrapped within numpy

From 305f0074677c7fcdfc3e5ccdbd00956ecac9b406 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 17:53:26 +0200
Subject: [PATCH 114/290] Follow PEP 257

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_parallel_reductions.pyx | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 2ee6513e6663d..22745daa78c64 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -74,8 +74,7 @@ cdef extern from "numpy/arrayobject.h":
 
 
 cdef class StdVectorSentinel:
-    """Wraps a reference to a vector which will be
-    deallocated with this object."""
+    """Wraps a reference to a vector which will be deallocated with this object."""
     pass
 
 
@@ -100,7 +99,7 @@ cdef class StdVectorSentinelITYPE(StdVectorSentinel):
 
 
 cdef np.ndarray vector_to_numpy_array(vector_DITYPE_t * vect_ptr):
-    """ Create a numpy ndarray given a C++ vector.
+    """Create a numpy ndarray given a C++ vector.
 
     This registers a Sentinel as the base object for the numpy array
     freeing the C++ vector it encapsulates when it must.
@@ -204,7 +203,7 @@ cdef class DistanceComputer:
 
 
 cdef class DenseDenseDistanceComputer(DistanceComputer):
-    """ Compute distances between vectors of two arrays.
+    """Compute distances between vectors of two arrays.
 
     X: ndarray of shape (n_X, d)
         Rows represent vectors
@@ -260,7 +259,7 @@ cdef class DenseDenseDistanceComputer(DistanceComputer):
 
 
 cdef class SparseSparseDistanceComputer(DistanceComputer):
-    """ Compute distances between vectors of two sparse matrices.
+    """Compute distances between vectors of two sparse matrices.
 
     X: sparse matrix of shape (n_X, d)
         Rows represent vectors
@@ -336,7 +335,7 @@ cdef class SparseSparseDistanceComputer(DistanceComputer):
 
 
 cdef class SparseDenseDistanceComputer(DistanceComputer):
-    """ Compute distances between vectors of two sparse matrices.
+    """Compute distances between vectors of two sparse matrices.
 
     X: sparse matrix of shape (n_X, d)
         Rows represent vectors
@@ -410,8 +409,7 @@ cdef class SparseDenseDistanceComputer(DistanceComputer):
 
 
 cdef class DenseSparseDistanceComputer(DistanceComputer):
-    """ Compute distances between vectors of a sparse matrix and
-     vectors of an array.
+    """Compute distances between vectors of a sparse matrix and vectors of an array.
 
     X: ndarray of shape (n_X, d)
         Rows represent vectors
@@ -651,7 +649,7 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil except -1:
-        """ Abstract method: Sub-classes implemented the reduction
+        """Abstract method: Sub-classes implemented the reduction
         on a pair of chunks"""
         return -1
 

From ba8953215ea5b16133838e6c1b5a15b9ae61025f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 18:05:46 +0200
Subject: [PATCH 115/290] Fix RadiusNeighborhood.__dealloc__

---
 sklearn/metrics/_parallel_reductions.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 22745daa78c64..d8de80b1ceef5 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -1206,7 +1206,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
             sizeof(self.neigh_indices) * self.effective_omp_n_thread)
 
-    def __dealloc(self):
+    def __dealloc__(self):
         if self.neigh_distances_chunks is not NULL:
             free(self.neigh_distances_chunks)
 

From 3b1e98c8289d5a35ea58d95b2337a8d06950e184 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 18:06:33 +0200
Subject: [PATCH 116/290] Rename DistanceComputer for DatasetPairs

And adapt comments.
---
 sklearn/metrics/_parallel_reductions.pyx | 96 ++++++++++++++----------
 1 file changed, 55 insertions(+), 41 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index d8de80b1ceef5..b5ad9eccf168c 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -139,11 +139,13 @@ cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(
 
 #####################
 
-cdef class DistanceComputer:
-    """Abstract class to compute distances between vectors.
+cdef class DatasetsPair:
+    """Abstract class which wraps a pair of datasets (X, Y).
 
-    Compute distances for one pair of vectors at a time.
-    Vectors can be stored as rows of np.ndarrays or CSR matrices.
+    X and Y can be stored as rows of np.ndarrays or CSR matrices in subclasses.
+
+    This class allows compute distances via :class:`sklearn.metrics.DistanceMetric`
+    for one pair of vectors at a time given the pair of their indices (i, j).
 
     This class avoids the overhead of dispatching distance computations
     based on the physical representation of the vectors (sparse vs. dense)
@@ -159,7 +161,7 @@ cdef class DistanceComputer:
                 Y,
                 str metric="euclidean",
                 dict metric_kwargs=dict(),
-        ) -> DistanceComputer:
+        ) -> DatasetsPair:
         cdef:
             DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
                                                                        **metric_kwargs)
@@ -174,12 +176,12 @@ cdef class DistanceComputer:
         distance_metric._validate_data(Y)
 
         if not issparse(X) and not issparse(Y):
-            return DenseDenseDistanceComputer(X, Y, distance_metric)
+            return DenseDenseDatasetsPair(X, Y, distance_metric)
         if issparse(X) and not issparse(Y):
-            return SparseDenseDistanceComputer(X, Y, distance_metric)
+            return SparseDenseDatasetsPair(X, Y, distance_metric)
         if not issparse(X) and issparse(Y):
-            return DenseSparseDistanceComputer(X, Y, distance_metric)
-        return SparseSparseDistanceComputer(X, Y, distance_metric)
+            return DenseSparseDatasetsPair(X, Y, distance_metric)
+        return SparseSparseDatasetsPair(X, Y, distance_metric)
 
     @property
     def n_X(self):
@@ -202,7 +204,7 @@ cdef class DistanceComputer:
         return -1
 
 
-cdef class DenseDenseDistanceComputer(DistanceComputer):
+cdef class DenseDenseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of two arrays.
 
     X: ndarray of shape (n_X, d)
@@ -258,7 +260,7 @@ cdef class DenseDenseDistanceComputer(DistanceComputer):
                                          self.d)
 
 
-cdef class SparseSparseDistanceComputer(DistanceComputer):
+cdef class SparseSparseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of two sparse matrices.
 
     X: sparse matrix of shape (n_X, d)
@@ -334,7 +336,7 @@ cdef class SparseSparseDistanceComputer(DistanceComputer):
                                          self.Y_indices[yj_start:yj_end])
 
 
-cdef class SparseDenseDistanceComputer(DistanceComputer):
+cdef class SparseDenseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of two sparse matrices.
 
     X: sparse matrix of shape (n_X, d)
@@ -384,7 +386,7 @@ cdef class SparseDenseDistanceComputer(DistanceComputer):
             ITYPE_t xi_end = self.X_indptr[i + 1]
 
         # TODO: the 2D to 1D memory-view conversion might make computation slower, see:
-        #
+        # https://github.com/scikit-learn/scikit-learn/issues/17299
         # Ideally, we could pass pointers and indices and access elements
         # then in distance_metric.dist
         return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
@@ -408,7 +410,7 @@ cdef class SparseDenseDistanceComputer(DistanceComputer):
                                          self.Y_indices)
 
 
-cdef class DenseSparseDistanceComputer(DistanceComputer):
+cdef class DenseSparseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of a sparse matrix and vectors of an array.
 
     X: ndarray of shape (n_X, d)
@@ -419,23 +421,23 @@ cdef class DenseSparseDistanceComputer(DistanceComputer):
     cdef:
         # As distance metrics are commutative functions, we can
         # simply rely on the other strategy and swap arguments
-        DistanceComputer distance_computer
+        DatasetsPair datasets_pair
 
     def __init__(self, X, Y, distance_metric):
         # Swapping arguments on the constructor
-        self.distance_computer = SparseDenseDistanceComputer(Y, X, distance_metric)
+        self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
 
     @property
     @final
     def n_X(self):
         # Swapping interface
-        return self.distance_computer.n_Y
+        return self.datasets_pair.n_Y
 
     @property
     @final
     def n_Y(self):
         # Swapping interface
-        return self.distance_computer.n_X
+        return self.datasets_pair.n_X
 
     @final
     cdef DTYPE_t proxy_dist(self,
@@ -443,7 +445,7 @@ cdef class DenseSparseDistanceComputer(DistanceComputer):
         ITYPE_t j,
     ) nogil except -1:
         # Swapping arguments on the same interface
-        return self.distance_computer.proxy_dist(j, i)
+        return self.datasets_pair.proxy_dist(j, i)
 
     @final
     cdef DTYPE_t dist(self,
@@ -451,7 +453,7 @@ cdef class DenseSparseDistanceComputer(DistanceComputer):
         ITYPE_t j,
     ) nogil except -1:
         # Swapping arguments on the same interface
-        return self.distance_computer.dist(j, i)
+        return self.datasets_pair.dist(j, i)
 
 
 cdef class PairwiseDistancesReduction:
@@ -470,7 +472,7 @@ cdef class PairwiseDistancesReduction:
     """
 
     cdef:
-        DistanceComputer distance_computer
+        DatasetsPair datasets_pair
 
         ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
@@ -493,7 +495,7 @@ cdef class PairwiseDistancesReduction:
 
 
     def __init__(self,
-                 DistanceComputer distance_computer,
+                 DatasetsPair datasets_pair,
                  ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         cdef:
@@ -504,14 +506,14 @@ cdef class PairwiseDistancesReduction:
         self.chunk_size = chunk_size
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
-        self.distance_computer = distance_computer
+        self.datasets_pair = datasets_pair
 
-        self.n_Y = distance_computer.n_Y
+        self.n_Y = datasets_pair.n_Y
         self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
         Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
         self.Y_n_samples_remainder = self.n_Y % self.Y_n_samples_chunk
 
-        self.n_X = distance_computer.n_X
+        self.n_X = datasets_pair.n_X
         self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
         X_n_full_chunks = self.n_X // self.X_n_samples_chunk
         self.X_n_samples_remainder = self.n_X % self.X_n_samples_chunk
@@ -710,8 +712,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     Parameters
     ----------
-    distance_metric: DistanceMetric
-        The distance to use
+    datasets_pair: DatasetsPair
+        The dataset pairs (X, Y) for the reduction
     k: int
         The k for the argkmin reduction
     chunk_size: int
@@ -745,16 +747,16 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         # TODO: take the size of X vs chunk_size into account for this choice.
         if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
-        return ArgKmin(distance_computer=DistanceComputer.get_for(X, Y, metric, metric_kwargs),
+        return ArgKmin(datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
                        k=k,
                        chunk_size=chunk_size)
 
     def __init__(self,
-        DistanceComputer distance_computer,
+        DatasetsPair datasets_pair,
         ITYPE_t k,
         ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        PairwiseDistancesReduction.__init__(self, distance_computer, chunk_size)
+        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
 
         self.k = k
 
@@ -799,7 +801,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 _push(heaps_proxy_distances + i * self.k,
                       heaps_indices + i * self.k,
                       k,
-                      self.distance_computer.proxy_dist(X_start + i, Y_start + j),
+                      self.datasets_pair.proxy_dist(X_start + i, Y_start + j),
                       Y_start + j)
 
         return 0
@@ -918,7 +920,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         for i in prange(self.n_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
             for j in range(self.k):
-                distances[i, j] = self.distance_computer.dist(i, Y_indices[i, j])
+                distances[i, j] = self.datasets_pair.dist(i, Y_indices[i, j])
 
     @final
     def compute(self,
@@ -1006,8 +1008,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         ArgKmin.__init__(self,
-            # The distance computer here is used for exact distances computations
-            distance_computer=DistanceComputer.get_for(X, Y, metric="euclidean"),
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             k=k,
             chunk_size=chunk_size)
         self.X = X
@@ -1135,15 +1137,27 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
 
 cdef class RadiusNeighborhood(PairwiseDistancesReduction):
-    """Returns the indices of neighbors of a first set
-    of vectors (rows of X) present in another set of vectors
+    """Returns indices in a vector-set Y of radius-based neighbors of vector-set X.
+
+    The neighbors of a first set of vectors X present in
+    the second in present another.
+
+    present in another set of vectors
     (rows of Y) for a given a radius and distance.
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pairs (X, Y) for the reduction
+    radius: int
+        The radius defining the neighborhood
+    chunk_size: int
+        The number of vectors per chunk
     """
 
     cdef:
         DTYPE_t radius
 
-        # Distances metrics compute rank preserving distance
+        # DistanceMetric compute rank preserving distance via rdist
         # ("reduced distance" in the original wording),
         # which are proxies necessitating less computations.
         # We get the proxy for the radius to be able to compare
@@ -1184,16 +1198,16 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                 dict metric_kwargs=dict(),
         ):
         return RadiusNeighborhood(
-                       distance_computer=DistanceComputer.get_for(X, Y, metric, metric_kwargs),
+                       datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
                        radius=radius,
                        chunk_size=chunk_size)
 
     def __init__(self,
-        DistanceComputer distance_computer,
+        DatasetsPair datasets_pair,
         DTYPE_t radius,
         ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        PairwiseDistancesReduction.__init__(self, distance_computer, chunk_size)
+        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
 
         self.radius = radius
         self.sort_results = False
@@ -1227,7 +1241,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         for i in range(X_start, X_end):
             for j in range(Y_start, Y_end):
-                dist_i_j = self.distance_computer.dist(i, j)
+                dist_i_j = self.datasets_pair.dist(i, j)
                 if dist_i_j <= self.radius:
                     deref(self.neigh_distances_chunks[thread_num])[i].push_back(dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)

From 7ae5091435b62e110057bebc60b21c5cf2684584 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 18:16:14 +0200
Subject: [PATCH 117/290] Minimalistically document template methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_parallel_reductions.pyx | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index b5ad9eccf168c..fd14f43d9b78d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -651,8 +651,7 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil except -1:
-        """Abstract method: Sub-classes implemented the reduction
-        on a pair of chunks"""
+        """Implemented the reduction on a pair of chunks."""
         return -1
 
     # Placeholder methods which can be implemented
@@ -660,11 +659,7 @@ cdef class PairwiseDistancesReduction:
     cdef void _on_X_parallel_init(self,
         ITYPE_t thread_num,
     ) nogil:
-        return
-
-    cdef void _on_X_parallel_finalize(self,
-        ITYPE_t thread_num
-    ) nogil:
+        """Allocate datastructures used in a thread given its number."""
         return
 
     cdef void _on_X_prange_iter_init(self,
@@ -672,6 +667,7 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
+        """Initialise datastructures used in a thread given its number."""
         return
 
     cdef void _on_X_prange_iter_finalize(self,
@@ -679,16 +675,25 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
+        """Interact with datastructures after a reduction on chunks."""
+        return
+
+    cdef void _on_X_parallel_finalize(self,
+        ITYPE_t thread_num
+    ) nogil:
+        """Interact with datastructures after executing all the reductions."""
         return
 
     cdef void _on_Y_init(self,
         ITYPE_t num_threads,
     ) nogil:
+        """Allocate datastructures used in threads."""
         return
 
     cdef void _on_Y_parallel_init(self,
         ITYPE_t thread_num,
     ) nogil:
+        """Initialise datastructures used in a thread given its number."""
         return
 
     cdef void _on_Y_after_parallel(self,
@@ -696,11 +701,13 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
+        """Interact with datastructures after a threads parallel region."""
         return
 
     cdef void _on_Y_finalize(self,
         ITYPE_t num_threads,
     ) nogil:
+        """Interact with datastructures after executing all the reductions."""
         return
 
 cdef class ArgKmin(PairwiseDistancesReduction):

From 2f9dc025e79c71652004aee038919dd3bdc4cba8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 18:29:59 +0200
Subject: [PATCH 118/290] Reallocate datastructures for results at each new
 call

---
 sklearn/metrics/_parallel_reductions.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index fd14f43d9b78d..18d984b28043f 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -751,7 +751,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 dict metric_kwargs=dict(),
         ):
         # This factory comes to handle specialisations.
-        # TODO: take the size of X vs chunk_size into account for this choice.
         if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
         return ArgKmin(datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
@@ -767,10 +766,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
         self.k = k
 
-        # Results returned by ArgKmin.compute used as the main heaps
-        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
-        self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
-
         # Allocating pointers to datastructures but not the datastructures themselves.
         # There's potentially more pointers than actual thread used for the
         # reduction but as many datastructures as threads.
@@ -960,6 +955,11 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         indices: ndarray of shape (n, k)
             Indices of each X vector argkmin in Y.
         """
+
+        # Results returned by ArgKmin.compute used as the main heaps
+        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
+
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.

From 4f3bd4cb22ac63a25811a56a65e037c6e6960176 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 27 Jul 2021 18:34:19 +0200
Subject: [PATCH 119/290] Avoid thread over-subscription for BLAS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_parallel_reductions.pyx | 31 +++++++++++++++---------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 18d984b28043f..6c21890892a56 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -38,6 +38,7 @@ from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 
 
 from scipy.sparse import issparse
+from threadpoolctl import threadpool_limits
 from ._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
 from ..utils._openmp_helpers import _openmp_effective_n_threads
@@ -968,12 +969,15 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             else:
                 strategy = 'parallel_on_Y'
 
-        if strategy == 'parallel_on_Y':
-            self._parallel_on_Y()
-        elif strategy == 'parallel_on_X':
-            self._parallel_on_X()
-        else:
-            raise RuntimeError(f"strategy '{strategy}' not supported.")
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if strategy == 'parallel_on_Y':
+                self._parallel_on_Y()
+            elif strategy == 'parallel_on_X':
+                self._parallel_on_X()
+            else:
+                raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
             # We need to recompute distances because we relied on
@@ -1394,12 +1398,15 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             else:
                 strategy = 'parallel_on_Y'
 
-        if strategy == 'parallel_on_Y':
-            self._parallel_on_Y()
-        elif strategy == 'parallel_on_X':
-            self._parallel_on_X()
-        else:
-            raise RuntimeError(f"strategy '{strategy}' not supported.")
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if strategy == 'parallel_on_Y':
+                self._parallel_on_Y()
+            elif strategy == 'parallel_on_X':
+                self._parallel_on_X()
+            else:
+                raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
             res = (

From 9d6b83baf3bd3a6fd58515b2c59e52f95b3f9fde Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 28 Jul 2021 08:43:04 +0200
Subject: [PATCH 120/290] Introduce FastSquaredEuclideanRadiusNeighborhood

---
 sklearn/metrics/_parallel_reductions.pyx  | 203 ++++++++++++++++++++--
 sklearn/neighbors/tests/test_neighbors.py |  34 +++-
 2 files changed, 217 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 6c21890892a56..ecbebb5412941 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -657,6 +657,10 @@ cdef class PairwiseDistancesReduction:
 
     # Placeholder methods which can be implemented
 
+    cdef void compute_exact_distances(self) nogil:
+        """Convert proxy distances to exact distances or recompute them."""
+        return
+
     cdef void _on_X_parallel_init(self,
         ITYPE_t thread_num,
     ) nogil:
@@ -911,14 +915,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 )
         return
 
-    @final
-    cdef void _exact_distances(self,
-        ITYPE_t[:, ::1] Y_indices,  # IN
-        DTYPE_t[:, ::1] distances,  # IN/OUT
-    ) nogil:
-        """Convert proxy distances to pairwise distances in parallel."""
+    # TODO: annotating with 'final' here makes the compilation fails but it should not
+    # @final
+    cdef void compute_exact_distances(self) nogil:
         cdef:
             ITYPE_t i, j
+            ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
+            DTYPE_t[:, ::1] distances = self.argkmin_distances
 
         for i in prange(self.n_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
@@ -980,9 +983,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
-            # We need to recompute distances because we relied on
-            # proxy distances.
-            self._exact_distances(self.argkmin_indices, self.argkmin_distances)
+            # We eventually need to recompute distances because we relied on proxy distances.
+            self.compute_exact_distances()
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
         return np.asarray(self.argkmin_indices)
@@ -1198,16 +1200,24 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         bint sort_results
 
+    @classmethod
+    def valid_metrics(cls):
+        return {"fast_sqeuclidean", *PairwiseDistancesReduction.valid_metrics()}
 
     @classmethod
     def get_for(cls,
                 X,
                 Y,
                 DTYPE_t radius,
-                str metric="euclidean",
+                str metric="fast_sqeuclidean",
                 ITYPE_t chunk_size=CHUNK_SIZE,
                 dict metric_kwargs=dict(),
         ):
+        # This factory comes to handle specialisations.
+        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
+            return FastSquaredEuclideanRadiusNeighborhood(X=X, Y=Y,
+                                                          radius=radius,
+                                                          chunk_size=chunk_size)
         return RadiusNeighborhood(
                        datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
                        radius=radius,
@@ -1238,7 +1248,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         if self.neigh_indices_chunks is not NULL:
             free(self.neigh_indices_chunks)
 
-    @final
     cdef int _reduce_on_chunks(self,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -1354,7 +1363,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         return
 
-    @final
     cdef void _on_Y_finalize(self,
         ITYPE_t num_threads,
     ) nogil:
@@ -1409,6 +1417,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                 raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
+            self.compute_exact_distances()
             res = (
                 _coerce_vectors_to_np_nd_arrays(self.neigh_distances),
                 _coerce_vectors_to_np_nd_arrays(self.neigh_indices),
@@ -1420,3 +1429,173 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         del self.neigh_indices
 
         return res
+
+
+cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
+    """Fast specialized alternative for RadiusNeighborhood on EuclideanDistance.
+
+    Notes
+    -----
+    This implementation has an superior arithmetic intensity
+    and hence running time, but it can suffer from numerical
+    instability. RadiusNeighborhood with EuclideanDistance
+    must be used when exact precision is needed.
+    """
+
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        DTYPE_t[::1] X_sq_norms
+        DTYPE_t[::1] Y_sq_norms
+
+        DTYPE_t squared_radius
+
+        # Buffers for GEMM
+        DTYPE_t ** dist_middle_terms_chunks
+
+    def __init__(self,
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
+        DTYPE_t radius,
+        ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        RadiusNeighborhood.__init__(self,
+                        # The distance computer here is used for exact distances computations
+                        datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+                        radius=radius,
+                        chunk_size=chunk_size)
+        self.X = X
+        self.Y = Y
+        self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
+        self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+        self.squared_radius = self.radius ** 2
+
+        # Temporary datastructures used in threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+
+    def __dealloc__(self):
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+
+    @final
+    cdef void _on_X_parallel_init(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        RadiusNeighborhood._on_X_parallel_init(self, thread_num)
+
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+
+    @final
+    cdef void _on_X_parallel_finalize(self,
+        ITYPE_t thread_num
+    ) nogil:
+        RadiusNeighborhood._on_X_parallel_finalize(self, thread_num)
+        free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _on_Y_init(self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        RadiusNeighborhood._on_Y_init(self, num_threads)
+
+        for thread_num in range(num_threads):
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+
+    @final
+    cdef void _on_Y_finalize(self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        RadiusNeighborhood._on_Y_finalize(self, num_threads)
+
+        for thread_num in range(num_threads):
+            free(self.dist_middle_terms_chunks[thread_num])
+
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        """Convert proxy distances to pairwise distances in parallel."""
+        cdef:
+            ITYPE_t i, j
+
+        for i in prange(self.n_X, nogil=True, num_threads=self.effective_omp_n_thread):
+            for j in range(deref(self.neigh_indices)[i].size()):
+                deref(self.neigh_distances)[i][j] = (
+                        self.datasets_pair.dist(i, deref(self.neigh_indices)[i][j])
+                )
+
+
+    @final
+    cdef int _reduce_on_chunks(self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil except -1:
+        """
+        Critical part of the computation of pairwise distances.
+
+        "Fast Squared Euclidean" distances strategy relying
+        on the gemm-trick.
+        """
+        cdef:
+            ITYPE_t i, j
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+
+            # We compute the full pairwise squared distances matrix as follows
+            #
+            #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+            #
+            # The middle term gets computed efficiently bellow using GEMM from BLAS Level 3.
+            #
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays in BLAS documentations,
+            # for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            # TODO: necessarily casting because APIs exposed
+            # via scipy.linalg.cython_blas aren't reflecting
+            # the const-identifier for arguments
+            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            DTYPE_t * C = dist_middle_terms
+            ITYPE_t ldc = Y_c.shape[0]
+
+            DTYPE_t squared_dist_i_j
+
+
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                squared_dist_i_j = (self.X_sq_norms[i + X_start]
+                            + dist_middle_terms[i * Y_c.shape[0] + j]
+                            + self.Y_sq_norms[j + Y_start])
+                if squared_dist_i_j <= self.squared_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
+
+        return 0
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 59abf01a6c6a1..7d588add7e3d6 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1855,12 +1855,15 @@ def test_pairwise_deprecated(NearestNeighbors):
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
-@pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
-def test_fast_sqeuclidean_correctness(
+@pytest.mark.parametrize(
+    "n_neighbors, radius", [(1, 0), (10, 1), (100, 10), (1000, 100)]
+)
+def test_kneighbors_fast_sqeuclidean_correctness(
     n_samples,
     n_features,
     ratio_train_test,
     n_neighbors,
+    radius,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy must return results
@@ -1885,16 +1888,17 @@ def test_fast_sqeuclidean_correctness(
         * spread
     )
 
-    neigh = NearestNeighbors(
-        n_neighbors=n_neighbors, algorithm="brute", metric="euclidean"
-    ).fit(X_train)
+    neigh = NearestNeighbors(algorithm="brute", metric="euclidean").fit(X_train)
+
+    fse_neigh = NearestNeighbors(algorithm="brute", metric="fast_sqeuclidean").fit(
+        X_train
+    )
+
+    # Results for KNN should be consistent
     eucl_dist, eucl_nn = neigh.kneighbors(
         X=X_test, n_neighbors=n_neighbors, return_distance=True
     )
 
-    fse_neigh = NearestNeighbors(
-        n_neighbors=n_neighbors, algorithm="brute", metric="fast_sqeuclidean"
-    ).fit(X_train)
     fse_dist, fse_nn = fse_neigh.kneighbors(
         X=X_test, n_neighbors=n_neighbors, return_distance=True
     )
@@ -1902,6 +1906,20 @@ def test_fast_sqeuclidean_correctness(
     assert_allclose(eucl_dist, fse_dist)
     assert_array_equal(eucl_nn, fse_nn)
 
+    # Results for KNN should be consistent
+    eucl_dist, eucl_nn = neigh.radius_neighbors(
+        X=X_test, radius=radius, return_distance=True, sort_results=True
+    )
+
+    fse_dist, fse_nn = fse_neigh.radius_neighbors(
+        X=X_test, radius=radius, return_distance=True, sort_results=True
+    )
+
+    # We get arrays of arrays and we need to check for individual pairs
+    for i in range(eucl_dist.shape[0]):
+        assert_allclose(eucl_dist[i], fse_dist[i])
+        assert_array_equal(eucl_nn[i], fse_nn[i])
+
 
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
 @pytest.mark.parametrize("n_features", [5, 10, 100, 500])

From a319358955975cf514914907b529850e3883dbb5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 28 Jul 2021 12:10:38 +0200
Subject: [PATCH 121/290] Adapt internal checks

---
 sklearn/metrics/_parallel_reductions.pyx | 62 +++++++++++++-----------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index ecbebb5412941..8098b3b6f190b 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -7,6 +7,7 @@
 # cython: initializedcheck=False
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
+import numbers
 
 import numpy as np
 cimport numpy as np
@@ -37,10 +38,10 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 
 
-from scipy.sparse import issparse
+from scipy.sparse import issparse, spmatrix
 from threadpoolctl import threadpool_limits
 from ._dist_metrics import METRIC_MAPPING
-from ..utils import check_array
+from ..utils import check_array, check_scalar
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
@@ -166,6 +167,8 @@ cdef class DatasetsPair:
         cdef:
             DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
                                                                        **metric_kwargs)
+        X = check_array(X, accept_sparse='csr')
+        Y = check_array(Y, accept_sparse='csr')
 
         if X.shape[1] != Y.shape[1]:
             raise RuntimeError("Vectors of X and Y must have the "
@@ -184,6 +187,13 @@ cdef class DatasetsPair:
             return DenseSparseDatasetsPair(X, Y, distance_metric)
         return SparseSparseDatasetsPair(X, Y, distance_metric)
 
+    @classmethod
+    def unpack_csr_matrix(cls, X: spmatrix):
+        X_data = check_array(X.data, dtype=DTYPE, ensure_2d=False)
+        X_indices = check_array(X.indices, dtype=ITYPE, ensure_2d=False)
+        X_indptr = check_array(X.indptr, dtype=ITYPE, ensure_2d=False)
+        return X_data, X_indptr, X_indptr
+
     @property
     def n_X(self):
         raise RuntimeError()
@@ -296,13 +306,8 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         X = check_array(X, dtype=DTYPE, accept_sparse='csr')
         Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
 
-        self.X_data = X.data
-        self.X_indices = X.indices
-        self.X_indptr = X.indptr
-
-        self.Y_data = Y.data
-        self.Y_indices = Y.indices
-        self.Y_indptr = Y.indptr
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
 
     @final
     cdef DTYPE_t proxy_dist(self,
@@ -360,9 +365,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         self.distance_metric = distance_metric
 
         X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-        self.X_data = X.data
-        self.X_indices = X.indices
-        self.X_indptr = X.indptr
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
 
         self.Y = check_array(Y, dtype=DTYPE)
         self.Y_indices = np.arange(self.Y.shape[1])
@@ -466,14 +469,14 @@ cdef class PairwiseDistancesReduction:
     on chunks whose size can be set using ``chunk_size``.
     Parameters
     ----------
-    distance_metric: DistanceMetric
-        The distance to use
+    datasets_pair: DatasetsPair
+        The pair of dataset to use
     chunk_size: int
         The number of vectors per chunk
     """
 
     cdef:
-        DatasetsPair datasets_pair
+        readonly DatasetsPair datasets_pair
 
         ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
@@ -504,6 +507,7 @@ cdef class PairwiseDistancesReduction:
 
         self.effective_omp_n_thread = _openmp_effective_n_threads()
 
+        check_scalar(chunk_size, "chunk_size", numbers.Integral, min_val=1)
         self.chunk_size = chunk_size
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
@@ -756,7 +760,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 dict metric_kwargs=dict(),
         ):
         # This factory comes to handle specialisations.
-        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
+        if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
         return ArgKmin(datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
                        k=k,
@@ -769,11 +773,14 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ):
         PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
 
+        check_scalar(k, "k", numbers.Integral, min_val=1)
         self.k = k
 
         # Allocating pointers to datastructures but not the datastructures themselves.
-        # There's potentially more pointers than actual thread used for the
-        # reduction but as many datastructures as threads.
+        # There as many pointers as available threads.
+        # When reducing on small datasets, there can be more pointers than actual
+        # threads used for the reduction but there won't be allocated but unused
+        # datastructures.
         self.heaps_proxy_distances_chunks = <DTYPE_t **> malloc(
             sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
@@ -1015,8 +1022,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         DTYPE_t ** dist_middle_terms_chunks
 
     def __init__(self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
+        X,
+        Y,
         ITYPE_t k,
         ITYPE_t chunk_size = CHUNK_SIZE,
     ):
@@ -1025,8 +1032,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             k=k,
             chunk_size=chunk_size)
-        self.X = X
-        self.Y = Y
+        self.X = check_array(X, dtype=DTYPE)
+        self.Y = check_array(Y, dtype=DTYPE)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
@@ -1214,7 +1221,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                 dict metric_kwargs=dict(),
         ):
         # This factory comes to handle specialisations.
-        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
+        if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanRadiusNeighborhood(X=X, Y=Y,
                                                           radius=radius,
                                                           chunk_size=chunk_size)
@@ -1230,6 +1237,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ):
         PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
 
+        check_scalar(radius, "radius", numbers.Real, min_val=0)
         self.radius = radius
         self.sort_results = False
 
@@ -1454,8 +1462,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         DTYPE_t ** dist_middle_terms_chunks
 
     def __init__(self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
+        X,
+        Y,
         DTYPE_t radius,
         ITYPE_t chunk_size = CHUNK_SIZE,
     ):
@@ -1464,8 +1472,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
                         datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
                         radius=radius,
                         chunk_size=chunk_size)
-        self.X = X
-        self.Y = Y
+        self.X = check_array(X, dtype=DTYPE)
+        self.Y = check_array(Y, dtype=DTYPE)
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
         self.squared_radius = self.radius ** 2

From 3ebb200d88ae01f2c5ae5400c73c2be0ca3c8367 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 28 Jul 2021 12:12:16 +0200
Subject: [PATCH 122/290] Add test suite for PairwiseDistancesReduction

---
 .../metrics/tests/test_pairwise_reduction.py  | 361 ++++++++++++++++++
 sklearn/neighbors/tests/test_neighbors.py     | 116 ------
 2 files changed, 361 insertions(+), 116 deletions(-)
 create mode 100644 sklearn/metrics/tests/test_pairwise_reduction.py

diff --git a/sklearn/metrics/tests/test_pairwise_reduction.py b/sklearn/metrics/tests/test_pairwise_reduction.py
new file mode 100644
index 0000000000000..956a88e368ad1
--- /dev/null
+++ b/sklearn/metrics/tests/test_pairwise_reduction.py
@@ -0,0 +1,361 @@
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal, assert_allclose
+from scipy.sparse import (
+    lil_matrix,
+    csc_matrix,
+    dok_matrix,
+    dia_matrix,
+    coo_matrix,
+    bsr_matrix,
+)
+
+from sklearn.metrics._parallel_reductions import (
+    ArgKmin,
+    RadiusNeighborhood,
+    FastSquaredEuclideanArgKmin,
+    SparseSparseDatasetsPair,
+    DenseDenseDatasetsPair,
+    DenseSparseDatasetsPair,
+    SparseDenseDatasetsPair,
+    FastSquaredEuclideanRadiusNeighborhood,
+)
+
+
+def assert_radius_neighborhood_equality(ref_dist, dist, ref_indices, indices):
+    # We get arrays of arrays and we need to check for individual pairs
+    for i in range(ref_dist.shape[0]):
+        assert_array_equal(
+            ref_dist[i],
+            dist[i],
+            err_msg=f"Query vector #{i} has different neighbors' distances",
+        )
+        assert_array_equal(
+            ref_indices[i],
+            indices[i],
+            err_msg=f"Query vector #{i} has different neighbors' indices",
+        )
+
+
+def test_argkmin_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    k = 5
+    metric = "euclidean"
+
+    with pytest.raises(ValueError, match="`k`= -1, must be >= 1."):
+        ArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="`k`= 0, must be >= 1."):
+        ArgKmin.get_for(X=X, Y=Y, k=0.1, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        ArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric")
+
+    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+        ArgKmin.get_for(X=[1, 2], Y=Y, k=k, metric=metric)
+
+    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+        ArgKmin.get_for(X=X, Y=[1, 2], k=k, metric=metric)
+
+
+def test_radius_neighborhood_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    radius = 5
+    metric = "euclidean"
+
+    with pytest.raises(ValueError, match="`radius`= -1.0, must be >= 0."):
+        RadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        RadiusNeighborhood.get_for(X=X, Y=Y, radius=radius, metric="wrong metric")
+
+    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+        RadiusNeighborhood.get_for(X=[1, 2], Y=Y, radius=radius, metric=metric)
+
+    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+        RadiusNeighborhood.get_for(X=X, Y=[1, 2], radius=radius, metric=metric)
+
+
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction",
+    [
+        (ArgKmin, FastSquaredEuclideanArgKmin),
+        (RadiusNeighborhood, FastSquaredEuclideanRadiusNeighborhood),
+    ],
+)
+def test_paiwise_disances_reduction_factory_method(
+    PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction
+):
+    # Test all the combinations of DatasetsPair for creation
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    metric = "euclidean"
+
+    # Dummy value for k or radius
+    dummy_arg = 5
+
+    dense_dense_instance = PairwiseDistancesReduction.get_for(X, Y, dummy_arg, metric)
+    assert isinstance(dense_dense_instance.datasets_pair, DenseDenseDatasetsPair)
+
+    sparse_matrix_constructors = [
+        lil_matrix,
+        csc_matrix,
+        bsr_matrix,
+        coo_matrix,
+        dia_matrix,
+        dok_matrix,
+    ]
+
+    for c_X, c_Y in itertools.combinations_with_replacement(
+        sparse_matrix_constructors, r=2
+    ):
+        sparse_sparse_instance = PairwiseDistancesReduction.get_for(
+            c_X(X), c_Y(Y), dummy_arg, metric
+        )
+        assert isinstance(
+            sparse_sparse_instance.datasets_pair, SparseSparseDatasetsPair
+        )
+
+    for constructor in sparse_matrix_constructors:
+        dense_sparse_instance = PairwiseDistancesReduction.get_for(
+            X, constructor(Y), dummy_arg, metric=metric
+        )
+        assert isinstance(dense_sparse_instance.datasets_pair, DenseSparseDatasetsPair)
+
+        sparse_dense_instance = PairwiseDistancesReduction.get_for(
+            constructor(X), Y, dummy_arg, metric=metric
+        )
+        assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair)
+
+    # Test specialisations creation
+    fast_sqeuclidean_instance = PairwiseDistancesReduction.get_for(
+        X, Y, dummy_arg, metric="fast_sqeuclidean"
+    )
+    assert isinstance(fast_sqeuclidean_instance, PairwiseDistancesReduction)
+    assert isinstance(fast_sqeuclidean_instance, FastSquaredPairwiseDistancesReduction)
+
+
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("k", [1, 10, 100])
+@pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
+@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+def test_argkmin_chunk_size_agnosticism(
+    n_samples, k, chunk_size, metric, n_features=100, dtype=np.float64
+):
+    # ArgKmin results should not depend on the chunk size
+    rng = np.random.RandomState(1)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    ref_dist, ref_indices = ArgKmin.get_for(X, Y, k=k, metric="euclidean").compute(
+        return_distance=True
+    )
+
+    dist, indices = ArgKmin.get_for(
+        X, Y, k=k, metric=metric, chunk_size=chunk_size
+    ).compute(return_distance=True)
+
+    assert_array_equal(ref_dist, dist)
+    assert_array_equal(ref_indices, indices)
+
+
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("radius", [1, 10, 100])
+@pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
+@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+def test_radius_neighborhood_chunk_size_agnosticism(
+    n_samples, radius, chunk_size, metric, n_features=100, dtype=np.float64
+):
+    # RadiusNeighborhood results should not depend on the chunk size
+    rng = np.random.RandomState(1)
+    spread = 100
+
+    # Scaling the radius with the dimensions
+    scaled_radius = radius * np.log(n_features)
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    ref_dist, ref_indices = RadiusNeighborhood.get_for(
+        X, Y, radius=scaled_radius, metric="euclidean"
+    ).compute(return_distance=True)
+
+    dist, indices = RadiusNeighborhood.get_for(
+        X, Y, radius=scaled_radius, metric=metric, chunk_size=chunk_size
+    ).compute(return_distance=True)
+
+    assert_radius_neighborhood_equality(ref_dist, dist, ref_indices, indices)
+
+
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_features", [5, 100, 500])
+@pytest.mark.parametrize("k", [1, 10, 100])
+@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+def test_argkmin_strategies_consistency(
+    n_samples,
+    n_features,
+    k,
+    metric,
+    dtype=np.float64,
+):
+    # ArgKmin results obtained using both parallelization strategies
+    # must be identical
+
+    rng = np.random.RandomState(1)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    argkmin_reduction = ArgKmin.get_for(X, Y, k=k, metric=metric)
+
+    dist_par_X, indices_par_X = argkmin_reduction.compute(
+        strategy="parallel_on_X", return_distance=True
+    )
+
+    dist_par_Y, indices_par_Y = argkmin_reduction.compute(
+        strategy="parallel_on_Y", return_distance=True
+    )
+
+    assert_array_equal(dist_par_X, dist_par_Y)
+    assert_array_equal(indices_par_X, indices_par_Y)
+
+
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_features", [5, 100, 500])
+@pytest.mark.parametrize("radius", [1, 10, 100])
+@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+def test_radius_neighborhood_strategies_consistency(
+    n_samples,
+    n_features,
+    radius,
+    metric,
+    dtype=np.float64,
+):
+    # RadiusNeighborhood results obtained using both parallelization strategies
+    # must be identical
+
+    rng = np.random.RandomState(1)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    radius_neigh_reduction = RadiusNeighborhood.get_for(
+        X,
+        Y,
+        # Scaling the radius with the dimensions
+        radius=radius ** np.log(n_features),
+        metric=metric,
+    )
+
+    dist_par_X, indices_par_X = radius_neigh_reduction.compute(
+        strategy="parallel_on_X", return_distance=True
+    )
+
+    dist_par_Y, indices_par_Y = radius_neigh_reduction.compute(
+        strategy="parallel_on_Y", return_distance=True
+    )
+
+    assert_radius_neighborhood_equality(
+        dist_par_X, dist_par_Y, indices_par_X, indices_par_Y
+    )
+
+
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("sample_imbalance", [10, 2, 1, 0.5])
+@pytest.mark.parametrize("k, radius", [(1, 0), (10, 1), (100, 10), (1000, 100)])
+def test_fast_sqeuclidean_correctness(
+    n_samples,
+    n_features,
+    sample_imbalance,
+    k,
+    radius,
+    dtype=np.float64,
+):
+    # The fast squared euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
+    if n_samples < k:
+        pytest.skip(
+            f"Skipping as n_samples (={n_samples}) < k (={k})",
+            allow_module_level=True,
+        )
+
+    rng = np.random.RandomState(1)
+
+    spread = 100
+    X = (
+        rng.rand(int(n_samples * n_features / sample_imbalance))
+        .astype(dtype)
+        .reshape((-1, n_features))
+        * spread
+    )
+    Y = (
+        rng.rand(int(n_samples * n_features)).astype(dtype).reshape((-1, n_features))
+        * spread
+    )
+
+    eucl_dist, eucl_indices = ArgKmin.get_for(X, Y, k, metric="euclidean").compute(
+        return_distance=True
+    )
+    fse_dist, fse_indices = ArgKmin.get_for(X, Y, k, metric="fast_sqeuclidean").compute(
+        return_distance=True
+    )
+
+    assert_array_equal(eucl_dist, fse_dist)
+    assert_array_equal(eucl_indices, fse_indices)
+
+    eucl_dist, eucl_indices = RadiusNeighborhood.get_for(
+        X, Y, radius, metric="euclidean"
+    ).compute(return_distance=True)
+    fse_dist, fse_indices = RadiusNeighborhood.get_for(
+        X, Y, radius, metric="fast_sqeuclidean"
+    ).compute(return_distance=True)
+
+    assert_radius_neighborhood_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
+
+
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_features", [5, 10, 100, 500])
+@pytest.mark.parametrize("k", [1, 10, 100, 1000])
+@pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+@pytest.mark.skip(
+    reason=(
+        "Long test, translation invariance should have its own study: skipping for now"
+    )
+)
+def test_fast_sqeuclidean_translation_invariance(
+    n_samples,
+    n_features,
+    k,
+    translation,
+    dtype=np.float64,
+):
+    # The fast squared euclidean strategy should be translation invariant.
+    if n_samples < k:
+        pytest.skip(
+            f"Skipping as n_samples (={n_samples}) < n_neighbors (={k})",
+            allow_module_level=True,
+        )
+
+    rng = np.random.RandomState(1)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    reference_dist, reference_indices = ArgKmin.get_for(
+        X, Y, k, metric="fast_sqeuclidean"
+    ).compute(return_distance=True)
+
+    dist, indices = ArgKmin.get_for(
+        X + translation, X + translation, k, metric="fast_sqeuclidean"
+    ).compute(return_distance=True)
+
+    assert_array_equal(reference_indices, indices)
+    assert_allclose(reference_dist, dist)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 7d588add7e3d6..a75f2e5cc62e8 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -23,7 +23,6 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import (
-    NearestNeighbors,
     VALID_METRICS_SPARSE,
     VALID_METRICS,
 )
@@ -1850,118 +1849,3 @@ def test_pairwise_deprecated(NearestNeighbors):
     msg = r"Attribute `_pairwise` was deprecated in version 0\.24"
     with pytest.warns(FutureWarning, match=msg):
         nn._pairwise
-
-
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
-@pytest.mark.parametrize(
-    "n_neighbors, radius", [(1, 0), (10, 1), (100, 10), (1000, 100)]
-)
-def test_kneighbors_fast_sqeuclidean_correctness(
-    n_samples,
-    n_features,
-    ratio_train_test,
-    n_neighbors,
-    radius,
-    dtype=np.float64,
-):
-    # The fast squared euclidean strategy must return results
-    # that are close to the ones obtained with the euclidean distance
-    if n_samples < n_neighbors:
-        pytest.skip(
-            f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})",
-            allow_module_level=True,
-        )
-
-    rng = np.random.RandomState(1)
-
-    spread = 100
-    X_train = (
-        rng.rand(int(n_samples * n_features)).astype(dtype).reshape((-1, n_features))
-        * spread
-    )
-    X_test = (
-        rng.rand(int(n_samples * n_features / ratio_train_test))
-        .astype(dtype)
-        .reshape((-1, n_features))
-        * spread
-    )
-
-    neigh = NearestNeighbors(algorithm="brute", metric="euclidean").fit(X_train)
-
-    fse_neigh = NearestNeighbors(algorithm="brute", metric="fast_sqeuclidean").fit(
-        X_train
-    )
-
-    # Results for KNN should be consistent
-    eucl_dist, eucl_nn = neigh.kneighbors(
-        X=X_test, n_neighbors=n_neighbors, return_distance=True
-    )
-
-    fse_dist, fse_nn = fse_neigh.kneighbors(
-        X=X_test, n_neighbors=n_neighbors, return_distance=True
-    )
-
-    assert_allclose(eucl_dist, fse_dist)
-    assert_array_equal(eucl_nn, fse_nn)
-
-    # Results for KNN should be consistent
-    eucl_dist, eucl_nn = neigh.radius_neighbors(
-        X=X_test, radius=radius, return_distance=True, sort_results=True
-    )
-
-    fse_dist, fse_nn = fse_neigh.radius_neighbors(
-        X=X_test, radius=radius, return_distance=True, sort_results=True
-    )
-
-    # We get arrays of arrays and we need to check for individual pairs
-    for i in range(eucl_dist.shape[0]):
-        assert_allclose(eucl_dist[i], fse_dist[i])
-        assert_array_equal(eucl_nn[i], fse_nn[i])
-
-
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("n_features", [5, 10, 100, 500])
-@pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
-@pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
-@pytest.mark.skip(
-    reason=(
-        "Long test, translation invariance should have its own study: skipping for now"
-    )
-)
-def test_fast_sqeuclidean_translation_invariance(
-    n_samples,
-    n_features,
-    n_neighbors,
-    translation,
-    dtype=np.float64,
-):
-    # The fast squared euclidean strategy should be translation invariant.
-    if n_samples < n_neighbors:
-        pytest.skip(
-            f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})",
-            allow_module_level=True,
-        )
-
-    rng = np.random.RandomState(1)
-    spread = 100
-    X_train = rng.rand(n_samples, n_features).astype(dtype) * spread
-    X_test = rng.rand(n_samples, n_features).astype(dtype) * spread
-
-    neigh = NearestNeighbors(
-        n_neighbors=n_neighbors, algorithm="brute", metric="fast_sqeuclidean"
-    ).fit(X_train)
-    reference_dist, reference_nns = neigh.kneighbors(
-        X=X_test, n_neighbors=n_neighbors, return_distance=True
-    )
-
-    neigh = NearestNeighbors(
-        n_neighbors=n_neighbors, algorithm="brute", metric="fast_sqeuclidean"
-    ).fit(X_train + translation)
-    dist, nns = neigh.kneighbors(
-        X=X_test + translation, n_neighbors=n_neighbors, return_distance=True
-    )
-
-    assert_allclose(reference_dist, dist)
-    assert_array_equal(reference_nns, nns)

From f63692a65d64eb5a01ceeb11bcbfec7c54301f41 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 28 Jul 2021 17:13:42 +0200
Subject: [PATCH 123/290] Merge vectors at the really end using dynamic
 scheduling

---
 sklearn/metrics/_parallel_reductions.pyx | 65 +++++++++++-------------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 8098b3b6f190b..41bd1153ec433 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -1306,15 +1306,18 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_indices)[idx].size()
                 )
 
-    @final
-    cdef void _on_Y_parallel_init(self,
-        ITYPE_t thread_num,
+    cdef void _on_Y_init(self,
+        ITYPE_t num_threads,
     ) nogil:
-        # As chunks of X are shared across threads, so must their
-        # vectors. To solve this, each thread has its own vectors
-        # which are then synchronised merged back in the main ones.
-        self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X)
-        self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
+        cdef:
+            ITYPE_t thread_num
+        # As chunks of X are shared across threads, so must datastructures
+        # to avoid race conditions.
+        # Each thread has its own vectors of n_X vectors which are then merged
+        # back in the main n_X vectors.
+        for thread_num in range(num_threads):
+            self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X)
+            self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
 
     @final
     cdef void _merge_vectors(self,
@@ -1348,44 +1351,36 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             )
             last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
-    @final
-    cdef void _on_Y_after_parallel(self,
+
+    cdef void _on_Y_finalize(self,
         ITYPE_t num_threads,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
     ) nogil:
         cdef:
-            ITYPE_t idx, thread_num
-        # Merge associated vectors into one
-        # This is done in parallel samples-wise (no need for locks)
+            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
+
         with nogil, parallel(num_threads=self.effective_omp_n_thread):
-            for idx in prange(self.n_X, schedule='static'):
+            # Merge vectors used in threads into the main ones.
+            # This is done in parallel sample-wise (no need for locks)
+            # using dynamic scheduling because we generally do not have
+            # the same number of neighbors for each query vectors.
+            # TODO: compare 'dynamic' vs 'static' vs 'guided'
+            for idx in prange(self.n_X, schedule='dynamic'):
                 self._merge_vectors(idx, num_threads)
 
             # The content of the vector have been std::moved,
-            # Hence they can't be used anymore and can only
-            # be deleted.
+            # Hence they can't be used anymore and can only be deleted.
             for thread_num in prange(num_threads, schedule='static'):
                 del self.neigh_distances_chunks[thread_num]
                 del self.neigh_indices_chunks[thread_num]
 
-        return
-
-    cdef void _on_Y_finalize(self,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
-
-        # Sort in parallel in ascending order w.r.t the distances if needed
-        if self.sort_results:
-            for idx in prange(self.n_X, schedule='static', nogil=True,
-                              num_threads=self.effective_omp_n_thread):
-                _simultaneous_sort(
-                    deref(self.neigh_distances)[idx].data(),
-                    deref(self.neigh_indices)[idx].data(),
-                    deref(self.neigh_indices)[idx].size()
-                )
+            # Sort in parallel in ascending order w.r.t the distances if needed
+            if self.sort_results:
+                for idx in prange(self.n_X, schedule='static'):
+                    _simultaneous_sort(
+                        deref(self.neigh_distances)[idx].data(),
+                        deref(self.neigh_indices)[idx].data(),
+                        deref(self.neigh_indices)[idx].size()
+                    )
 
         return
 

From 5f1d3a0a8550eaee39a9c5f8c0d8370f7029e01e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 28 Jul 2021 17:34:06 +0200
Subject: [PATCH 124/290] Pull the distance metric up and make it readonly

---
 sklearn/metrics/_parallel_reductions.pyx | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 41bd1153ec433..444e9ad89d7d6 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -156,6 +156,8 @@ cdef class DatasetsPair:
     This makes use of cython.final to remove the overhead of method calls'
     dispatch.
     """
+    cdef:
+        readonly DistanceMetric distance_metric
 
     @classmethod
     def get_for(cls,
@@ -187,6 +189,9 @@ cdef class DatasetsPair:
             return DenseSparseDatasetsPair(X, Y, distance_metric)
         return SparseSparseDatasetsPair(X, Y, distance_metric)
 
+    def __init__(self, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+
     @classmethod
     def unpack_csr_matrix(cls, X: spmatrix):
         X_data = check_array(X.data, dtype=DTYPE, ensure_2d=False)
@@ -224,8 +229,6 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         Rows represent vectors
     """
     cdef:
-        DistanceMetric distance_metric
-
         const DTYPE_t[:, ::1] X  # shape: (n_X, d)
         const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
         ITYPE_t d
@@ -237,7 +240,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
-        self.distance_metric = distance_metric
+        DatasetsPair.__init__(self, distance_metric)
         self.X = check_array(X, dtype=DTYPE, order='C')
         self.Y = check_array(Y, dtype=DTYPE, order='C')
         self.d = X.shape[1]
@@ -280,8 +283,6 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         Rows represent vectors
     """
     cdef:
-        DistanceMetric distance_metric
-
         const DTYPE_t[:] X_data
         const ITYPE_t[:] X_indices,
         const ITYPE_t[:] X_indptr,
@@ -301,7 +302,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         return self.Y_indptr.shape[0] -1
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
-        self.distance_metric = distance_metric
+        DatasetsPair.__init__(self, distance_metric)
 
         X = check_array(X, dtype=DTYPE, accept_sparse='csr')
         Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
@@ -351,8 +352,6 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         Rows represent vectors
     """
     cdef:
-        DistanceMetric distance_metric
-
         const DTYPE_t[:] X_data
         const ITYPE_t[:] X_indices,
         const ITYPE_t[:] X_indptr,
@@ -362,7 +361,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
 
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
-        self.distance_metric = distance_metric
+        DatasetsPair.__init__(self, distance_metric)
 
         X = check_array(X, dtype=DTYPE, accept_sparse='csr')
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
@@ -428,6 +427,7 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
         DatasetsPair datasets_pair
 
     def __init__(self, X, Y, distance_metric):
+        DatasetsPair.__init__(self, distance_metric)
         # Swapping arguments on the constructor
         self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
 

From bd812453a18edd20f02f5812b5eaf2cd0167d19c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 28 Jul 2021 17:57:45 +0200
Subject: [PATCH 125/290] Use proxy distance for RadiusNeighborhood reduction

We loose a bit of precision for a lot of speed because
we don't recompute distances, which can be
computationally prohibitive.
---
 sklearn/metrics/_parallel_reductions.pyx | 68 ++++++++++++++----------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 444e9ad89d7d6..114e21c752c30 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -922,8 +922,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 )
         return
 
-    # TODO: annotating with 'final' here makes the compilation fails but it should not
-    # @final
     cdef void compute_exact_distances(self) nogil:
         cdef:
             ITYPE_t i, j
@@ -933,7 +931,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         for i in prange(self.n_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
             for j in range(self.k):
-                distances[i, j] = self.datasets_pair.dist(i, Y_indices[i, j])
+                distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(distances[i, j])
 
     @final
     def compute(self,
@@ -1082,6 +1080,22 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
 
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
+            DTYPE_t[:, ::1] distances = self.argkmin_distances
+
+        for i in prange(self.n_X, schedule='static', nogil=True,
+                        num_threads=self.effective_omp_n_thread):
+            for j in range(self.k):
+                # This time we have no other choice but to recompute distances
+                # because we don't take ||X_c||² in the reduction
+                # TODO: introduce ||X_c||² for FastSquaredEuclideanArgKmin
+                # and factorise code shared with FastSquaredEuclideanRadiusNeighborhood?
+                distances[i, j] = self.datasets_pair.dist(i, Y_indices[i, j])
+
     @final
     cdef int _reduce_on_chunks(self,
         ITYPE_t X_start,
@@ -1181,9 +1195,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # ("reduced distance" in the original wording),
         # which are proxies necessitating less computations.
         # We get the proxy for the radius to be able to compare
-
-        # TODO: use it?
-        DTYPE_t radius_proxy
+        DTYPE_t proxy_radius
 
         # We want resizable buffers which we will to wrapped within numpy
         # arrays at the end.
@@ -1239,6 +1251,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         check_scalar(radius, "radius", numbers.Real, min_val=0)
         self.radius = radius
+        self.proxy_radius = self.datasets_pair.distance_metric._dist_to_rdist(self.radius)
         self.sort_results = False
 
         # Allocating pointers to datastructures but not the datastructures themselves.
@@ -1265,13 +1278,13 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            DTYPE_t dist_i_j
+            DTYPE_t proxy_dist_i_j
 
         for i in range(X_start, X_end):
             for j in range(Y_start, Y_end):
-                dist_i_j = self.datasets_pair.dist(i, j)
-                if dist_i_j <= self.radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(dist_i_j)
+                proxy_dist_i_j = self.datasets_pair.proxy_dist(i, j)
+                if proxy_dist_i_j <= self.proxy_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(proxy_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
 
         return 0
@@ -1384,6 +1397,22 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         return
 
+    # TODO: annotating with 'final' here makes the compilation fails but it should not
+    # @final
+    cdef void compute_exact_distances(self) nogil:
+        """Convert proxy distances to pairwise distances in parallel."""
+        cdef:
+            ITYPE_t i, j
+
+        for i in prange(self.n_X, nogil=True, schedule='static',
+                        num_threads=self.effective_omp_n_thread):
+            for j in range(deref(self.neigh_indices)[i].size()):
+                deref(self.neigh_distances)[i][j] = (
+                        self.datasets_pair.distance_metric._rdist_to_dist(
+                            deref(self.neigh_distances)[i][j]
+                        )
+                )
+
     @final
     def compute(self,
         str strategy = "auto",
@@ -1451,8 +1480,6 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         DTYPE_t[::1] X_sq_norms
         DTYPE_t[::1] Y_sq_norms
 
-        DTYPE_t squared_radius
-
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
 
@@ -1471,7 +1498,6 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         self.Y = check_array(Y, dtype=DTYPE)
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
-        self.squared_radius = self.radius ** 2
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
@@ -1520,20 +1546,6 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
 
-
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        """Convert proxy distances to pairwise distances in parallel."""
-        cdef:
-            ITYPE_t i, j
-
-        for i in prange(self.n_X, nogil=True, num_threads=self.effective_omp_n_thread):
-            for j in range(deref(self.neigh_indices)[i].size()):
-                deref(self.neigh_distances)[i][j] = (
-                        self.datasets_pair.dist(i, deref(self.neigh_indices)[i][j])
-                )
-
-
     @final
     cdef int _reduce_on_chunks(self,
         ITYPE_t X_start,
@@ -1597,7 +1609,7 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
                 squared_dist_i_j = (self.X_sq_norms[i + X_start]
                             + dist_middle_terms[i * Y_c.shape[0] + j]
                             + self.Y_sq_norms[j + Y_start])
-                if squared_dist_i_j <= self.squared_radius:
+                if squared_dist_i_j <= self.proxy_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
 

From 2e48051485c6b17d7089fe4bb119f8ee699e4f48 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 09:42:54 +0200
Subject: [PATCH 126/290] fixup! Correct cross-referencing for
 metrics.DistanceMetric

---
 doc/glossary.rst        |  7 +++----
 doc/modules/classes.rst | 11 ++++++++++-
 doc/modules/density.rst |  2 +-
 doc/whats_new/v1.0.rst  |  2 +-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/doc/glossary.rst b/doc/glossary.rst
index 21d0c947f3e64..fcea1bf1ec378 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -644,9 +644,8 @@ General Concepts
 
         Note that for most distance metrics, we rely on implementations from
         :mod:`scipy.spatial.distance`, but may reimplement for efficiency in
-        our context.  The :mod:`neighbors` module also duplicates some metric
-        implementations for integration with efficient binary tree search data
-        structures.
+        our context. The :class:`metrics.Distance` is used to implement
+        distance metrics for integration with efficient neighbors search.
 
     pd
         A shorthand for `Pandas <https://pandas.pydata.org>`_ due to the
@@ -1022,7 +1021,7 @@ such as:
 
 Further examples:
 
-* :class:`neighbors.DistanceMetric`
+* :class:`metrics.DistanceMetric`
 * :class:`gaussian_process.kernels.Kernel`
 * ``tree.Criterion``
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index ddcbe36bb1b33..de95d14cbc064 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1057,6 +1057,16 @@ further details.
 
    metrics.consensus_score
 
+Distance metrics
+----------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metrics.DistanceMetric
 
 Pairwise metrics
 ----------------
@@ -1316,7 +1326,6 @@ Model validation
    :template: class.rst
 
    neighbors.BallTree
-   neighbors.DistanceMetric
    neighbors.KDTree
    neighbors.KernelDensity
    neighbors.KNeighborsClassifier
diff --git a/doc/modules/density.rst b/doc/modules/density.rst
index 115d318183577..6440bf79ab729 100644
--- a/doc/modules/density.rst
+++ b/doc/modules/density.rst
@@ -136,7 +136,7 @@ The form of these kernels is as follows:
   :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h`
 
 The kernel density estimator can be used with any of the valid distance
-metrics (see :class:`~sklearn.neighbors.DistanceMetric` for a list of available metrics), though
+metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of available metrics), though
 the results are properly normalized only for the Euclidean metric.  One
 particularly useful metric is the
 `Haversine distance <https://en.wikipedia.org/wiki/Haversine_formula>`_
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a5beb23a48d3f..2baf4c94dc988 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -565,7 +565,7 @@ Changelog
   :pr:`19473` by :user:`jiefangxuanyan <jiefangxuanyan>` and
   :user:`Julien Jerphanion <jjerphan>`.
 
-- |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly
+- |FIX| :class:`metrics.DistanceMetric` subclasses now support readonly
   memory-mapped datasets. :pr:`19883` by :user:`Julien Jerphanion <jjerphan>`.
 
 - |FIX| :class:`neighbors.NearestNeighbors`, :class:`neighbors.KNeighborsClassifier`,

From 53ba89dd81891fcd616dd43a7ad5103750c579af Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 10:55:10 +0200
Subject: [PATCH 127/290] Improve inputs checks

---
 sklearn/metrics/_parallel_reductions.pyx      | 36 +++++---
 .../metrics/tests/test_pairwise_reduction.py  | 84 ++++++++++++++++---
 2 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 114e21c752c30..d71ac9290a421 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -168,13 +168,21 @@ cdef class DatasetsPair:
         ) -> DatasetsPair:
         cdef:
             DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
-                                                                       **metric_kwargs)
-        X = check_array(X, accept_sparse='csr')
-        Y = check_array(Y, accept_sparse='csr')
+                                                                 **metric_kwargs)
+
+        # TODO: what's the best coercion for lists?
+        X = np.asarray(X) if isinstance(X, (tuple, list)) else X
+        Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
+
+        if X.dtype.itemsize != 8 or Y.dtype.itemsize != 8:
+            raise ValueError("32bits datasets aren't supported for X and Y yet.")
+
+        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
 
         if X.shape[1] != Y.shape[1]:
-            raise RuntimeError("Vectors of X and Y must have the "
-                               "same dimension but currently are "
+            raise ValueError("Vectors of X and Y must have the same "
+                               "number of dimensions but currently are "
                                f"respectively {X.shape[1]}-dimensional "
                                f"and {Y.shape[1]}-dimensional.")
 
@@ -492,12 +500,14 @@ cdef class PairwiseDistancesReduction:
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        # TODO: support sparse arrays
-        return (not issparse(X) and
-                not issparse(Y) and
+        # TODO: what's the best coercion for lists?
+        X = np.asarray(X) if isinstance(X, (tuple, list)) else X
+        Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
+        # TODO: support sparse arrays and 32 bits
+        return (not issparse(X) and X.dtype.itemsize == 8 and X.ndim == 2 and
+                not issparse(Y) and Y.dtype.itemsize == 8 and Y.ndim == 2 and
                 metric in cls.valid_metrics())
 
-
     def __init__(self,
                  DatasetsPair datasets_pair,
                  ITYPE_t chunk_size = CHUNK_SIZE,
@@ -1030,8 +1040,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             k=k,
             chunk_size=chunk_size)
-        self.X = check_array(X, dtype=DTYPE)
-        self.Y = check_array(Y, dtype=DTYPE)
+        self.X = check_array(X, dtype=DTYPE, order='C')
+        self.Y = check_array(Y, dtype=DTYPE, order='C')
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
@@ -1494,8 +1504,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
                         datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
                         radius=radius,
                         chunk_size=chunk_size)
-        self.X = check_array(X, dtype=DTYPE)
-        self.Y = check_array(Y, dtype=DTYPE)
+        self.X = check_array(X, dtype=DTYPE, order='C')
+        self.Y = check_array(Y, dtype=DTYPE, order='C')
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
 
diff --git a/sklearn/metrics/tests/test_pairwise_reduction.py b/sklearn/metrics/tests/test_pairwise_reduction.py
index 956a88e368ad1..f7fc589887d58 100644
--- a/sklearn/metrics/tests/test_pairwise_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_reduction.py
@@ -4,39 +4,64 @@
 import pytest
 from numpy.testing import assert_array_equal, assert_allclose
 from scipy.sparse import (
-    lil_matrix,
+    bsr_matrix,
+    coo_matrix,
     csc_matrix,
-    dok_matrix,
+    csr_matrix,
     dia_matrix,
-    coo_matrix,
-    bsr_matrix,
+    dok_matrix,
+    lil_matrix,
 )
 
 from sklearn.metrics._parallel_reductions import (
+    PairwiseDistancesReduction,
     ArgKmin,
     RadiusNeighborhood,
     FastSquaredEuclideanArgKmin,
-    SparseSparseDatasetsPair,
+    FastSquaredEuclideanRadiusNeighborhood,
     DenseDenseDatasetsPair,
     DenseSparseDatasetsPair,
     SparseDenseDatasetsPair,
-    FastSquaredEuclideanRadiusNeighborhood,
+    SparseSparseDatasetsPair,
 )
 
 
 def assert_radius_neighborhood_equality(ref_dist, dist, ref_indices, indices):
     # We get arrays of arrays and we need to check for individual pairs
     for i in range(ref_dist.shape[0]):
-        assert_array_equal(
-            ref_dist[i],
-            dist[i],
-            err_msg=f"Query vector #{i} has different neighbors' distances",
-        )
         assert_array_equal(
             ref_indices[i],
             indices[i],
             err_msg=f"Query vector #{i} has different neighbors' indices",
         )
+        assert_allclose(
+            ref_dist[i],
+            dist[i],
+            err_msg=f"Query vector #{i} has different neighbors' distances",
+        )
+
+
+def test_pairwise_distances_reduction_is_usable_for():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    metric = "euclidean"
+    assert PairwiseDistancesReduction.is_usable_for(X, Y, metric)
+    assert PairwiseDistancesReduction.is_usable_for(
+        X.astype(np.int64), Y.astype(np.int64), metric
+    )
+
+    assert not PairwiseDistancesReduction.is_usable_for(X[0], Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y[0], metric)
+
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc")
+    # TODO: remove once 32 bits datasets are supported
+    assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric)
+
+    # TODO: remove once sparse matrices are supported
+    assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, csc_matrix(Y), metric)
 
 
 def test_argkmin_factory_method_wrong_usages():
@@ -46,6 +71,16 @@ def test_argkmin_factory_method_wrong_usages():
     k = 5
     metric = "euclidean"
 
+    with pytest.raises(
+        ValueError, match="32bits datasets aren't supported for X and Y yet."
+    ):
+        ArgKmin.get_for(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
+
+    with pytest.raises(
+        ValueError, match="32bits datasets aren't supported for X and Y yet."
+    ):
+        ArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
+
     with pytest.raises(ValueError, match="`k`= -1, must be >= 1."):
         ArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
 
@@ -61,6 +96,11 @@ def test_argkmin_factory_method_wrong_usages():
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
         ArgKmin.get_for(X=X, Y=[1, 2], k=k, metric=metric)
 
+    with pytest.raises(
+        ValueError, match="Vectors of X and Y must have the same number of dimensions"
+    ):
+        ArgKmin.get_for(X=X[:, ::2], Y=Y, k=k, metric=metric)
+
 
 def test_radius_neighborhood_factory_method_wrong_usages():
     rng = np.random.RandomState(1)
@@ -69,6 +109,20 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     radius = 5
     metric = "euclidean"
 
+    with pytest.raises(
+        ValueError, match="32bits datasets aren't supported for X and Y yet."
+    ):
+        RadiusNeighborhood.get_for(
+            X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
+        )
+
+    with pytest.raises(
+        ValueError, match="32bits datasets aren't supported for X and Y yet."
+    ):
+        RadiusNeighborhood.get_for(
+            X=X, Y=Y.astype(np.int32), radius=radius, metric=metric
+        )
+
     with pytest.raises(ValueError, match="`radius`= -1.0, must be >= 0."):
         RadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric)
 
@@ -81,6 +135,11 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
         RadiusNeighborhood.get_for(X=X, Y=[1, 2], radius=radius, metric=metric)
 
+    with pytest.raises(
+        ValueError, match="Vectors of X and Y must have the same number of dimensions"
+    ):
+        RadiusNeighborhood.get_for(X=X[:, ::2], Y=Y, radius=radius, metric=metric)
+
 
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction",
@@ -89,7 +148,7 @@ def test_radius_neighborhood_factory_method_wrong_usages():
         (RadiusNeighborhood, FastSquaredEuclideanRadiusNeighborhood),
     ],
 )
-def test_paiwise_disances_reduction_factory_method(
+def test_pairwise_distances_reduction_factory_method(
     PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction
 ):
     # Test all the combinations of DatasetsPair for creation
@@ -107,6 +166,7 @@ def test_paiwise_disances_reduction_factory_method(
     sparse_matrix_constructors = [
         lil_matrix,
         csc_matrix,
+        csr_matrix,
         bsr_matrix,
         coo_matrix,
         dia_matrix,

From 1632e1428e6f90cc7b1b5fab1aa00571c7493ff3 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 11:07:30 +0200
Subject: [PATCH 128/290] Rename submodule for pairwise distances reductions

---
 ...allel_reductions.pyx => _pairwise_distances_reduction.pyx} | 0
 sklearn/metrics/pairwise.py                                   | 2 +-
 sklearn/metrics/setup.py                                      | 4 ++--
 ...wise_reduction.py => test_pairwise_distances_reduction.py} | 2 +-
 sklearn/neighbors/_base.py                                    | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename sklearn/metrics/{_parallel_reductions.pyx => _pairwise_distances_reduction.pyx} (100%)
 rename sklearn/metrics/tests/{test_pairwise_reduction.py => test_pairwise_distances_reduction.py} (99%)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
similarity index 100%
rename from sklearn/metrics/_parallel_reductions.pyx
rename to sklearn/metrics/_pairwise_distances_reduction.pyx
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 9dbc762a4891a..23a8d89aed25e 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._parallel_reductions import ArgKmin
+from ._pairwise_distances_reduction import ArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 01f8056319408..cd32817574dd3 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -20,8 +20,8 @@ def configuration(parent_package="", top_path=None):
     )
 
     config.add_extension(
-        "_parallel_reductions",
-        sources=["_parallel_reductions.pyx"],
+        "_pairwise_distances_reduction",
+        sources=["_pairwise_distances_reduction.pyx"],
         language="c++",
         libraries=libraries,
     )
diff --git a/sklearn/metrics/tests/test_pairwise_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
similarity index 99%
rename from sklearn/metrics/tests/test_pairwise_reduction.py
rename to sklearn/metrics/tests/test_pairwise_distances_reduction.py
index f7fc589887d58..2fd3128ff0375 100644
--- a/sklearn/metrics/tests/test_pairwise_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -13,7 +13,7 @@
     lil_matrix,
 )
 
-from sklearn.metrics._parallel_reductions import (
+from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     ArgKmin,
     RadiusNeighborhood,
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 31be56a3ff53b..f6c7fcc7df344 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._parallel_reductions import ArgKmin, RadiusNeighborhood
+from ..metrics._pairwise_distances_reduction import ArgKmin, RadiusNeighborhood
 from ..utils import (
     check_array,
     gen_even_slices,

From 906b1e451651eed8fdd41888c0ceae19ae6de767 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 11:37:56 +0200
Subject: [PATCH 129/290] Move DatasetsPair closer to DistanceMetrics

---
 sklearn/metrics/_dist_metrics.pxd             |  10 +
 sklearn/metrics/_dist_metrics.pyx             | 322 ++++++++++++++++-
 .../metrics/_pairwise_distances_reduction.pyx | 329 +-----------------
 .../test_pairwise_distances_reduction.py      |  11 +-
 4 files changed, 337 insertions(+), 335 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index e7581571ca0a8..17423f67be8b9 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -81,3 +81,13 @@ cdef class DistanceMetric:
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1
 
     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
+
+
+######################################################################
+# DatasetsPair base class
+cdef class DatasetsPair:
+    cdef readonly DistanceMetric distance_metric
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1
+
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index d736482df216c..d18958e9b2540 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -10,6 +10,8 @@
 
 import numpy as np
 cimport numpy as np
+from cython cimport final
+
 np.import_array()  # required in order to use C-API
 
 
@@ -33,9 +35,11 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
 from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
 cdef DTYPE_t INF = np.inf
 
+from scipy.sparse import csr_matrix, issparse
+
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
 from ..utils._typedefs import DTYPE, ITYPE
-
+from ..utils import check_array
 
 ######################################################################
 # newObj function
@@ -197,8 +201,8 @@ cdef class DistanceMetric:
     """
     def __cinit__(self):
         self.p = 2
-        self.vec = np.zeros(1, dtype=DTYPE, order='c')
-        self.mat = np.zeros((1, 1), dtype=DTYPE, order='c')
+        self.vec = np.zeros(1, dtype=DTYPE, order='C')
+        self.mat = np.zeros((1, 1), dtype=DTYPE, order='C')
         self.size = 1
 
     def __reduce__(self):
@@ -1174,3 +1178,315 @@ cdef class PyFuncDistance(DistanceMetric):
 
 cdef inline double fmax(double a, double b) nogil:
     return max(a, b)
+
+
+######################################################################
+# Datasets Pair Classes
+cdef class DatasetsPair:
+    """Abstract class which wraps a pair of datasets (X, Y).
+
+    This class allows computing distances between two vectors (X_i, Y_j)
+    (rows of X and Y) at a time given the pair of their indices (i, j).
+
+    X and Y can be stored as np.ndarrays or CSR matrices in subclasses.
+
+    This class avoids the overhead of dispatching distance computations
+    to :class:`sklearn.metrics.DistanceMetric` based on the physical
+    representation of the vectors (sparse vs. dense). It makes use of
+    cython.final to remove the overhead of method calls' dispatch.
+    """
+
+    @classmethod
+    def get_for(cls, X, Y, str metric="euclidean",
+                dict metric_kwargs=dict()) -> DatasetsPair:
+        cdef:
+            DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
+                                                                 **metric_kwargs)
+
+        # TODO: what's the best coercion for lists?
+        X = np.asarray(X) if isinstance(X, (tuple, list)) else X
+        Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
+
+        if X.dtype.itemsize != 8 or Y.dtype.itemsize != 8:
+            raise ValueError("32bits datasets aren't supported for X and Y yet.")
+
+        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
+
+        if X.shape[1] != Y.shape[1]:
+            raise ValueError("Vectors of X and Y must have the same "
+                               "number of dimensions but currently are "
+                               f"respectively {X.shape[1]}-dimensional "
+                               f"and {Y.shape[1]}-dimensional.")
+
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        if not issparse(X) and not issparse(Y):
+            return DenseDenseDatasetsPair(X, Y, distance_metric)
+        if issparse(X) and not issparse(Y):
+            return SparseDenseDatasetsPair(X, Y, distance_metric)
+        if not issparse(X) and issparse(Y):
+            return DenseSparseDatasetsPair(X, Y, distance_metric)
+        return SparseSparseDatasetsPair(X, Y, distance_metric)
+
+    def __init__(self, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        X_data = check_array(X.data, dtype=DTYPE, ensure_2d=False)
+        X_indices = check_array(X.indices, dtype=ITYPE, ensure_2d=False)
+        X_indptr = check_array(X.indptr, dtype=ITYPE, ensure_2d=False)
+        return X_data, X_indptr, X_indptr
+
+    @property
+    def n_X(self):
+        raise RuntimeError()
+
+    @property
+    def n_Y(self):
+        raise RuntimeError()
+
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        return self.dist(i, j)
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        return -1
+
+
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of two arrays.
+
+    X: ndarray of shape (n_X, d)
+        Rows represent vectors
+    Y: ndarray of shape (n_Y, d)
+        Rows represent vectors
+    """
+    cdef:
+        const DTYPE_t[:, ::1] X  # shape: (n_X, d)
+        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+        ITYPE_t d
+
+    def __cinit__(self):
+        # Initializing memory view to prevent memory errors and seg-faults
+        # in rare cases where __init__ is not called
+        self.X = np.empty((1, 1), dtype=DTYPE, order='C')
+        self.Y = np.empty((1, 1), dtype=DTYPE, order='C')
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        DatasetsPair.__init__(self, distance_metric)
+        self.X = check_array(X, dtype=DTYPE, order='C')
+        self.Y = check_array(Y, dtype=DTYPE, order='C')
+        self.d = X.shape[1]
+
+    @property
+    @final
+    def n_X(self):
+        return self.X.shape[0]
+
+    @property
+    @final
+    def n_Y(self):
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        return self.distance_metric.rdist(&self.X[i, 0],
+                                          &self.Y[j, 0],
+                                          self.d)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        return self.distance_metric.dist(&self.X[i, 0],
+                                         &self.Y[j, 0],
+                                         self.d)
+
+
+cdef class SparseSparseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of two sparse matrices.
+
+    X: sparse matrix of shape (n_X, d)
+        Rows represent vectors
+    Y: sparse matrix of shape (n_X, d)
+        Rows represent vectors
+    """
+    cdef:
+        const DTYPE_t[:] X_data
+        const ITYPE_t[:] X_indices,
+        const ITYPE_t[:] X_indptr,
+
+        const DTYPE_t[:] Y_data
+        const ITYPE_t[:] Y_indices
+        const ITYPE_t[:] Y_indptr
+
+    def __cinit__(self):
+        # Initializing memory view to prevent memory errors and seg-faults
+        # in rare cases where __init__ is not called
+        self.X_data = np.empty((1, 1), dtype=DTYPE, order='C')
+        self.X_indices = np.empty((1, 1), dtype=ITYPE, order='C')
+        self.X_indptr = np.empty((1, 1), dtype=ITYPE, order='C')
+
+        self.Y_data = np.empty((1, 1), dtype=DTYPE, order='C')
+        self.Y_indices = np.empty((1, 1), dtype=ITYPE, order='C')
+        self.Y_indptr = np.empty((1, 1), dtype=ITYPE, order='C')
+
+    @property
+    @final
+    def n_X(self):
+        return self.X_indptr.shape[0] - 1
+
+    @property
+    @final
+    def n_Y(self):
+        return self.Y_indptr.shape[0] -1
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        DatasetsPair.__init__(self, distance_metric)
+
+        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
+
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    @final
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+            ITYPE_t yj_start = self.Y_indptr[j]
+            ITYPE_t yj_end = self.Y_indptr[j + 1]
+
+        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
+                                          self.X_indices[xi_start:xi_end],
+                                          self.Y_data[yj_start:yj_end],
+                                          self.Y_indices[yj_start:yj_end])
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+            ITYPE_t yj_start = self.Y_indptr[j]
+            ITYPE_t yj_end = self.Y_indptr[j + 1]
+
+        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
+                                         self.X_indices[xi_start:xi_end],
+                                         self.Y_data[yj_start:yj_end],
+                                         self.Y_indices[yj_start:yj_end])
+
+
+cdef class SparseDenseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of a sparse matrix and a dense array.
+
+    X: sparse matrix of shape (n_X, d)
+        Rows represent vectors
+    Y: ndarray of shape (n_Y, d)
+        Rows represent vectors
+    """
+    cdef:
+        const DTYPE_t[:] X_data
+        const ITYPE_t[:] X_indices,
+        const ITYPE_t[:] X_indptr,
+
+        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+        const ITYPE_t[:] Y_indices
+
+    def __cinit__(self):
+        # Initializing memory view to prevent memory errors and seg-faults
+        # in rare cases where __init__ is not called
+        self.X_data = np.empty((1, 1), dtype=DTYPE, order='C')
+        self.X_indices = np.empty((1, 1), dtype=ITYPE, order='C')
+        self.X_indptr = np.empty((1, 1), dtype=ITYPE, order='C')
+
+        self.Y = np.empty((1, 1), dtype=DTYPE, order='C')
+        self.Y_indices = np.empty((1, 1), dtype=ITYPE, order='C')
+
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        DatasetsPair.__init__(self, distance_metric)
+
+        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+
+        self.Y = check_array(Y, dtype=DTYPE)
+        self.Y_indices = np.arange(self.Y.shape[1])
+
+    @property
+    @final
+    def n_X(self):
+        return self.X_indptr.shape[0] - 1
+
+    @property
+    @final
+    def n_Y(self):
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+
+        # TODO: the 2D to 1D memory-view conversion might make computation slower, see:
+        # https://github.com/scikit-learn/scikit-learn/issues/17299
+        # Ideally, we could pass pointers and indices and access elements
+        # then in distance_metric.dist
+        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
+                                          self.X_indices[xi_start:xi_end],
+                                          self.Y[j, :],
+                                          self.Y_indices)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+
+        # TODO: same as previous comment
+        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
+                                         self.X_indices[xi_start:xi_end],
+                                         self.Y[j, :],
+                                         self.Y_indices)
+
+
+cdef class DenseSparseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of a dense array and a sparse matrix.
+
+    X: ndarray of shape (n_X, d)
+        Rows represent vectors
+    Y: sparse matrix of shape (n_Y, d)
+        Rows represent vectors
+    """
+    cdef:
+        # As distance metrics are symmetric functions, we can
+        # simply rely on the other DatasetsPair and swap arguments.
+        DatasetsPair datasets_pair
+
+    def __init__(self, X, Y, distance_metric):
+        DatasetsPair.__init__(self, distance_metric)
+        # Swapping arguments on the constructor
+        self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
+
+    @property
+    @final
+    def n_X(self):
+        # Swapping interface
+        return self.datasets_pair.n_Y
+
+    @property
+    @final
+    def n_Y(self):
+        # Swapping interface
+        return self.datasets_pair.n_X
+
+    @final
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        # Swapping arguments on the same interface
+        return self.datasets_pair.proxy_dist(j, i)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+        # Swapping arguments on the same interface
+        return self.datasets_pair.dist(j, i)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index d71ac9290a421..1c5989fc48b1e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -22,7 +22,7 @@ from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
 from cpython.ref cimport Py_INCREF
 
-from ._dist_metrics cimport DistanceMetric
+from ._dist_metrics cimport DatasetsPair
 from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -141,333 +141,6 @@ cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(
 
 #####################
 
-cdef class DatasetsPair:
-    """Abstract class which wraps a pair of datasets (X, Y).
-
-    X and Y can be stored as rows of np.ndarrays or CSR matrices in subclasses.
-
-    This class allows compute distances via :class:`sklearn.metrics.DistanceMetric`
-    for one pair of vectors at a time given the pair of their indices (i, j).
-
-    This class avoids the overhead of dispatching distance computations
-    based on the physical representation of the vectors (sparse vs. dense)
-    for each row of the collection.
-
-    This makes use of cython.final to remove the overhead of method calls'
-    dispatch.
-    """
-    cdef:
-        readonly DistanceMetric distance_metric
-
-    @classmethod
-    def get_for(cls,
-                X,
-                Y,
-                str metric="euclidean",
-                dict metric_kwargs=dict(),
-        ) -> DatasetsPair:
-        cdef:
-            DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
-                                                                 **metric_kwargs)
-
-        # TODO: what's the best coercion for lists?
-        X = np.asarray(X) if isinstance(X, (tuple, list)) else X
-        Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
-
-        if X.dtype.itemsize != 8 or Y.dtype.itemsize != 8:
-            raise ValueError("32bits datasets aren't supported for X and Y yet.")
-
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
-
-        if X.shape[1] != Y.shape[1]:
-            raise ValueError("Vectors of X and Y must have the same "
-                               "number of dimensions but currently are "
-                               f"respectively {X.shape[1]}-dimensional "
-                               f"and {Y.shape[1]}-dimensional.")
-
-        distance_metric._validate_data(X)
-        distance_metric._validate_data(Y)
-
-        if not issparse(X) and not issparse(Y):
-            return DenseDenseDatasetsPair(X, Y, distance_metric)
-        if issparse(X) and not issparse(Y):
-            return SparseDenseDatasetsPair(X, Y, distance_metric)
-        if not issparse(X) and issparse(Y):
-            return DenseSparseDatasetsPair(X, Y, distance_metric)
-        return SparseSparseDatasetsPair(X, Y, distance_metric)
-
-    def __init__(self, DistanceMetric distance_metric):
-        self.distance_metric = distance_metric
-
-    @classmethod
-    def unpack_csr_matrix(cls, X: spmatrix):
-        X_data = check_array(X.data, dtype=DTYPE, ensure_2d=False)
-        X_indices = check_array(X.indices, dtype=ITYPE, ensure_2d=False)
-        X_indptr = check_array(X.indptr, dtype=ITYPE, ensure_2d=False)
-        return X_data, X_indptr, X_indptr
-
-    @property
-    def n_X(self):
-        raise RuntimeError()
-
-    @property
-    def n_Y(self):
-        raise RuntimeError()
-
-    cdef DTYPE_t proxy_dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        return self.dist(i, j)
-
-    cdef DTYPE_t dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        return -1
-
-
-cdef class DenseDenseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of two arrays.
-
-    X: ndarray of shape (n_X, d)
-        Rows represent vectors
-    Y: ndarray of shape (n_Y, d)
-        Rows represent vectors
-    """
-    cdef:
-        const DTYPE_t[:, ::1] X  # shape: (n_X, d)
-        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
-        ITYPE_t d
-
-    def __cinit__(self):
-        # Initializing memory view to prevent memory errors and seg-faults
-        # in rare cases where __init__ is not called
-        self.X = np.empty((1, 1), dtype=DTYPE, order='c')
-        self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
-        self.X = check_array(X, dtype=DTYPE, order='C')
-        self.Y = check_array(Y, dtype=DTYPE, order='C')
-        self.d = X.shape[1]
-
-    @property
-    @final
-    def n_X(self):
-        return self.X.shape[0]
-
-    @property
-    @final
-    def n_Y(self):
-        return self.Y.shape[0]
-
-    @final
-    cdef DTYPE_t proxy_dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        return self.distance_metric.rdist(&self.X[i, 0],
-                                          &self.Y[j, 0],
-                                          self.d)
-
-    @final
-    cdef DTYPE_t dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        return self.distance_metric.dist(&self.X[i, 0],
-                                         &self.Y[j, 0],
-                                         self.d)
-
-
-cdef class SparseSparseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of two sparse matrices.
-
-    X: sparse matrix of shape (n_X, d)
-        Rows represent vectors
-    Y: sparse matrix of shape (n_X, d)
-        Rows represent vectors
-    """
-    cdef:
-        const DTYPE_t[:] X_data
-        const ITYPE_t[:] X_indices,
-        const ITYPE_t[:] X_indptr,
-
-        const DTYPE_t[:] Y_data
-        const ITYPE_t[:] Y_indices
-        const ITYPE_t[:] Y_indptr
-
-    @property
-    @final
-    def n_X(self):
-        return self.X_indptr.shape[0] - 1
-
-    @property
-    @final
-    def n_Y(self):
-        return self.Y_indptr.shape[0] -1
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
-
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
-
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
-
-    @final
-    cdef DTYPE_t proxy_dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-            ITYPE_t yj_start = self.Y_indptr[j]
-            ITYPE_t yj_end = self.Y_indptr[j + 1]
-
-        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
-                                          self.X_indices[xi_start:xi_end],
-                                          self.Y_data[yj_start:yj_end],
-                                          self.Y_indices[yj_start:yj_end])
-
-    @final
-    cdef DTYPE_t dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-            ITYPE_t yj_start = self.Y_indptr[j]
-            ITYPE_t yj_end = self.Y_indptr[j + 1]
-
-        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
-                                         self.X_indices[xi_start:xi_end],
-                                         self.Y_data[yj_start:yj_end],
-                                         self.Y_indices[yj_start:yj_end])
-
-
-cdef class SparseDenseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of two sparse matrices.
-
-    X: sparse matrix of shape (n_X, d)
-        Rows represent vectors
-    Y: ndarray of shape (n_Y, d)
-        Rows represent vectors
-    """
-    cdef:
-        const DTYPE_t[:] X_data
-        const ITYPE_t[:] X_indices,
-        const ITYPE_t[:] X_indptr,
-
-        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
-        const ITYPE_t[:] Y_indices
-
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
-
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-
-        self.Y = check_array(Y, dtype=DTYPE)
-        self.Y_indices = np.arange(self.Y.shape[1])
-
-    @property
-    @final
-    def n_X(self):
-        return self.X_indptr.shape[0] - 1
-
-    @property
-    @final
-    def n_Y(self):
-        return self.Y.shape[0]
-
-    @final
-    cdef DTYPE_t proxy_dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-
-        # TODO: the 2D to 1D memory-view conversion might make computation slower, see:
-        # https://github.com/scikit-learn/scikit-learn/issues/17299
-        # Ideally, we could pass pointers and indices and access elements
-        # then in distance_metric.dist
-        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
-                                          self.X_indices[xi_start:xi_end],
-                                          self.Y[j, :],
-                                          self.Y_indices)
-
-    @final
-    cdef DTYPE_t dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-
-        # TODO: same as previous comment
-        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
-                                         self.X_indices[xi_start:xi_end],
-                                         self.Y[j, :],
-                                         self.Y_indices)
-
-
-cdef class DenseSparseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of a sparse matrix and vectors of an array.
-
-    X: ndarray of shape (n_X, d)
-        Rows represent vectors
-    Y: sparse matrix of shape (n_Y, d)
-        Rows represent vectors
-    """
-    cdef:
-        # As distance metrics are commutative functions, we can
-        # simply rely on the other strategy and swap arguments
-        DatasetsPair datasets_pair
-
-    def __init__(self, X, Y, distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
-        # Swapping arguments on the constructor
-        self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
-
-    @property
-    @final
-    def n_X(self):
-        # Swapping interface
-        return self.datasets_pair.n_Y
-
-    @property
-    @final
-    def n_Y(self):
-        # Swapping interface
-        return self.datasets_pair.n_X
-
-    @final
-    cdef DTYPE_t proxy_dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        # Swapping arguments on the same interface
-        return self.datasets_pair.proxy_dist(j, i)
-
-    @final
-    cdef DTYPE_t dist(self,
-        ITYPE_t i,
-        ITYPE_t j,
-    ) nogil except -1:
-        # Swapping arguments on the same interface
-        return self.datasets_pair.dist(j, i)
-
-
 cdef class PairwiseDistancesReduction:
     """Abstract class to computes a reduction on pairwise
     distances between a set of vectors (rows) X and another
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 2fd3128ff0375..360afa1ecba30 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -13,16 +13,19 @@
     lil_matrix,
 )
 
+from sklearn.metrics._dist_metrics import (
+    DenseDenseDatasetsPair,
+    DenseSparseDatasetsPair,
+    SparseDenseDatasetsPair,
+    SparseSparseDatasetsPair,
+)
+
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     ArgKmin,
     RadiusNeighborhood,
     FastSquaredEuclideanArgKmin,
     FastSquaredEuclideanRadiusNeighborhood,
-    DenseDenseDatasetsPair,
-    DenseSparseDatasetsPair,
-    SparseDenseDatasetsPair,
-    SparseSparseDatasetsPair,
 )
 
 

From eb988eac9447919495f1d2c1da853ddc1ca921ad Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 11:42:20 +0200
Subject: [PATCH 130/290] Add const qualifier on squared norms memoryviews

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 1c5989fc48b1e..2d0aa8ed34717 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -697,7 +697,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
     cdef:
         const DTYPE_t[:, ::1] X
         const DTYPE_t[:, ::1] Y
-        DTYPE_t[::1] Y_sq_norms
+        const DTYPE_t[::1] Y_sq_norms
 
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
@@ -1160,8 +1160,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
     cdef:
         const DTYPE_t[:, ::1] X
         const DTYPE_t[:, ::1] Y
-        DTYPE_t[::1] X_sq_norms
-        DTYPE_t[::1] Y_sq_norms
+        const DTYPE_t[::1] X_sq_norms
+        const DTYPE_t[::1] Y_sq_norms
 
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks

From 9fc5e95e5d802dc2a82a4e92ce4fb2138f4b47c3 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 11:56:51 +0200
Subject: [PATCH 131/290] Improve style

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx    | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 2d0aa8ed34717..fe133f698ea4b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -445,9 +445,12 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         # This factory comes to handle specialisations.
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
-        return ArgKmin(datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
-                       k=k,
-                       chunk_size=chunk_size)
+
+        return ArgKmin(
+            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            k=k,
+            chunk_size=chunk_size,
+        )
 
     def __init__(self,
         DatasetsPair datasets_pair,
@@ -921,9 +924,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                                                           radius=radius,
                                                           chunk_size=chunk_size)
         return RadiusNeighborhood(
-                       datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
-                       radius=radius,
-                       chunk_size=chunk_size)
+            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            radius=radius,
+            chunk_size=chunk_size,
+        )
 
     def __init__(self,
         DatasetsPair datasets_pair,

From 4f73848e32f2b2a7a58284a72e8c91b8bb27cc95 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 11:56:51 +0200
Subject: [PATCH 132/290] Improve docstring notse for FastSquaredEuclidean
 alternatives

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index fe133f698ea4b..02016932d5be1 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -682,19 +682,16 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
 
 cdef class FastSquaredEuclideanArgKmin(ArgKmin):
-    """Fast specialized alternative for ArgKmin on
-    EuclideanDistance.
-
-    Computes the argkmin of vectors (rows) of a set of
-    vectors (rows) of X on another set of vectors (rows) of Y
-    using the GEMM-trick.
+    """Fast specialized alternative for ArgKmin on EuclideanDistance.
 
     Notes
     -----
-    This implementation has an superior arithmetic intensity
-    and hence running time, but it can suffer from numerical
-    instability. ArgKmin with EuclideanDistance must be
-    used when exact precision is needed.
+    This implementation has a superior arithmetic intensity and hence
+    better running time when the alternative is IO bound, but it can suffer
+    from numerical instability.
+
+    ArgKmin with EuclideanDistance must be used when higher numerical precision
+    is needed.
     """
 
     cdef:
@@ -1155,10 +1152,12 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
 
     Notes
     -----
-    This implementation has an superior arithmetic intensity
-    and hence running time, but it can suffer from numerical
-    instability. RadiusNeighborhood with EuclideanDistance
-    must be used when exact precision is needed.
+    This implementation has a superior arithmetic intensity and hence
+    better running time when the alternative is IO bound, but it can suffer
+    from numerical instability.
+
+    RadiusNeighborhood with EuclideanDistance must be used when higher
+    numerical precision is needed.
     """
 
     cdef:

From ea3d791e4bf9cd8b6b7569775a67e1ead70f128c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 13:25:17 +0200
Subject: [PATCH 133/290] Fix __cinit__

---
 sklearn/metrics/_dist_metrics.pyx | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index d18958e9b2540..87aa6b5423460 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1323,13 +1323,13 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
         # in rare cases where __init__ is not called
-        self.X_data = np.empty((1, 1), dtype=DTYPE, order='C')
-        self.X_indices = np.empty((1, 1), dtype=ITYPE, order='C')
-        self.X_indptr = np.empty((1, 1), dtype=ITYPE, order='C')
+        self.X_data = np.empty((1), dtype=DTYPE, order='C')
+        self.X_indices = np.empty((1), dtype=ITYPE, order='C')
+        self.X_indptr = np.empty((1), dtype=ITYPE, order='C')
 
-        self.Y_data = np.empty((1, 1), dtype=DTYPE, order='C')
-        self.Y_indices = np.empty((1, 1), dtype=ITYPE, order='C')
-        self.Y_indptr = np.empty((1, 1), dtype=ITYPE, order='C')
+        self.Y_data = np.empty((1), dtype=DTYPE, order='C')
+        self.Y_indices = np.empty((1), dtype=ITYPE, order='C')
+        self.Y_indptr = np.empty((1), dtype=ITYPE, order='C')
 
     @property
     @final
@@ -1396,12 +1396,12 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
         # in rare cases where __init__ is not called
-        self.X_data = np.empty((1, 1), dtype=DTYPE, order='C')
-        self.X_indices = np.empty((1, 1), dtype=ITYPE, order='C')
-        self.X_indptr = np.empty((1, 1), dtype=ITYPE, order='C')
+        self.X_data = np.empty((1), dtype=DTYPE, order='C')
+        self.X_indices = np.empty((1), dtype=ITYPE, order='C')
+        self.X_indptr = np.empty((1), dtype=ITYPE, order='C')
 
         self.Y = np.empty((1, 1), dtype=DTYPE, order='C')
-        self.Y_indices = np.empty((1, 1), dtype=ITYPE, order='C')
+        self.Y_indices = np.empty((1), dtype=ITYPE, order='C')
 
 
     def __init__(self, X, Y, DistanceMetric distance_metric):

From 445a22d91335ee09154420d3798dba744fa1db22 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 13:32:38 +0200
Subject: [PATCH 134/290] Skip for tests for 32 bits

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 360afa1ecba30..cad5c8a905d40 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -28,6 +28,8 @@
     FastSquaredEuclideanRadiusNeighborhood,
 )
 
+from sklearn.utils._testing import skip_if_32bit
+
 
 def assert_radius_neighborhood_equality(ref_dist, dist, ref_indices, indices):
     # We get arrays of arrays and we need to check for individual pairs
@@ -209,6 +211,7 @@ def test_pairwise_distances_reduction_factory_method(
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
 @pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+@skip_if_32bit
 def test_argkmin_chunk_size_agnosticism(
     n_samples, k, chunk_size, metric, n_features=100, dtype=np.float64
 ):
@@ -234,6 +237,7 @@ def test_argkmin_chunk_size_agnosticism(
 @pytest.mark.parametrize("radius", [1, 10, 100])
 @pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
 @pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+@skip_if_32bit
 def test_radius_neighborhood_chunk_size_agnosticism(
     n_samples, radius, chunk_size, metric, n_features=100, dtype=np.float64
 ):
@@ -261,6 +265,7 @@ def test_radius_neighborhood_chunk_size_agnosticism(
 @pytest.mark.parametrize("n_features", [5, 100, 500])
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+@skip_if_32bit
 def test_argkmin_strategies_consistency(
     n_samples,
     n_features,
@@ -294,6 +299,7 @@ def test_argkmin_strategies_consistency(
 @pytest.mark.parametrize("n_features", [5, 100, 500])
 @pytest.mark.parametrize("radius", [1, 10, 100])
 @pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
+@skip_if_32bit
 def test_radius_neighborhood_strategies_consistency(
     n_samples,
     n_features,
@@ -334,6 +340,7 @@ def test_radius_neighborhood_strategies_consistency(
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("sample_imbalance", [10, 2, 1, 0.5])
 @pytest.mark.parametrize("k, radius", [(1, 0), (10, 1), (100, 10), (1000, 100)])
+@skip_if_32bit
 def test_fast_sqeuclidean_correctness(
     n_samples,
     n_features,

From 2647327218365c06ac957c2db1bab396beca093b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 29 Jul 2021 19:16:27 +0200
Subject: [PATCH 135/290] Adapt tests for better parametrisation variability

---
 sklearn/metrics/_dist_metrics.pyx             |  10 ++
 .../metrics/_pairwise_distances_reduction.pyx |  10 +-
 sklearn/metrics/tests/test_dist_metrics.py    |  11 +-
 .../test_pairwise_distances_reduction.py      | 146 +++++++++++-------
 4 files changed, 104 insertions(+), 73 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 87aa6b5423460..2ba11aa20c119 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -77,6 +77,16 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance,
                   'haversine': HaversineDistance,
                   'pyfunc': PyFuncDistance}
 
+BOOL_METRICS = [
+    "matching",
+    "jaccard",
+    "dice",
+    "kulsinski",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalmichener",
+    "sokalsneath",
+]
 
 def get_valid_metric_ids(L):
     """Given an iterable of metric class names or class identifiers,
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 02016932d5be1..a8bc96eb7a1cb 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -38,9 +38,9 @@ from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 
 
-from scipy.sparse import issparse, spmatrix
+from scipy.sparse import issparse
 from threadpoolctl import threadpool_limits
-from ._dist_metrics import METRIC_MAPPING
+from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
 from ..utils import check_array, check_scalar
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
@@ -167,8 +167,10 @@ cdef class PairwiseDistancesReduction:
 
     @classmethod
     def valid_metrics(cls):
-        # TODO: support those distances
-        excluded = {"pyfunc", "sokalmichener", "matching", "jaccard"}
+        # TODO: support those boolean metrics.
+        # In order for them to be supported, we need to have a
+        # simultaneous sort which breaks ties on distances
+        excluded = {"pyfunc", *BOOL_METRICS}
         return sorted({*METRIC_MAPPING.keys()}.difference(excluded))
 
     @classmethod
diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index 9440abba6f848..28ed45ac5cf33 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -8,6 +8,7 @@
 
 from scipy.spatial.distance import cdist
 from sklearn.metrics import DistanceMetric
+from sklearn.metrics._dist_metrics import BOOL_METRICS
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version
@@ -36,16 +37,6 @@ def dist_func(x1, x2, p):
 V = rng.random_sample((d, d))
 VI = np.dot(V, V.T)
 
-BOOL_METRICS = [
-    "matching",
-    "jaccard",
-    "dice",
-    "kulsinski",
-    "rogerstanimoto",
-    "russellrao",
-    "sokalmichener",
-    "sokalsneath",
-]
 
 METRICS_DEFAULT_PARAMS = {
     "euclidean": {},
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index cad5c8a905d40..d5b3dc3f4986c 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -28,10 +28,8 @@
     FastSquaredEuclideanRadiusNeighborhood,
 )
 
-from sklearn.utils._testing import skip_if_32bit
 
-
-def assert_radius_neighborhood_equality(ref_dist, dist, ref_indices, indices):
+def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
     # We get arrays of arrays and we need to check for individual pairs
     for i in range(ref_dist.shape[0]):
         assert_array_equal(
@@ -43,9 +41,24 @@ def assert_radius_neighborhood_equality(ref_dist, dist, ref_indices, indices):
             ref_dist[i],
             dist[i],
             err_msg=f"Query vector #{i} has different neighbors' distances",
+            rtol=1e-7,
         )
 
 
+def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
+    assert_array_equal(
+        ref_indices,
+        indices,
+        err_msg="Query vectors have different neighbors' indices",
+    )
+    assert_allclose(
+        ref_dist,
+        dist,
+        err_msg="Query vectors havehas different neighbors' distances",
+        rtol=1e-7,
+    )
+
+
 def test_pairwise_distances_reduction_is_usable_for():
     rng = np.random.RandomState(1)
     X = rng.rand(100, 10)
@@ -96,10 +109,10 @@ def test_argkmin_factory_method_wrong_usages():
         ArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric")
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        ArgKmin.get_for(X=[1, 2], Y=Y, k=k, metric=metric)
+        ArgKmin.get_for(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        ArgKmin.get_for(X=X, Y=[1, 2], k=k, metric=metric)
+        ArgKmin.get_for(X=X, Y=np.array([1.0, 2.0]), k=k, metric=metric)
 
     with pytest.raises(
         ValueError, match="Vectors of X and Y must have the same number of dimensions"
@@ -135,10 +148,14 @@ def test_radius_neighborhood_factory_method_wrong_usages():
         RadiusNeighborhood.get_for(X=X, Y=Y, radius=radius, metric="wrong metric")
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        RadiusNeighborhood.get_for(X=[1, 2], Y=Y, radius=radius, metric=metric)
+        RadiusNeighborhood.get_for(
+            X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
+        )
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        RadiusNeighborhood.get_for(X=X, Y=[1, 2], radius=radius, metric=metric)
+        RadiusNeighborhood.get_for(
+            X=X, Y=np.array([1.0, 2.0]), radius=radius, metric=metric
+        )
 
     with pytest.raises(
         ValueError, match="Vectors of X and Y must have the same number of dimensions"
@@ -210,13 +227,18 @@ def test_pairwise_distances_reduction_factory_method(
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
-@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
-@skip_if_32bit
+@pytest.mark.parametrize("seed", range(10))
 def test_argkmin_chunk_size_agnosticism(
-    n_samples, k, chunk_size, metric, n_features=100, dtype=np.float64
+    n_samples,
+    k,
+    chunk_size,
+    seed,
+    metric="fast_sqeuclidean",
+    n_features=100,
+    dtype=np.float64,
 ):
     # ArgKmin results should not depend on the chunk size
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -229,20 +251,24 @@ def test_argkmin_chunk_size_agnosticism(
         X, Y, k=k, metric=metric, chunk_size=chunk_size
     ).compute(return_distance=True)
 
-    assert_array_equal(ref_dist, dist)
-    assert_array_equal(ref_indices, indices)
+    assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices)
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("radius", [1, 10, 100])
 @pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
-@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
-@skip_if_32bit
+@pytest.mark.parametrize("seed", range(10))
 def test_radius_neighborhood_chunk_size_agnosticism(
-    n_samples, radius, chunk_size, metric, n_features=100, dtype=np.float64
+    n_samples,
+    radius,
+    chunk_size,
+    seed,
+    metric="fast_sqeuclidean",
+    n_features=100,
+    dtype=np.float64,
 ):
     # RadiusNeighborhood results should not depend on the chunk size
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(seed)
     spread = 100
 
     # Scaling the radius with the dimensions
@@ -258,29 +284,35 @@ def test_radius_neighborhood_chunk_size_agnosticism(
         X, Y, radius=scaled_radius, metric=metric, chunk_size=chunk_size
     ).compute(return_distance=True)
 
-    assert_radius_neighborhood_equality(ref_dist, dist, ref_indices, indices)
+    assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices)
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 100, 500])
 @pytest.mark.parametrize("k", [1, 10, 100])
-@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
-@skip_if_32bit
+@pytest.mark.parametrize("metric", ArgKmin.valid_metrics())
+@pytest.mark.parametrize("seed", range(10))
 def test_argkmin_strategies_consistency(
     n_samples,
     n_features,
     k,
     metric,
+    seed,
     dtype=np.float64,
 ):
     # ArgKmin results obtained using both parallelization strategies
     # must be identical
 
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = X[:, :2]
+        Y = Y[:, :2]
+
     argkmin_reduction = ArgKmin.get_for(X, Y, k=k, metric=metric)
 
     dist_par_X, indices_par_X = argkmin_reduction.compute(
@@ -291,30 +323,37 @@ def test_argkmin_strategies_consistency(
         strategy="parallel_on_Y", return_distance=True
     )
 
-    assert_array_equal(dist_par_X, dist_par_Y)
-    assert_array_equal(indices_par_X, indices_par_Y)
+    assert_argkmin_results_equality(
+        dist_par_X, dist_par_Y, indices_par_X, indices_par_Y
+    )
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 100, 500])
 @pytest.mark.parametrize("radius", [1, 10, 100])
-@pytest.mark.parametrize("metric", ["euclidean", "fast_sqeuclidean"])
-@skip_if_32bit
+@pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
+@pytest.mark.parametrize("seed", range(10))
 def test_radius_neighborhood_strategies_consistency(
     n_samples,
     n_features,
     radius,
     metric,
+    seed,
     dtype=np.float64,
 ):
     # RadiusNeighborhood results obtained using both parallelization strategies
     # must be identical
 
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = X[:, :2]
+        Y = Y[:, :2]
+
     radius_neigh_reduction = RadiusNeighborhood.get_for(
         X,
         Y,
@@ -331,22 +370,21 @@ def test_radius_neighborhood_strategies_consistency(
         strategy="parallel_on_Y", return_distance=True
     )
 
-    assert_radius_neighborhood_equality(
+    assert_radius_neighborhood_results_equality(
         dist_par_X, dist_par_Y, indices_par_X, indices_par_Y
     )
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("sample_imbalance", [10, 2, 1, 0.5])
-@pytest.mark.parametrize("k, radius", [(1, 0), (10, 1), (100, 10), (1000, 100)])
-@skip_if_32bit
+@pytest.mark.parametrize("k, radius", [(50, 100)])
+@pytest.mark.parametrize("seed", range(10))
 def test_fast_sqeuclidean_correctness(
     n_samples,
     n_features,
-    sample_imbalance,
     k,
     radius,
+    seed,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy must return results
@@ -357,19 +395,11 @@ def test_fast_sqeuclidean_correctness(
             allow_module_level=True,
         )
 
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(seed)
 
     spread = 100
-    X = (
-        rng.rand(int(n_samples * n_features / sample_imbalance))
-        .astype(dtype)
-        .reshape((-1, n_features))
-        * spread
-    )
-    Y = (
-        rng.rand(int(n_samples * n_features)).astype(dtype).reshape((-1, n_features))
-        * spread
-    )
+    X = rng.rand(n_samples, n_features).astype(dtype).reshape((-1, n_features)) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype).reshape((-1, n_features)) * spread
 
     eucl_dist, eucl_indices = ArgKmin.get_for(X, Y, k, metric="euclidean").compute(
         return_distance=True
@@ -388,23 +418,22 @@ def test_fast_sqeuclidean_correctness(
         X, Y, radius, metric="fast_sqeuclidean"
     ).compute(return_distance=True)
 
-    assert_radius_neighborhood_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
+    assert_radius_neighborhood_results_equality(
+        eucl_dist, fse_dist, eucl_indices, fse_indices
+    )
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 10, 100, 500])
-@pytest.mark.parametrize("k", [1, 10, 100, 1000])
-@pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
-@pytest.mark.skip(
-    reason=(
-        "Long test, translation invariance should have its own study: skipping for now"
-    )
-)
+@pytest.mark.parametrize("k", [1, 10, 100])
+@pytest.mark.parametrize("translation", [10 ** i for i in [4]])
+@pytest.mark.parametrize("seed", range(10))
 def test_fast_sqeuclidean_translation_invariance(
     n_samples,
     n_features,
     k,
     translation,
+    seed,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy should be translation invariant.
@@ -414,7 +443,7 @@ def test_fast_sqeuclidean_translation_invariance(
             allow_module_level=True,
         )
 
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -424,8 +453,7 @@ def test_fast_sqeuclidean_translation_invariance(
     ).compute(return_distance=True)
 
     dist, indices = ArgKmin.get_for(
-        X + translation, X + translation, k, metric="fast_sqeuclidean"
+        X + translation, Y + translation, k, metric="fast_sqeuclidean"
     ).compute(return_distance=True)
 
-    assert_array_equal(reference_indices, indices)
-    assert_allclose(reference_dist, dist)
+    assert_argkmin_results_equality(reference_dist, dist, reference_indices, indices)

From 4b2c165d8aab8ce6621907d0c0b57d8524a5ead9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 10:19:41 +0200
Subject: [PATCH 136/290] Adaptation for DistanceMetrics

---
 .../metrics/_pairwise_distances_reduction.pyx | 38 ++++++++-----------
 .../test_pairwise_distances_reduction.py      | 30 ++++++++++++++-
 2 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index a8bc96eb7a1cb..f2486dbf66c03 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -8,7 +8,6 @@
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
 import numbers
-
 import numpy as np
 cimport numpy as np
 
@@ -37,7 +36,7 @@ from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 
-
+from typing import List
 from scipy.sparse import issparse
 from threadpoolctl import threadpool_limits
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
@@ -166,12 +165,18 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
 
     @classmethod
-    def valid_metrics(cls):
-        # TODO: support those boolean metrics.
-        # In order for them to be supported, we need to have a
-        # simultaneous sort which breaks ties on distances
-        excluded = {"pyfunc", *BOOL_METRICS}
-        return sorted({*METRIC_MAPPING.keys()}.difference(excluded))
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            "pyfunc",  # is relatively slow because we need to coerce data as numpy arrays
+            "mahalanobis", # is numerically unstable
+            # TODO: support those last distances.
+            # In order for them to be supported, we need to have a
+            # simultaneous sort which breaks ties on distances.
+            # The best might be using std::sort and a Comparator.
+            "hamming",
+            *BOOL_METRICS,
+        }
+        return sorted({"fast_sqeuclidean", *METRIC_MAPPING.keys()}.difference(excluded))
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
@@ -289,7 +294,6 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t num_threads = min(self.Y_n_chunks, self.effective_omp_n_thread)
             ITYPE_t thread_num
 
-        # TODO: put the "with nogil, parallel"-context here
         # Allocating datastructures
         self._on_Y_init(num_threads)
 
@@ -323,12 +327,10 @@ cdef class PairwiseDistancesReduction:
             # end: with nogil, parallel
 
             # Synchronizing the thread datastructures with the main ones
-            # This can potentially block
             self._on_Y_after_parallel(num_threads, X_start, X_end)
 
         # end: for X_chunk_idx
-        # Deallocating temporary datastructures
-        # Adjusting main datastructures before returning
+        # Deallocating temporary datastructures and adjusting main datastructures before returning
         self._on_Y_finalize(num_threads)
         return
 
@@ -431,10 +433,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         DTYPE_t ** heaps_proxy_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
-    @classmethod
-    def valid_metrics(cls):
-        return {"fast_sqeuclidean", *PairwiseDistancesReduction.valid_metrics()}
-
     @classmethod
     def get_for(cls,
                 X,
@@ -443,7 +441,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 str metric="fast_sqeuclidean",
                 ITYPE_t chunk_size=CHUNK_SIZE,
                 dict metric_kwargs=dict(),
-        ):
+        ) -> ArgKmin:
         # This factory comes to handle specialisations.
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
@@ -904,10 +902,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         bint sort_results
 
-    @classmethod
-    def valid_metrics(cls):
-        return {"fast_sqeuclidean", *PairwiseDistancesReduction.valid_metrics()}
-
     @classmethod
     def get_for(cls,
                 X,
@@ -916,7 +910,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                 str metric="fast_sqeuclidean",
                 ITYPE_t chunk_size=CHUNK_SIZE,
                 dict metric_kwargs=dict(),
-        ):
+        ) -> RadiusNeighborhood:
         # This factory comes to handle specialisations.
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanRadiusNeighborhood(X=X, Y=Y,
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index d5b3dc3f4986c..f5e504c4a88e8 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -29,6 +29,27 @@
 )
 
 
+def get_dummy_metric_kwargs(metric: str, n_features: int):
+    """Return dummy DistanceMetric kwargs for tests."""
+    rng = np.random.RandomState(1)
+    weights = rng.random_sample(n_features)
+    weights /= weights.sum()
+
+    V = rng.random_sample((n_features, n_features))
+
+    # VI is positive-semidefinite, preferred for precision matrix
+    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
+
+    kwargs = {
+        "minkowski": dict(p=1.5),
+        "seuclidean": dict(V=weights),
+        "wminkowski": dict(p=1.5, w=weights),
+        "mahalanobis": dict(VI=VI),
+    }
+
+    return kwargs.get(metric, {})
+
+
 def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
     # We get arrays of arrays and we need to check for individual pairs
     for i in range(ref_dist.shape[0]):
@@ -313,7 +334,13 @@ def test_argkmin_strategies_consistency(
         X = X[:, :2]
         Y = Y[:, :2]
 
-    argkmin_reduction = ArgKmin.get_for(X, Y, k=k, metric=metric)
+    argkmin_reduction = ArgKmin.get_for(
+        X,
+        Y,
+        k=k,
+        metric=metric,
+        metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
+    )
 
     dist_par_X, indices_par_X = argkmin_reduction.compute(
         strategy="parallel_on_X", return_distance=True
@@ -360,6 +387,7 @@ def test_radius_neighborhood_strategies_consistency(
         # Scaling the radius with the dimensions
         radius=radius ** np.log(n_features),
         metric=metric,
+        metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
     )
 
     dist_par_X, indices_par_X = radius_neigh_reduction.compute(

From 3c71fd6ae5d8d793e6a0427149ac074323edcac4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 10:40:40 +0200
Subject: [PATCH 137/290] Improve style

---
 sklearn/metrics/_dist_metrics.pyx             |  8 +++--
 .../metrics/_pairwise_distances_reduction.pyx | 32 +++++++++----------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 2ba11aa20c119..f339961ad7c3d 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1207,8 +1207,12 @@ cdef class DatasetsPair:
     """
 
     @classmethod
-    def get_for(cls, X, Y, str metric="euclidean",
-                dict metric_kwargs=dict()) -> DatasetsPair:
+    def get_for(cls,
+        X,
+        Y,
+        str metric="euclidean",
+        dict metric_kwargs=dict(),
+    ) -> DatasetsPair:
         cdef:
             DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
                                                                  **metric_kwargs)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index f2486dbf66c03..8f31d8813bdc4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -189,8 +189,8 @@ cdef class PairwiseDistancesReduction:
                 metric in cls.valid_metrics())
 
     def __init__(self,
-                 DatasetsPair datasets_pair,
-                 ITYPE_t chunk_size = CHUNK_SIZE,
+        DatasetsPair datasets_pair,
+        ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         cdef:
             ITYPE_t X_n_full_chunks, Y_n_full_chunks
@@ -435,13 +435,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     @classmethod
     def get_for(cls,
-                X,
-                Y,
-                ITYPE_t k,
-                str metric="fast_sqeuclidean",
-                ITYPE_t chunk_size=CHUNK_SIZE,
-                dict metric_kwargs=dict(),
-        ) -> ArgKmin:
+        X,
+        Y,
+        ITYPE_t k,
+        str metric="fast_sqeuclidean",
+        ITYPE_t chunk_size=CHUNK_SIZE,
+        dict metric_kwargs=dict(),
+    ) -> ArgKmin:
         # This factory comes to handle specialisations.
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
@@ -904,13 +904,13 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     @classmethod
     def get_for(cls,
-                X,
-                Y,
-                DTYPE_t radius,
-                str metric="fast_sqeuclidean",
-                ITYPE_t chunk_size=CHUNK_SIZE,
-                dict metric_kwargs=dict(),
-        ) -> RadiusNeighborhood:
+        X,
+        Y,
+        DTYPE_t radius,
+        str metric="fast_sqeuclidean",
+        ITYPE_t chunk_size=CHUNK_SIZE,
+        dict metric_kwargs=dict(),
+    ) -> RadiusNeighborhood:
         # This factory comes to handle specialisations.
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanRadiusNeighborhood(X=X, Y=Y,

From eaa892f911d07038e3aaefeabef25203df3f5934 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 10:47:28 +0200
Subject: [PATCH 138/290] Parametrise tests by seed last

---
 .../test_pairwise_distances_reduction.py      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index f5e504c4a88e8..fbe400a4e36e0 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -245,15 +245,15 @@ def test_pairwise_distances_reduction_factory_method(
     assert isinstance(fast_sqeuclidean_instance, FastSquaredPairwiseDistancesReduction)
 
 
+@pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
-@pytest.mark.parametrize("seed", range(10))
 def test_argkmin_chunk_size_agnosticism(
+    seed,
     n_samples,
     k,
     chunk_size,
-    seed,
     metric="fast_sqeuclidean",
     n_features=100,
     dtype=np.float64,
@@ -275,15 +275,15 @@ def test_argkmin_chunk_size_agnosticism(
     assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices)
 
 
+@pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("radius", [1, 10, 100])
 @pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
-@pytest.mark.parametrize("seed", range(10))
 def test_radius_neighborhood_chunk_size_agnosticism(
+    seed,
     n_samples,
     radius,
     chunk_size,
-    seed,
     metric="fast_sqeuclidean",
     n_features=100,
     dtype=np.float64,
@@ -308,17 +308,17 @@ def test_radius_neighborhood_chunk_size_agnosticism(
     assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices)
 
 
+@pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 100, 500])
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("metric", ArgKmin.valid_metrics())
-@pytest.mark.parametrize("seed", range(10))
 def test_argkmin_strategies_consistency(
+    seed,
     n_samples,
     n_features,
     k,
     metric,
-    seed,
     dtype=np.float64,
 ):
     # ArgKmin results obtained using both parallelization strategies
@@ -355,17 +355,17 @@ def test_argkmin_strategies_consistency(
     )
 
 
+@pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 100, 500])
 @pytest.mark.parametrize("radius", [1, 10, 100])
 @pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
-@pytest.mark.parametrize("seed", range(10))
 def test_radius_neighborhood_strategies_consistency(
+    seed,
     n_samples,
     n_features,
     radius,
     metric,
-    seed,
     dtype=np.float64,
 ):
     # RadiusNeighborhood results obtained using both parallelization strategies
@@ -403,16 +403,16 @@ def test_radius_neighborhood_strategies_consistency(
     )
 
 
+@pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("k, radius", [(50, 100)])
-@pytest.mark.parametrize("seed", range(10))
 def test_fast_sqeuclidean_correctness(
+    seed,
     n_samples,
     n_features,
     k,
     radius,
-    seed,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy must return results
@@ -451,17 +451,17 @@ def test_fast_sqeuclidean_correctness(
     )
 
 
+@pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 10, 100, 500])
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("translation", [10 ** i for i in [4]])
-@pytest.mark.parametrize("seed", range(10))
 def test_fast_sqeuclidean_translation_invariance(
+    seed,
     n_samples,
     n_features,
     k,
     translation,
-    seed,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy should be translation invariant.

From 7d5f6036d32aa3325c7e0bc2bd8dc96481b2dd19 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 10:49:26 +0200
Subject: [PATCH 139/290] Lighten tests parametrization

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index fbe400a4e36e0..2a3d72fa06a4f 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -310,7 +310,7 @@ def test_radius_neighborhood_chunk_size_agnosticism(
 
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 100, 500])
+@pytest.mark.parametrize("n_features", [5, 100])
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("metric", ArgKmin.valid_metrics())
 def test_argkmin_strategies_consistency(
@@ -357,7 +357,7 @@ def test_argkmin_strategies_consistency(
 
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 100, 500])
+@pytest.mark.parametrize("n_features", [5, 100])
 @pytest.mark.parametrize("radius", [1, 10, 100])
 @pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
 def test_radius_neighborhood_strategies_consistency(
@@ -453,7 +453,7 @@ def test_fast_sqeuclidean_correctness(
 
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 10, 100, 500])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("k", [1, 10, 100])
 @pytest.mark.parametrize("translation", [10 ** i for i in [4]])
 def test_fast_sqeuclidean_translation_invariance(

From 91e3b27ecf20f24d8e42433da16d9372f92cb4e2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 11:04:25 +0200
Subject: [PATCH 140/290] Use squared norms of X vectors in
 FastSquaredEuclideanArgKmin

This allow relying on proxy distances from getting exact distances
instead of recomputing them from scratch which is more expensive.
---
 .../metrics/_pairwise_distances_reduction.pyx | 46 +++++++------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 8f31d8813bdc4..439a764cccbb7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -608,6 +608,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 )
         return
 
+    # TODO: annotating with 'final' here makes the compilation fails but it should not
+    # @final
     cdef void compute_exact_distances(self) nogil:
         cdef:
             ITYPE_t i, j
@@ -697,6 +699,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
     cdef:
         const DTYPE_t[:, ::1] X
         const DTYPE_t[:, ::1] Y
+        const DTYPE_t[::1] X_sq_norms
         const DTYPE_t[::1] Y_sq_norms
 
         # Buffers for GEMM
@@ -715,6 +718,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             chunk_size=chunk_size)
         self.X = check_array(X, dtype=DTYPE, order='C')
         self.Y = check_array(Y, dtype=DTYPE, order='C')
+        self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
@@ -763,22 +767,6 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
 
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
-            DTYPE_t[:, ::1] distances = self.argkmin_distances
-
-        for i in prange(self.n_X, schedule='static', nogil=True,
-                        num_threads=self.effective_omp_n_thread):
-            for j in range(self.k):
-                # This time we have no other choice but to recompute distances
-                # because we don't take ||X_c||² in the reduction
-                # TODO: introduce ||X_c||² for FastSquaredEuclideanArgKmin
-                # and factorise code shared with FastSquaredEuclideanRadiusNeighborhood?
-                distances[i, j] = self.datasets_pair.dist(i, Y_indices[i, j])
-
     @final
     cdef int _reduce_on_chunks(self,
         ITYPE_t X_start,
@@ -795,24 +783,19 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         """
         cdef:
             ITYPE_t i, j
+            ITYPE_t k = self.k
+
             const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
             const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_proxy_distances = self.heaps_proxy_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-            # Instead of computing the full pairwise squared distances matrix,
+            # We compute the full pairwise squared distances matrix as follows
             #
             #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
             #
-            # we only need to store the
-            #                                - 2 X_c.Y_c^T + ||Y_c||²
-            #
-            # term since the argkmin for a given sample X_c^{i} does not depend on
-            # ||X_c^{i}||²
-            #
-            # This term gets computed efficiently bellow using GEMM from BLAS Level 3.
+            # The middle term gets computed efficiently bellow using GEMM from BLAS Level 3.
             #
             # Careful: LDA, LDB and LDC are given for F-ordered arrays in BLAS documentations,
             # for instance:
@@ -847,8 +830,12 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                 _push(heaps_proxy_distances + i * k,
                       heaps_indices + i * k,
                       k,
-                      # proxy distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                      dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
+                      # proxy distance: |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      (
+                        self.X_sq_norms[i + X_start] +
+                        dist_middle_terms[i * Y_c.shape[0] + j] +
+                        self.Y_sq_norms[j + Y_start]
+                       ),
                       j + Y_start)
         return 0
 
@@ -1244,6 +1231,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         """
         cdef:
             ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+
             const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
             const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
@@ -1277,9 +1266,6 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
             DTYPE_t * C = dist_middle_terms
             ITYPE_t ldc = Y_c.shape[0]
 
-            DTYPE_t squared_dist_i_j
-
-
         # dist_middle_terms = -2 * X_c.dot(Y_c.T)
         _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
 

From c4add770bb458a4c7a1dd54f8eede05b6d4da878 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 11:09:30 +0200
Subject: [PATCH 141/290] fixup! Adapt tests for better parametrisation
 variability

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 2a3d72fa06a4f..b6a94b20ec5ca 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -436,8 +436,7 @@ def test_fast_sqeuclidean_correctness(
         return_distance=True
     )
 
-    assert_array_equal(eucl_dist, fse_dist)
-    assert_array_equal(eucl_indices, fse_indices)
+    assert_argkmin_results_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
 
     eucl_dist, eucl_indices = RadiusNeighborhood.get_for(
         X, Y, radius, metric="euclidean"

From caa7faad9c0753670c6b03f982049c192b1252fa Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 11:18:05 +0200
Subject: [PATCH 142/290] Simplify tests

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index b6a94b20ec5ca..177c90db52625 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -424,10 +424,9 @@ def test_fast_sqeuclidean_correctness(
         )
 
     rng = np.random.RandomState(seed)
-
     spread = 100
-    X = rng.rand(n_samples, n_features).astype(dtype).reshape((-1, n_features)) * spread
-    Y = rng.rand(n_samples, n_features).astype(dtype).reshape((-1, n_features)) * spread
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
     eucl_dist, eucl_indices = ArgKmin.get_for(X, Y, k, metric="euclidean").compute(
         return_distance=True

From c48c8861ef887a5cc86ba997a1cdec97698cde9a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 13:50:25 +0200
Subject: [PATCH 143/290] Improve test parametrisation on DistanceMetric

---
 .../test_pairwise_distances_reduction.py      |  21 +-
 sklearn/neighbors/tests/test_neighbors.py     | 223 ++++++++----------
 sklearn/utils/_testing.py                     |  21 ++
 3 files changed, 115 insertions(+), 150 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 177c90db52625..ed89dc873c550 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -28,26 +28,7 @@
     FastSquaredEuclideanRadiusNeighborhood,
 )
 
-
-def get_dummy_metric_kwargs(metric: str, n_features: int):
-    """Return dummy DistanceMetric kwargs for tests."""
-    rng = np.random.RandomState(1)
-    weights = rng.random_sample(n_features)
-    weights /= weights.sum()
-
-    V = rng.random_sample((n_features, n_features))
-
-    # VI is positive-semidefinite, preferred for precision matrix
-    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
-
-    kwargs = {
-        "minkowski": dict(p=1.5),
-        "seuclidean": dict(V=weights),
-        "wminkowski": dict(p=1.5, w=weights),
-        "mahalanobis": dict(VI=VI),
-    }
-
-    return kwargs.get(metric, {})
+from sklearn.utils._testing import get_dummy_metric_kwargs
 
 
 def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index a75f2e5cc62e8..90d508d80420d 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -24,7 +24,6 @@
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import (
     VALID_METRICS_SPARSE,
-    VALID_METRICS,
 )
 from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed
 from sklearn.pipeline import make_pipeline
@@ -32,6 +31,7 @@
     assert_allclose,
     assert_array_almost_equal,
     assert_array_equal,
+    get_dummy_metric_kwargs,
 )
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
@@ -1302,71 +1302,50 @@ def test_neighbors_badargs():
         nbrs.radius_neighbors_graph(X, mode="blah")
 
 
-def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5):
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
+def test_neighbors_metrics(
+    metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5
+):
     # Test computing the neighbors for various metrics
+    if metric == "wminkowski" and sp_version >= parse_version("1.8.0"):
+        pytest.skip("wminkowski will be removed in SciPy 1.8.0")
 
     rng = np.random.RandomState(0)
     X = rng.rand(n_samples, n_features)
     test = rng.rand(n_query_pts, n_features)
-    V = np.cov(X.T)
-
-    metrics = [
-        ("euclidean", {}),
-        ("manhattan", {}),
-        ("minkowski", dict(p=1)),
-        ("minkowski", dict(p=2)),
-        ("minkowski", dict(p=3)),
-        ("minkowski", dict(p=np.inf)),
-        ("chebyshev", {}),
-        ("seuclidean", dict(V=rng.rand(n_features))),
-        ("wminkowski", dict(p=3, w=rng.rand(n_features))),
-        ("mahalanobis", dict(V=V)),
-        ("haversine", {}),
-    ]
-    algorithms = ["brute", "ball_tree", "kd_tree"]
 
-    for metric, metric_params in metrics:
-        if metric == "wminkowski" and sp_version >= parse_version("1.8.0"):
-            # wminkowski will be removed in SciPy 1.8.0
-            continue
-
-        # Haversine distance only accepts 2D data
-        if metric == "haversine":
-            feature_sl = slice(None, 2)
-            X_train = np.ascontiguousarray(X[:, feature_sl])
-            X_test = np.ascontiguousarray(test[:, feature_sl])
-        else:
-            X_train = X
-            X_test = test
-
-        results = {}
-        p = metric_params.pop("p", 2)
-        for algorithm in algorithms:
-            # KD tree doesn't support all metrics
-            if algorithm == "kd_tree" and metric not in neighbors.KDTree.valid_metrics:
-                est = neighbors.NearestNeighbors(
-                    algorithm=algorithm, metric=metric, metric_params=metric_params
-                )
-                with pytest.raises(ValueError):
-                    est.fit(X)
-                continue
-            neigh = neighbors.NearestNeighbors(
-                n_neighbors=n_neighbors,
-                algorithm=algorithm,
-                metric=metric,
-                p=p,
-                metric_params=metric_params,
-            )
-
-            neigh.fit(X_train)
+    algorithms = ["brute", "ball_tree", "kd_tree"]
+    metric_params = get_dummy_metric_kwargs(metric, n_features)
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        feature_sl = slice(None, 2)
+        X_train = np.ascontiguousarray(X[:, feature_sl])
+        X_test = np.ascontiguousarray(test[:, feature_sl])
+    else:
+        X_train = X
+        X_test = test
+
+    results = {}
+    p = metric_params.pop("p", 2)
+    for algorithm in algorithms:
+        # KD tree doesn't support all metrics
+        neigh = neighbors.NearestNeighbors(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+        )
 
-            results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
+        neigh.fit(X_train)
+        results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
 
-        assert_allclose(results["brute"][0], results["ball_tree"][0])
-        assert_allclose(results["brute"][1], results["ball_tree"][1])
-        if "kd_tree" in results:
-            assert_allclose(results["brute"][0], results["kd_tree"][0])
-            assert_allclose(results["brute"][1], results["kd_tree"][1])
+    assert_allclose(results["brute"][0], results["ball_tree"][0])
+    assert_allclose(results["brute"][1], results["ball_tree"][1])
+    if "kd_tree" in results:
+        assert_allclose(results["brute"][0], results["kd_tree"][0])
+        assert_allclose(results["brute"][1], results["kd_tree"][1])
 
 
 def test_callable_metric():
@@ -1390,59 +1369,44 @@ def custom_metric(x1, x2):
     assert_array_almost_equal(dist1, dist2)
 
 
-def test_valid_brute_metric_for_auto_algorithm():
-    X = rng.rand(12, 12)
+@pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"])
+def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=12):
+    X = rng.rand(n_samples, n_features)
     Xcsr = csr_matrix(X)
 
-    # check that there is a metric that is valid for brute
-    # but not ball_tree (so we actually test something)
-    assert "cosine" in VALID_METRICS["brute"]
-    assert "cosine" not in VALID_METRICS["ball_tree"]
+    metric_params = get_dummy_metric_kwargs(metric, n_features)
 
-    # Metric which don't required any additional parameter
-    require_params = ["mahalanobis", "wminkowski", "seuclidean"]
-    for metric in VALID_METRICS["brute"]:
-        if metric != "precomputed" and metric not in require_params:
-            nn = neighbors.NearestNeighbors(
-                n_neighbors=3, algorithm="auto", metric=metric
-            )
-            if metric != "haversine":
-                nn.fit(X)
-                nn.kneighbors(X)
-            else:
-                nn.fit(X[:, :2])
-                nn.kneighbors(X[:, :2])
-        elif metric == "precomputed":
-            X_precomputed = rng.random_sample((10, 4))
-            Y_precomputed = rng.random_sample((3, 4))
-            DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean")
-            DYX = metrics.pairwise_distances(
-                Y_precomputed, X_precomputed, metric="euclidean"
-            )
-            nb_p = neighbors.NearestNeighbors(n_neighbors=3)
-            nb_p.fit(DXX)
-            nb_p.kneighbors(DYX)
+    if metric == "precomputed":
+        X_precomputed = rng.random_sample((10, 4))
+        Y_precomputed = rng.random_sample((3, 4))
+        DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean")
+        DYX = metrics.pairwise_distances(
+            Y_precomputed, X_precomputed, metric="euclidean"
+        )
+        nb_p = neighbors.NearestNeighbors(n_neighbors=3, metric="precomputed")
+        nb_p.fit(DXX)
+        nb_p.kneighbors(DYX)
+
+    else:
+        nn = neighbors.NearestNeighbors(
+            n_neighbors=3, algorithm="auto", metric=metric, metric_params=metric_params
+        )
+        # Haversine distance only accepts 2D data
+        if metric == "haversine":
+            feature_sl = slice(None, 2)
+            X = np.ascontiguousarray(X[:, feature_sl])
+        else:
+            X = X
 
-    for metric in VALID_METRICS_SPARSE["brute"]:
-        if metric != "precomputed" and metric not in require_params:
+        nn.fit(X)
+        nn.kneighbors(X)
+
+        if metric in VALID_METRICS_SPARSE["brute"]:
             nn = neighbors.NearestNeighbors(
                 n_neighbors=3, algorithm="auto", metric=metric
             ).fit(Xcsr)
             nn.kneighbors(Xcsr)
 
-    # Metric with parameter
-    VI = np.dot(X, X.T)
-    list_metrics = [
-        ("seuclidean", dict(V=rng.rand(12))),
-        ("wminkowski", dict(w=rng.rand(12))),
-        ("mahalanobis", dict(VI=VI)),
-    ]
-    for metric, params in list_metrics:
-        nn = neighbors.NearestNeighbors(
-            n_neighbors=3, algorithm="auto", metric=metric, metric_params=params
-        ).fit(X)
-        nn.kneighbors(X)
-
 
 def test_metric_params_interface():
     X = rng.rand(5, 5)
@@ -1534,37 +1498,36 @@ def test_k_and_radius_neighbors_train_is_not_query():
         assert_array_equal(rng.A, [[0, 1], [1, 1]])
 
 
-def test_k_and_radius_neighbors_X_None():
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_k_and_radius_neighbors_X_None(algorithm):
     # Test kneighbors et.al when query is None
-    for algorithm in ALGORITHMS:
-
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
-
-        X = [[0], [1]]
-        nn.fit(X)
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
 
-        dist, ind = nn.kneighbors()
-        assert_array_equal(dist, [[1], [1]])
-        assert_array_equal(ind, [[1], [0]])
-        dist, ind = nn.radius_neighbors(None, radius=1.5)
-        check_object_arrays(dist, [[1], [1]])
-        check_object_arrays(ind, [[1], [0]])
+    X = [[0], [1]]
+    nn.fit(X)
 
-        # Test the graph variants.
-        rng = nn.radius_neighbors_graph(None, radius=1.5)
-        kng = nn.kneighbors_graph(None)
-        for graph in [rng, kng]:
-            assert_array_equal(graph.A, [[0, 1], [1, 0]])
-            assert_array_equal(graph.data, [1, 1])
-            assert_array_equal(graph.indices, [1, 0])
-
-        X = [[0, 1], [0, 1], [1, 1]]
-        nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm)
-        nn.fit(X)
-        assert_array_equal(
-            nn.kneighbors_graph().A,
-            np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]),
-        )
+    dist, ind = nn.kneighbors()
+    assert_array_equal(dist, [[1], [1]])
+    assert_array_equal(ind, [[1], [0]])
+    dist, ind = nn.radius_neighbors(None, radius=1.5)
+    check_object_arrays(dist, [[1], [1]])
+    check_object_arrays(ind, [[1], [0]])
+
+    # Test the graph variants.
+    rng = nn.radius_neighbors_graph(None, radius=1.5)
+    kng = nn.kneighbors_graph(None)
+    for graph in [rng, kng]:
+        assert_array_equal(graph.A, [[0, 1], [1, 0]])
+        assert_array_equal(graph.data, [1, 1])
+        assert_array_equal(graph.indices, [1, 0])
+
+    X = [[0, 1], [0, 1], [1, 1]]
+    nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm)
+    nn.fit(X)
+    assert_array_equal(
+        nn.kneighbors_graph().A,
+        np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]),
+    )
 
 
 @pytest.mark.parametrize("algorithm", ALGORITHMS)
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 7d2308a6d28e3..0685f503e3a1e 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -1039,3 +1039,24 @@ def transform(self, X, y=None):
 
     def fit_transform(self, X, y=None):
         return self.fit(X, y).transform(X, y)
+
+
+def get_dummy_metric_kwargs(metric: str, n_features: int):
+    """Return dummy DistanceMetric kwargs for tests."""
+    rng = np.random.RandomState(1)
+    weights = rng.random_sample(n_features)
+    weights /= weights.sum()
+
+    V = rng.random_sample((n_features, n_features))
+
+    # VI is positive-semidefinite, preferred for precision matrix
+    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
+
+    kwargs = {
+        "minkowski": dict(p=1.5),
+        "seuclidean": dict(V=weights),
+        "wminkowski": dict(p=1.5, w=weights),
+        "mahalanobis": dict(VI=VI),
+    }
+
+    return kwargs.get(metric, {})

From 4f06c3a51ae92229c1c1a758d9910372c585e234 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 13:51:31 +0200
Subject: [PATCH 144/290] Improve heap routines' interfaces

---
 .../metrics/_pairwise_distances_reduction.pyx | 45 +++++----
 sklearn/neighbors/_binary_tree.pxi            |  4 +-
 sklearn/utils/_heap.pxd                       | 11 ++-
 sklearn/utils/_heap.pyx                       | 98 ++++++++++---------
 4 files changed, 83 insertions(+), 75 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 439a764cccbb7..396ce5e3dc812 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -31,7 +31,7 @@ from ..utils._cython_blas cimport (
   Trans,
   _gemm,
 )
-from ..utils._heap cimport _simultaneous_sort, _push
+from ..utils._heap cimport simultaneous_sort, heap_push
 from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
@@ -169,9 +169,8 @@ cdef class PairwiseDistancesReduction:
         excluded = {
             "pyfunc",  # is relatively slow because we need to coerce data as numpy arrays
             "mahalanobis", # is numerically unstable
-            # TODO: support those last distances.
-            # In order for them to be supported, we need to have a
-            # simultaneous sort which breaks ties on distances.
+            # TODO: In order to support discrete distance metrics, we need to have a
+            # simultaneous sort which breaks ties on indices when distances are identical.
             # The best might be using std::sort and a Comparator.
             "hamming",
             *BOOL_METRICS,
@@ -498,11 +497,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         # which keep tracks of the argkmin.
         for i in range(n_X):
             for j in range(n_Y):
-                _push(heaps_proxy_distances + i * self.k,
-                      heaps_indices + i * self.k,
-                      k,
-                      self.datasets_pair.proxy_dist(X_start + i, Y_start + j),
-                      Y_start + j)
+                heap_push(
+                    heaps_proxy_distances + i * self.k,
+                    heaps_indices + i * self.k,
+                    k,
+                    self.datasets_pair.proxy_dist(X_start + i, Y_start + j),
+                    Y_start + j,
+                )
 
         return 0
 
@@ -529,7 +530,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
         # Sorting indices of the argkmin for each query vector of X
         for idx in range(X_end - X_start):
-            _simultaneous_sort(
+            simultaneous_sort(
                 self.heaps_proxy_distances_chunks[thread_num] + idx * self.k,
                 self.heaps_indices_chunks[thread_num] + idx * self.k,
                 self.k
@@ -578,7 +579,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             for idx in prange(X_end - X_start, schedule="static"):
                 for thread_num in range(num_threads):
                     for jdx in range(self.k):
-                        _push(
+                        heap_push(
                             &self.argkmin_distances[X_start + idx, 0],
                             &self.argkmin_indices[X_start + idx, 0],
                             self.k,
@@ -601,7 +602,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             # Sort the main heaps into arrays in parallel
             # in ascending order w.r.t the distances
             for idx in prange(self.n_X, schedule='static'):
-                _simultaneous_sort(
+                simultaneous_sort(
                     &self.argkmin_distances[idx, 0],
                     &self.argkmin_indices[idx, 0],
                     self.k,
@@ -827,16 +828,18 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         # which keep tracks of the argkmin.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_proxy_distances + i * k,
-                      heaps_indices + i * k,
-                      k,
-                      # proxy distance: |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                      (
+                heap_push(
+                    heaps_proxy_distances + i * k,
+                    heaps_indices + i * k,
+                    k,
+                    # proxy distance: |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    (
                         self.X_sq_norms[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
                         self.Y_sq_norms[j + Y_start]
-                       ),
-                      j + Y_start)
+                    ),
+                    j + Y_start,
+                )
         return 0
 
 
@@ -980,7 +983,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # Sorting neighbors for each query vector of X
         if self.sort_results:
             for idx in range(X_start, X_end):
-                _simultaneous_sort(
+                simultaneous_sort(
                     deref(self.neigh_distances)[idx].data(),
                     deref(self.neigh_indices)[idx].data(),
                     deref(self.neigh_indices)[idx].size()
@@ -1056,7 +1059,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             # Sort in parallel in ascending order w.r.t the distances if needed
             if self.sort_results:
                 for idx in prange(self.n_X, schedule='static'):
-                    _simultaneous_sort(
+                    simultaneous_sort(
                         deref(self.neigh_distances)[idx].data(),
                         deref(self.neigh_indices)[idx].data(),
                         deref(self.neigh_indices)[idx].size()
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 03b88baf5ddf5..3a55219cf97c7 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -158,7 +158,7 @@ from ._partition_nodes cimport partition_node_indices
 from ..utils import check_array
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 from ..utils._typedefs import DTYPE, ITYPE
-from ..utils._heap cimport _simultaneous_sort, _push
+from ..utils._heap cimport simultaneous_sort as _simultaneous_sort, heap_push
 
 cdef extern from "numpy/arrayobject.h":
     void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
@@ -546,7 +546,7 @@ cdef class NeighborsHeap:
             ITYPE_t size = self.distances.shape[1]
             DTYPE_t* dist_arr = &self.distances[row, 0]
             ITYPE_t* ind_arr = &self.indices[row, 0]
-        return _push(dist_arr, ind_arr, size, val, i_val)
+        return heap_push(dist_arr, ind_arr, size, val, i_val)
 
     cdef int _sort(self) except -1:
         """simultaneously sort the distances and indices"""
diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
index 05e4760994e33..f085b28a38417 100644
--- a/sklearn/utils/_heap.pxd
+++ b/sklearn/utils/_heap.pxd
@@ -1,19 +1,20 @@
 # cython: language_level=3
+# Heap routines, used in various Cython implementation.
 
 from cython cimport floating
 
 from ._typedefs cimport ITYPE_t
 
-cdef int _simultaneous_sort(
+cdef int simultaneous_sort(
     floating* dist,
     ITYPE_t* idx,
     ITYPE_t size
 ) nogil except -1
 
-cdef int _push(
-    floating* dist,
-    ITYPE_t* idx,
+cdef inline int heap_push(
+    floating* values,
+    ITYPE_t* indices,
     ITYPE_t size,
     floating val,
-    ITYPE_t i_val,
+    ITYPE_t val_idx,
 ) nogil except -1
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index b9ae8049848a8..fcc885c0f93a0 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -10,7 +10,7 @@ from ._typedefs cimport ITYPE_t
 
 cdef inline void dual_swap(floating* darr, ITYPE_t* iarr,
                            ITYPE_t i1, ITYPE_t i2) nogil:
-    """swap the values at inex i1 and i2 of both darr and iarr"""
+    """Swap the values at inex i1 and i2 of both darr and iarr"""
     cdef floating dtmp = darr[i1]
     darr[i1] = darr[i2]
     darr[i2] = dtmp
@@ -19,19 +19,18 @@ cdef inline void dual_swap(floating* darr, ITYPE_t* iarr,
     iarr[i1] = iarr[i2]
     iarr[i2] = itmp
 
-cdef int _simultaneous_sort(
-    floating* dist,
-    ITYPE_t* idx,
+cdef int simultaneous_sort(
+    floating* values,
+    ITYPE_t* indices,
     ITYPE_t size
 ) nogil except -1:
     """
-    Perform a recursive quicksort on the dist array, simultaneously
-    performing the same swaps on the idx array.
-
-    TODO: test if the following algorithms are better:
-      - introselect via std::nth_element
-      - heap-sort-like
+    Perform a recursive quicksort on the values array, simultaneously
+    performing the same swaps on the indices array.
     """
+    # TODO: In order to support discrete distance metrics, we need to have a
+    # simultaneous sort which breaks ties on indices when distances are identical.
+    # The best might be using std::sort and a Comparator.
     cdef:
         ITYPE_t pivot_idx, i, store_idx
         floating pivot_val
@@ -40,68 +39,73 @@ cdef int _simultaneous_sort(
     if size <= 1:
         pass
     elif size == 2:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
+        if values[0] > values[1]:
+            dual_swap(values, indices, 0, 1)
     elif size == 3:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
-        if dist[1] > dist[2]:
-            dual_swap(dist, idx, 1, 2)
-            if dist[0] > dist[1]:
-                dual_swap(dist, idx, 0, 1)
+        if values[0] > values[1]:
+            dual_swap(values, indices, 0, 1)
+        if values[1] > values[2]:
+            dual_swap(values, indices, 1, 2)
+            if values[0] > values[1]:
+                dual_swap(values, indices, 0, 1)
     else:
         # Determine the pivot using the median-of-three rule.
         # The smallest of the three is moved to the beginning of the array,
         # the middle (the pivot value) is moved to the end, and the largest
         # is moved to the pivot index.
         pivot_idx = size // 2
-        if dist[0] > dist[size - 1]:
-            dual_swap(dist, idx, 0, size - 1)
-        if dist[size - 1] > dist[pivot_idx]:
-            dual_swap(dist, idx, size - 1, pivot_idx)
-            if dist[0] > dist[size - 1]:
-                dual_swap(dist, idx, 0, size - 1)
-        pivot_val = dist[size - 1]
+        if values[0] > values[size - 1]:
+            dual_swap(values, indices, 0, size - 1)
+        if values[size - 1] > values[pivot_idx]:
+            dual_swap(values, indices, size - 1, pivot_idx)
+            if values[0] > values[size - 1]:
+                dual_swap(values, indices, 0, size - 1)
+        pivot_val = values[size - 1]
 
         # partition indices about pivot.  At the end of this operation,
         # pivot_idx will contain the pivot value, everything to the left
         # will be smaller, and everything to the right will be larger.
         store_idx = 0
         for i in range(size - 1):
-            if dist[i] < pivot_val:
-                dual_swap(dist, idx, i, store_idx)
+            if values[i] < pivot_val:
+                dual_swap(values, indices, i, store_idx)
                 store_idx += 1
-        dual_swap(dist, idx, store_idx, size - 1)
+        dual_swap(values, indices, store_idx, size - 1)
         pivot_idx = store_idx
 
         # recursively sort each side of the pivot
         if pivot_idx > 1:
-            _simultaneous_sort(dist, idx, pivot_idx)
+            simultaneous_sort(values, indices, pivot_idx)
         if pivot_idx + 2 < size:
-            _simultaneous_sort(dist + pivot_idx + 1,
-                               idx + pivot_idx + 1,
+            simultaneous_sort(values + pivot_idx + 1,
+                               indices + pivot_idx + 1,
                                size - pivot_idx - 1)
     return 0
 
 
-cdef inline int _push(
-    floating* dist,
-    ITYPE_t* idx,
+cdef inline int heap_push(
+    floating* values,
+    ITYPE_t* indices,
     ITYPE_t size,
     floating val,
-    ITYPE_t i_val,
+    ITYPE_t val_idx,
 ) nogil except -1:
-    """push (val, i_val) into the heap (dist, idx) of the given size"""
+    """Push a tuple (val, val_idx) into a fixed-size max-heap.
+
+    The max-heap is represented as a struct of arrays where:
+     - values is the array containing the data to construct the heap on
+     - indices is the array containing the indices (meta-data) of each value.
+    """
     cdef:
         ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx
 
     # check if val should be in heap
-    if val >= dist[0]:
+    if val >= values[0]:
         return 0
 
     # insert val at position zero
-    dist[0] = val
-    idx[0] = i_val
+    values[0] = val
+    indices[0] = val_idx
 
     # descend the heap, swapping values until the max heap criterion is met
     current_idx = 0
@@ -112,27 +116,27 @@ cdef inline int _push(
         if left_child_idx >= size:
             break
         elif right_child_idx >= size:
-            if dist[left_child_idx] > val:
+            if values[left_child_idx] > val:
                 swap_idx = left_child_idx
             else:
                 break
-        elif dist[left_child_idx] >= dist[right_child_idx]:
-            if val < dist[left_child_idx]:
+        elif values[left_child_idx] >= values[right_child_idx]:
+            if val < values[left_child_idx]:
                 swap_idx = left_child_idx
             else:
                 break
         else:
-            if val < dist[right_child_idx]:
+            if val < values[right_child_idx]:
                 swap_idx = right_child_idx
             else:
                 break
 
-        dist[current_idx] = dist[swap_idx]
-        idx[current_idx] = idx[swap_idx]
+        values[current_idx] = values[swap_idx]
+        indices[current_idx] = indices[swap_idx]
 
         current_idx = swap_idx
 
-    dist[current_idx] = val
-    idx[current_idx] = i_val
+    values[current_idx] = val
+    indices[current_idx] = val_idx
 
     return 0

From 962f5350e3827150b42f867094aa22aff6b576bb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 14:33:34 +0200
Subject: [PATCH 145/290] fixup! Improve heap routines' interfaces

---
 sklearn/utils/_heap.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
index f085b28a38417..a33c47f416b53 100644
--- a/sklearn/utils/_heap.pxd
+++ b/sklearn/utils/_heap.pxd
@@ -11,7 +11,7 @@ cdef int simultaneous_sort(
     ITYPE_t size
 ) nogil except -1
 
-cdef inline int heap_push(
+cdef int heap_push(
     floating* values,
     ITYPE_t* indices,
     ITYPE_t size,

From 1b712829cc7a73028d1e8577e6f4b095e790f1d0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 16:39:32 +0200
Subject: [PATCH 146/290] Fix docstring

---
 sklearn/neighbors/_regression.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 14247d8c837b3..7bf309b827398 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -76,8 +76,7 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. For a list of available metrics, see the documentation of
-
-
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.

From dc8ddf4314f97e7c345a3e442c496747df67be7e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 16:45:59 +0200
Subject: [PATCH 147/290] Add missing dtype for indices

---
 sklearn/metrics/_dist_metrics.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index f339961ad7c3d..a3507b46d88f8 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1425,7 +1425,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
 
         self.Y = check_array(Y, dtype=DTYPE)
-        self.Y_indices = np.arange(self.Y.shape[1])
+        self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE)
 
     @property
     @final

From bd1b0d96523fdbc266bcd5701a6e569dc437f395 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 4 Aug 2021 18:01:52 +0200
Subject: [PATCH 148/290] Better deprecate neighbors.DistanceMetric

This avoids making pytest inspection raise
the FutureWarning.

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/tests/test_dist_metrics.py | 10 +++++++++
 sklearn/neighbors/DistanceMetric.py        | 16 --------------
 sklearn/neighbors/__init__.py              |  2 ++
 sklearn/neighbors/_distance_metric.py      | 25 ++++++++++++++++++++++
 4 files changed, 37 insertions(+), 16 deletions(-)
 delete mode 100644 sklearn/neighbors/DistanceMetric.py
 create mode 100644 sklearn/neighbors/_distance_metric.py

diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index 28ed45ac5cf33..9f0750fd75669 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -52,6 +52,16 @@ def dist_func(x1, x2, p):
 }
 
 
+# TODO: remove this test in 1.2
+def test_neighbors_distance_metric_deprecation():
+    from sklearn.neighbors import DistanceMetric as DeprecatedDistanceMetric
+
+    with pytest.warns(
+        FutureWarning, match="sklearn.neighbors.DistanceMetric has been moved"
+    ):
+        DeprecatedDistanceMetric.get_metric("euclidean")
+
+
 @pytest.mark.parametrize("metric", METRICS_DEFAULT_PARAMS)
 @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
 def test_cdist(metric, X1, X2):
diff --git a/sklearn/neighbors/DistanceMetric.py b/sklearn/neighbors/DistanceMetric.py
deleted file mode 100644
index c0c175800e925..0000000000000
--- a/sklearn/neighbors/DistanceMetric.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# TODO: Remove in 1.2
-import warnings
-
-from ..metrics import DistanceMetric
-from .. import neighbors
-
-warnings.warn(
-    "sklearn.neighbors.DistanceMetric has been moved "
-    "to sklearn.metrics.DistanceMetric in 1.0. "
-    "This import path will be removed in 1.2",
-    category=FutureWarning,
-)
-
-# Monkey-patching neighbors to alias sklearn.metrics.DistanceMetric
-setattr(neighbors, "DistanceMetric", DistanceMetric)
-neighbors.__all__ += ["DistanceMetric"]
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 3cd1d7925acf6..ff5ad4875d77d 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -4,6 +4,7 @@
 """
 
 from ._ball_tree import BallTree
+from ._distance_metric import DistanceMetric
 from ._kd_tree import KDTree
 from ._graph import kneighbors_graph, radius_neighbors_graph
 from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
@@ -18,6 +19,7 @@
 
 __all__ = [
     "BallTree",
+    "DistanceMetric",
     "KDTree",
     "KNeighborsClassifier",
     "KNeighborsRegressor",
diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py
new file mode 100644
index 0000000000000..d7992ca833cd8
--- /dev/null
+++ b/sklearn/neighbors/_distance_metric.py
@@ -0,0 +1,25 @@
+# TODO: Remove this file in 1.2
+import warnings
+
+from ..metrics import DistanceMetric as _DistanceMetric
+
+
+class DistanceMetric(_DistanceMetric):
+    @classmethod
+    def _warn(cls):
+        warnings.warn(
+            "sklearn.neighbors.DistanceMetric has been moved "
+            "to sklearn.metrics.DistanceMetric in 1.0. "
+            "This import path will be removed in 1.2",
+            category=FutureWarning,
+        )
+
+    def __init__(self):
+        DistanceMetric._warn()
+        if self.__class__ is DistanceMetric:
+            raise NotImplementedError("DistanceMetric is an abstract class")
+
+    @classmethod
+    def get_metric(cls, metric, **kwargs):
+        DistanceMetric._warn()
+        return _DistanceMetric.get_metric(metric, **kwargs)

From c0dbc97c0a645d53ea97721202be968279abfcd4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 5 Aug 2021 11:58:01 +0200
Subject: [PATCH 149/290] Add link to CPython docs regarding reference stealing

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 396ce5e3dc812..19db4b9373513 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -120,6 +120,7 @@ cdef np.ndarray vector_to_numpy_array(vector_DITYPE_t * vect_ptr):
     # Makes the numpy array responsible to the life-cycle of its buffer.
     # A reference to the sentinel will be stolen by the call bellow,
     # so we increase its reference count.
+    # See: https://docs.python.org/3/c-api/intro.html#reference-count-details
     Py_INCREF(sentinel)
     PyArray_SetBaseObject(arr, <PyObject*>sentinel)
     return arr

From 2b0d3a67196d3b3888d8e86737a9ab3fa776784a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 5 Aug 2021 14:50:52 +0200
Subject: [PATCH 150/290] Force the coretype to be armv8 on linux-arm64

Currently, there are some problems with the neoversen1 kernel,
which makes computations using BLAS via scipy unstable for this
architecture.

See this comment:
https://github.com/numpy/numpy/issues/19411#issuecomment-875884604
---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b730ae0ff595a..f629985a16083 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -137,6 +137,7 @@ jobs:
     environment:
       - OMP_NUM_THREADS: 2
       - OPENBLAS_NUM_THREADS: 2
+      - OPENBLAS_CORETYPE: 'armv8'
       - CYTHON_VERSION: 'latest'
       - JOBLIB_VERSION: 'latest'
       - THREADPOOLCTL_VERSION: 'latest'

From fb3866c45976a214eac5ce8040d03a894282dded Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 10:12:43 +0200
Subject: [PATCH 151/290] Revert "Force the coretype to be armv8 on
 linux-arm64"

This reverts commit 2b0d3a67196d3b3888d8e86737a9ab3fa776784a.
---
 .circleci/config.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index f629985a16083..b730ae0ff595a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -137,7 +137,6 @@ jobs:
     environment:
       - OMP_NUM_THREADS: 2
       - OPENBLAS_NUM_THREADS: 2
-      - OPENBLAS_CORETYPE: 'armv8'
       - CYTHON_VERSION: 'latest'
       - JOBLIB_VERSION: 'latest'
       - THREADPOOLCTL_VERSION: 'latest'

From e55fd945d4afa02f5bddf0bc26f2542319fb618f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 09:22:00 +0200
Subject: [PATCH 152/290] Use conda-forge to test arm64

---
 build_tools/circle/build_test_arm.sh | 29 ++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index 3d555f66227c4..fa8b08805cdf4 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -21,10 +21,27 @@ source build_tools/shared.sh
 
 sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test
 sudo apt-get update
-sudo apt-get install python3-virtualenv ccache
-python3 -m virtualenv --system-site-packages --python=python3 testenv
-source testenv/bin/activate
-pip install --upgrade pip
+
+# Setup conda environment
+
+MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-arm64.sh"
+
+# Install Miniconda
+wget $MINICONDA_URL -O miniconda.sh
+MINICONDA_PATH=$HOME/miniconda
+chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
+export PATH=$MINICONDA_PATH/bin:$PATH
+conda update --yes conda
+
+# Create environment and install dependencies
+conda create -n testenv --yes python=3.7
+source activate testenv
+
+# Use the latest by default
+conda install --verbose -c conda-forge -y ccache numpy scipy cython pip
+pip install joblib threadpoolctl
+pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist
+
 setup_ccache
 python -m pip install $(get_dep cython $CYTHON_VERSION) \
                       $(get_dep joblib $JOBLIB_VERSION)
@@ -47,13 +64,13 @@ if [[ "$TEST_DOCSTRINGS" == "true" ]]; then
 fi
 
 python --version
+conda list
 
 # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI
 # workers with 2 cores when building the compiled extensions of scikit-learn.
 export SKLEARN_BUILD_PARALLEL=3
 
-python -m pip list
-pip install --verbose --editable .
+pip install --verbose --editable . --no-build-isolation
 ccache -s
 python -c "import sklearn; sklearn.show_versions()"
 python -m threadpoolctl --import sklearn

From 50d2669ce3e94884c047b38163249bed67e05cf4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 10:17:22 +0200
Subject: [PATCH 153/290] Use Mambaforge instead

---
 build_tools/circle/build_test_arm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index fa8b08805cdf4..08d46b22b49db 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -24,7 +24,7 @@ sudo apt-get update
 
 # Setup conda environment
 
-MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-arm64.sh"
+MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh"
 
 # Install Miniconda
 wget $MINICONDA_URL -O miniconda.sh
@@ -38,7 +38,7 @@ conda create -n testenv --yes python=3.7
 source activate testenv
 
 # Use the latest by default
-conda install --verbose -c conda-forge -y ccache numpy scipy cython pip
+mamba install --verbose -y ccache numpy scipy cython pip
 pip install joblib threadpoolctl
 pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist
 

From 84c4315d05c19417cf527e4fa6aa8b1c9eafb32a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 11:48:42 +0200
Subject: [PATCH 154/290] Install all dependencies in a row via mamba

---
 build_tools/circle/build_test_arm.sh | 44 +++++++++++++---------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index 08d46b22b49db..2ddf8e88faf16 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -24,43 +24,41 @@ sudo apt-get update
 
 # Setup conda environment
 
-MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh"
+MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh"
 
-# Install Miniconda
-wget $MINICONDA_URL -O miniconda.sh
+# Install Mambaforge
+wget $MINICONDA_URL -O mambaforge.sh
 MINICONDA_PATH=$HOME/miniconda
-chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
+chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MINICONDA_PATH
 export PATH=$MINICONDA_PATH/bin:$PATH
-conda update --yes conda
+mamba update --yes conda
 
 # Create environment and install dependencies
-conda create -n testenv --yes python=3.7
-source activate testenv
+mamba create -n testenv --yes python=3.7
+conda activate testenv
 
 # Use the latest by default
-mamba install --verbose -y ccache numpy scipy cython pip
-pip install joblib threadpoolctl
-pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist
-
+mamba install --verbose -y  ccache \
+                            pip \
+                            numpy \
+                            scipy \
+                            cython \
+                            pip \
+                            $(get_dep cython $CYTHON_VERSION) \
+                            $(get_dep joblib $JOBLIB_VERSION) \
+                            $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \
+                            $(get_dep pytest $PYTEST_VERSION) \
+                            $(get_dep pytest-xdist $PYTEST_XDIST_VERSION)
 setup_ccache
-python -m pip install $(get_dep cython $CYTHON_VERSION) \
-                      $(get_dep joblib $JOBLIB_VERSION)
-python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \
-                      $(get_dep pytest $PYTEST_VERSION) \
-                      $(get_dep pytest-xdist $PYTEST_XDIST_VERSION)
 
 if [[ "$COVERAGE" == "true" ]]; then
-    python -m pip install codecov pytest-cov
-fi
-
-if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
-    python -m pip install pytest-xdist
+    mamba install --verbose -y codecov pytest-cov
 fi
 
 if [[ "$TEST_DOCSTRINGS" == "true" ]]; then
     # numpydoc requires sphinx
-    python -m pip install sphinx
-    python -m pip install numpydoc
+    mamba install --verbose -y sphinx
+    mamba install --verbose -y numpydoc
 fi
 
 python --version

From 2411ffc0b2367f44fd3f806445b9f55d9dc8c86c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 14:16:14 +0200
Subject: [PATCH 155/290] Mark tests as xfail when in unstable OpenBLAS
 configuration

---
 .../metrics/_pairwise_distances_reduction.pyx |  2 +-
 .../test_pairwise_distances_reduction.py      | 19 ++++++++++++-
 sklearn/utils/__init__.py                     | 27 +++++++++++++++++++
 sklearn/utils/_testing.py                     | 11 +++++++-
 4 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 19db4b9373513..94f47c897d239 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -40,7 +40,7 @@ from typing import List
 from scipy.sparse import issparse
 from threadpoolctl import threadpool_limits
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
-from ..utils import check_array, check_scalar
+from ..utils import check_array, check_scalar, in_unstable_openblas_configuration
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index ed89dc873c550..489d06ebaa68a 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -28,7 +28,11 @@
     FastSquaredEuclideanRadiusNeighborhood,
 )
 
-from sklearn.utils._testing import get_dummy_metric_kwargs
+from sklearn.utils import in_unstable_openblas_configuration
+from sklearn.utils._testing import (
+    fails_if_unstable_openblas,
+    get_dummy_metric_kwargs,
+)
 
 
 def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
@@ -165,6 +169,7 @@ def test_radius_neighborhood_factory_method_wrong_usages():
         RadiusNeighborhood.get_for(X=X[:, ::2], Y=Y, radius=radius, metric=metric)
 
 
+@fails_if_unstable_openblas
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction",
     [
@@ -226,6 +231,7 @@ def test_pairwise_distances_reduction_factory_method(
     assert isinstance(fast_sqeuclidean_instance, FastSquaredPairwiseDistancesReduction)
 
 
+@fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
 @pytest.mark.parametrize("k", [1, 10, 100])
@@ -256,6 +262,7 @@ def test_argkmin_chunk_size_agnosticism(
     assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices)
 
 
+@fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("radius", [1, 10, 100])
@@ -304,6 +311,10 @@ def test_argkmin_strategies_consistency(
 ):
     # ArgKmin results obtained using both parallelization strategies
     # must be identical
+    if in_unstable_openblas_configuration and metric == "fast_sqeuclidean":
+        pytest.xfail(
+            "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
+        )
 
     rng = np.random.RandomState(seed)
     spread = 100
@@ -351,6 +362,10 @@ def test_radius_neighborhood_strategies_consistency(
 ):
     # RadiusNeighborhood results obtained using both parallelization strategies
     # must be identical
+    if in_unstable_openblas_configuration and metric == "fast_sqeuclidean":
+        pytest.xfail(
+            "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
+        )
 
     rng = np.random.RandomState(seed)
     spread = 100
@@ -384,6 +399,7 @@ def test_radius_neighborhood_strategies_consistency(
     )
 
 
+@fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
@@ -430,6 +446,7 @@ def test_fast_sqeuclidean_correctness(
     )
 
 
+@fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index a389bba30ca4a..863107d049b07 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -3,6 +3,7 @@
 """
 import pkgutil
 import inspect
+from distutils.version import LooseVersion
 from importlib import import_module
 from operator import itemgetter
 from collections.abc import Sequence
@@ -18,6 +19,7 @@
 import warnings
 import numpy as np
 from scipy.sparse import issparse
+from threadpoolctl import threadpool_info
 
 from .murmurhash import murmurhash3_32
 from .class_weight import compute_class_weight, compute_sample_weight
@@ -79,6 +81,31 @@
 _IS_32BIT = 8 * struct.calcsize("P") == 32
 
 
+def in_unstable_openblas_configuration():
+    """Return True if in an unstable configuration for OpenBLAS.
+
+    See discussions in https://github.com/numpy/numpy/issues/19411
+    """
+
+    # Make sure numpy and scipy are imported
+    import numpy  # noqa
+    import scipy  # noqa
+
+    modules_info = threadpool_info()
+
+    open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
+    if not open_blas_used:
+        return False
+
+    open_blas_version = next(
+        info["version"] for info in modules_info if info["internal_api"] == "openblas"
+    )
+
+    return platform.machine() == "neoversen1" and open_blas_version < LooseVersion(
+        "0.3.16"
+    )
+
+
 class Bunch(dict):
     """Container object exposing keys as attributes.
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 0685f503e3a1e..eb635d8f45113 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -48,7 +48,12 @@
 import joblib
 
 import sklearn
-from sklearn.utils import IS_PYPY, _IS_32BIT, deprecated
+from sklearn.utils import (
+    IS_PYPY,
+    _IS_32BIT,
+    deprecated,
+    in_unstable_openblas_configuration,
+)
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import (
     check_array,
@@ -446,6 +451,10 @@ def set_random_state(estimator, random_state=0):
         os.environ.get("TRAVIS") == "true", reason="skip on travis"
     )
     fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy")
+    fails_if_unstable_openblas = pytest.mark.xfail(
+        in_unstable_openblas_configuration(),
+        reason="OpenBLAS is unstable for this configuration",
+    )
     skip_if_no_parallel = pytest.mark.skipif(
         not joblib.parallel.mp, reason="joblib is in serial mode"
     )

From b7bbd060e11d51113ee6076beba584428c238d22 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 14:44:09 +0200
Subject: [PATCH 156/290] Lighten tests' parametrizations

---
 .../test_pairwise_distances_reduction.py      | 46 +++++++++----------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 489d06ebaa68a..810d3650cdabc 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -232,15 +232,14 @@ def test_pairwise_distances_reduction_factory_method(
 
 
 @fails_if_unstable_openblas
-@pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("k", [1, 10, 100])
-@pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
 def test_argkmin_chunk_size_agnosticism(
     seed,
     n_samples,
-    k,
     chunk_size,
+    k=10,
     metric="fast_sqeuclidean",
     n_features=100,
     dtype=np.float64,
@@ -263,15 +262,14 @@ def test_argkmin_chunk_size_agnosticism(
 
 
 @fails_if_unstable_openblas
-@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("radius", [1, 10, 100])
-@pytest.mark.parametrize("chunk_size", [512, 1024, 1337, 19301])
+@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
 def test_radius_neighborhood_chunk_size_agnosticism(
     seed,
     n_samples,
-    radius,
     chunk_size,
+    radius=10.0,
     metric="fast_sqeuclidean",
     n_features=100,
     dtype=np.float64,
@@ -296,22 +294,20 @@ def test_radius_neighborhood_chunk_size_agnosticism(
     assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices)
 
 
-@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 100])
-@pytest.mark.parametrize("k", [1, 10, 100])
-@pytest.mark.parametrize("metric", ArgKmin.valid_metrics())
+@pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
 def test_argkmin_strategies_consistency(
-    seed,
-    n_samples,
-    n_features,
-    k,
     metric,
+    n_samples,
+    seed,
+    n_features=10,
+    k=10,
     dtype=np.float64,
 ):
     # ArgKmin results obtained using both parallelization strategies
     # must be identical
-    if in_unstable_openblas_configuration and metric == "fast_sqeuclidean":
+    if in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
         pytest.xfail(
             "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
         )
@@ -332,6 +328,8 @@ def test_argkmin_strategies_consistency(
         k=k,
         metric=metric,
         metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
+        # To be sure to use parallelization
+        chunk_size=n_samples // 4,
     )
 
     dist_par_X, indices_par_X = argkmin_reduction.compute(
@@ -347,22 +345,20 @@ def test_argkmin_strategies_consistency(
     )
 
 
-@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 100])
-@pytest.mark.parametrize("radius", [1, 10, 100])
 @pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
 def test_radius_neighborhood_strategies_consistency(
     seed,
     n_samples,
-    n_features,
-    radius,
     metric,
+    n_features=10,
+    radius=10.0,
     dtype=np.float64,
 ):
     # RadiusNeighborhood results obtained using both parallelization strategies
     # must be identical
-    if in_unstable_openblas_configuration and metric == "fast_sqeuclidean":
+    if in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
         pytest.xfail(
             "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
         )
@@ -384,6 +380,8 @@ def test_radius_neighborhood_strategies_consistency(
         radius=radius ** np.log(n_features),
         metric=metric,
         metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
+        # To be sure to use parallelization
+        chunk_size=n_samples // 4,
     )
 
     dist_par_X, indices_par_X = radius_neigh_reduction.compute(

From 042e228d0b51026318bc00813c8e71d4d3287ac0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 15:04:24 +0200
Subject: [PATCH 157/290] Improve checks for unstable OpenBLAS configuration

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/utils/__init__.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 863107d049b07..fcba392bde647 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -82,12 +82,9 @@
 
 
 def in_unstable_openblas_configuration():
-    """Return True if in an unstable configuration for OpenBLAS.
+    """Return True if in an unstable configuration for OpenBLAS"""
 
-    See discussions in https://github.com/numpy/numpy/issues/19411
-    """
-
-    # Make sure numpy and scipy are imported
+    # Import libraries which might load OpenBLAS.
     import numpy  # noqa
     import scipy  # noqa
 
@@ -97,13 +94,24 @@ def in_unstable_openblas_configuration():
     if not open_blas_used:
         return False
 
-    open_blas_version = next(
-        info["version"] for info in modules_info if info["internal_api"] == "openblas"
-    )
-
-    return platform.machine() == "neoversen1" and open_blas_version < LooseVersion(
-        "0.3.16"
-    )
+    # OpenBLAS 0.3.16 fixed unstability for arm64, see:
+    # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
+    openblas_arm64_stable_version = LooseVersion("0.3.16")
+    for info in modules_info:
+        if info["internal_api"] != "openblas":
+            continue
+        openblas_version = info.get("version")
+        openblas_architecture = info.get("architecture")
+        if openblas_version is None or openblas_architecture is None:
+            # Cannot be sure that OpenBLAS is good enough. Assume unstable:
+            return True
+        if (
+            openblas_architecture == "neoversen1"
+            and openblas_version < openblas_arm64_stable_version
+        ):
+            # See discussions in https://github.com/numpy/numpy/issues/19411
+            return True
+    return False
 
 
 class Bunch(dict):

From 394d9dc98b78ecdecfd23f04047a75398319e476 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 16:34:12 +0200
Subject: [PATCH 158/290] fixup! Use Mambaforge instead

---
 build_tools/circle/build_test_arm.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index 2ddf8e88faf16..6f34247d34af6 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -23,8 +23,7 @@ sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test
 sudo apt-get update
 
 # Setup conda environment
-
-MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh"
+MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh"
 
 # Install Mambaforge
 wget $MINICONDA_URL -O mambaforge.sh

From b2d80dcd6d244fc0c201417153154e638e5c48ac Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 16:40:30 +0200
Subject: [PATCH 159/290] Check against now made privated
 in_unstable_openblas_configuration

---
 sklearn/metrics/_pairwise_distances_reduction.pyx      | 10 +++++++++-
 .../metrics/tests/test_pairwise_distances_reduction.py |  6 +++---
 sklearn/utils/__init__.py                              |  2 +-
 sklearn/utils/_testing.py                              |  4 ++--
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 94f47c897d239..83a44b27dd3b2 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -40,7 +40,7 @@ from typing import List
 from scipy.sparse import issparse
 from threadpoolctl import threadpool_limits
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
-from ..utils import check_array, check_scalar, in_unstable_openblas_configuration
+from ..utils import check_array, check_scalar, _in_unstable_openblas_configuration
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
@@ -707,6 +707,10 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
 
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return ArgKmin.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()
+
     def __init__(self,
         X,
         Y,
@@ -1156,6 +1160,10 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
 
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return RadiusNeighborhood.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()
+
     def __init__(self,
         X,
         Y,
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 810d3650cdabc..45189e01d3316 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -28,7 +28,7 @@
     FastSquaredEuclideanRadiusNeighborhood,
 )
 
-from sklearn.utils import in_unstable_openblas_configuration
+from sklearn.utils import _in_unstable_openblas_configuration
 from sklearn.utils._testing import (
     fails_if_unstable_openblas,
     get_dummy_metric_kwargs,
@@ -307,7 +307,7 @@ def test_argkmin_strategies_consistency(
 ):
     # ArgKmin results obtained using both parallelization strategies
     # must be identical
-    if in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
+    if _in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
         pytest.xfail(
             "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
         )
@@ -358,7 +358,7 @@ def test_radius_neighborhood_strategies_consistency(
 ):
     # RadiusNeighborhood results obtained using both parallelization strategies
     # must be identical
-    if in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
+    if _in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
         pytest.xfail(
             "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
         )
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index fcba392bde647..86b4b1a4f1209 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -81,7 +81,7 @@
 _IS_32BIT = 8 * struct.calcsize("P") == 32
 
 
-def in_unstable_openblas_configuration():
+def _in_unstable_openblas_configuration():
     """Return True if in an unstable configuration for OpenBLAS"""
 
     # Import libraries which might load OpenBLAS.
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index eb635d8f45113..81eed5e035d82 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -52,7 +52,7 @@
     IS_PYPY,
     _IS_32BIT,
     deprecated,
-    in_unstable_openblas_configuration,
+    _in_unstable_openblas_configuration,
 )
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import (
@@ -452,7 +452,7 @@ def set_random_state(estimator, random_state=0):
     )
     fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy")
     fails_if_unstable_openblas = pytest.mark.xfail(
-        in_unstable_openblas_configuration(),
+        _in_unstable_openblas_configuration(),
         reason="OpenBLAS is unstable for this configuration",
     )
     skip_if_no_parallel = pytest.mark.skipif(

From ae097cf361b9e21ae833e5ab86463fd1c233ca63 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 6 Aug 2021 17:23:08 +0200
Subject: [PATCH 160/290] fixup! Use conda-forge to test arm64

---
 build_tools/circle/build_test_arm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index 6f34247d34af6..5169c4871ba04 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -34,7 +34,7 @@ mamba update --yes conda
 
 # Create environment and install dependencies
 mamba create -n testenv --yes python=3.7
-conda activate testenv
+source activate testenv
 
 # Use the latest by default
 mamba install --verbose -y  ccache \

From 1821aadb2cfafc44ef9daabec0c95840527ec9b1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 11 Aug 2021 11:32:37 +0200
Subject: [PATCH 161/290] Remove SparseEfficiencyWarning

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 45189e01d3316..61ac8295873c3 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -170,6 +170,7 @@ def test_radius_neighborhood_factory_method_wrong_usages():
 
 
 @fails_if_unstable_openblas
+@pytest.mark.filterwarnings("ignore:Constructing a DIA matrix")
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction",
     [

From ca942a5a9ce513f0b235d21003ef99de0700e19d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 11 Aug 2021 11:35:09 +0200
Subject: [PATCH 162/290] Apply suggestions from reviews comments and
 discussions

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/metrics/_dist_metrics.pxd             |   2 +-
 sklearn/metrics/_dist_metrics.pyx             |  72 ++++-
 .../metrics/_pairwise_distances_reduction.pyx | 245 +++++++++++-------
 sklearn/utils/_heap.pyx                       |   3 +-
 4 files changed, 215 insertions(+), 107 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index 17423f67be8b9..f5f93f88ac7e9 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -4,7 +4,7 @@
 #cython: cdivision=True
 
 cimport numpy as np
-from libc.math cimport  sqrt, exp
+from libc.math cimport sqrt, exp
 
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index a3507b46d88f8..5a0019e1b065c 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -31,7 +31,6 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
     return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
 
 
-# some handy constants
 from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
 cdef DTYPE_t INF = np.inf
 
@@ -1204,8 +1203,16 @@ cdef class DatasetsPair:
     to :class:`sklearn.metrics.DistanceMetric` based on the physical
     representation of the vectors (sparse vs. dense). It makes use of
     cython.final to remove the overhead of method calls' dispatch.
+
+    Parameters
+    ----------
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
     """
 
+    # The `distance_metric` attribute is defined in _dist_metrics.pxd
+
     @classmethod
     def get_for(cls,
         X,
@@ -1213,11 +1220,34 @@ cdef class DatasetsPair:
         str metric="euclidean",
         dict metric_kwargs=dict(),
     ) -> DatasetsPair:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        X : array-like of shape (n_X, d)
+            Input data.
+
+        Y : array-like of shape (n_Y, d)
+            Input data.
+
+        metric : str, default='fast_sqeuclidean'
+            The distance metric to use for argkmin. The default metric is
+            a fast implementation of the standard Euclidean metric.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        datasets_pair: DatasetsPair
+            The suited DatasetsPair implementation.
+        """
         cdef:
             DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
                                                                  **metric_kwargs)
 
-        # TODO: what's the best coercion for lists?
+        # check_array can be expensive, and we prefer to simply coerce from lists
+        # to ndarrays eventually to get their dtype itemsize
         X = np.asarray(X) if isinstance(X, (tuple, list)) else X
         Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
 
@@ -1272,10 +1302,15 @@ cdef class DatasetsPair:
 cdef class DenseDenseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of two arrays.
 
+    Parameters
+    ----------
     X: ndarray of shape (n_X, d)
-        Rows represent vectors
+        Rows represent vectors.
     Y: ndarray of shape (n_Y, d)
-        Rows represent vectors
+        Rows represent vectors.
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
     """
     cdef:
         const DTYPE_t[:, ::1] X  # shape: (n_X, d)
@@ -1320,10 +1355,15 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
 cdef class SparseSparseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of two sparse matrices.
 
+    Parameters
+    ----------
     X: sparse matrix of shape (n_X, d)
-        Rows represent vectors
+        Rows represent vectors.
     Y: sparse matrix of shape (n_X, d)
-        Rows represent vectors
+        Rows represent vectors.
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
     """
     cdef:
         const DTYPE_t[:] X_data
@@ -1394,10 +1434,15 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
 cdef class SparseDenseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of a sparse matrix and a dense array.
 
+    Parameters
+    ----------
     X: sparse matrix of shape (n_X, d)
-        Rows represent vectors
+        Rows represent vectors.
     Y: ndarray of shape (n_Y, d)
-        Rows represent vectors
+        Rows represent vectors.
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
     """
     cdef:
         const DTYPE_t[:] X_data
@@ -1468,17 +1513,22 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
 cdef class DenseSparseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of a dense array and a sparse matrix.
 
+    Parameters
+    ----------
     X: ndarray of shape (n_X, d)
-        Rows represent vectors
+        Rows represent vectors.
     Y: sparse matrix of shape (n_Y, d)
-        Rows represent vectors
+        Rows represent vectors.
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
     """
     cdef:
         # As distance metrics are symmetric functions, we can
         # simply rely on the other DatasetsPair and swap arguments.
         DatasetsPair datasets_pair
 
-    def __init__(self, X, Y, distance_metric):
+    def __init__(self, X, Y, DistanceMetric distance_metric):
         DatasetsPair.__init__(self, distance_metric)
         # Swapping arguments on the constructor
         self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 83a44b27dd3b2..39eded68f31e8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -7,6 +7,17 @@
 # cython: initializedcheck=False
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
+
+# Pairwise Distances Reductions
+# =============================
+#
+#    Author: Julien Jerphanion <git@jjerphan.xyz>
+#
+#
+# The routines defined here are used in various algorithms realising
+# the same structure of operations on distances between vectors
+# of a datasets pair (X, Y).
+
 import numbers
 import numpy as np
 cimport numpy as np
@@ -44,21 +55,21 @@ from ..utils import check_array, check_scalar, _in_unstable_openblas_configurati
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
-
+# Those constants have been chosen for modern laptops' caches and architecture.
 DEF CHUNK_SIZE = 256  # number of vectors
 DEF MIN_CHUNK_SAMPLES = 20
+
 DEF FLOAT_INF = 1e36
 
-# TODO: This has been introduced in Cython 3.0, change for
-# `libcpp.algorithm.move` once Cython 3 is used
+# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
 # Introduction in Cython:
-# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa
 cdef extern from "<algorithm>" namespace "std" nogil:
-    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except +
+    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa
 
 ######################
 ## std::vector to np.ndarray coercion
-# As type covariance is not supported for C++ container via Cython,
+# As type covariance is not supported for C++ containers via Cython,
 # we need to redefine fused types.
 ctypedef fused vector_DITYPE_t:
     vector[ITYPE_t]
@@ -75,10 +86,16 @@ cdef extern from "numpy/arrayobject.h":
 
 
 cdef class StdVectorSentinel:
-    """Wraps a reference to a vector which will be deallocated with this object."""
-    pass
+    """Wraps a reference to a vector which will be deallocated with this object.
 
+    When created, the StdVectorSentinel swaps the reference of its internal
+    vectors with the provided one (vec_ptr), thus making the StdVectorSentinel
+    manage the provided one's lifetime.
+    """
+    pass
 
+# We necessarily need to define two extension types extending StdVectorSentinel
+# because we need to provide the dtype of the vector but can't use numeric fused types.
 cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
     cdef vector[DTYPE_t] vec
 
@@ -99,11 +116,12 @@ cdef class StdVectorSentinelITYPE(StdVectorSentinel):
         return sentinel
 
 
-cdef np.ndarray vector_to_numpy_array(vector_DITYPE_t * vect_ptr):
+cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
     """Create a numpy ndarray given a C++ vector.
 
-    This registers a Sentinel as the base object for the numpy array
-    freeing the C++ vector it encapsulates when it must.
+    The numpy array buffer is the one of the C++ vector.
+    A StdVectorSentinel is registers as the base object for the numpy array,
+    freeing the C++ vector it encapsulates when the numpy array is freed.
     """
     typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE
     cdef:
@@ -118,26 +136,27 @@ cdef np.ndarray vector_to_numpy_array(vector_DITYPE_t * vect_ptr):
         sentinel = StdVectorSentinelITYPE.create_for(vect_ptr)
 
     # Makes the numpy array responsible to the life-cycle of its buffer.
-    # A reference to the sentinel will be stolen by the call bellow,
-    # so we increase its reference count.
+    # A reference to the StdVectorSentinel will be stolen by the call bellow,
+    # so we increase its reference counter.
     # See: https://docs.python.org/3/c-api/intro.html#reference-count-details
     Py_INCREF(sentinel)
     PyArray_SetBaseObject(arr, <PyObject*>sentinel)
     return arr
 
 
-cdef np.ndarray[object, ndim=1] _coerce_vectors_to_np_nd_arrays(
+cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
     vector_vector_DITYPE_t* vecs
     ):
+    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
     cdef:
         ITYPE_t n = deref(vecs).size()
-        np.ndarray[object, ndim=1] np_arrays_of_np_arrays = np.empty(n,
+        np.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n,
                                                                      dtype=np.ndarray)
 
     for i in range(n):
-        np_arrays_of_np_arrays[i] = vector_to_numpy_array(&(deref(vecs)[i]))
+        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
 
-    return np_arrays_of_np_arrays
+    return nd_arrays_of_nd_arrays
 
 #####################
 
@@ -148,12 +167,13 @@ cdef class PairwiseDistancesReduction:
 
     The implementation of the reduction is done parallelized
     on chunks whose size can be set using ``chunk_size``.
+
     Parameters
     ----------
     datasets_pair: DatasetsPair
-        The pair of dataset to use
+        The pair of dataset to use.
     chunk_size: int
-        The number of vectors per chunk
+        The number of vectors per chunk.
     """
 
     cdef:
@@ -168,11 +188,12 @@ cdef class PairwiseDistancesReduction:
     @classmethod
     def valid_metrics(cls) -> List[str]:
         excluded = {
-            "pyfunc",  # is relatively slow because we need to coerce data as numpy arrays
+            "pyfunc",  # is relatively slow because we need to coerce data as np arrays
             "mahalanobis", # is numerically unstable
             # TODO: In order to support discrete distance metrics, we need to have a
             # simultaneous sort which breaks ties on indices when distances are identical.
-            # The best might be using std::sort and a Comparator.
+            # The best might be using a std::sort and a Comparator whic might need
+            # AoS instead of SoA (currently used).
             "hamming",
             *BOOL_METRICS,
         }
@@ -330,21 +351,21 @@ cdef class PairwiseDistancesReduction:
             self._on_Y_after_parallel(num_threads, X_start, X_end)
 
         # end: for X_chunk_idx
-        # Deallocating temporary datastructures and adjusting main datastructures before returning
+        # Deallocating temporary datastructures and adjusting main datastructures
         self._on_Y_finalize(num_threads)
         return
 
     # Placeholder methods which have to be implemented
 
-    cdef int _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
         ITYPE_t Y_end,
         ITYPE_t thread_num,
-    ) nogil except -1:
+    ) nogil:
         """Implemented the reduction on a pair of chunks."""
-        return -1
+        return
 
     # Placeholder methods which can be implemented
 
@@ -416,15 +437,15 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     Parameters
     ----------
     datasets_pair: DatasetsPair
-        The dataset pairs (X, Y) for the reduction
+        The dataset pairs (X, Y) for the reduction.
     k: int
-        The k for the argkmin reduction
+        The k for the argkmin reduction.
     chunk_size: int
-        The number of vectors per chunk
+        The number of vectors per chunk.
     """
 
     cdef:
-        ITYPE_t k
+        readonly ITYPE_t k
 
         ITYPE_t[:, ::1] argkmin_indices
         DTYPE_t[:, ::1] argkmin_distances
@@ -442,8 +463,36 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         ITYPE_t chunk_size=CHUNK_SIZE,
         dict metric_kwargs=dict(),
     ) -> ArgKmin:
+        """Return the ArgKmin implementation for the given arguments.
+
+        X : array-like of shape (n_X, d)
+            Input data.
+
+        Y : array-like of shape (n_Y, d)
+            Input data.
+
+        k : int
+            The k for the argkmin reduction.
+
+        metric : str, default='fast_sqeuclidean'
+            The distance metric to use for argkmin. The default metric is
+            a fast implementation of the standard Euclidean metric.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size: int, default=256,
+            The number of vectors per chunk.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        argkmin: ArgKmin
+            The suited ArgKmin implementation.
+        """
         # This factory comes to handle specialisations.
-        if metric == "fast_sqeuclidean":
+        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
 
         return ArgKmin(
@@ -479,13 +528,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         if self.heaps_proxy_distances_chunks is not NULL:
             free(self.heaps_proxy_distances_chunks)
 
-    cdef int _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
         ITYPE_t Y_end,
         ITYPE_t thread_num,
-    ) nogil except -1:
+    ) nogil:
         cdef:
             ITYPE_t i, j
             ITYPE_t n_X = X_end - X_start
@@ -506,15 +555,12 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                     Y_start + j,
                 )
 
-        return 0
-
     @final
     cdef void _on_X_prange_iter_init(self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
-
         # As this strategy is embarrassingly parallel, we can set the
         # thread heaps pointers to the proper position on the main heaps
         self.heaps_proxy_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
@@ -541,7 +587,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         ITYPE_t num_threads,
     ) nogil:
         cdef:
-            # number of scalar elements
+            # Maximum number of scalar elements (the last chunks can be smaller)
             ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
             ITYPE_t thread_num
 
@@ -575,8 +621,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         with nogil, parallel(num_threads=self.effective_omp_n_thread):
             # Synchronising the thread heaps with the main heaps
             # This is done in parallel samples-wise (no need for locks)
-            #
-            # NOTE: can this lead to false sharing?
             for idx in prange(X_end - X_start, schedule="static"):
                 for thread_num in range(num_threads):
                     for jdx in range(self.k):
@@ -678,7 +722,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
-            # We eventually need to recompute distances because we relied on proxy distances.
+            # We eventually need to recompute distances because we relied on proxies.
             self.compute_exact_distances()
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
@@ -709,7 +753,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return ArgKmin.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()
+        return (ArgKmin.is_usable_for(X, Y, metric) and
+                not _in_unstable_openblas_configuration())
 
     def __init__(self,
         X,
@@ -726,6 +771,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         self.Y = check_array(Y, dtype=DTYPE, order='C')
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
             sizeof(DTYPE_t *) * self.effective_omp_n_thread)
@@ -774,19 +820,13 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef int _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
         ITYPE_t Y_end,
         ITYPE_t thread_num,
-    ) nogil except -1:
-        """
-        Critical part of the computation of pairwise distances.
-
-        "Fast Squared Euclidean" distances strategy relying
-        on the gemm-trick.
-        """
+    ) nogil:
         cdef:
             ITYPE_t i, j
             ITYPE_t k = self.k
@@ -801,11 +841,11 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             #
             #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
             #
-            # The middle term gets computed efficiently bellow using GEMM from BLAS Level 3.
+            # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
             #
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays in BLAS documentations,
-            # for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
             #
             # Here, we use their counterpart values to work with C-ordered arrays.
             BLAS_Order order = RowMajor
@@ -815,9 +855,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             ITYPE_t n = Y_c.shape[0]
             ITYPE_t K = X_c.shape[1]
             DTYPE_t alpha = - 2.
-            # TODO: necessarily casting because APIs exposed
-            # via scipy.linalg.cython_blas aren't reflecting
-            # the const-identifier for arguments
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
             DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
             ITYPE_t lda = X_c.shape[1]
             DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
@@ -837,7 +876,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                     heaps_proxy_distances + i * k,
                     heaps_indices + i * k,
                     k,
-                    # proxy distance: |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    # proxy distance: ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                     (
                         self.X_sq_norms[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
@@ -845,7 +884,6 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                     ),
                     j + Y_start,
                 )
-        return 0
 
 
 cdef class RadiusNeighborhood(PairwiseDistancesReduction):
@@ -859,15 +897,15 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     Parameters
     ----------
     datasets_pair: DatasetsPair
-        The dataset pairs (X, Y) for the reduction
-    radius: int
-        The radius defining the neighborhood
+        The dataset pairs (X, Y) for the reduction.
+    radius: float
+        The radius defining the neighborhood.
     chunk_size: int
-        The number of vectors per chunk
+        The number of vectors per chunk.
     """
 
     cdef:
-        DTYPE_t radius
+        readonly DTYPE_t radius
 
         # DistanceMetric compute rank preserving distance via rdist
         # ("reduced distance" in the original wording),
@@ -906,8 +944,36 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         ITYPE_t chunk_size=CHUNK_SIZE,
         dict metric_kwargs=dict(),
     ) -> RadiusNeighborhood:
+        """Return the RadiusNeighborhood implementation for the given arguments.
+
+        X : array-like of shape (n_X, d)
+            Input data.
+
+        Y : array-like of shape (n_Y, d)
+            Input data.
+
+        radius : float
+            The radius defining the neighborhood.
+
+        metric : str, default='fast_sqeuclidean'
+            The distance metric to use for argkmin. The default metric is
+            a fast implementation of the standard Euclidean metric.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size: int, default=256,
+            The number of vectors per chunk.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        radius_neighborhood: RadiusNeighborhood
+            The suited RadiusNeighborhood implementation.
+        """
         # This factory comes to handle specialisations.
-        if metric == "fast_sqeuclidean":
+        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
             return FastSquaredEuclideanRadiusNeighborhood(X=X, Y=Y,
                                                           radius=radius,
                                                           chunk_size=chunk_size)
@@ -926,12 +992,14 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         check_scalar(radius, "radius", numbers.Real, min_val=0)
         self.radius = radius
-        self.proxy_radius = self.datasets_pair.distance_metric._dist_to_rdist(self.radius)
+        self.proxy_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
         self.sort_results = False
 
         # Allocating pointers to datastructures but not the datastructures themselves.
-        # There's potentially more pointers than actual thread used for the
-        # reduction but as many datastructures as threads.
+        # There as many pointers as available threads.
+        # When reducing on small datasets, there can be more pointers than actual
+        # threads used for the reduction but there won't be allocated but unused
+        # datastructures.
         self.neigh_distances_chunks = <vector[vector[DTYPE_t]] **> malloc(
             sizeof(self.neigh_distances) * self.effective_omp_n_thread)
         self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
@@ -944,13 +1012,13 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         if self.neigh_indices_chunks is not NULL:
             free(self.neigh_indices_chunks)
 
-    cdef int _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
         ITYPE_t Y_end,
         ITYPE_t thread_num,
-    ) nogil except -1:
+    ) nogil:
         cdef:
             ITYPE_t i, j
             DTYPE_t proxy_dist_i_j
@@ -962,8 +1030,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_distances_chunks[thread_num])[i].push_back(proxy_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
 
-        return 0
-
     @final
     cdef void _on_X_prange_iter_init(self,
         ITYPE_t thread_num,
@@ -1126,11 +1192,11 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         if return_distance:
             self.compute_exact_distances()
             res = (
-                _coerce_vectors_to_np_nd_arrays(self.neigh_distances),
-                _coerce_vectors_to_np_nd_arrays(self.neigh_indices),
+                coerce_vectors_to_nd_arrays(self.neigh_distances),
+                coerce_vectors_to_nd_arrays(self.neigh_indices),
             )
         else:
-            res = _coerce_vectors_to_np_nd_arrays(self.neigh_indices)
+            res = coerce_vectors_to_nd_arrays(self.neigh_indices)
 
         del self.neigh_distances
         del self.neigh_indices
@@ -1171,10 +1237,10 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         RadiusNeighborhood.__init__(self,
-                        # The distance computer here is used for exact distances computations
-                        datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
-                        radius=radius,
-                        chunk_size=chunk_size)
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            radius=radius,
+            chunk_size=chunk_size)
         self.X = check_array(X, dtype=DTYPE, order='C')
         self.Y = check_array(Y, dtype=DTYPE, order='C')
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
@@ -1228,19 +1294,13 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef int _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
         ITYPE_t Y_end,
         ITYPE_t thread_num,
-    ) nogil except -1:
-        """
-        Critical part of the computation of pairwise distances.
-
-        "Fast Squared Euclidean" distances strategy relying
-        on the gemm-trick.
-        """
+    ) nogil:
         cdef:
             ITYPE_t i, j
             DTYPE_t squared_dist_i_j
@@ -1253,11 +1313,11 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
             #
             #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
             #
-            # The middle term gets computed efficiently bellow using GEMM from BLAS Level 3.
+            # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
             #
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays in BLAS documentations,
-            # for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
             #
             # Here, we use their counterpart values to work with C-ordered arrays.
             BLAS_Order order = RowMajor
@@ -1267,9 +1327,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
             ITYPE_t n = Y_c.shape[0]
             ITYPE_t K = X_c.shape[1]
             DTYPE_t alpha = - 2.
-            # TODO: necessarily casting because APIs exposed
-            # via scipy.linalg.cython_blas aren't reflecting
-            # the const-identifier for arguments
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
             DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
             ITYPE_t lda = X_c.shape[1]
             DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
@@ -1292,5 +1351,3 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
                 if squared_dist_i_j <= self.proxy_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
-
-        return 0
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index fcc885c0f93a0..16a96f449e568 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -30,7 +30,8 @@ cdef int simultaneous_sort(
     """
     # TODO: In order to support discrete distance metrics, we need to have a
     # simultaneous sort which breaks ties on indices when distances are identical.
-    # The best might be using std::sort and a Comparator.
+    # The best might be using a std::sort and a Comparator whic might need
+    # AoS instead of SoA (currently used).
     cdef:
         ITYPE_t pivot_idx, i, store_idx
         floating pivot_val

From 89f909ce805d2be6ce3644ad08e89c7ad9b12967 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 11 Aug 2021 11:36:57 +0200
Subject: [PATCH 163/290] Correct string alignement

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/metrics/_dist_metrics.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 5a0019e1b065c..43171acd9a385 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1259,9 +1259,9 @@ cdef class DatasetsPair:
 
         if X.shape[1] != Y.shape[1]:
             raise ValueError("Vectors of X and Y must have the same "
-                               "number of dimensions but currently are "
-                               f"respectively {X.shape[1]}-dimensional "
-                               f"and {Y.shape[1]}-dimensional.")
+                              "number of dimensions but currently are "
+                              f"respectively {X.shape[1]}-dimensional "
+                              f"and {Y.shape[1]}-dimensional.")
 
         distance_metric._validate_data(X)
         distance_metric._validate_data(Y)

From 6887a373f973988c5d508ad3c87fdcc1df986da5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 11 Aug 2021 15:42:48 +0200
Subject: [PATCH 164/290] Improve comment for 32 bits fallback

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/pairwise.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 23a8d89aed25e..a767280ea4a35 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -660,7 +660,8 @@ def pairwise_distances_argmin_min(
         values = values.flatten()
         indices = indices.flatten()
     else:
-        # TODO: support sparse matrices
+        # TODO: once ArgKmin supports sparse input matrices and 32 bit,
+        # we won't need to fallback to pairwise_distances_chunked anymore.
         # When ArgKmin is not supported and when the
         # user asked for "fast_sqeuclidean", we need to
         # revert to "euclidean"

From 0832dc477f9b636e9e6af254fb5cce3786dc2449 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 12 Aug 2021 11:34:38 +0200
Subject: [PATCH 165/290] Remove some checks and interactions with python

Explicitly set directives to remove some checks.
Still some checks might be costly but currently needed.
---
 sklearn/metrics/_dist_metrics.pxd             |  22 +++-
 sklearn/metrics/_dist_metrics.pyx             | 104 +++++++++---------
 .../metrics/_pairwise_distances_reduction.pyx |  52 +++++----
 sklearn/utils/_heap.pxd                       |   4 +-
 sklearn/utils/_heap.pyx                       |   4 +-
 5 files changed, 102 insertions(+), 84 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index f5f93f88ac7e9..6cd3545feaa26 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -1,7 +1,13 @@
 #!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
+# cython: annotate=False
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: profile=False
+# cython: linetrace=False
+# cython: initializedcheck=False
+# cython: binding=False
+# distutils: define_macros=CYTHON_TRACE_NOGIL=0
 
 cimport numpy as np
 from libc.math cimport sqrt, exp
@@ -86,8 +92,12 @@ cdef class DistanceMetric:
 ######################################################################
 # DatasetsPair base class
 cdef class DatasetsPair:
-    cdef readonly DistanceMetric distance_metric
+    cdef DistanceMetric distance_metric
 
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1
+    cdef ITYPE_t n_X(self) nogil
 
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1
+    cdef ITYPE_t n_Y(self) nogil
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 43171acd9a385..2c3109c5181eb 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1,8 +1,14 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: initializedcheck=False
-#cython: cdivision=True
+# cython: language_level=3
+# cython: annotate=False
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: profile=False
+# cython: linetrace=False
+# cython: initializedcheck=False
+# cython: binding=False
+# distutils: define_macros=CYTHON_TRACE_NOGIL=0
+
 
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
 # written for the scikit-learn project
@@ -21,7 +27,7 @@ cdef extern from "arrayobject.h":
                                      int typenum, void* data)
 
 
-cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
+cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
     # Wrap a memory buffer with an ndarray. Warning: this is not robust.
     # In particular, if x is deallocated before the returned array goes
     # out of scope, this could cause memory errors.  Since there is not
@@ -1274,31 +1280,31 @@ cdef class DatasetsPair:
             return DenseSparseDatasetsPair(X, Y, distance_metric)
         return SparseSparseDatasetsPair(X, Y, distance_metric)
 
-    def __init__(self, DistanceMetric distance_metric):
-        self.distance_metric = distance_metric
-
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure getting ITYPE instead of int internally used for CSR matrices."""
+        # TODO: this adds another level of checks and conversion, could we remove it?
         X_data = check_array(X.data, dtype=DTYPE, ensure_2d=False)
         X_indices = check_array(X.indices, dtype=ITYPE, ensure_2d=False)
         X_indptr = check_array(X.indptr, dtype=ITYPE, ensure_2d=False)
         return X_data, X_indptr, X_indptr
 
-    @property
-    def n_X(self):
-        raise RuntimeError()
+    def __init__(self, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
 
-    @property
-    def n_Y(self):
-        raise RuntimeError()
+    cdef ITYPE_t n_X(self) nogil:
+        return -999
 
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef ITYPE_t n_Y(self) nogil:
+        return -999
+
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.dist(i, j)
 
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return -1
 
-
+@final
 cdef class DenseDenseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of two arrays.
 
@@ -1329,29 +1335,27 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         self.Y = check_array(Y, dtype=DTYPE, order='C')
         self.d = X.shape[1]
 
-    @property
     @final
-    def n_X(self):
+    cdef ITYPE_t n_X(self) nogil:
         return self.X.shape[0]
 
-    @property
     @final
-    def n_Y(self):
+    cdef ITYPE_t n_Y(self) nogil:
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.rdist(&self.X[i, 0],
                                           &self.Y[j, 0],
                                           self.d)
 
     @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.dist(&self.X[i, 0],
                                          &self.Y[j, 0],
                                          self.d)
 
-
+@final
 cdef class SparseSparseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of two sparse matrices.
 
@@ -1385,16 +1389,6 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         self.Y_indices = np.empty((1), dtype=ITYPE, order='C')
         self.Y_indptr = np.empty((1), dtype=ITYPE, order='C')
 
-    @property
-    @final
-    def n_X(self):
-        return self.X_indptr.shape[0] - 1
-
-    @property
-    @final
-    def n_Y(self):
-        return self.Y_indptr.shape[0] -1
-
     def __init__(self, X, Y, DistanceMetric distance_metric):
         DatasetsPair.__init__(self, distance_metric)
 
@@ -1405,7 +1399,15 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef ITYPE_t n_X(self) nogil:
+        return self.X_indptr.shape[0] - 1
+
+    @final
+    cdef ITYPE_t n_Y(self) nogil:
+        return self.Y_indptr.shape[0] -1
+
+    @final
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1418,7 +1420,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
                                           self.Y_indices[yj_start:yj_end])
 
     @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1430,7 +1432,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
                                          self.Y_data[yj_start:yj_end],
                                          self.Y_indices[yj_start:yj_end])
 
-
+@final
 cdef class SparseDenseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of a sparse matrix and a dense array.
 
@@ -1472,18 +1474,16 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         self.Y = check_array(Y, dtype=DTYPE)
         self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE)
 
-    @property
     @final
-    def n_X(self):
+    cdef ITYPE_t n_X(self) nogil:
         return self.X_indptr.shape[0] - 1
 
-    @property
     @final
-    def n_Y(self):
+    cdef ITYPE_t n_Y(self) nogil:
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1498,7 +1498,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
                                           self.Y_indices)
 
     @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1509,7 +1509,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
                                          self.Y[j, :],
                                          self.Y_indices)
 
-
+@final
 cdef class DenseSparseDatasetsPair(DatasetsPair):
     """Compute distances between vectors of a dense array and a sparse matrix.
 
@@ -1533,24 +1533,22 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
         # Swapping arguments on the constructor
         self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
 
-    @property
     @final
-    def n_X(self):
+    cdef ITYPE_t n_X(self) nogil:
         # Swapping interface
-        return self.datasets_pair.n_Y
+        return self.datasets_pair.n_Y()
 
-    @property
     @final
-    def n_Y(self):
+    cdef ITYPE_t n_Y(self) nogil:
         # Swapping interface
-        return self.datasets_pair.n_X
+        return self.datasets_pair.n_X()
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         # Swapping arguments on the same interface
         return self.datasets_pair.proxy_dist(j, i)
 
     @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil except -1:
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
         # Swapping arguments on the same interface
         return self.datasets_pair.dist(j, i)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 39eded68f31e8..011ada92b1ce0 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -1,4 +1,5 @@
 # cython: language_level=3
+# cython: annotate=False
 # cython: cdivision=True
 # cython: boundscheck=False
 # cython: wraparound=False
@@ -6,6 +7,7 @@
 # cython: linetrace=False
 # cython: initializedcheck=False
 # cython: binding=False
+# distutils: language=c++
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
 
 # Pairwise Distances Reductions
@@ -18,7 +20,6 @@
 # the same structure of operations on distances between vectors
 # of a datasets pair (X, Y).
 
-import numbers
 import numpy as np
 cimport numpy as np
 
@@ -47,6 +48,7 @@ from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
 from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 
+from numbers import Integral, Real
 from typing import List
 from scipy.sparse import issparse
 from threadpoolctl import threadpool_limits
@@ -101,7 +103,8 @@ cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
 
     @staticmethod
     cdef StdVectorSentinel create_for(vector[DTYPE_t] * vec_ptr):
-        sentinel = StdVectorSentinelDTYPE()
+        # This initializes the object directly without calling __init__
+        cdef StdVectorSentinelDTYPE sentinel = StdVectorSentinelDTYPE.__new__(StdVectorSentinelDTYPE)
         sentinel.vec.swap(deref(vec_ptr))
         return sentinel
 
@@ -111,7 +114,8 @@ cdef class StdVectorSentinelITYPE(StdVectorSentinel):
 
     @staticmethod
     cdef StdVectorSentinel create_for(vector[ITYPE_t] * vec_ptr):
-        sentinel = StdVectorSentinelITYPE()
+        # This initializes the object directly without calling __init__
+        cdef StdVectorSentinelITYPE sentinel = StdVectorSentinelITYPE.__new__(StdVectorSentinelITYPE)
         sentinel.vec.swap(deref(vec_ptr))
         return sentinel
 
@@ -126,6 +130,8 @@ cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
     typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE
     cdef:
         np.npy_intp size = deref(vect_ptr).size()
+        # TODO: use PyArray_SimpleNewFromData from the Numpy C API directly
+        # I've tried, but Cython fails when parsing the C API
         np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum,
                                                       deref(vect_ptr).data())
         StdVectorSentinel sentinel
@@ -177,7 +183,7 @@ cdef class PairwiseDistancesReduction:
     """
 
     cdef:
-        readonly DatasetsPair datasets_pair
+        DatasetsPair _datasets_pair
 
         ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
@@ -209,6 +215,10 @@ cdef class PairwiseDistancesReduction:
                 not issparse(Y) and Y.dtype.itemsize == 8 and Y.ndim == 2 and
                 metric in cls.valid_metrics())
 
+    @property
+    def datasets_pair(self) ->DatasetsPair:
+        return self._datasets_pair
+
     def __init__(self,
         DatasetsPair datasets_pair,
         ITYPE_t chunk_size = CHUNK_SIZE,
@@ -218,18 +228,18 @@ cdef class PairwiseDistancesReduction:
 
         self.effective_omp_n_thread = _openmp_effective_n_threads()
 
-        check_scalar(chunk_size, "chunk_size", numbers.Integral, min_val=1)
+        check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
         self.chunk_size = chunk_size
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
-        self.datasets_pair = datasets_pair
+        self._datasets_pair = datasets_pair
 
-        self.n_Y = datasets_pair.n_Y
+        self.n_Y = datasets_pair.n_Y()
         self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
         Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
         self.Y_n_samples_remainder = self.n_Y % self.Y_n_samples_chunk
 
-        self.n_X = datasets_pair.n_X
+        self.n_X = datasets_pair.n_X()
         self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
         X_n_full_chunks = self.n_X // self.X_n_samples_chunk
         self.X_n_samples_remainder = self.n_X % self.X_n_samples_chunk
@@ -445,7 +455,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     """
 
     cdef:
-        readonly ITYPE_t k
+        ITYPE_t k
 
         ITYPE_t[:, ::1] argkmin_indices
         DTYPE_t[:, ::1] argkmin_distances
@@ -508,7 +518,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ):
         PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
 
-        check_scalar(k, "k", numbers.Integral, min_val=1)
+        check_scalar(k, "k", Integral, min_val=1)
         self.k = k
 
         # Allocating pointers to datastructures but not the datastructures themselves.
@@ -551,7 +561,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                     heaps_proxy_distances + i * self.k,
                     heaps_indices + i * self.k,
                     k,
-                    self.datasets_pair.proxy_dist(X_start + i, Y_start + j),
+                    self._datasets_pair.proxy_dist(X_start + i, Y_start + j),
                     Y_start + j,
                 )
 
@@ -665,7 +675,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         for i in prange(self.n_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
             for j in range(self.k):
-                distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(distances[i, j])
+                distances[i, j] = self._datasets_pair.distance_metric._rdist_to_dist(distances[i, j])
 
     @final
     def compute(self,
@@ -767,8 +777,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             k=k,
             chunk_size=chunk_size)
-        self.X = check_array(X, dtype=DTYPE, order='C')
-        self.Y = check_array(Y, dtype=DTYPE, order='C')
+        # X and Y are checked by the DatasetsPair
+        self.X, self.Y = X, Y
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
 
@@ -905,7 +915,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     """
 
     cdef:
-        readonly DTYPE_t radius
+        DTYPE_t radius
 
         # DistanceMetric compute rank preserving distance via rdist
         # ("reduced distance" in the original wording),
@@ -990,9 +1000,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ):
         PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
 
-        check_scalar(radius, "radius", numbers.Real, min_val=0)
+        check_scalar(radius, "radius", Real, min_val=0)
         self.radius = radius
-        self.proxy_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
+        self.proxy_radius = self._datasets_pair.distance_metric._dist_to_rdist(radius)
         self.sort_results = False
 
         # Allocating pointers to datastructures but not the datastructures themselves.
@@ -1025,7 +1035,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         for i in range(X_start, X_end):
             for j in range(Y_start, Y_end):
-                proxy_dist_i_j = self.datasets_pair.proxy_dist(i, j)
+                proxy_dist_i_j = self._datasets_pair.proxy_dist(i, j)
                 if proxy_dist_i_j <= self.proxy_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i].push_back(proxy_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
@@ -1149,7 +1159,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                         num_threads=self.effective_omp_n_thread):
             for j in range(deref(self.neigh_indices)[i].size()):
                 deref(self.neigh_distances)[i][j] = (
-                        self.datasets_pair.distance_metric._rdist_to_dist(
+                        self._datasets_pair.distance_metric._rdist_to_dist(
                             deref(self.neigh_distances)[i][j]
                         )
                 )
@@ -1241,8 +1251,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             radius=radius,
             chunk_size=chunk_size)
-        self.X = check_array(X, dtype=DTYPE, order='C')
-        self.Y = check_array(Y, dtype=DTYPE, order='C')
+        # X and Y are checked by the DatasetsPair
+        self.X, self.Y = X, Y
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
 
diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
index a33c47f416b53..65227c0d30b70 100644
--- a/sklearn/utils/_heap.pxd
+++ b/sklearn/utils/_heap.pxd
@@ -9,7 +9,7 @@ cdef int simultaneous_sort(
     floating* dist,
     ITYPE_t* idx,
     ITYPE_t size
-) nogil except -1
+) nogil
 
 cdef int heap_push(
     floating* values,
@@ -17,4 +17,4 @@ cdef int heap_push(
     ITYPE_t size,
     floating val,
     ITYPE_t val_idx,
-) nogil except -1
+) nogil
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index 16a96f449e568..b8878ad402dbb 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -23,7 +23,7 @@ cdef int simultaneous_sort(
     floating* values,
     ITYPE_t* indices,
     ITYPE_t size
-) nogil except -1:
+) nogil:
     """
     Perform a recursive quicksort on the values array, simultaneously
     performing the same swaps on the indices array.
@@ -90,7 +90,7 @@ cdef inline int heap_push(
     ITYPE_t size,
     floating val,
     ITYPE_t val_idx,
-) nogil except -1:
+) nogil:
     """Push a tuple (val, val_idx) into a fixed-size max-heap.
 
     The max-heap is represented as a struct of arrays where:

From 4fba30a9b183bae9aea4df183d0df0b4812c617d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 31 Aug 2021 09:40:51 +0200
Subject: [PATCH 166/290] CI Specify latest lib versions for linux-arm64

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .circleci/config.yml                 | 2 ++
 build_tools/circle/build_test_arm.sh | 6 ++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b730ae0ff595a..c9ca1987b9dff 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -137,6 +137,8 @@ jobs:
     environment:
       - OMP_NUM_THREADS: 2
       - OPENBLAS_NUM_THREADS: 2
+      - NUMPY_VERSION: 'latest'
+      - SCIPY_VERSION: 'latest'
       - CYTHON_VERSION: 'latest'
       - JOBLIB_VERSION: 'latest'
       - THREADPOOLCTL_VERSION: 'latest'
diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index 5169c4871ba04..9ad7418e855ca 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -39,10 +39,8 @@ source activate testenv
 # Use the latest by default
 mamba install --verbose -y  ccache \
                             pip \
-                            numpy \
-                            scipy \
-                            cython \
-                            pip \
+                            $(get_dep numpy $NUMPY_VERSION) \
+                            $(get_dep scipy $SCIPY_VERSION) \
                             $(get_dep cython $CYTHON_VERSION) \
                             $(get_dep joblib $JOBLIB_VERSION) \
                             $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \

From 2a8c0265880d1c485ea6573d2171b74fbacc46f7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 31 Aug 2021 09:42:44 +0200
Subject: [PATCH 167/290] DOC Fix glossary

---
 doc/glossary.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/glossary.rst b/doc/glossary.rst
index fcea1bf1ec378..274672b64cf9b 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -644,7 +644,7 @@ General Concepts
 
         Note that for most distance metrics, we rely on implementations from
         :mod:`scipy.spatial.distance`, but may reimplement for efficiency in
-        our context. The :class:`metrics.Distance` is used to implement
+        our context. The :class:`metrics.DistanceMetric` interface is used to implement
         distance metrics for integration with efficient neighbors search.
 
     pd

From f4c5b64918810804eabf3b07d7422f574b3efab2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 31 Aug 2021 10:39:27 +0200
Subject: [PATCH 168/290] Remove neighbors.DistanceMetric.__init__

---
 sklearn/neighbors/_distance_metric.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py
index d7992ca833cd8..10d6e24139068 100644
--- a/sklearn/neighbors/_distance_metric.py
+++ b/sklearn/neighbors/_distance_metric.py
@@ -14,11 +14,6 @@ def _warn(cls):
             category=FutureWarning,
         )
 
-    def __init__(self):
-        DistanceMetric._warn()
-        if self.__class__ is DistanceMetric:
-            raise NotImplementedError("DistanceMetric is an abstract class")
-
     @classmethod
     def get_metric(cls, metric, **kwargs):
         DistanceMetric._warn()

From dfd9661cfb059f7cb7315aa5cbe56406623a6b9f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 31 Aug 2021 11:35:25 +0200
Subject: [PATCH 169/290] Format Parameters section

---
 sklearn/metrics/_dist_metrics.pyx                 | 10 ++++++++++
 sklearn/metrics/_pairwise_distances_reduction.pyx |  9 +++++++++
 2 files changed, 19 insertions(+)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 2c3109c5181eb..6158e8a9f15a7 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1228,6 +1228,8 @@ cdef class DatasetsPair:
     ) -> DatasetsPair:
         """Return the DatasetsPair implementation for the given arguments.
 
+        Parameters
+        ----------
         X : array-like of shape (n_X, d)
             Input data.
 
@@ -1312,8 +1314,10 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
     ----------
     X: ndarray of shape (n_X, d)
         Rows represent vectors.
+
     Y: ndarray of shape (n_Y, d)
         Rows represent vectors.
+
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
         between two vectors of (X, Y).
@@ -1363,8 +1367,10 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
     ----------
     X: sparse matrix of shape (n_X, d)
         Rows represent vectors.
+
     Y: sparse matrix of shape (n_X, d)
         Rows represent vectors.
+
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
         between two vectors of (X, Y).
@@ -1440,8 +1446,10 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
     ----------
     X: sparse matrix of shape (n_X, d)
         Rows represent vectors.
+
     Y: ndarray of shape (n_Y, d)
         Rows represent vectors.
+
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
         between two vectors of (X, Y).
@@ -1517,8 +1525,10 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
     ----------
     X: ndarray of shape (n_X, d)
         Rows represent vectors.
+
     Y: sparse matrix of shape (n_Y, d)
         Rows represent vectors.
+
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
         between two vectors of (X, Y).
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 011ada92b1ce0..ecc410848a4f8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -178,6 +178,7 @@ cdef class PairwiseDistancesReduction:
     ----------
     datasets_pair: DatasetsPair
         The pair of dataset to use.
+
     chunk_size: int
         The number of vectors per chunk.
     """
@@ -448,8 +449,10 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ----------
     datasets_pair: DatasetsPair
         The dataset pairs (X, Y) for the reduction.
+
     k: int
         The k for the argkmin reduction.
+
     chunk_size: int
         The number of vectors per chunk.
     """
@@ -475,6 +478,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ) -> ArgKmin:
         """Return the ArgKmin implementation for the given arguments.
 
+        Parameters
+        ----------
         X : array-like of shape (n_X, d)
             Input data.
 
@@ -684,6 +689,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ):
         """Computes the reduction of vectors (rows) of X on Y.
 
+        Parameters
+        ----------
         strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
             The chunking strategy defining which dataset
             parallelization are made on.
@@ -956,6 +963,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     ) -> RadiusNeighborhood:
         """Return the RadiusNeighborhood implementation for the given arguments.
 
+        Parameters
+        ----------
         X : array-like of shape (n_X, d)
             Input data.
 

From afc8bf863050af1fbc1e9bc833c6a6353b3cc7b7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 31 Aug 2021 11:35:50 +0200
Subject: [PATCH 170/290] Improve some docstrings and comments

---
 .../metrics/_pairwise_distances_reduction.pyx | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index ecc410848a4f8..96d66a01173d9 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -167,7 +167,7 @@ cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 #####################
 
 cdef class PairwiseDistancesReduction:
-    """Abstract class to computes a reduction on pairwise
+    """Abstract class computing a reduction on pairwise
     distances between a set of vectors (rows) X and another
     set of vectors (rows) of Y.
 
@@ -904,19 +904,17 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
 
 cdef class RadiusNeighborhood(PairwiseDistancesReduction):
-    """Returns indices in a vector-set Y of radius-based neighbors of vector-set X.
+    """Returns radius-based neighbors vectors' indices in a dataset Y of
+    of vectors in a dataset X.
 
-    The neighbors of a first set of vectors X present in
-    the second in present another.
-
-    present in another set of vectors
-    (rows of Y) for a given a radius and distance.
     Parameters
     ----------
     datasets_pair: DatasetsPair
         The dataset pairs (X, Y) for the reduction.
+
     radius: float
         The radius defining the neighborhood.
+
     chunk_size: int
         The number of vectors per chunk.
     """
@@ -930,19 +928,20 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # We get the proxy for the radius to be able to compare
         DTYPE_t proxy_radius
 
-        # We want resizable buffers which we will to wrapped within numpy
-        # arrays at the end.
+        # Neighbors informations are returned as np.ndarray or np.ndarray.
         #
-        # std::vector comes as a handy interface for efficient resizable
-        # buffers.
+        # We want resizable buffers which we will to wrapped within numpy
+        # arrays at the end. std::vector comes as a handy interface for
+        # interacting efficiently with resizable buffers.
         #
         # Though it is possible to access their buffer address with
-        # std::vector::data, their buffer can't be stolen: their
-        # life-time is tight to the buffer's.
+        # std::vector::data, they can't be stolen: buffers lifetime
+        # is tight to their std::vector and are deallocated when
+        # std::vectors are.
         #
-        # To solve this, we dynamically allocate vectors and then
+        # To solve this, we dynamically allocate std::vectors and then
         # encapsulate them in a StdVectorSentinel responsible for
-        # freeing them when needed
+        # freeing them when the associated np.ndarray is freed.
         vector[vector[ITYPE_t]] * neigh_indices
         vector[vector[DTYPE_t]] * neigh_distances
 
@@ -1247,7 +1246,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return RadiusNeighborhood.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()
+        return (RadiusNeighborhood.is_usable_for(X, Y, metric)
+                and not _in_unstable_openblas_configuration())
 
     def __init__(self,
         X,

From de2dbf6c06d9558b60413f41a00cb75fc5f9aefd Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 31 Aug 2021 11:39:50 +0200
Subject: [PATCH 171/290] Use libc.float.DBL_MAX instead of constant defined
 via macro

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 96d66a01173d9..87f108b92ab4b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -26,6 +26,7 @@ cimport numpy as np
 np.import_array()
 
 from libc.stdlib cimport free, malloc
+from libc.float cimport DBL_MAX
 from libcpp.vector cimport vector
 from cython cimport final
 from cpython.object cimport PyObject
@@ -61,7 +62,6 @@ from ..utils._typedefs import ITYPE, DTYPE
 DEF CHUNK_SIZE = 256  # number of vectors
 DEF MIN_CHUNK_SAMPLES = 20
 
-DEF FLOAT_INF = 1e36
 
 # TODO: change for `libcpp.algorithm.move` once Cython 3 is used
 # Introduction in Cython:
@@ -622,7 +622,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     ) nogil:
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
-            self.heaps_proxy_distances_chunks[thread_num][idx] = FLOAT_INF
+            self.heaps_proxy_distances_chunks[thread_num][idx] = DBL_MAX
             self.heaps_indices_chunks[thread_num][idx] = -1
 
     @final
@@ -718,7 +718,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
         # Results returned by ArgKmin.compute used as the main heaps
         self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
-        self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
+        self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE)
 
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the

From 36f7b6eb5ff1ad6ae408b540e6666849f2540fd1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 31 Aug 2021 13:31:21 +0200
Subject: [PATCH 172/290] Change default metric for 'fast_sqeuclidean'

---
 sklearn/metrics/_dist_metrics.pxd             |  7 ++++++
 sklearn/metrics/_dist_metrics.pyx             |  6 ++---
 .../metrics/_pairwise_distances_reduction.pyx | 14 +++++++-----
 sklearn/neighbors/_base.py                    | 10 ++++++++-
 sklearn/neighbors/_classification.py          | 22 ++++++++++---------
 sklearn/neighbors/_graph.py                   | 18 ++++++++-------
 sklearn/neighbors/_regression.py              |  2 +-
 sklearn/neighbors/_unsupervised.py            | 11 +++++-----
 8 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index 6cd3545feaa26..146629bb0311b 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -101,3 +101,10 @@ cdef class DatasetsPair:
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
 
     cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        ITYPE_t d
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 6158e8a9f15a7..d473ee19beacc 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1322,10 +1322,8 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         The distance metric responsible for computing distances
         between two vectors of (X, Y).
     """
-    cdef:
-        const DTYPE_t[:, ::1] X  # shape: (n_X, d)
-        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
-        ITYPE_t d
+
+    # The `X`, `Y` and `d` attributes are defined in _dist_metrics.pxd
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 87f108b92ab4b..25cac4e66a4d7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -34,7 +34,7 @@ from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
 from cpython.ref cimport Py_INCREF
 
-from ._dist_metrics cimport DatasetsPair
+from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
 from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -784,8 +784,10 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             k=k,
             chunk_size=chunk_size)
-        # X and Y are checked by the DatasetsPair
-        self.X, self.Y = X, Y
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+        self.X, self.Y = datasets_pair.X, datasets_pair.Y
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
 
@@ -1260,8 +1262,10 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             radius=radius,
             chunk_size=chunk_size)
-        # X and Y are checked by the DatasetsPair
-        self.X, self.Y = X, Y
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+        self.X, self.Y = datasets_pair.X, datasets_pair.Y
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
 
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index f6c7fcc7df344..c606deb3c839d 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -334,7 +334,7 @@ def __init__(
         radius=None,
         algorithm="auto",
         leaf_size=30,
-        metric="minkowski",
+        metric="fast_sqeuclidean",
         p=2,
         metric_params=None,
         n_jobs=None,
@@ -363,6 +363,10 @@ def _check_algorithm_metric(self):
         else:
             alg_check = self.algorithm
 
+        if alg_check != "brute" and self.metric == "fast_sqeuclidean":
+            # The fast euclidean alternative is only available for 'brute'.
+            self.metric = "euclidean"
+
         if callable(self.metric):
             if self.algorithm == "kd_tree":
                 # callable metric is only valid for brute force and ball_tree
@@ -501,6 +505,10 @@ def _fit(self, X, y=None):
         if issparse(X):
             if self.algorithm not in ("auto", "brute"):
                 warnings.warn("cannot use tree with sparse input: using brute force")
+            if self.metric == "fast_sqeuclidean":
+                # TODO: support sparse datasets
+                # The fast euclidean alternative is only available for dense datasets.
+                self.effective_metric_ = "euclidean"
             if self.effective_metric_ not in VALID_METRICS_SPARSE[
                 "brute"
             ] and not callable(self.effective_metric_):
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 9bb6ccc7f0a73..b3457c3a998ca 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -64,10 +64,11 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
-    metric : str or callable, default='minkowski'
-        The distance metric to use for the tree.  The default metric is
-        minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. For a list of available metrics, see the documentation of
+    metric : str or callable, default='fast_sqeuclidean'
+        The distance metric to use for the tree. The default distance is
+        'fast_sqeuclidean' as fast alternative for the Euclidean distance
+        metric. If exact results are needed, prefer 'euclidean'.
+        For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
@@ -154,7 +155,7 @@ def __init__(
         algorithm="auto",
         leaf_size=30,
         p=2,
-        metric="minkowski",
+        metric="fast_sqeuclidean",
         metric_params=None,
         n_jobs=None,
     ):
@@ -336,10 +337,11 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
-    metric : str or callable, default='minkowski'
-        the distance metric to use for the tree.  The default metric is
-        minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. For a list of available metrics, see the documentation of
+    metric : str or callable, default='fast_sqeuclidean'
+        the distance metric to use for the tree. The default distance is
+        'fast_sqeuclidean' as fast alternative for the Euclidean distance
+        metric. If exact results are needed, prefer 'euclidean'.
+        For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
@@ -430,7 +432,7 @@ def __init__(
         algorithm="auto",
         leaf_size=30,
         p=2,
-        metric="minkowski",
+        metric="fast_sqeuclidean",
         outlier_label=None,
         metric_params=None,
         n_jobs=None,
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 1e8d92cada599..7eb710db0c7c0 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -40,7 +40,7 @@ def kneighbors_graph(
     n_neighbors,
     *,
     mode="connectivity",
-    metric="minkowski",
+    metric="fast_sqeuclidean",
     p=2,
     metric_params=None,
     include_self=False,
@@ -64,10 +64,11 @@ def kneighbors_graph(
         matrix with ones and zeros, and 'distance' will return the distances
         between neighbors according to the given metric.
 
-    metric : str, default='minkowski'
-        The distance metric used to calculate the neighbors within a
-        given radius for each sample point. The default distance is
-        'euclidean' ('minkowski' metric with the p param equal to 2.)
+    metric : str, default='fast_sqeuclidean'
+        The distance metric used to calculate the k nearest neighbors for
+        each sample point. The default distance is
+        'fast_sqeuclidean' as fast alternative for the Euclidean distance
+        metric. If exact results are needed, prefer 'euclidean'.
         For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
 
@@ -130,7 +131,7 @@ def radius_neighbors_graph(
     radius,
     *,
     mode="connectivity",
-    metric="minkowski",
+    metric="fast_sqeuclidean",
     p=2,
     metric_params=None,
     include_self=False,
@@ -157,10 +158,11 @@ def radius_neighbors_graph(
         matrix with ones and zeros, and 'distance' will return the distances
         between neighbors according to the given metric.
 
-    metric : str, default='minkowski'
+    metric : str, default='fast_sqeuclidean'
         The distance metric used to calculate the neighbors within a
         given radius for each sample point. The default distance is
-        'euclidean' ('minkowski' metric with the p param equal to 2.)
+        'fast_sqeuclidean' as fast alternative for the Euclidean distance
+        metric. If exact results are needed, prefer 'euclidean'.
         For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 7bf309b827398..e32a961a99f1a 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -154,7 +154,7 @@ def __init__(
         algorithm="auto",
         leaf_size=30,
         p=2,
-        metric="minkowski",
+        metric="fast_sqeuclidean",
         metric_params=None,
         n_jobs=None,
     ):
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 4bd23367a7367..1f04e6c579e22 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -38,10 +38,11 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
-    metric : str or callable, default='minkowski'
-        The distance metric to use for the tree.  The default metric is
-        minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. For a list of available metrics, see the documentation of
+    metric : str or callable, default='fast_sqeuclidean'
+        The distance metric to use for the tree. The default distance is
+        'fast_sqeuclidean' as fast alternative for the Euclidean distance
+        metric. If exact results are needed, prefer 'euclidean'.
+        For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
@@ -124,7 +125,7 @@ def __init__(
         radius=1.0,
         algorithm="auto",
         leaf_size=30,
-        metric="minkowski",
+        metric="fast_sqeuclidean",
         p=2,
         metric_params=None,
         n_jobs=None,

From 16bf24a20edd18efab9bed6002356a5e886ec440 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Sep 2021 10:32:43 +0200
Subject: [PATCH 173/290] Adapt error message in test

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 61ac8295873c3..af05805f2a9fd 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -105,10 +105,10 @@ def test_argkmin_factory_method_wrong_usages():
     ):
         ArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
 
-    with pytest.raises(ValueError, match="`k`= -1, must be >= 1."):
+    with pytest.raises(ValueError, match="k == -1, must be >= 1."):
         ArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
 
-    with pytest.raises(ValueError, match="`k`= 0, must be >= 1."):
+    with pytest.raises(ValueError, match="k == 0, must be >= 1."):
         ArgKmin.get_for(X=X, Y=Y, k=0.1, metric=metric)
 
     with pytest.raises(ValueError, match="Unrecognized metric"):
@@ -147,7 +147,7 @@ def test_radius_neighborhood_factory_method_wrong_usages():
             X=X, Y=Y.astype(np.int32), radius=radius, metric=metric
         )
 
-    with pytest.raises(ValueError, match="`radius`= -1.0, must be >= 0."):
+    with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
         RadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric)
 
     with pytest.raises(ValueError, match="Unrecognized metric"):

From 5353794427bfd27b8fe60cf6af272f646c0f7106 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Sep 2021 10:34:23 +0200
Subject: [PATCH 174/290] Add guard against negative zeros when computing exact
 distances

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 25cac4e66a4d7..aefeee368852e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -676,11 +676,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             ITYPE_t i, j
             ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
             DTYPE_t[:, ::1] distances = self.argkmin_distances
-
         for i in prange(self.n_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
             for j in range(self.k):
-                distances[i, j] = self._datasets_pair.distance_metric._rdist_to_dist(distances[i, j])
+                distances[i, j] = self._datasets_pair.distance_metric._rdist_to_dist(
+                    # Guard against eventual -0., causing nan production.
+                    distances[i, j] if distances[i, j] > 0. else 0.
+                )
 
     @final
     def compute(self,
@@ -1170,7 +1172,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             for j in range(deref(self.neigh_indices)[i].size()):
                 deref(self.neigh_distances)[i][j] = (
                         self._datasets_pair.distance_metric._rdist_to_dist(
+                            # Guard against eventual -0., causing nan production.
                             deref(self.neigh_distances)[i][j]
+                            if deref(self.neigh_distances)[i][j] > 0.
+                            else 0
                         )
                 )
 

From d7b984dfd2a6b13ec76a3f8f61462a6d9a391404 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Sep 2021 14:17:37 +0200
Subject: [PATCH 175/290] Introduce 'fast_euclidean' and adapt KNeighborsMixins
 accordingly

---
 sklearn/metrics/_dist_metrics.pyx             |  2 +-
 .../metrics/_pairwise_distances_reduction.pyx | 61 +++++++++++++------
 sklearn/metrics/tests/test_pairwise.py        |  4 +-
 .../test_pairwise_distances_reduction.py      | 44 +++++++------
 sklearn/neighbors/_base.py                    | 48 +++++++++++----
 sklearn/neighbors/_classification.py          | 24 ++++----
 sklearn/neighbors/_graph.py                   | 22 +++----
 sklearn/neighbors/_regression.py              |  2 +-
 sklearn/neighbors/_unsupervised.py            | 10 +--
 9 files changed, 133 insertions(+), 84 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index d473ee19beacc..cfe17b6ee599e 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1236,7 +1236,7 @@ cdef class DatasetsPair:
         Y : array-like of shape (n_Y, d)
             Input data.
 
-        metric : str, default='fast_sqeuclidean'
+        metric : str, default='euclidean'
             The distance metric to use for argkmin. The default metric is
             a fast implementation of the standard Euclidean metric.
             For a list of available metrics, see the documentation of
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index aefeee368852e..e8ba9cf7cc338 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -204,7 +204,7 @@ cdef class PairwiseDistancesReduction:
             "hamming",
             *BOOL_METRICS,
         }
-        return sorted({"fast_sqeuclidean", *METRIC_MAPPING.keys()}.difference(excluded))
+        return sorted({"fast_euclidean", "fast_sqeuclidean", *METRIC_MAPPING.keys()}.difference(excluded))
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
@@ -472,7 +472,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         X,
         Y,
         ITYPE_t k,
-        str metric="fast_sqeuclidean",
+        str metric="fast_euclidean",
         ITYPE_t chunk_size=CHUNK_SIZE,
         dict metric_kwargs=dict(),
     ) -> ArgKmin:
@@ -489,7 +489,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         k : int
             The k for the argkmin reduction.
 
-        metric : str, default='fast_sqeuclidean'
+        metric : str, default='fast_euclidean'
             The distance metric to use for argkmin. The default metric is
             a fast implementation of the standard Euclidean metric.
             For a list of available metrics, see the documentation of
@@ -507,8 +507,11 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             The suited ArgKmin implementation.
         """
         # This factory comes to handle specialisations.
-        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
-            return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
+        if metric in {"fast_euclidean", "fast_sqeuclidean"} and not issparse(X) and not issparse(Y):
+            use_squared_distances = metric == "fast_sqeuclidean"
+            return FastEuclideanArgKmin(X=X, Y=Y, k=k,
+                                        use_squared_distances=use_squared_distances,
+                                        chunk_size=chunk_size)
 
         return ArgKmin(
             datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
@@ -669,8 +672,6 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 )
         return
 
-    # TODO: annotating with 'final' here makes the compilation fails but it should not
-    # @final
     cdef void compute_exact_distances(self) nogil:
         cdef:
             ITYPE_t i, j
@@ -748,7 +749,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         return np.asarray(self.argkmin_indices)
 
 
-cdef class FastSquaredEuclideanArgKmin(ArgKmin):
+cdef class FastEuclideanArgKmin(ArgKmin):
     """Fast specialized alternative for ArgKmin on EuclideanDistance.
 
     Notes
@@ -769,6 +770,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
+        bint use_squared_distances
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
@@ -779,7 +781,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         X,
         Y,
         ITYPE_t k,
-        ITYPE_t chunk_size = CHUNK_SIZE,
+        bint use_squared_distances=False,
+        ITYPE_t chunk_size=CHUNK_SIZE,
     ):
         ArgKmin.__init__(self,
             # The datasets pair here is used for exact distances computations
@@ -792,6 +795,7 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+        self.use_squared_distances = use_squared_distances
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
@@ -801,6 +805,11 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         if self.dist_middle_terms_chunks is not NULL:
             free(self.dist_middle_terms_chunks)
 
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            ArgKmin.compute_exact_distances(self)
+
     @final
     cdef void _on_X_parallel_init(self,
             ITYPE_t thread_num,
@@ -960,7 +969,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         X,
         Y,
         DTYPE_t radius,
-        str metric="fast_sqeuclidean",
+        str metric="fast_euclidean",
         ITYPE_t chunk_size=CHUNK_SIZE,
         dict metric_kwargs=dict(),
     ) -> RadiusNeighborhood:
@@ -977,7 +986,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         radius : float
             The radius defining the neighborhood.
 
-        metric : str, default='fast_sqeuclidean'
+        metric : str, default='fast_euclidean'
             The distance metric to use for argkmin. The default metric is
             a fast implementation of the standard Euclidean metric.
             For a list of available metrics, see the documentation of
@@ -995,10 +1004,13 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             The suited RadiusNeighborhood implementation.
         """
         # This factory comes to handle specialisations.
-        if metric == "fast_sqeuclidean" and not issparse(X) and not issparse(Y):
-            return FastSquaredEuclideanRadiusNeighborhood(X=X, Y=Y,
-                                                          radius=radius,
-                                                          chunk_size=chunk_size)
+        if metric in {"fast_euclidean", "fast_sqeuclidean"} and not issparse(X) and not issparse(Y):
+            use_squared_distances = metric == "fast_sqeuclidean"
+            return FastEuclideanRadiusNeighborhood(X=X, Y=Y,
+                                                   radius=radius,
+                                                   use_squared_distances=use_squared_distances,
+                                                   chunk_size=chunk_size)
+
         return RadiusNeighborhood(
             datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
             radius=radius,
@@ -1160,8 +1172,6 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         return
 
-    # TODO: annotating with 'final' here makes the compilation fails but it should not
-    # @final
     cdef void compute_exact_distances(self) nogil:
         """Convert proxy distances to pairwise distances in parallel."""
         cdef:
@@ -1229,7 +1239,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         return res
 
 
-cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
+cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
     """Fast specialized alternative for RadiusNeighborhood on EuclideanDistance.
 
     Notes
@@ -1250,6 +1260,7 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
 
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
+        bint use_squared_distances
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
@@ -1260,7 +1271,8 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         X,
         Y,
         DTYPE_t radius,
-        ITYPE_t chunk_size = CHUNK_SIZE,
+        bint use_squared_distances=False,
+        ITYPE_t chunk_size=CHUNK_SIZE,
     ):
         RadiusNeighborhood.__init__(self,
             # The datasets pair here is used for exact distances computations
@@ -1273,6 +1285,12 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
         self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+        self.use_squared_distances = use_squared_distances
+
+        if use_squared_distances:
+            # In this specialisation and this setup, the value passed to the radius is
+            # already considered to be the the proxy radius, so we overwrite it.
+            self.proxy_radius = radius
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
@@ -1282,6 +1300,11 @@ cdef class FastSquaredEuclideanRadiusNeighborhood(RadiusNeighborhood):
         if self.dist_middle_terms_chunks is not NULL:
             free(self.dist_middle_terms_chunks)
 
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            RadiusNeighborhood.compute_exact_distances(self)
+
     @final
     cdef void _on_X_parallel_init(self,
         ITYPE_t thread_num,
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 0b20942965afa..d2eb43d7bcf2a 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1470,7 +1470,7 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
 @pytest.mark.parametrize("X_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("sign", [1, -1])
-def test_fast_sqeuclidean_correctness(
+def test_fast_euclidean_correctness(
     X_translation, Y_translation, sign, n_samples=10000, n_features=10
 ):
     # This is the only failing test case, so we prefer xfailing.
@@ -1491,7 +1491,7 @@ def test_fast_sqeuclidean_correctness(
 
     argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
     fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
-        X, Y, metric="fast_sqeuclidean"
+        X, Y, metric="fast_euclidean"
     )
 
     np.testing.assert_array_equal(argmins, fsq_argmins)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index af05805f2a9fd..c7a6d20b48699 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -24,8 +24,8 @@
     PairwiseDistancesReduction,
     ArgKmin,
     RadiusNeighborhood,
-    FastSquaredEuclideanArgKmin,
-    FastSquaredEuclideanRadiusNeighborhood,
+    FastEuclideanArgKmin,
+    FastEuclideanRadiusNeighborhood,
 )
 
 from sklearn.utils import _in_unstable_openblas_configuration
@@ -60,7 +60,7 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
     assert_allclose(
         ref_dist,
         dist,
-        err_msg="Query vectors havehas different neighbors' distances",
+        err_msg="Query vectors have different neighbors' distances",
         rtol=1e-7,
     )
 
@@ -172,14 +172,14 @@ def test_radius_neighborhood_factory_method_wrong_usages():
 @fails_if_unstable_openblas
 @pytest.mark.filterwarnings("ignore:Constructing a DIA matrix")
 @pytest.mark.parametrize(
-    "PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction",
+    "PairwiseDistancesReduction, FastPairwiseDistancesReduction",
     [
-        (ArgKmin, FastSquaredEuclideanArgKmin),
-        (RadiusNeighborhood, FastSquaredEuclideanRadiusNeighborhood),
+        (ArgKmin, FastEuclideanArgKmin),
+        (RadiusNeighborhood, FastEuclideanRadiusNeighborhood),
     ],
 )
 def test_pairwise_distances_reduction_factory_method(
-    PairwiseDistancesReduction, FastSquaredPairwiseDistancesReduction
+    PairwiseDistancesReduction, FastPairwiseDistancesReduction
 ):
     # Test all the combinations of DatasetsPair for creation
     rng = np.random.RandomState(1)
@@ -225,11 +225,11 @@ def test_pairwise_distances_reduction_factory_method(
         assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair)
 
     # Test specialisations creation
-    fast_sqeuclidean_instance = PairwiseDistancesReduction.get_for(
-        X, Y, dummy_arg, metric="fast_sqeuclidean"
+    fast_euclidean_instance = PairwiseDistancesReduction.get_for(
+        X, Y, dummy_arg, metric="fast_euclidean"
     )
-    assert isinstance(fast_sqeuclidean_instance, PairwiseDistancesReduction)
-    assert isinstance(fast_sqeuclidean_instance, FastSquaredPairwiseDistancesReduction)
+    assert isinstance(fast_euclidean_instance, PairwiseDistancesReduction)
+    assert isinstance(fast_euclidean_instance, FastPairwiseDistancesReduction)
 
 
 @fails_if_unstable_openblas
@@ -241,7 +241,7 @@ def test_argkmin_chunk_size_agnosticism(
     n_samples,
     chunk_size,
     k=10,
-    metric="fast_sqeuclidean",
+    metric="fast_euclidean",
     n_features=100,
     dtype=np.float64,
 ):
@@ -271,7 +271,7 @@ def test_radius_neighborhood_chunk_size_agnosticism(
     n_samples,
     chunk_size,
     radius=10.0,
-    metric="fast_sqeuclidean",
+    metric="fast_euclidean",
     n_features=100,
     dtype=np.float64,
 ):
@@ -308,9 +308,12 @@ def test_argkmin_strategies_consistency(
 ):
     # ArgKmin results obtained using both parallelization strategies
     # must be identical
-    if _in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
+    if _in_unstable_openblas_configuration() and metric == {
+        "fast_sqeuclidean",
+        "fast_euclidean",
+    }:
         pytest.xfail(
-            "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
+            "OpenBLAS (used for 'fast_(sq)euclidean') is unstable in this configuration"
         )
 
     rng = np.random.RandomState(seed)
@@ -359,9 +362,12 @@ def test_radius_neighborhood_strategies_consistency(
 ):
     # RadiusNeighborhood results obtained using both parallelization strategies
     # must be identical
-    if _in_unstable_openblas_configuration() and metric == "fast_sqeuclidean":
+    if _in_unstable_openblas_configuration() and metric == {
+        "fast_sqeuclidean",
+        "fast_euclidean",
+    }:
         pytest.xfail(
-            "OpenBLAS (used for 'fast_sqeuclidean') is unstable in this configuration"
+            "OpenBLAS (used for 'fast_(sq)euclidean') is unstable in this configuration"
         )
 
     rng = np.random.RandomState(seed)
@@ -427,7 +433,7 @@ def test_fast_sqeuclidean_correctness(
     eucl_dist, eucl_indices = ArgKmin.get_for(X, Y, k, metric="euclidean").compute(
         return_distance=True
     )
-    fse_dist, fse_indices = ArgKmin.get_for(X, Y, k, metric="fast_sqeuclidean").compute(
+    fse_dist, fse_indices = ArgKmin.get_for(X, Y, k, metric="fast_euclidean").compute(
         return_distance=True
     )
 
@@ -437,7 +443,7 @@ def test_fast_sqeuclidean_correctness(
         X, Y, radius, metric="euclidean"
     ).compute(return_distance=True)
     fse_dist, fse_indices = RadiusNeighborhood.get_for(
-        X, Y, radius, metric="fast_sqeuclidean"
+        X, Y, radius, metric="fast_euclidean"
     ).compute(return_distance=True)
 
     assert_radius_neighborhood_results_equality(
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index c606deb3c839d..6da50ac4fa4aa 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -51,6 +51,7 @@
             "correlation",
             "cosine",
             "dice",
+            "fast_euclidean",
             "fast_sqeuclidean",
             "hamming",
             "jaccard",
@@ -334,7 +335,7 @@ def __init__(
         radius=None,
         algorithm="auto",
         leaf_size=30,
-        metric="fast_sqeuclidean",
+        metric="minkowski",
         p=2,
         metric_params=None,
         n_jobs=None,
@@ -363,9 +364,15 @@ def _check_algorithm_metric(self):
         else:
             alg_check = self.algorithm
 
-        if alg_check != "brute" and self.metric == "fast_sqeuclidean":
-            # The fast euclidean alternative is only available for 'brute'.
-            self.metric = "euclidean"
+        if alg_check != "brute" and self.metric in {"fast_sqeuclidean", "sqeuclidean"}:
+            alternative = self.metric.replace("fast_", "")
+            warnings.warn(
+                f"'{self.metric}' is only available for algorithm='brute', falling"
+                f"back on metric='{alternative}'.",
+                UserWarning,
+                stacklevel=3,
+            )
+            self.metric = alternative
 
         if callable(self.metric):
             if self.algorithm == "kd_tree":
@@ -505,10 +512,11 @@ def _fit(self, X, y=None):
         if issparse(X):
             if self.algorithm not in ("auto", "brute"):
                 warnings.warn("cannot use tree with sparse input: using brute force")
-            if self.metric == "fast_sqeuclidean":
-                # TODO: support sparse datasets
-                # The fast euclidean alternative is only available for dense datasets.
-                self.effective_metric_ = "euclidean"
+
+            if self.metric in {"fast_sqeuclidean", "fast_euclidean"}:
+                # The fast alternatives are only available for dense datasets.
+                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
+
             if self.effective_metric_ not in VALID_METRICS_SPARSE[
                 "brute"
             ] and not callable(self.effective_metric_):
@@ -552,6 +560,8 @@ def _fit(self, X, y=None):
                 else:
                     self._fit_method = "brute"
 
+        specialised_metrics = {"euclidean", "sqeuclidean"}
+
         if self._fit_method == "ball_tree":
             self._tree = BallTree(
                 X,
@@ -567,6 +577,13 @@ def _fit(self, X, y=None):
                 **self.effective_metric_params_,
             )
         elif self._fit_method == "brute":
+            if (
+                self.effective_metric_ in specialised_metrics
+                and self.metric not in specialised_metrics
+            ):
+                # In that case, the standard stabler metric has not been explicitly
+                # specified by the user, so we prefer its fast alternative.
+                self.effective_metric_ = f"fast_{self.effective_metric_}"
             self._tree = None
         else:
             raise ValueError("algorithm = '%s' not recognized" % self.algorithm)
@@ -756,11 +773,11 @@ class from an array representing our data set and ask who's
 
         elif self._fit_method == "brute":
             # TODO: support sparse matrices
-            # When ArgKmin is not supported and when the
-            # user asked for "fast_sqeuclidean", we need to
-            # revert to "euclidean"
-            if self.effective_metric_ == "fast_sqeuclidean":
-                self.effective_metric_ = "euclidean"
+            # When ArgKmin is not supported and when the user ask for a
+            # fast alternative, we need to revert to the standard.
+            if self.effective_metric_ in {"fast_sqeuclidean", "fast_euclidean"}:
+                # The fast alternatives are only available for dense datasets.
+                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
             reduce_func = partial(
                 self._kneighbors_reduce_func,
@@ -1083,6 +1100,11 @@ class from an array representing our data set and ask who's
             )
 
         elif self._fit_method == "brute":
+            # When RadiusNeighborhood is not supported and when the user ask for a
+            # fast alternative, we need to revert to the standard.
+            if self.effective_metric_ in {"fast_sqeuclidean", "fast_euclidean"}:
+                # The fast alternatives are only available for dense datasets.
+                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
             reduce_func = partial(
                 self._radius_neighbors_reduce_func,
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 8b9ba5b5bdf68..a3044e1bf8118 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -64,11 +64,10 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
-    metric : str or callable, default='fast_sqeuclidean'
-        The distance metric to use for the tree. The default distance is
-        'fast_sqeuclidean' as fast alternative for the Euclidean distance
-        metric. If exact results are needed, prefer 'euclidean'.
-        For a list of available metrics, see the documentation of
+    metric : str or callable, default='minkowski'
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric. For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
@@ -161,7 +160,7 @@ def __init__(
         algorithm="auto",
         leaf_size=30,
         p=2,
-        metric="fast_sqeuclidean",
+        metric="minkowski",
         metric_params=None,
         n_jobs=None,
     ):
@@ -215,6 +214,7 @@ def predict(self, X):
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         neigh_dist, neigh_ind = self.kneighbors(X)
+
         classes_ = self.classes_
         _y = self._y
         if not self.outputs_2d_:
@@ -346,11 +346,10 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors
         equivalent to using manhattan_distance (l1), and euclidean_distance
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
-    metric : str or callable, default='fast_sqeuclidean'
-        the distance metric to use for the tree. The default distance is
-        'fast_sqeuclidean' as fast alternative for the Euclidean distance
-        metric. If exact results are needed, prefer 'euclidean'.
-        For a list of available metrics, see the documentation of
+    metric : str or callable, default='minkowski'
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric. For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
@@ -447,7 +446,7 @@ def __init__(
         algorithm="auto",
         leaf_size=30,
         p=2,
-        metric="fast_sqeuclidean",
+        metric="minkowski",
         outlier_label=None,
         metric_params=None,
         n_jobs=None,
@@ -601,6 +600,7 @@ def predict_proba(self, X):
         n_queries = _num_samples(X)
 
         neigh_dist, neigh_ind = self.radius_neighbors(X)
+
         outlier_mask = np.zeros(n_queries, dtype=bool)
         outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
         outliers = np.flatnonzero(outlier_mask)
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 07e0f63ba20f0..93a385fb30ad7 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -40,7 +40,7 @@ def kneighbors_graph(
     n_neighbors,
     *,
     mode="connectivity",
-    metric="fast_sqeuclidean",
+    metric="minkowski",
     p=2,
     metric_params=None,
     include_self=False,
@@ -64,11 +64,10 @@ def kneighbors_graph(
         matrix with ones and zeros, and 'distance' will return the distances
         between neighbors according to the given metric.
 
-    metric : str, default='fast_sqeuclidean'
-        The distance metric used to calculate the k nearest neighbors for
-        each sample point. The default distance is
-        'fast_sqeuclidean' as fast alternative for the Euclidean distance
-        metric. If exact results are needed, prefer 'euclidean'.
+    metric : str, default='minkowski'
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric.
         For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
 
@@ -131,7 +130,7 @@ def radius_neighbors_graph(
     radius,
     *,
     mode="connectivity",
-    metric="fast_sqeuclidean",
+    metric="minkowski",
     p=2,
     metric_params=None,
     include_self=False,
@@ -158,11 +157,10 @@ def radius_neighbors_graph(
         matrix with ones and zeros, and 'distance' will return the distances
         between neighbors according to the given metric.
 
-    metric : str, default='fast_sqeuclidean'
-        The distance metric used to calculate the neighbors within a
-        given radius for each sample point. The default distance is
-        'fast_sqeuclidean' as fast alternative for the Euclidean distance
-        metric. If exact results are needed, prefer 'euclidean'.
+    metric : str, default='minkowski'
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric.
         For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 9c340a5dfbb16..b7040d969769a 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -160,7 +160,7 @@ def __init__(
         algorithm="auto",
         leaf_size=30,
         p=2,
-        metric="fast_sqeuclidean",
+        metric="minkowski",
         metric_params=None,
         n_jobs=None,
     ):
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index e55b585aeb758..f5a89e7253ed7 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -38,10 +38,10 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
-    metric : str or callable, default='fast_sqeuclidean'
-        The distance metric to use for the tree. The default distance is
-        'fast_sqeuclidean' as fast alternative for the Euclidean distance
-        metric. If exact results are needed, prefer 'euclidean'.
+    metric : str or callable, default='minkowski'
+        the distance metric to use for the tree.  The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric. For a list of available metrics, see the documentation of
         For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
@@ -131,7 +131,7 @@ def __init__(
         radius=1.0,
         algorithm="auto",
         leaf_size=30,
-        metric="fast_sqeuclidean",
+        metric="minkowski",
         p=2,
         metric_params=None,
         n_jobs=None,

From 6427883b8ed87191762861dab6dca751c397169b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Sep 2021 16:32:01 +0200
Subject: [PATCH 176/290] [WIP] Use 'fast_sqeuclidean' instead when possible in
 KNeighborsMixins

---
 sklearn/neighbors/_classification.py      | 43 ++++++++++++++++++++---
 sklearn/neighbors/_regression.py          | 21 +++++++++--
 sklearn/neighbors/tests/test_neighbors.py |  4 ++-
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index a3044e1bf8118..664cff85aa16d 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -11,7 +11,7 @@
 import numpy as np
 from scipy import stats
 from ..utils.extmath import weighted_mode
-from ..utils.validation import _is_arraylike, _num_samples
+from ..utils.validation import _is_arraylike, _num_samples, check_is_fitted
 
 import warnings
 from ._base import _check_weights, _get_weights
@@ -211,9 +211,20 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
             Class labels for each data sample.
         """
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        neigh_dist, neigh_ind = self.kneighbors(X)
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            self.effective_metric_ = "fast_sqeuclidean"
+            neigh_dist, neigh_ind = self.kneighbors(X)
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
 
         classes_ = self.classes_
         _y = self._y
@@ -256,9 +267,20 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        neigh_dist, neigh_ind = self.kneighbors(X)
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            self.effective_metric_ = "fast_sqeuclidean"
+            neigh_dist, neigh_ind = self.kneighbors(X)
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
 
         classes_ = self.classes_
         _y = self._y
@@ -595,11 +617,24 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
 
         X = self._validate_data(X, accept_sparse="csr", reset=False)
         n_queries = _num_samples(X)
 
-        neigh_dist, neigh_ind = self.radius_neighbors(X)
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            original_radius = self.radius
+            self.effective_metric_ = "fast_sqeuclidean"
+            self.radius = original_radius * original_radius
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
+            self.radius = original_radius
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
 
         outlier_mask = np.zeros(n_queries, dtype=bool)
         outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index b7040d969769a..a908437a5dd8e 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -228,7 +228,14 @@ def predict(self, X):
         """
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        neigh_dist, neigh_ind = self.kneighbors(X)
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            self.effective_metric_ = "fast_sqeuclidean"
+            neigh_dist, neigh_ind = self.kneighbors(X)
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
 
         weights = _get_weights(neigh_dist, self.weights)
 
@@ -438,7 +445,17 @@ def predict(self, X):
         """
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        neigh_dist, neigh_ind = self.radius_neighbors(X)
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            original_radius = self.radius
+            self.effective_metric_ = "fast_sqeuclidean"
+            self.radius = original_radius * original_radius
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
+            self.radius = original_radius
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
 
         weights = _get_weights(neigh_dist, self.weights)
 
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 90d508d80420d..d2d4cf2e5bb59 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1267,8 +1267,10 @@ def test_neighbors_badargs():
         with pytest.raises(ValueError):
             est.fit(X, y)
 
+        # Raise an error if predicting on an unfitted estimator
         nbrs = cls(algorithm="ball_tree", metric="haversine")
-        with pytest.raises(ValueError):
+        msg = "has no attribute 'effective_metric_'"
+        with pytest.raises(AttributeError, match=msg):
             nbrs.predict(X)
         with pytest.raises(ValueError):
             ignore_warnings(nbrs.fit(Xsparse, y))

From c865dc69ff53fdf209f426f6a481d36389eec3cc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 2 Sep 2021 09:01:29 +0200
Subject: [PATCH 177/290] fixup! Introduce 'fast_euclidean' and adapt
 KNeighborsMixins accordingly

---
 sklearn/metrics/tests/test_pairwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index d2eb43d7bcf2a..e330c9380e433 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -459,7 +459,7 @@ def test_pairwise_distances_argmin_min():
     idx, vals = pairwise_distances_argmin_min(
         X,
         Y,
-        metric="fast_sqeuclidean",
+        metric="fast_euclidean",
     )
     assert_array_almost_equal(idx, expected_idx)
     assert_array_almost_equal(vals, expected_vals)

From be6741bda2b14e493486024df30c0d49e1b72cfe Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 2 Sep 2021 09:02:44 +0200
Subject: [PATCH 178/290] fixup! [WIP] Use 'fast_sqeuclidean' instead when
 possible in KNeighborsMixins

---
 sklearn/neighbors/_regression.py          | 9 +++++++++
 sklearn/neighbors/tests/test_neighbors.py | 4 +---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index a908437a5dd8e..5ea2db7ce4d21 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -18,6 +18,7 @@
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import RegressorMixin
 from ..utils.deprecation import deprecated
+from ..utils.validation import check_is_fitted
 
 
 class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
@@ -226,6 +227,10 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
             Target values.
         """
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
@@ -443,6 +448,10 @@ def predict(self, X):
                 dtype=double
             Target values.
         """
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index d2d4cf2e5bb59..90d508d80420d 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1267,10 +1267,8 @@ def test_neighbors_badargs():
         with pytest.raises(ValueError):
             est.fit(X, y)
 
-        # Raise an error if predicting on an unfitted estimator
         nbrs = cls(algorithm="ball_tree", metric="haversine")
-        msg = "has no attribute 'effective_metric_'"
-        with pytest.raises(AttributeError, match=msg):
+        with pytest.raises(ValueError):
             nbrs.predict(X)
         with pytest.raises(ValueError):
             ignore_warnings(nbrs.fit(Xsparse, y))

From e8664dfc183015640c886e8fb3ae8de17364481e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 2 Sep 2021 12:05:06 +0200
Subject: [PATCH 179/290] Fix NearestNeighbors docstring for Numpydoc

---
 sklearn/neighbors/_binary_tree.pxi | 2 +-
 sklearn/neighbors/_unsupervised.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 3a55219cf97c7..32a907d1c6dea 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -225,7 +225,7 @@ leaf_size : positive int, default=40
     the case that ``n_samples < leaf_size``.
 
 metric : str or DistanceMetric object
-    the distance metric to use for the tree.  Default='minkowski'
+    The distance metric to use for the tree.  Default='minkowski'
     with p=2 (that is, a euclidean metric). See the documentation
     of the DistanceMetric class for a list of available metrics.
     {binary_tree}.valid_metrics gives a list of the metrics which
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index f5a89e7253ed7..440ac41eb71d5 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -39,10 +39,9 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
         nature of the problem.
 
     metric : str or callable, default='minkowski'
-        the distance metric to use for the tree.  The default metric is
+        The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. For a list of available metrics, see the documentation of
-        For a list of available metrics, see the documentation of
         :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,

From 339ab30af2cd476bc851819519e3e2d11d851617 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 2 Sep 2021 17:03:02 +0200
Subject: [PATCH 180/290] Use metric="fast_sqeuclidean" for
 pairwise_distances_argmin internal calls

---
 sklearn/cluster/_affinity_propagation.py | 4 +++-
 sklearn/cluster/_mean_shift.py           | 4 +++-
 sklearn/metrics/pairwise.py              | 7 +++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 85f86b4d5e497..cbabe5da4995b 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -523,7 +523,9 @@ def predict(self, X):
 
         if self.cluster_centers_.shape[0] > 0:
             with config_context(assume_finite=True):
-                return pairwise_distances_argmin(X, self.cluster_centers_)
+                return pairwise_distances_argmin(
+                    X, self.cluster_centers_, metric="fast_sqeuclidean"
+                )
         else:
             warnings.warn(
                 "This model does not have any cluster centers "
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 9fc260485600b..cd49ee075e95b 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -504,4 +504,6 @@ def predict(self, X):
         check_is_fitted(self)
         X = self._validate_data(X, reset=False)
         with config_context(assume_finite=True):
-            return pairwise_distances_argmin(X, self.cluster_centers_)
+            return pairwise_distances_argmin(
+                X, self.cluster_centers_, metric="fast_sqeuclidean"
+            )
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a767280ea4a35..5a207d73cee0b 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -662,10 +662,9 @@ def pairwise_distances_argmin_min(
     else:
         # TODO: once ArgKmin supports sparse input matrices and 32 bit,
         # we won't need to fallback to pairwise_distances_chunked anymore.
-        # When ArgKmin is not supported and when the
-        # user asked for "fast_sqeuclidean", we need to
-        # revert to "euclidean"
-        if metric == "fast_sqeuclidean":
+        # When ArgKmin is not supported and when the user asked for
+        # a fast alternative, we need to revert to the standard one.
+        if metric in {"fast_sqeuclidean", "fast_euclidean"}:
             metric = "euclidean"
 
         indices, values = zip(

From f9e337c3850f23ebde229b0e5104f53e77d4b00c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 3 Sep 2021 11:27:46 +0200
Subject: [PATCH 181/290] Add n_threads on PairwiseDistancesReduction

---
 .../metrics/_pairwise_distances_reduction.pyx | 81 +++++++++++++++----
 1 file changed, 67 insertions(+), 14 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index e8ba9cf7cc338..2bdd61c25c958 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -167,7 +167,7 @@ cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 #####################
 
 cdef class PairwiseDistancesReduction:
-    """Abstract class computing a reduction on pairwise
+    f"""Abstract class computing a reduction on pairwise
     distances between a set of vectors (rows) X and another
     set of vectors (rows) of Y.
 
@@ -179,13 +179,21 @@ cdef class PairwiseDistancesReduction:
     datasets_pair: DatasetsPair
         The pair of dataset to use.
 
-    chunk_size: int
+    chunk_size: int, default={CHUNK_SIZE}
         The number of vectors per chunk.
+
+    n_threads: int, default=-1
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on :method:`~PairwiseDistancesReduction.compute`.
+
+        -1 means using all processors.
     """
 
     cdef:
         DatasetsPair _datasets_pair
 
+        ITYPE_t n_threads
         ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
 
@@ -222,15 +230,28 @@ cdef class PairwiseDistancesReduction:
 
     def __init__(self,
         DatasetsPair datasets_pair,
-        ITYPE_t chunk_size = CHUNK_SIZE,
-    ):
+        ITYPE_t chunk_size=CHUNK_SIZE,
+        n_threads=-1,
+     ):
         cdef:
             ITYPE_t X_n_full_chunks, Y_n_full_chunks
 
-        self.effective_omp_n_thread = _openmp_effective_n_threads()
-
         check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
         self.chunk_size = chunk_size
+
+        if n_threads is None:
+            # By convention.
+            n_threads = -1
+
+        self.n_threads = n_threads
+
+        if self.n_threads == -1:
+            # Using all possible cores
+            self.effective_omp_n_thread = _openmp_effective_n_threads()
+        else:
+            check_scalar(self.n_threads, "n_threads", Integral, min_val=1)
+            self.effective_omp_n_thread = self.n_threads
+
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
         self._datasets_pair = datasets_pair
@@ -455,6 +476,13 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     chunk_size: int
         The number of vectors per chunk.
+
+    n_threads: int, default=-1
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on :method:`~ArgKmin.compute`.
+
+        -1 means using all processors.
     """
 
     cdef:
@@ -475,8 +503,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         str metric="fast_euclidean",
         ITYPE_t chunk_size=CHUNK_SIZE,
         dict metric_kwargs=dict(),
+        n_threads=-1,
     ) -> ArgKmin:
-        """Return the ArgKmin implementation for the given arguments.
+        f"""Return the ArgKmin implementation for the given arguments.
 
         Parameters
         ----------
@@ -495,12 +524,19 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             For a list of available metrics, see the documentation of
             :class:`~sklearn.metrics.DistanceMetric`.
 
-        chunk_size: int, default=256,
+        chunk_size: int, default={CHUNK_SIZE},
             The number of vectors per chunk.
 
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.
 
+        n_threads: int, default=-1
+            The number of OpenMP threads to use for the reduction.
+            Parallelism is done on chunks and the sharding of chunks
+            depends on the `strategy` set on :method:`~ArgKmin.compute`.
+
+            -1 means using all processors.
+
         Returns
         -------
         argkmin: ArgKmin
@@ -522,9 +558,10 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     def __init__(self,
         DatasetsPair datasets_pair,
         ITYPE_t k,
-        ITYPE_t chunk_size = CHUNK_SIZE,
+        ITYPE_t chunk_size=CHUNK_SIZE,
+        n_threads=-1,
     ):
-        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
+        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size, n_threads)
 
         check_scalar(k, "k", Integral, min_val=1)
         self.k = k
@@ -930,6 +967,13 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
     chunk_size: int
         The number of vectors per chunk.
+
+    n_threads: int, default=-1
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on :method:`~RadiusNeighborhood.compute`.
+
+        -1 means using all processors.
     """
 
     cdef:
@@ -972,8 +1016,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         str metric="fast_euclidean",
         ITYPE_t chunk_size=CHUNK_SIZE,
         dict metric_kwargs=dict(),
+        n_threads=-1,
     ) -> RadiusNeighborhood:
-        """Return the RadiusNeighborhood implementation for the given arguments.
+        f"""Return the RadiusNeighborhood implementation for the given arguments.
 
         Parameters
         ----------
@@ -992,12 +1037,19 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             For a list of available metrics, see the documentation of
             :class:`~sklearn.metrics.DistanceMetric`.
 
-        chunk_size: int, default=256,
+        chunk_size: int, default={CHUNK_SIZE},
             The number of vectors per chunk.
 
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.
 
+        n_threads: int, default=-1
+            The number of OpenMP threads to use for the reduction.
+            Parallelism is done on chunks and the sharding of chunks
+            depends on the `strategy` set on :method:`~RadiusNeighborhood.compute`.
+
+            -1 means using all processors.
+
         Returns
         -------
         radius_neighborhood: RadiusNeighborhood
@@ -1020,9 +1072,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     def __init__(self,
         DatasetsPair datasets_pair,
         DTYPE_t radius,
-        ITYPE_t chunk_size = CHUNK_SIZE,
+        ITYPE_t chunk_size=CHUNK_SIZE,
+        n_threads=-1,
     ):
-        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size)
+        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size, n_threads)
 
         check_scalar(radius, "radius", Real, min_val=0)
         self.radius = radius

From d14af8e33ce93f1a0e40f65db007b5aa7ec5b6de Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 3 Sep 2021 11:31:26 +0200
Subject: [PATCH 182/290] Pass n_threads on PairwiseDistancesReduction calls

---
 sklearn/neighbors/_base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 6da50ac4fa4aa..e5dd4555cfd9c 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -766,6 +766,7 @@ class from an array representing our data set and ask who's
                 k=n_neighbors,
                 metric=self.effective_metric_,
                 metric_kwargs=self.effective_metric_params_,
+                n_threads=self.n_jobs,
             ).compute(
                 strategy="auto",
                 return_distance=return_distance,
@@ -1093,6 +1094,7 @@ class from an array representing our data set and ask who's
                 radius=radius,
                 metric=self.effective_metric_,
                 metric_kwargs=self.effective_metric_params_,
+                n_threads=self.n_jobs,
             ).compute(
                 strategy="auto",
                 return_distance=return_distance,

From e8f04680d677fe8b9e2066ab8f9b9ee4e0826c88 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 3 Sep 2021 11:48:50 +0200
Subject: [PATCH 183/290] Factorise tests and add another for n_threads
 agnosticism

---
 .../test_pairwise_distances_reduction.py      | 135 +++++++-----------
 1 file changed, 53 insertions(+), 82 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index c7a6d20b48699..6dd752865effa 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -65,6 +65,12 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
     )
 
 
+ASSERT_RESULT = {
+    ArgKmin: assert_argkmin_results_equality,
+    RadiusNeighborhood: assert_radius_neighborhood_results_equality,
+}
+
+
 def test_pairwise_distances_reduction_is_usable_for():
     rng = np.random.RandomState(1)
     X = rng.rand(100, 10)
@@ -236,78 +242,91 @@ def test_pairwise_distances_reduction_factory_method(
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
-def test_argkmin_chunk_size_agnosticism(
+@pytest.mark.parametrize("PairwiseDistancesReduction", [ArgKmin, RadiusNeighborhood])
+def test_chunk_size_agnosticism(
+    PairwiseDistancesReduction,
     seed,
     n_samples,
     chunk_size,
-    k=10,
     metric="fast_euclidean",
     n_features=100,
     dtype=np.float64,
 ):
-    # ArgKmin results should not depend on the chunk size
+    # Results should not depend on the chunk size
     rng = np.random.RandomState(seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
-    ref_dist, ref_indices = ArgKmin.get_for(X, Y, k=k, metric="euclidean").compute(
-        return_distance=True
+    parameter = (
+        10
+        if PairwiseDistancesReduction is ArgKmin
+        # Scaling the radius with the dimensions
+        else 10 ** np.log(n_features)
     )
 
-    dist, indices = ArgKmin.get_for(
-        X, Y, k=k, metric=metric, chunk_size=chunk_size
+    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric="euclidean"
+    ).compute(return_distance=True)
+
+    dist, indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric=metric, chunk_size=chunk_size
     ).compute(return_distance=True)
 
-    assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices)
+    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 
 
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
-def test_radius_neighborhood_chunk_size_agnosticism(
+@pytest.mark.parametrize("PairwiseDistancesReduction", [ArgKmin, RadiusNeighborhood])
+def test_n_threads_agnosticism(
+    PairwiseDistancesReduction,
     seed,
     n_samples,
     chunk_size,
-    radius=10.0,
     metric="fast_euclidean",
     n_features=100,
     dtype=np.float64,
 ):
-    # RadiusNeighborhood results should not depend on the chunk size
+    # Results should not depend on the number of threads
     rng = np.random.RandomState(seed)
     spread = 100
-
-    # Scaling the radius with the dimensions
-    scaled_radius = radius * np.log(n_features)
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
-    ref_dist, ref_indices = RadiusNeighborhood.get_for(
-        X, Y, radius=scaled_radius, metric="euclidean"
+    parameter = (
+        10
+        if PairwiseDistancesReduction is ArgKmin
+        # Scaling the radius with the dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric="euclidean"
     ).compute(return_distance=True)
 
-    dist, indices = RadiusNeighborhood.get_for(
-        X, Y, radius=scaled_radius, metric=metric, chunk_size=chunk_size
+    dist, indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric=metric, n_threads=1
     ).compute(return_distance=True)
 
-    assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices)
+    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 
 
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
-def test_argkmin_strategies_consistency(
+@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize("PairwiseDistancesReduction", [ArgKmin, RadiusNeighborhood])
+def test_strategies_consistency(
+    PairwiseDistancesReduction,
     metric,
     n_samples,
     seed,
     n_features=10,
-    k=10,
     dtype=np.float64,
 ):
-    # ArgKmin results obtained using both parallelization strategies
-    # must be identical
+    # Results obtained using both parallelization strategies must be identical
     if _in_unstable_openblas_configuration() and metric == {
         "fast_sqeuclidean",
         "fast_euclidean",
@@ -326,80 +345,32 @@ def test_argkmin_strategies_consistency(
         X = X[:, :2]
         Y = Y[:, :2]
 
-    argkmin_reduction = ArgKmin.get_for(
-        X,
-        Y,
-        k=k,
-        metric=metric,
-        metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
-        # To be sure to use parallelization
-        chunk_size=n_samples // 4,
-    )
-
-    dist_par_X, indices_par_X = argkmin_reduction.compute(
-        strategy="parallel_on_X", return_distance=True
-    )
-
-    dist_par_Y, indices_par_Y = argkmin_reduction.compute(
-        strategy="parallel_on_Y", return_distance=True
-    )
-
-    assert_argkmin_results_equality(
-        dist_par_X, dist_par_Y, indices_par_X, indices_par_Y
+    parameter = (
+        10
+        if PairwiseDistancesReduction is ArgKmin
+        # Scaling the radius with the dimensions
+        else 10 ** np.log(n_features)
     )
 
-
-@pytest.mark.parametrize("seed", range(5))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("metric", RadiusNeighborhood.valid_metrics())
-def test_radius_neighborhood_strategies_consistency(
-    seed,
-    n_samples,
-    metric,
-    n_features=10,
-    radius=10.0,
-    dtype=np.float64,
-):
-    # RadiusNeighborhood results obtained using both parallelization strategies
-    # must be identical
-    if _in_unstable_openblas_configuration() and metric == {
-        "fast_sqeuclidean",
-        "fast_euclidean",
-    }:
-        pytest.xfail(
-            "OpenBLAS (used for 'fast_(sq)euclidean') is unstable in this configuration"
-        )
-
-    rng = np.random.RandomState(seed)
-    spread = 100
-    X = rng.rand(n_samples, n_features).astype(dtype) * spread
-    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
-
-    # Haversine distance only accepts 2D data
-    if metric == "haversine":
-        X = X[:, :2]
-        Y = Y[:, :2]
-
-    radius_neigh_reduction = RadiusNeighborhood.get_for(
+    pairwise_distances_reduction = PairwiseDistancesReduction.get_for(
         X,
         Y,
-        # Scaling the radius with the dimensions
-        radius=radius ** np.log(n_features),
+        parameter,
         metric=metric,
         metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
         # To be sure to use parallelization
         chunk_size=n_samples // 4,
     )
 
-    dist_par_X, indices_par_X = radius_neigh_reduction.compute(
+    dist_par_X, indices_par_X = pairwise_distances_reduction.compute(
         strategy="parallel_on_X", return_distance=True
     )
 
-    dist_par_Y, indices_par_Y = radius_neigh_reduction.compute(
+    dist_par_Y, indices_par_Y = pairwise_distances_reduction.compute(
         strategy="parallel_on_Y", return_distance=True
     )
 
-    assert_radius_neighborhood_results_equality(
+    ASSERT_RESULT[PairwiseDistancesReduction](
         dist_par_X, dist_par_Y, indices_par_X, indices_par_Y
     )
 

From 7e5775c8575a3f2543729fad279ea31c9a452bda Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 3 Sep 2021 11:52:11 +0200
Subject: [PATCH 184/290] Add docstring for RadiusNeighborhood.compute and
 improve others

---
 .../metrics/_pairwise_distances_reduction.pyx | 91 +++++++++++++++----
 1 file changed, 73 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 2bdd61c25c958..21ce1447524cd 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -212,7 +212,8 @@ cdef class PairwiseDistancesReduction:
             "hamming",
             *BOOL_METRICS,
         }
-        return sorted({"fast_euclidean", "fast_sqeuclidean", *METRIC_MAPPING.keys()}.difference(excluded))
+        return sorted({"fast_euclidean", "fast_sqeuclidean",
+                       *METRIC_MAPPING.keys()}.difference(excluded))
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
@@ -280,6 +281,11 @@ cdef class PairwiseDistancesReduction:
         """Computes the reduction of each vector (row) of X on Y
         by parallelizing computation on chunks of X.
 
+        This strategy dispatches chunks of X uniformly on threads.
+        Each thread then iterates on all the chunks of Y. This strategy is
+        embarrassingly parallel and comes with no datastructures synchronisation
+        but is less used in practice (because X is smaller than Y generally).
+
         Private datastructures are modified internally by threads.
 
         Private template methods can be implemented on subclasses to
@@ -337,6 +343,12 @@ cdef class PairwiseDistancesReduction:
         """Computes the reduction of each vector (row) of X on Y
         by parallelizing computation on chunks of Y.
 
+        This strategy dispatches chunks of Y uniformly on threads.
+        Each thread then iterates on all the chunks of X. This strategy is
+        embarrassingly parallel but uses intermediate datastructures
+        synchronisation. However it is more useful in practice (because Y is
+        larger than X generally).
+
         Private datastructures are modified internally by threads.
 
         Private template methods can be implemented on subclasses to
@@ -724,25 +736,30 @@ cdef class ArgKmin(PairwiseDistancesReduction):
 
     @final
     def compute(self,
-       str strategy = "auto",
-       bint return_distance = False
+       str strategy="auto",
+       bint return_distance=False,
     ):
         """Computes the reduction of vectors (rows) of X on Y.
 
         Parameters
         ----------
-        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
-            The chunking strategy defining which dataset
-            parallelization are made on.
-
-             - 'parallel_on_X' is embarassingly parallel but
-            is less used in practice.
-             - 'parallel_on_Y' comes with synchronisation but
-            is more useful in practice.
-             -'auto' relies on a simple heuristic to choose
-            between 'parallel_on_X' and 'parallel_on_Y'.
-
-        return_distance: boolean
+        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default='auto'
+            The chunking strategy defining which dataset parallelization are made on.
+
+            Strategies differs on the dispatching they use for chunks on threads:
+                 - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                 Each thread then iterates on all the chunks of Y. This strategy is
+                 embarrassingly parallel and comes with no datastructures synchronisation
+                 but is less used in practice (because X is smaller than Y generally).
+                 - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                 Each thread then iterates on all the chunks of X. This strategy is
+                 embarrassingly parallel but uses intermediate datastructures
+                 synchronisation. However it is more useful in practice (because Y is
+                 larger than X generally).
+                 -'auto' relies on a simple heuristic to choose between 'parallel_on_X'
+                 and 'parallel_on_Y'.
+
+        return_distance: boolean, default=False
             Return distances between each X vector and its
             argkmin if set to True.
 
@@ -753,7 +770,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             in Y. Only returned if ``return_distance=True``.
 
         indices: ndarray of shape (n, k)
-            Indices of each X vector argkmin in Y.
+            Indices of argkmin of vectors of X in Y.
         """
 
         # Results returned by ArgKmin.compute used as the main heaps
@@ -1058,8 +1075,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # This factory comes to handle specialisations.
         if metric in {"fast_euclidean", "fast_sqeuclidean"} and not issparse(X) and not issparse(Y):
             use_squared_distances = metric == "fast_sqeuclidean"
-            return FastEuclideanRadiusNeighborhood(X=X, Y=Y,
-                                                   radius=radius,
+            return FastEuclideanRadiusNeighborhood(X=X, Y=Y, radius=radius,
                                                    use_squared_distances=use_squared_distances,
                                                    chunk_size=chunk_size)
 
@@ -1248,6 +1264,45 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         bint return_distance = False,
         bint sort_results = False
     ):
+        """Computes the reduction of vectors (rows) of X on Y.
+
+        Parameters
+        ----------
+        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default='auto'
+            The chunking strategy defining which dataset parallelization are made on.
+
+            Strategies differs on the dispatching they use for chunks on threads:
+                 - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                 Each thread then iterates on all the chunks of Y. This strategy is
+                 embarrassingly parallel and comes with no datastructures synchronisation
+                 but is less used in practice (because X is smaller than Y generally).
+                 - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                 Each thread then iterates on all the chunks of X. This strategy is
+                 embarrassingly parallel but uses intermediate datastructures
+                 synchronisation. However it is more useful in practice (because Y is
+                 larger than X generally).
+                 -'auto' relies on a simple heuristic to choose between 'parallel_on_X'
+                 and 'parallel_on_Y'.
+
+        return_distance: boolean, default=False
+            Return distances between each X vector and its
+            neighbors if set to True.
+
+        sort_results: boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
+            return_distance must be True if sort_results is set to True.
+
+        Returns
+        -------
+        distances: ndarray of shape (n, k)
+            Distances between each X vector and its neighbors
+            in Y. Only returned if ``return_distance=True``.
+
+        indices: ndarray of shape (n, k)
+            Indices of each neighbor of vectors of X in Y.
+        """
         if sort_results and not return_distance:
             raise ValueError("return_distance must be True "
                              "if sort_results is True.")

From e9803de82498063ca3d582134cfc8d354609ddd2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 6 Sep 2021 13:50:43 +0200
Subject: [PATCH 185/290] Fix conjugation

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 21ce1447524cd..b4f5c97a591ac 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -124,7 +124,7 @@ cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
     """Create a numpy ndarray given a C++ vector.
 
     The numpy array buffer is the one of the C++ vector.
-    A StdVectorSentinel is registers as the base object for the numpy array,
+    A StdVectorSentinel is registered as the base object for the numpy array,
     freeing the C++ vector it encapsulates when the numpy array is freed.
     """
     typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE

From fb05746bad79995085f54387585de5ddfe5f48fb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 6 Sep 2021 13:52:42 +0200
Subject: [PATCH 186/290] Use correct wording

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index b4f5c97a591ac..ee00d708ba587 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -141,7 +141,7 @@ cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
     else:
         sentinel = StdVectorSentinelITYPE.create_for(vect_ptr)
 
-    # Makes the numpy array responsible to the life-cycle of its buffer.
+    # Makes the numpy array responsible of the life-cycle of its buffer.
     # A reference to the StdVectorSentinel will be stolen by the call bellow,
     # so we increase its reference counter.
     # See: https://docs.python.org/3/c-api/intro.html#reference-count-details

From 5f14488aabcc88ad9b6d750c0c4bdf6b5032321b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 6 Sep 2021 14:51:48 +0200
Subject: [PATCH 187/290] =?UTF-8?q?Format=20code=20=C3=A0=20la=20black?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_dist_metrics.pxd             |  26 ++-
 sklearn/metrics/_dist_metrics.pyx             |  73 ++++---
 .../metrics/_pairwise_distances_reduction.pyx | 193 ++++++++++++------
 3 files changed, 185 insertions(+), 107 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index 146629bb0311b..ed9ad982274d2 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -67,17 +67,21 @@ cdef class DistanceMetric:
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
-    cdef DTYPE_t sparse_dist(self, const DTYPE_t[:] x1_data,
-                      const ITYPE_t[:] x1_indices,
-                      const DTYPE_t[:] x2_data,
-                      const ITYPE_t[:] x2_indices,
-                      ) nogil except -1
-
-    cdef DTYPE_t sparse_rdist(self, const DTYPE_t[:] x1_data,
-                      const ITYPE_t[:] x1_indices,
-                      const DTYPE_t[:] x2_data,
-                      const ITYPE_t[:] x2_indices,
-                      ) nogil except -1
+    cdef DTYPE_t sparse_dist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1
+
+    cdef DTYPE_t sparse_rdist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1
 
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
 
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index cfe17b6ee599e..88db523c03aed 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -321,11 +321,13 @@ cdef class DistanceMetric:
         """
         return self.dist(x1, x2, size)
 
-    cdef DTYPE_t sparse_dist(self, const DTYPE_t[:] x1_data,
-                      const ITYPE_t[:] x1_indices,
-                      const DTYPE_t[:] x2_data,
-                      const ITYPE_t[:] x2_indices,
-                      ) nogil except -1:
+    cdef DTYPE_t sparse_dist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1:
         """Compute the reduced distance between vectors x1 and x2
         given non null coordinates and their corresponding indices.
 
@@ -333,11 +335,13 @@ cdef class DistanceMetric:
         """
         return -999
 
-    cdef DTYPE_t sparse_rdist(self, const DTYPE_t[:] x1_data,
-                      const ITYPE_t[:] x1_indices,
-                      const DTYPE_t[:] x2_data,
-                      const ITYPE_t[:] x2_indices,
-                      ) nogil except -1:
+    cdef DTYPE_t sparse_rdist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1:
         """Compute the reduced distance between vectors x1 and x2
         given non null coordinates and their corresponding indices.
 
@@ -1220,7 +1224,8 @@ cdef class DatasetsPair:
     # The `distance_metric` attribute is defined in _dist_metrics.pxd
 
     @classmethod
-    def get_for(cls,
+    def get_for(
+        cls,
         X,
         Y,
         str metric="euclidean",
@@ -1251,8 +1256,10 @@ cdef class DatasetsPair:
             The suited DatasetsPair implementation.
         """
         cdef:
-            DistanceMetric distance_metric = DistanceMetric.get_metric(metric,
-                                                                 **metric_kwargs)
+            DistanceMetric distance_metric = DistanceMetric.get_metric(
+                metric,
+                **metric_kwargs
+            )
 
         # check_array can be expensive, and we prefer to simply coerce from lists
         # to ndarrays eventually to get their dtype itemsize
@@ -1418,10 +1425,12 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
             ITYPE_t yj_start = self.Y_indptr[j]
             ITYPE_t yj_end = self.Y_indptr[j + 1]
 
-        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
-                                          self.X_indices[xi_start:xi_end],
-                                          self.Y_data[yj_start:yj_end],
-                                          self.Y_indices[yj_start:yj_end])
+        return self.distance_metric.sparse_rdist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y_data[yj_start:yj_end],
+            self.Y_indices[yj_start:yj_end],
+        )
 
     @final
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
@@ -1431,10 +1440,12 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
             ITYPE_t yj_start = self.Y_indptr[j]
             ITYPE_t yj_end = self.Y_indptr[j + 1]
 
-        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
-                                         self.X_indices[xi_start:xi_end],
-                                         self.Y_data[yj_start:yj_end],
-                                         self.Y_indices[yj_start:yj_end])
+        return self.distance_metric.sparse_dist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y_data[yj_start:yj_end],
+            self.Y_indices[yj_start:yj_end]
+        )
 
 @final
 cdef class SparseDenseDatasetsPair(DatasetsPair):
@@ -1498,10 +1509,12 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         # https://github.com/scikit-learn/scikit-learn/issues/17299
         # Ideally, we could pass pointers and indices and access elements
         # then in distance_metric.dist
-        return self.distance_metric.sparse_rdist(self.X_data[xi_start:xi_end],
-                                          self.X_indices[xi_start:xi_end],
-                                          self.Y[j, :],
-                                          self.Y_indices)
+        return self.distance_metric.sparse_rdist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y[j, :],
+            self.Y_indices
+        )
 
     @final
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
@@ -1510,10 +1523,12 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
             ITYPE_t xi_end = self.X_indptr[i + 1]
 
         # TODO: same as previous comment
-        return self.distance_metric.sparse_dist(self.X_data[xi_start:xi_end],
-                                         self.X_indices[xi_start:xi_end],
-                                         self.Y[j, :],
-                                         self.Y_indices)
+        return self.distance_metric.sparse_dist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y[j, :],
+            self.Y_indices
+        )
 
 @final
 cdef class DenseSparseDatasetsPair(DatasetsPair):
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index ee00d708ba587..1bad4787fe942 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -152,7 +152,7 @@ cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
 
 cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
     vector_vector_DITYPE_t* vecs
-    ):
+):
     """Coerce a std::vector of std::vector to a ndarray of ndarray."""
     cdef:
         ITYPE_t n = deref(vecs).size()
@@ -229,7 +229,8 @@ cdef class PairwiseDistancesReduction:
     def datasets_pair(self) ->DatasetsPair:
         return self._datasets_pair
 
-    def __init__(self,
+    def __init__(
+        self,
         DatasetsPair datasets_pair,
         ITYPE_t chunk_size=CHUNK_SIZE,
         n_threads=-1,
@@ -401,7 +402,8 @@ cdef class PairwiseDistancesReduction:
 
     # Placeholder methods which have to be implemented
 
-    cdef void _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(
+        self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -417,13 +419,15 @@ cdef class PairwiseDistancesReduction:
         """Convert proxy distances to exact distances or recompute them."""
         return
 
-    cdef void _on_X_parallel_init(self,
+    cdef void _on_X_parallel_init(
+        self,
         ITYPE_t thread_num,
     ) nogil:
         """Allocate datastructures used in a thread given its number."""
         return
 
-    cdef void _on_X_prange_iter_init(self,
+    cdef void _on_X_prange_iter_init(
+        self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -431,7 +435,8 @@ cdef class PairwiseDistancesReduction:
         """Initialise datastructures used in a thread given its number."""
         return
 
-    cdef void _on_X_prange_iter_finalize(self,
+    cdef void _on_X_prange_iter_finalize(
+        self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -439,25 +444,29 @@ cdef class PairwiseDistancesReduction:
         """Interact with datastructures after a reduction on chunks."""
         return
 
-    cdef void _on_X_parallel_finalize(self,
+    cdef void _on_X_parallel_finalize(
+        self,
         ITYPE_t thread_num
     ) nogil:
         """Interact with datastructures after executing all the reductions."""
         return
 
-    cdef void _on_Y_init(self,
+    cdef void _on_Y_init(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         """Allocate datastructures used in threads."""
         return
 
-    cdef void _on_Y_parallel_init(self,
+    cdef void _on_Y_parallel_init(
+        self,
         ITYPE_t thread_num,
     ) nogil:
         """Initialise datastructures used in a thread given its number."""
         return
 
-    cdef void _on_Y_after_parallel(self,
+    cdef void _on_Y_after_parallel(
+        self,
         ITYPE_t num_threads,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -465,7 +474,8 @@ cdef class PairwiseDistancesReduction:
         """Interact with datastructures after a threads parallel region."""
         return
 
-    cdef void _on_Y_finalize(self,
+    cdef void _on_Y_finalize(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         """Interact with datastructures after executing all the reductions."""
@@ -508,7 +518,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
-    def get_for(cls,
+    def get_for(
+        cls,
         X,
         Y,
         ITYPE_t k,
@@ -567,7 +578,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             chunk_size=chunk_size,
         )
 
-    def __init__(self,
+    def __init__(
+        self,
         DatasetsPair datasets_pair,
         ITYPE_t k,
         ITYPE_t chunk_size=CHUNK_SIZE,
@@ -584,9 +596,11 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         # threads used for the reduction but there won't be allocated but unused
         # datastructures.
         self.heaps_proxy_distances_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+        )
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
-            sizeof(ITYPE_t *) * self.effective_omp_n_thread)
+            sizeof(ITYPE_t *) * self.effective_omp_n_thread
+        )
 
     def __dealloc__(self):
         if self.heaps_indices_chunks is not NULL:
@@ -595,7 +609,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         if self.heaps_proxy_distances_chunks is not NULL:
             free(self.heaps_proxy_distances_chunks)
 
-    cdef void _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(
+        self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -623,7 +638,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 )
 
     @final
-    cdef void _on_X_prange_iter_init(self,
+    cdef void _on_X_prange_iter_init(
+        self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -634,7 +650,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
     @final
-    cdef void _on_X_prange_iter_finalize(self,
+    cdef void _on_X_prange_iter_finalize(
+        self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -650,7 +667,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 self.k
             )
 
-    cdef void _on_Y_init(self,
+    cdef void _on_Y_init(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         cdef:
@@ -664,12 +682,15 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
             self.heaps_proxy_distances_chunks[thread_num] = <DTYPE_t *> malloc(
-                heaps_size * sizeof(DTYPE_t))
+                heaps_size * sizeof(DTYPE_t)
+            )
             self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
-                heaps_size * sizeof(ITYPE_t))
+                heaps_size * sizeof(ITYPE_t)
+            )
 
     @final
-    cdef void _on_Y_parallel_init(self,
+    cdef void _on_Y_parallel_init(
+        self,
         ITYPE_t thread_num,
     ) nogil:
         # Initialising heaps (memset can't be used here)
@@ -678,7 +699,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             self.heaps_indices_chunks[thread_num][idx] = -1
 
     @final
-    cdef void _on_Y_after_parallel(self,
+    cdef void _on_Y_after_parallel(
+        self,
         ITYPE_t num_threads,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -699,7 +721,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                             self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                         )
 
-    cdef void _on_Y_finalize(self,
+    cdef void _on_Y_finalize(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         cdef:
@@ -735,9 +758,10 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                 )
 
     @final
-    def compute(self,
-       str strategy="auto",
-       bint return_distance=False,
+    def compute(
+        self,
+        str strategy="auto",
+        bint return_distance=False,
     ):
         """Computes the reduction of vectors (rows) of X on Y.
 
@@ -831,18 +855,21 @@ cdef class FastEuclideanArgKmin(ArgKmin):
         return (ArgKmin.is_usable_for(X, Y, metric) and
                 not _in_unstable_openblas_configuration())
 
-    def __init__(self,
+    def __init__(
+        self,
         X,
         Y,
         ITYPE_t k,
         bint use_squared_distances=False,
         ITYPE_t chunk_size=CHUNK_SIZE,
     ):
-        ArgKmin.__init__(self,
+        ArgKmin.__init__(
+            self,
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             k=k,
-            chunk_size=chunk_size)
+            chunk_size=chunk_size,
+        )
         # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
         cdef:
             DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
@@ -853,7 +880,8 @@ cdef class FastEuclideanArgKmin(ArgKmin):
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+        )
 
     def __dealloc__(self):
         if self.dist_middle_terms_chunks is not NULL:
@@ -865,25 +893,29 @@ cdef class FastEuclideanArgKmin(ArgKmin):
             ArgKmin.compute_exact_distances(self)
 
     @final
-    cdef void _on_X_parallel_init(self,
-            ITYPE_t thread_num,
+    cdef void _on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
     ) nogil:
         ArgKmin._on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        )
 
     @final
-    cdef void _on_X_parallel_finalize(self,
-            ITYPE_t thread_num
+    cdef void _on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
     ) nogil:
         ArgKmin._on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _on_Y_init(self,
-            ITYPE_t num_threads,
+    cdef void _on_Y_init(
+        self,
+        ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
         ArgKmin._on_Y_init(self, num_threads)
@@ -891,11 +923,13 @@ cdef class FastEuclideanArgKmin(ArgKmin):
         for thread_num in range(num_threads):
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+            )
 
     @final
-    cdef void _on_Y_finalize(self,
-            ITYPE_t num_threads,
+    cdef void _on_Y_finalize(
+        self,
+        ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
         ArgKmin._on_Y_finalize(self, num_threads)
@@ -904,7 +938,8 @@ cdef class FastEuclideanArgKmin(ArgKmin):
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(
+        self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -1026,7 +1061,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         bint sort_results
 
     @classmethod
-    def get_for(cls,
+    def get_for(
+        cls,
         X,
         Y,
         DTYPE_t radius,
@@ -1085,7 +1121,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             chunk_size=chunk_size,
         )
 
-    def __init__(self,
+    def __init__(
+        self,
         DatasetsPair datasets_pair,
         DTYPE_t radius,
         ITYPE_t chunk_size=CHUNK_SIZE,
@@ -1104,9 +1141,11 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # threads used for the reduction but there won't be allocated but unused
         # datastructures.
         self.neigh_distances_chunks = <vector[vector[DTYPE_t]] **> malloc(
-            sizeof(self.neigh_distances) * self.effective_omp_n_thread)
+            sizeof(self.neigh_distances) * self.effective_omp_n_thread
+        )
         self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
-            sizeof(self.neigh_indices) * self.effective_omp_n_thread)
+            sizeof(self.neigh_indices) * self.effective_omp_n_thread
+        )
 
     def __dealloc__(self):
         if self.neigh_distances_chunks is not NULL:
@@ -1115,7 +1154,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         if self.neigh_indices_chunks is not NULL:
             free(self.neigh_indices_chunks)
 
-    cdef void _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(
+        self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -1134,7 +1174,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
 
     @final
-    cdef void _on_X_prange_iter_init(self,
+    cdef void _on_X_prange_iter_init(
+        self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -1146,7 +1187,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self.neigh_indices_chunks[thread_num] = self.neigh_indices
 
     @final
-    cdef void _on_X_prange_iter_finalize(self,
+    cdef void _on_X_prange_iter_finalize(
+        self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -1163,7 +1205,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_indices)[idx].size()
                 )
 
-    cdef void _on_Y_init(self,
+    cdef void _on_Y_init(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         cdef:
@@ -1177,7 +1220,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
 
     @final
-    cdef void _merge_vectors(self,
+    cdef void _merge_vectors(
+        self,
         ITYPE_t idx,
         ITYPE_t num_threads,
     ) nogil:
@@ -1209,7 +1253,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
 
-    cdef void _on_Y_finalize(self,
+    cdef void _on_Y_finalize(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         cdef:
@@ -1259,7 +1304,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                 )
 
     @final
-    def compute(self,
+    def compute(
+        self,
         str strategy = "auto",
         bint return_distance = False,
         bint sort_results = False
@@ -1375,18 +1421,21 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         return (RadiusNeighborhood.is_usable_for(X, Y, metric)
                 and not _in_unstable_openblas_configuration())
 
-    def __init__(self,
+    def __init__(
+        self,
         X,
         Y,
         DTYPE_t radius,
         bint use_squared_distances=False,
         ITYPE_t chunk_size=CHUNK_SIZE,
     ):
-        RadiusNeighborhood.__init__(self,
+        RadiusNeighborhood.__init__(
+            self,
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             radius=radius,
-            chunk_size=chunk_size)
+            chunk_size=chunk_size,
+        )
         # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
         cdef:
             DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
@@ -1402,7 +1451,8 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+        )
 
     def __dealloc__(self):
         if self.dist_middle_terms_chunks is not NULL:
@@ -1414,24 +1464,28 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
             RadiusNeighborhood.compute_exact_distances(self)
 
     @final
-    cdef void _on_X_parallel_init(self,
+    cdef void _on_X_parallel_init(
+        self,
         ITYPE_t thread_num,
     ) nogil:
         RadiusNeighborhood._on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        )
 
     @final
-    cdef void _on_X_parallel_finalize(self,
+    cdef void _on_X_parallel_finalize(
+        self,
         ITYPE_t thread_num
     ) nogil:
         RadiusNeighborhood._on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _on_Y_init(self,
+    cdef void _on_Y_init(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
@@ -1440,10 +1494,12 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         for thread_num in range(num_threads):
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t))
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+            )
 
     @final
-    cdef void _on_Y_finalize(self,
+    cdef void _on_Y_finalize(
+        self,
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
@@ -1453,7 +1509,8 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _reduce_on_chunks(self,
+    cdef void _reduce_on_chunks(
+        self,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -1504,9 +1561,11 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
                 # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                squared_dist_i_j = (self.X_sq_norms[i + X_start]
-                            + dist_middle_terms[i * Y_c.shape[0] + j]
-                            + self.Y_sq_norms[j + Y_start])
+                squared_dist_i_j = (
+                    self.X_sq_norms[i + X_start]
+                    + dist_middle_terms[i * Y_c.shape[0] + j]
+                    + self.Y_sq_norms[j + Y_start]
+                )
                 if squared_dist_i_j <= self.proxy_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)

From b71811cbbbbffca58b86d1852e3e121fabb42e3a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 6 Sep 2021 14:56:54 +0200
Subject: [PATCH 188/290] Format docstring for 'auto' strategy

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 1bad4787fe942..381ca4013341d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -780,8 +780,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                  embarrassingly parallel but uses intermediate datastructures
                  synchronisation. However it is more useful in practice (because Y is
                  larger than X generally).
-                 -'auto' relies on a simple heuristic to choose between 'parallel_on_X'
-                 and 'parallel_on_Y'.
+                 - 'auto' relies on a simple heuristic to choose between
+                 'parallel_on_X' and 'parallel_on_Y'.
 
         return_distance: boolean, default=False
             Return distances between each X vector and its
@@ -1327,8 +1327,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                  embarrassingly parallel but uses intermediate datastructures
                  synchronisation. However it is more useful in practice (because Y is
                  larger than X generally).
-                 -'auto' relies on a simple heuristic to choose between 'parallel_on_X'
-                 and 'parallel_on_Y'.
+                 - 'auto' relies on a simple heuristic to choose between
+                 'parallel_on_X' and 'parallel_on_Y'.
 
         return_distance: boolean, default=False
             Return distances between each X vector and its

From 9c819c89b8ac7b097eccee95441d61091834bc84 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 6 Sep 2021 15:00:35 +0200
Subject: [PATCH 189/290] Reword 'reduced distance' for 'rank-preserving
 surrogate distance'

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_dist_metrics.pyx             | 50 +++++++++----------
 .../metrics/_pairwise_distances_reduction.pyx |  6 +--
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 88db523c03aed..507fe14a6d9b9 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -310,14 +310,14 @@ cdef class DistanceMetric:
 
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1:
-        """Compute the reduced distance between vectors x1 and x2.
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
 
         This can optionally be overridden in a base class.
 
-        The reduced distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute.  For example, for the
-        Euclidean metric, the reduced distance is the squared-euclidean
-        distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
         """
         return self.dist(x1, x2, size)
 
@@ -328,7 +328,7 @@ cdef class DistanceMetric:
         const DTYPE_t[:] x2_data,
         const ITYPE_t[:] x2_indices,
     ) nogil except -1:
-        """Compute the reduced distance between vectors x1 and x2
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2
         given non null coordinates and their corresponding indices.
 
         This should be overridden in a base class.
@@ -342,15 +342,15 @@ cdef class DistanceMetric:
         const DTYPE_t[:] x2_data,
         const ITYPE_t[:] x2_indices,
     ) nogil except -1:
-        """Compute the reduced distance between vectors x1 and x2
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2
         given non null coordinates and their corresponding indices.
 
         This can optionally be overridden in a base class.
 
-        The reduced distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute.  For example, for the
-        Euclidean metric, the reduced distance is the squared-euclidean
-        distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
         """
         return self.sparse_dist(x1_data, x1_indices, x2_data, x2_indices)
 
@@ -375,25 +375,25 @@ cdef class DistanceMetric:
         return 0
 
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        """Convert the reduced distance to the distance"""
+        """Convert the rank-preserving surrogate distance to the distance"""
         return rdist
 
     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        """Convert the distance to the reduced distance"""
+        """Convert the distance to the rank-preserving surrogate distance"""
         return dist
 
     def rdist_to_dist(self, rdist):
-        """Convert the Reduced distance to the true distance.
+        """Convert the rank-preserving surrogate distance to the true distance.
 
-        The reduced distance, defined for some metrics, is a computationally
-        more efficient measure which preserves the rank of the true distance.
-        For example, in the Euclidean distance metric, the reduced distance
-        is the squared-euclidean distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
 
         Parameters
         ----------
         rdist : double
-            Reduced distance.
+            Rank-preserving surrogate distance.
 
         Returns
         -------
@@ -403,12 +403,12 @@ cdef class DistanceMetric:
         return rdist
 
     def dist_to_rdist(self, dist):
-        """Convert the true distance to the reduced distance.
+        """Convert the true distance to the rank-preserving surrogate distance.
 
-        The reduced distance, defined for some metrics, is a computationally
-        more efficient measure which preserves the rank of the true distance.
-        For example, in the Euclidean distance metric, the reduced distance
-        is the squared-euclidean distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
 
         Parameters
         ----------
@@ -418,7 +418,7 @@ cdef class DistanceMetric:
         Returns
         -------
         double
-            Reduced distance.
+            Rank-preserving surrogate distance.
         """
         return dist
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 381ca4013341d..c9ca029ec43cd 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -1031,10 +1031,10 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     cdef:
         DTYPE_t radius
 
-        # DistanceMetric compute rank preserving distance via rdist
-        # ("reduced distance" in the original wording),
+        # DistanceMetric compute rank-preserving surrogate distance via rdist
         # which are proxies necessitating less computations.
-        # We get the proxy for the radius to be able to compare
+        # We get the proxy for the radius to be able to compare it against
+        # vectors' rank-preserving surrogate distances.
         DTYPE_t proxy_radius
 
         # Neighbors informations are returned as np.ndarray or np.ndarray.

From 1ff2433dfa4e6bd368d63860b6c0bc17e2386410 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 6 Sep 2021 15:15:37 +0200
Subject: [PATCH 190/290] Look-up for the strategy in scikit-learn's
 configuration if not specified

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index c9ca029ec43cd..2321a974d8cef 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -23,6 +23,8 @@
 import numpy as np
 cimport numpy as np
 
+from .. import get_config
+
 np.import_array()
 
 from libc.stdlib cimport free, malloc
@@ -760,14 +762,14 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     @final
     def compute(
         self,
-        str strategy="auto",
+        str strategy=None,
         bint return_distance=False,
     ):
         """Computes the reduction of vectors (rows) of X on Y.
 
         Parameters
         ----------
-        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default='auto'
+        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
             The chunking strategy defining which dataset parallelization are made on.
 
             Strategies differs on the dispatching they use for chunks on threads:
@@ -782,6 +784,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
                  larger than X generally).
                  - 'auto' relies on a simple heuristic to choose between
                  'parallel_on_X' and 'parallel_on_Y'.
+                 - None (default) looks-up in scikit-learn configuration for
+                 `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
 
         return_distance: boolean, default=False
             Return distances between each X vector and its
@@ -801,6 +805,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE)
 
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
@@ -1306,7 +1313,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     @final
     def compute(
         self,
-        str strategy = "auto",
+        str strategy=None,
         bint return_distance = False,
         bint sort_results = False
     ):
@@ -1314,7 +1321,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         Parameters
         ----------
-        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default='auto'
+        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
             The chunking strategy defining which dataset parallelization are made on.
 
             Strategies differs on the dispatching they use for chunks on threads:
@@ -1329,6 +1336,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
                  larger than X generally).
                  - 'auto' relies on a simple heuristic to choose between
                  'parallel_on_X' and 'parallel_on_Y'.
+                 - None (default) looks-up in scikit-learn configuration for
+                 `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
 
         return_distance: boolean, default=False
             Return distances between each X vector and its
@@ -1360,6 +1369,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
 
         self.sort_results = sort_results
 
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.

From 8530267ae750092d5a9c5ace816a028d7f476fbc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 8 Sep 2021 05:06:41 -0400
Subject: [PATCH 191/290] Clarify wording in comment regarding n_threads

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 2321a974d8cef..e9087ad9ef5a2 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -250,7 +250,7 @@ cdef class PairwiseDistancesReduction:
         self.n_threads = n_threads
 
         if self.n_threads == -1:
-            # Using all possible cores
+            # By default use all available threads.
             self.effective_omp_n_thread = _openmp_effective_n_threads()
         else:
             check_scalar(self.n_threads, "n_threads", Integral, min_val=1)

From 0365c348bd2cd0d490f89e62096cd1ee419ddc19 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 9 Sep 2021 11:31:59 +0200
Subject: [PATCH 192/290] Apply some small reviews suggestions

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: jeremie du boisberranger <jeremiedbb@yahoo.fr>
---
 sklearn/metrics/_dist_metrics.pxd             |   9 +-
 sklearn/metrics/_dist_metrics.pyx             |  28 +---
 .../metrics/_pairwise_distances_reduction.pyx | 133 +++++++++---------
 .../test_pairwise_distances_reduction.py      |  10 +-
 sklearn/utils/_heap.pxd                       |   1 -
 sklearn/utils/_heap.pyx                       |   9 +-
 6 files changed, 83 insertions(+), 107 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index ed9ad982274d2..0c143e37bd7fe 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -1,13 +1,8 @@
 #!python
-# cython: annotate=False
-# cython: cdivision=True
 # cython: boundscheck=False
-# cython: wraparound=False
-# cython: profile=False
-# cython: linetrace=False
+# cython: cdivision=True
 # cython: initializedcheck=False
-# cython: binding=False
-# distutils: define_macros=CYTHON_TRACE_NOGIL=0
+# cython: wraparound=False
 
 cimport numpy as np
 from libc.math cimport sqrt, exp
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 507fe14a6d9b9..3e7b343ae7bba 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1,13 +1,7 @@
-# cython: language_level=3
-# cython: annotate=False
-# cython: cdivision=True
 # cython: boundscheck=False
-# cython: wraparound=False
-# cython: profile=False
-# cython: linetrace=False
+# cython: cdivision=True
 # cython: initializedcheck=False
-# cython: binding=False
-# distutils: define_macros=CYTHON_TRACE_NOGIL=0
+# cython: wraparound=False
 
 
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
@@ -1221,8 +1215,6 @@ cdef class DatasetsPair:
         between two vectors of (X, Y).
     """
 
-    # The `distance_metric` attribute is defined in _dist_metrics.pxd
-
     @classmethod
     def get_for(
         cls,
@@ -1235,10 +1227,10 @@ cdef class DatasetsPair:
 
         Parameters
         ----------
-        X : array-like of shape (n_X, d)
+        X : {ndarray, sparse matrix} of shape (n_X, d)
             Input data.
 
-        Y : array-like of shape (n_Y, d)
+        Y : {ndarray, sparse matrix} of shape (n_Y, d)
             Input data.
 
         metric : str, default='euclidean'
@@ -1261,13 +1253,8 @@ cdef class DatasetsPair:
                 **metric_kwargs
             )
 
-        # check_array can be expensive, and we prefer to simply coerce from lists
-        # to ndarrays eventually to get their dtype itemsize
-        X = np.asarray(X) if isinstance(X, (tuple, list)) else X
-        Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
-
-        if X.dtype.itemsize != 8 or Y.dtype.itemsize != 8:
-            raise ValueError("32bits datasets aren't supported for X and Y yet.")
+        if X.dtype != np.float64 or Y.dtype != np.float64:
+            raise ValueError("Only 64bit float datasets are supported for X and Y.")
 
         X = check_array(X, dtype=DTYPE, accept_sparse='csr')
         Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
@@ -1278,6 +1265,7 @@ cdef class DatasetsPair:
                               f"respectively {X.shape[1]}-dimensional "
                               f"and {Y.shape[1]}-dimensional.")
 
+        # Metric-specific checks that do not replace nor duplicate `check_array`.
         distance_metric._validate_data(X)
         distance_metric._validate_data(Y)
 
@@ -1330,8 +1318,6 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         between two vectors of (X, Y).
     """
 
-    # The `X`, `Y` and `d` attributes are defined in _dist_metrics.pxd
-
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
         # in rare cases where __init__ is not called
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index e9087ad9ef5a2..0655ee11f6d02 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -1,14 +1,8 @@
-# cython: language_level=3
-# cython: annotate=False
-# cython: cdivision=True
 # cython: boundscheck=False
-# cython: wraparound=False
-# cython: profile=False
-# cython: linetrace=False
+# cython: cdivision=True
 # cython: initializedcheck=False
-# cython: binding=False
+# cython: wraparound=False
 # distutils: language=c++
-# distutils: define_macros=CYTHON_TRACE_NOGIL=0
 
 # Pairwise Distances Reductions
 # =============================
@@ -169,27 +163,29 @@ cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 #####################
 
 cdef class PairwiseDistancesReduction:
-    f"""Abstract class computing a reduction on pairwise
-    distances between a set of vectors (rows) X and another
-    set of vectors (rows) of Y.
+    f"""Abstract class which compute pairwise distances between
+    a set of vectors (rows) X and another set of vectors (rows) of Y
+    and apply a reduction on top.
 
-    The implementation of the reduction is done parallelized
-    on chunks whose size can be set using ``chunk_size``.
+    The computations of the distances and the reduction is parallelized
+    on chunks of vectors of X and Y.
 
     Parameters
     ----------
     datasets_pair: DatasetsPair
         The pair of dataset to use.
 
-    chunk_size: int, default={CHUNK_SIZE}
-        The number of vectors per chunk.
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use {CHUNK_SIZE} if it is not set.
 
-    n_threads: int, default=-1
+    n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
         Parallelism is done on chunks and the sharding of chunks
         depends on the `strategy` set on :method:`~PairwiseDistancesReduction.compute`.
 
-        -1 means using all processors.
+        None and -1 means using all processors.
     """
 
     cdef:
@@ -209,7 +205,7 @@ cdef class PairwiseDistancesReduction:
             "mahalanobis", # is numerically unstable
             # TODO: In order to support discrete distance metrics, we need to have a
             # simultaneous sort which breaks ties on indices when distances are identical.
-            # The best might be using a std::sort and a Comparator whic might need
+            # The best might be using a std::sort and a Comparator which might need
             # AoS instead of SoA (currently used).
             "hamming",
             *BOOL_METRICS,
@@ -219,42 +215,38 @@ cdef class PairwiseDistancesReduction:
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        # TODO: what's the best coercion for lists?
+        # Coercing to np.array to get the dtype
+        # TODO: what is the best way to get lists' dtype?
         X = np.asarray(X) if isinstance(X, (tuple, list)) else X
         Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
         # TODO: support sparse arrays and 32 bits
-        return (not issparse(X) and X.dtype.itemsize == 8 and X.ndim == 2 and
-                not issparse(Y) and Y.dtype.itemsize == 8 and Y.ndim == 2 and
+        return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and
+                not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and
                 metric in cls.valid_metrics())
 
     @property
-    def datasets_pair(self) ->DatasetsPair:
+    def datasets_pair(self) -> DatasetsPair:
         return self._datasets_pair
 
     def __init__(
         self,
         DatasetsPair datasets_pair,
-        ITYPE_t chunk_size=CHUNK_SIZE,
-        n_threads=-1,
+        chunk_size=None,
+        n_threads=None,
      ):
         cdef:
             ITYPE_t X_n_full_chunks, Y_n_full_chunks
 
+        if chunk_size is None:
+            chunk_size = get_config().get("pairwise_dist_chunk_size", CHUNK_SIZE)
+
         check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
         self.chunk_size = chunk_size
 
-        if n_threads is None:
-            # By convention.
-            n_threads = -1
-
-        self.n_threads = n_threads
+        # By convention, -1 and None means using all cores.
+        n_threads = -1 if n_threads is None else n_threads
 
-        if self.n_threads == -1:
-            # By default use all available threads.
-            self.effective_omp_n_thread = _openmp_effective_n_threads()
-        else:
-            check_scalar(self.n_threads, "n_threads", Integral, min_val=1)
-            self.effective_omp_n_thread = self.n_threads
+        self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads)
 
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
@@ -286,8 +278,7 @@ cdef class PairwiseDistancesReduction:
 
         This strategy dispatches chunks of X uniformly on threads.
         Each thread then iterates on all the chunks of Y. This strategy is
-        embarrassingly parallel and comes with no datastructures synchronisation
-        but is less used in practice (because X is smaller than Y generally).
+        embarrassingly parallel and comes with no datastructures synchronisation.
 
         Private datastructures are modified internally by threads.
 
@@ -349,8 +340,7 @@ cdef class PairwiseDistancesReduction:
         This strategy dispatches chunks of Y uniformly on threads.
         Each thread then iterates on all the chunks of X. This strategy is
         embarrassingly parallel but uses intermediate datastructures
-        synchronisation. However it is more useful in practice (because Y is
-        larger than X generally).
+        synchronisation.
 
         Private datastructures are modified internally by threads.
 
@@ -484,12 +474,9 @@ cdef class PairwiseDistancesReduction:
         return
 
 cdef class ArgKmin(PairwiseDistancesReduction):
-    """Computes the argkmin of vectors (rows) of a set of
+    f"""Computes the argkmin of vectors (rows) of a set of
     vectors (rows) of X on another set of vectors (rows) of Y.
 
-    The implementation is parallelized on chunks whose size can
-    be set using ``chunk_size``.
-
     Parameters
     ----------
     datasets_pair: DatasetsPair
@@ -498,15 +485,17 @@ cdef class ArgKmin(PairwiseDistancesReduction):
     k: int
         The k for the argkmin reduction.
 
-    chunk_size: int
-        The number of vectors per chunk.
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use {CHUNK_SIZE} if it is not set.
 
-    n_threads: int, default=-1
+    n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
         Parallelism is done on chunks and the sharding of chunks
         depends on the `strategy` set on :method:`~ArgKmin.compute`.
 
-        -1 means using all processors.
+        None and -1 means using all processors.
     """
 
     cdef:
@@ -526,9 +515,9 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         Y,
         ITYPE_t k,
         str metric="fast_euclidean",
-        ITYPE_t chunk_size=CHUNK_SIZE,
+        chunk_size=None,
         dict metric_kwargs=dict(),
-        n_threads=-1,
+        n_threads=None,
     ) -> ArgKmin:
         f"""Return the ArgKmin implementation for the given arguments.
 
@@ -549,18 +538,20 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             For a list of available metrics, see the documentation of
             :class:`~sklearn.metrics.DistanceMetric`.
 
-        chunk_size: int, default={CHUNK_SIZE},
-            The number of vectors per chunk.
+        chunk_size: int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use {CHUNK_SIZE} if it is not set.
 
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.
 
-        n_threads: int, default=-1
+        n_threads: int, default=None
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
             depends on the `strategy` set on :method:`~ArgKmin.compute`.
 
-            -1 means using all processors.
+            None and -1 means using all processors.
 
         Returns
         -------
@@ -584,8 +575,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         self,
         DatasetsPair datasets_pair,
         ITYPE_t k,
-        ITYPE_t chunk_size=CHUNK_SIZE,
-        n_threads=-1,
+        chunk_size=None,
+        n_threads=None,
     ):
         PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size, n_threads)
 
@@ -868,7 +859,7 @@ cdef class FastEuclideanArgKmin(ArgKmin):
         Y,
         ITYPE_t k,
         bint use_squared_distances=False,
-        ITYPE_t chunk_size=CHUNK_SIZE,
+        chunk_size=None,
     ):
         ArgKmin.__init__(
             self,
@@ -1024,15 +1015,17 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     radius: float
         The radius defining the neighborhood.
 
-    chunk_size: int
-        The number of vectors per chunk.
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use {CHUNK_SIZE} if it is not set.
 
-    n_threads: int, default=-1
+    n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
         Parallelism is done on chunks and the sharding of chunks
         depends on the `strategy` set on :method:`~RadiusNeighborhood.compute`.
 
-        -1 means using all processors.
+        None and -1 means using all processors.
     """
 
     cdef:
@@ -1074,9 +1067,9 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         Y,
         DTYPE_t radius,
         str metric="fast_euclidean",
-        ITYPE_t chunk_size=CHUNK_SIZE,
+        chunk_size=None,
         dict metric_kwargs=dict(),
-        n_threads=-1,
+        n_threads=None,
     ) -> RadiusNeighborhood:
         f"""Return the RadiusNeighborhood implementation for the given arguments.
 
@@ -1097,18 +1090,20 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
             For a list of available metrics, see the documentation of
             :class:`~sklearn.metrics.DistanceMetric`.
 
-        chunk_size: int, default={CHUNK_SIZE},
-            The number of vectors per chunk.
+        chunk_size: int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use {CHUNK_SIZE} if it is not set.
 
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.
 
-        n_threads: int, default=-1
+        n_threads: int, default=None
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
             depends on the `strategy` set on :method:`~RadiusNeighborhood.compute`.
 
-            -1 means using all processors.
+            None and -1 means using all processors.
 
         Returns
         -------
@@ -1132,8 +1127,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         self,
         DatasetsPair datasets_pair,
         DTYPE_t radius,
-        ITYPE_t chunk_size=CHUNK_SIZE,
-        n_threads=-1,
+        chunk_size=None,
+        n_threads=None,
     ):
         PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size, n_threads)
 
@@ -1439,7 +1434,7 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         Y,
         DTYPE_t radius,
         bint use_squared_distances=False,
-        ITYPE_t chunk_size=CHUNK_SIZE,
+        chunk_size=None,
     ):
         RadiusNeighborhood.__init__(
             self,
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 6dd752865effa..a85c3eced8400 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -77,7 +77,7 @@ def test_pairwise_distances_reduction_is_usable_for():
     Y = rng.rand(100, 10)
     metric = "euclidean"
     assert PairwiseDistancesReduction.is_usable_for(X, Y, metric)
-    assert PairwiseDistancesReduction.is_usable_for(
+    assert not PairwiseDistancesReduction.is_usable_for(
         X.astype(np.int64), Y.astype(np.int64), metric
     )
 
@@ -102,12 +102,12 @@ def test_argkmin_factory_method_wrong_usages():
     metric = "euclidean"
 
     with pytest.raises(
-        ValueError, match="32bits datasets aren't supported for X and Y yet."
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
         ArgKmin.get_for(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
 
     with pytest.raises(
-        ValueError, match="32bits datasets aren't supported for X and Y yet."
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
         ArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
 
@@ -140,14 +140,14 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     metric = "euclidean"
 
     with pytest.raises(
-        ValueError, match="32bits datasets aren't supported for X and Y yet."
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
         RadiusNeighborhood.get_for(
             X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
         )
 
     with pytest.raises(
-        ValueError, match="32bits datasets aren't supported for X and Y yet."
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
         RadiusNeighborhood.get_for(
             X=X, Y=Y.astype(np.int32), radius=radius, metric=metric
diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
index 65227c0d30b70..0b65a5a32e393 100644
--- a/sklearn/utils/_heap.pxd
+++ b/sklearn/utils/_heap.pxd
@@ -1,4 +1,3 @@
-# cython: language_level=3
 # Heap routines, used in various Cython implementation.
 
 from cython cimport floating
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index b8878ad402dbb..d6133eab7c658 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -1,7 +1,8 @@
 #!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: wraparound=False
 
 
 from cython cimport floating, integral, numeric
@@ -30,7 +31,7 @@ cdef int simultaneous_sort(
     """
     # TODO: In order to support discrete distance metrics, we need to have a
     # simultaneous sort which breaks ties on indices when distances are identical.
-    # The best might be using a std::sort and a Comparator whic might need
+    # The best might be using a std::sort and a Comparator which might need
     # AoS instead of SoA (currently used).
     cdef:
         ITYPE_t pivot_idx, i, store_idx

From e754b678951305c7a8b08173971fd94aea957477 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 9 Sep 2021 08:18:24 -0400
Subject: [PATCH 193/290] Remove redundant statement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 0655ee11f6d02..674b4be8da022 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -243,8 +243,6 @@ cdef class PairwiseDistancesReduction:
         check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
         self.chunk_size = chunk_size
 
-        # By convention, -1 and None means using all cores.
-        n_threads = -1 if n_threads is None else n_threads
 
         self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads)
 

From fe2f8deb459253e30477e18091cf26c0e846205e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 14 Sep 2021 08:35:03 +0200
Subject: [PATCH 194/290] Do not validate X and Y for same number of dimensions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_dist_metrics.pyx                      |  6 ------
 .../metrics/tests/test_pairwise_distances_reduction.py | 10 ----------
 2 files changed, 16 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 3e7b343ae7bba..1697be9ee7409 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1259,12 +1259,6 @@ cdef class DatasetsPair:
         X = check_array(X, dtype=DTYPE, accept_sparse='csr')
         Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
 
-        if X.shape[1] != Y.shape[1]:
-            raise ValueError("Vectors of X and Y must have the same "
-                              "number of dimensions but currently are "
-                              f"respectively {X.shape[1]}-dimensional "
-                              f"and {Y.shape[1]}-dimensional.")
-
         # Metric-specific checks that do not replace nor duplicate `check_array`.
         distance_metric._validate_data(X)
         distance_metric._validate_data(Y)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index a85c3eced8400..981a4f9239afd 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -126,11 +126,6 @@ def test_argkmin_factory_method_wrong_usages():
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
         ArgKmin.get_for(X=X, Y=np.array([1.0, 2.0]), k=k, metric=metric)
 
-    with pytest.raises(
-        ValueError, match="Vectors of X and Y must have the same number of dimensions"
-    ):
-        ArgKmin.get_for(X=X[:, ::2], Y=Y, k=k, metric=metric)
-
 
 def test_radius_neighborhood_factory_method_wrong_usages():
     rng = np.random.RandomState(1)
@@ -169,11 +164,6 @@ def test_radius_neighborhood_factory_method_wrong_usages():
             X=X, Y=np.array([1.0, 2.0]), radius=radius, metric=metric
         )
 
-    with pytest.raises(
-        ValueError, match="Vectors of X and Y must have the same number of dimensions"
-    ):
-        RadiusNeighborhood.get_for(X=X[:, ::2], Y=Y, radius=radius, metric=metric)
-
 
 @fails_if_unstable_openblas
 @pytest.mark.filterwarnings("ignore:Constructing a DIA matrix")

From 4c6253e51e9faa1bd396b0723b4b0aa3e591269e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 14 Sep 2021 08:35:03 +0200
Subject: [PATCH 195/290] Do not validate X and Y for same number of dimensions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 96 ++++++++++---------
 sklearn/metrics/pairwise.py                   |  6 +-
 .../test_pairwise_distances_reduction.py      | 94 +++++++++++-------
 3 files changed, 111 insertions(+), 85 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 674b4be8da022..998c47b46c01f 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -471,7 +471,7 @@ cdef class PairwiseDistancesReduction:
         """Interact with datastructures after executing all the reductions."""
         return
 
-cdef class ArgKmin(PairwiseDistancesReduction):
+cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     f"""Computes the argkmin of vectors (rows) of a set of
     vectors (rows) of X on another set of vectors (rows) of Y.
 
@@ -516,8 +516,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         chunk_size=None,
         dict metric_kwargs=dict(),
         n_threads=None,
-    ) -> ArgKmin:
-        f"""Return the ArgKmin implementation for the given arguments.
+    ) -> PairwiseDistancesArgKmin:
+        f"""Return the PairwiseDistancesArgKmin implementation for the given arguments.
 
         Parameters
         ----------
@@ -547,23 +547,26 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         n_threads: int, default=None
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
-            depends on the `strategy` set on :method:`~ArgKmin.compute`.
+            depends on the `strategy` set on
+            :method:`~PairwiseDistancesArgKmin.compute`.
 
             None and -1 means using all processors.
 
         Returns
         -------
-        argkmin: ArgKmin
-            The suited ArgKmin implementation.
+        argkmin: PairwiseDistancesArgKmin
+            The suited PairwiseDistancesArgKmin implementation.
         """
         # This factory comes to handle specialisations.
         if metric in {"fast_euclidean", "fast_sqeuclidean"} and not issparse(X) and not issparse(Y):
             use_squared_distances = metric == "fast_sqeuclidean"
-            return FastEuclideanArgKmin(X=X, Y=Y, k=k,
-                                        use_squared_distances=use_squared_distances,
-                                        chunk_size=chunk_size)
+            return FastEuclideanPairwiseDistancesArgKmin(
+                X=X, Y=Y, k=k,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size
+            )
 
-        return ArgKmin(
+        return PairwiseDistancesArgKmin(
             datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
             k=k,
             chunk_size=chunk_size,
@@ -790,7 +793,7 @@ cdef class ArgKmin(PairwiseDistancesReduction):
             Indices of argkmin of vectors of X in Y.
         """
 
-        # Results returned by ArgKmin.compute used as the main heaps
+        # Results returned by PairwiseDistancesArgKmin.compute used as the main heaps.
         self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE)
 
@@ -823,8 +826,8 @@ cdef class ArgKmin(PairwiseDistancesReduction):
         return np.asarray(self.argkmin_indices)
 
 
-cdef class FastEuclideanArgKmin(ArgKmin):
-    """Fast specialized alternative for ArgKmin on EuclideanDistance.
+cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
+    """Fast specialized alternative for PairwiseDistancesArgKmin on EuclideanDistance.
 
     Notes
     -----
@@ -832,8 +835,8 @@ cdef class FastEuclideanArgKmin(ArgKmin):
     better running time when the alternative is IO bound, but it can suffer
     from numerical instability.
 
-    ArgKmin with EuclideanDistance must be used when higher numerical precision
-    is needed.
+    PairwiseDistancesArgKmin with EuclideanDistance must be used when higher
+    numerical precision is needed.
     """
 
     cdef:
@@ -848,7 +851,7 @@ cdef class FastEuclideanArgKmin(ArgKmin):
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return (ArgKmin.is_usable_for(X, Y, metric) and
+        return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and
                 not _in_unstable_openblas_configuration())
 
     def __init__(
@@ -859,7 +862,7 @@ cdef class FastEuclideanArgKmin(ArgKmin):
         bint use_squared_distances=False,
         chunk_size=None,
     ):
-        ArgKmin.__init__(
+        PairwiseDistancesArgKmin.__init__(
             self,
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
@@ -886,14 +889,14 @@ cdef class FastEuclideanArgKmin(ArgKmin):
     @final
     cdef void compute_exact_distances(self) nogil:
         if not self.use_squared_distances:
-            ArgKmin.compute_exact_distances(self)
+            PairwiseDistancesArgKmin.compute_exact_distances(self)
 
     @final
     cdef void _on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        ArgKmin._on_X_parallel_init(self, thread_num)
+        PairwiseDistancesArgKmin._on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
@@ -905,7 +908,7 @@ cdef class FastEuclideanArgKmin(ArgKmin):
         self,
         ITYPE_t thread_num
     ) nogil:
-        ArgKmin._on_X_parallel_finalize(self, thread_num)
+        PairwiseDistancesArgKmin._on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
@@ -914,7 +917,7 @@ cdef class FastEuclideanArgKmin(ArgKmin):
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        ArgKmin._on_Y_init(self, num_threads)
+        PairwiseDistancesArgKmin._on_Y_init(self, num_threads)
 
         for thread_num in range(num_threads):
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
@@ -928,7 +931,7 @@ cdef class FastEuclideanArgKmin(ArgKmin):
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        ArgKmin._on_Y_finalize(self, num_threads)
+        PairwiseDistancesArgKmin._on_Y_finalize(self, num_threads)
 
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
@@ -1001,7 +1004,7 @@ cdef class FastEuclideanArgKmin(ArgKmin):
                 )
 
 
-cdef class RadiusNeighborhood(PairwiseDistancesReduction):
+cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
     """Returns radius-based neighbors vectors' indices in a dataset Y of
     of vectors in a dataset X.
 
@@ -1021,7 +1024,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
     n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
         Parallelism is done on chunks and the sharding of chunks
-        depends on the `strategy` set on :method:`~RadiusNeighborhood.compute`.
+        depends on the `strategy` set on
+        :method:`~PairwiseDistancesRadiusNeighborhood.compute`.
 
         None and -1 means using all processors.
     """
@@ -1035,7 +1039,7 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         # vectors' rank-preserving surrogate distances.
         DTYPE_t proxy_radius
 
-        # Neighbors informations are returned as np.ndarray or np.ndarray.
+        # Neighbors indices and distances are returned as np.ndarray or np.ndarray.
         #
         # We want resizable buffers which we will to wrapped within numpy
         # arrays at the end. std::vector comes as a handy interface for
@@ -1068,8 +1072,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         chunk_size=None,
         dict metric_kwargs=dict(),
         n_threads=None,
-    ) -> RadiusNeighborhood:
-        f"""Return the RadiusNeighborhood implementation for the given arguments.
+    ) -> PairwiseDistancesRadiusNeighborhood:
+        f"""Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments.
 
         Parameters
         ----------
@@ -1099,23 +1103,26 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         n_threads: int, default=None
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
-            depends on the `strategy` set on :method:`~RadiusNeighborhood.compute`.
+            depends on the `strategy` set on
+            :method:`~PairwiseDistancesRadiusNeighborhood.compute`.
 
             None and -1 means using all processors.
 
         Returns
         -------
-        radius_neighborhood: RadiusNeighborhood
-            The suited RadiusNeighborhood implementation.
+        radius_neighborhood: PairwiseDistancesRadiusNeighborhood
+            The suited PairwiseDistancesRadiusNeighborhood implementation.
         """
         # This factory comes to handle specialisations.
         if metric in {"fast_euclidean", "fast_sqeuclidean"} and not issparse(X) and not issparse(Y):
             use_squared_distances = metric == "fast_sqeuclidean"
-            return FastEuclideanRadiusNeighborhood(X=X, Y=Y, radius=radius,
-                                                   use_squared_distances=use_squared_distances,
-                                                   chunk_size=chunk_size)
+            return FastEuclideanPairwiseDistancesRadiusNeighborhood(
+                X=X, Y=Y, radius=radius,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size
+            )
 
-        return RadiusNeighborhood(
+        return PairwiseDistancesRadiusNeighborhood(
             datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
             radius=radius,
             chunk_size=chunk_size,
@@ -1398,8 +1405,8 @@ cdef class RadiusNeighborhood(PairwiseDistancesReduction):
         return res
 
 
-cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
-    """Fast specialized alternative for RadiusNeighborhood on EuclideanDistance.
+cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood):
+    """Fast specialized alternative for PairwiseDistancesRadiusNeighborhood on EuclideanDistance.
 
     Notes
     -----
@@ -1423,7 +1430,7 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return (RadiusNeighborhood.is_usable_for(X, Y, metric)
+        return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric)
                 and not _in_unstable_openblas_configuration())
 
     def __init__(
@@ -1434,7 +1441,7 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         bint use_squared_distances=False,
         chunk_size=None,
     ):
-        RadiusNeighborhood.__init__(
+        PairwiseDistancesRadiusNeighborhood.__init__(
             self,
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
@@ -1466,14 +1473,14 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
     @final
     cdef void compute_exact_distances(self) nogil:
         if not self.use_squared_distances:
-            RadiusNeighborhood.compute_exact_distances(self)
+            PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self)
 
     @final
     cdef void _on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        RadiusNeighborhood._on_X_parallel_init(self, thread_num)
+        PairwiseDistancesRadiusNeighborhood._on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
@@ -1485,7 +1492,7 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         self,
         ITYPE_t thread_num
     ) nogil:
-        RadiusNeighborhood._on_X_parallel_finalize(self, thread_num)
+        PairwiseDistancesRadiusNeighborhood._on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
@@ -1494,7 +1501,7 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        RadiusNeighborhood._on_Y_init(self, num_threads)
+        PairwiseDistancesRadiusNeighborhood._on_Y_init(self, num_threads)
 
         for thread_num in range(num_threads):
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
@@ -1508,7 +1515,7 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        RadiusNeighborhood._on_Y_finalize(self, num_threads)
+        PairwiseDistancesRadiusNeighborhood._on_Y_finalize(self, num_threads)
 
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
@@ -1561,8 +1568,7 @@ cdef class FastEuclideanRadiusNeighborhood(RadiusNeighborhood):
         # dist_middle_terms = -2 * X_c.dot(Y_c.T)
         _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
 
-        # Pushing the distance and their associated indices on heaps
-        # which keep tracks of the argkmin.
+        # Pushing the distance and their associated indices in vectors.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
                 # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 5a207d73cee0b..27af360e4256a 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._pairwise_distances_reduction import ArgKmin
+from ._pairwise_distances_reduction import PairwiseDistancesArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -653,8 +653,8 @@ def pairwise_distances_argmin_min(
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    if ArgKmin.is_usable_for(X, Y, metric):
-        values, indices = ArgKmin.get_for(
+    if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric):
+        values, indices = PairwiseDistancesArgKmin.get_for(
             X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
         ).compute(strategy="auto", return_distance=True)
         values = values.flatten()
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 981a4f9239afd..cc9abfeb3d6d0 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -22,10 +22,10 @@
 
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
-    ArgKmin,
-    RadiusNeighborhood,
-    FastEuclideanArgKmin,
-    FastEuclideanRadiusNeighborhood,
+    PairwiseDistancesArgKmin,
+    PairwiseDistancesRadiusNeighborhood,
+    FastEuclideanPairwiseDistancesArgKmin,
+    FastEuclideanPairwiseDistancesRadiusNeighborhood,
 )
 
 from sklearn.utils import _in_unstable_openblas_configuration
@@ -66,8 +66,8 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
 
 
 ASSERT_RESULT = {
-    ArgKmin: assert_argkmin_results_equality,
-    RadiusNeighborhood: assert_radius_neighborhood_results_equality,
+    PairwiseDistancesArgKmin: assert_argkmin_results_equality,
+    PairwiseDistancesRadiusNeighborhood: assert_radius_neighborhood_results_equality,
 }
 
 
@@ -104,27 +104,33 @@ def test_argkmin_factory_method_wrong_usages():
     with pytest.raises(
         ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
-        ArgKmin.get_for(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
+        PairwiseDistancesArgKmin.get_for(
+            X=X.astype(np.float32), Y=Y, k=k, metric=metric
+        )
 
     with pytest.raises(
         ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
-        ArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
 
     with pytest.raises(ValueError, match="k == -1, must be >= 1."):
-        ArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
 
     with pytest.raises(ValueError, match="k == 0, must be >= 1."):
-        ArgKmin.get_for(X=X, Y=Y, k=0.1, metric=metric)
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=0.1, metric=metric)
 
     with pytest.raises(ValueError, match="Unrecognized metric"):
-        ArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric")
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric")
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        ArgKmin.get_for(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)
+        PairwiseDistancesArgKmin.get_for(
+            X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric
+        )
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        ArgKmin.get_for(X=X, Y=np.array([1.0, 2.0]), k=k, metric=metric)
+        PairwiseDistancesArgKmin.get_for(
+            X=X, Y=np.array([1.0, 2.0]), k=k, metric=metric
+        )
 
 
 def test_radius_neighborhood_factory_method_wrong_usages():
@@ -137,30 +143,32 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     with pytest.raises(
         ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
-        RadiusNeighborhood.get_for(
+        PairwiseDistancesRadiusNeighborhood.get_for(
             X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
         )
 
     with pytest.raises(
         ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
-        RadiusNeighborhood.get_for(
+        PairwiseDistancesRadiusNeighborhood.get_for(
             X=X, Y=Y.astype(np.int32), radius=radius, metric=metric
         )
 
     with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
-        RadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric)
+        PairwiseDistancesRadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric)
 
     with pytest.raises(ValueError, match="Unrecognized metric"):
-        RadiusNeighborhood.get_for(X=X, Y=Y, radius=radius, metric="wrong metric")
+        PairwiseDistancesRadiusNeighborhood.get_for(
+            X=X, Y=Y, radius=radius, metric="wrong metric"
+        )
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        RadiusNeighborhood.get_for(
+        PairwiseDistancesRadiusNeighborhood.get_for(
             X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
         )
 
     with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
-        RadiusNeighborhood.get_for(
+        PairwiseDistancesRadiusNeighborhood.get_for(
             X=X, Y=np.array([1.0, 2.0]), radius=radius, metric=metric
         )
 
@@ -170,8 +178,11 @@ def test_radius_neighborhood_factory_method_wrong_usages():
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction, FastPairwiseDistancesReduction",
     [
-        (ArgKmin, FastEuclideanArgKmin),
-        (RadiusNeighborhood, FastEuclideanRadiusNeighborhood),
+        (PairwiseDistancesArgKmin, FastEuclideanPairwiseDistancesArgKmin),
+        (
+            PairwiseDistancesRadiusNeighborhood,
+            FastEuclideanPairwiseDistancesRadiusNeighborhood,
+        ),
     ],
 )
 def test_pairwise_distances_reduction_factory_method(
@@ -232,7 +243,10 @@ def test_pairwise_distances_reduction_factory_method(
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
-@pytest.mark.parametrize("PairwiseDistancesReduction", [ArgKmin, RadiusNeighborhood])
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
 def test_chunk_size_agnosticism(
     PairwiseDistancesReduction,
     seed,
@@ -250,7 +264,7 @@ def test_chunk_size_agnosticism(
 
     parameter = (
         10
-        if PairwiseDistancesReduction is ArgKmin
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
         # Scaling the radius with the dimensions
         else 10 ** np.log(n_features)
     )
@@ -270,7 +284,10 @@ def test_chunk_size_agnosticism(
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
-@pytest.mark.parametrize("PairwiseDistancesReduction", [ArgKmin, RadiusNeighborhood])
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
 def test_n_threads_agnosticism(
     PairwiseDistancesReduction,
     seed,
@@ -288,7 +305,7 @@ def test_n_threads_agnosticism(
 
     parameter = (
         10
-        if PairwiseDistancesReduction is ArgKmin
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
         # Scaling the radius with the dimensions
         else 10 ** np.log(n_features)
     )
@@ -307,7 +324,10 @@ def test_n_threads_agnosticism(
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
-@pytest.mark.parametrize("PairwiseDistancesReduction", [ArgKmin, RadiusNeighborhood])
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
 def test_strategies_consistency(
     PairwiseDistancesReduction,
     metric,
@@ -337,7 +357,7 @@ def test_strategies_consistency(
 
     parameter = (
         10
-        if PairwiseDistancesReduction is ArgKmin
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
         # Scaling the radius with the dimensions
         else 10 ** np.log(n_features)
     )
@@ -391,19 +411,19 @@ def test_fast_sqeuclidean_correctness(
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
-    eucl_dist, eucl_indices = ArgKmin.get_for(X, Y, k, metric="euclidean").compute(
-        return_distance=True
-    )
-    fse_dist, fse_indices = ArgKmin.get_for(X, Y, k, metric="fast_euclidean").compute(
-        return_distance=True
-    )
+    eucl_dist, eucl_indices = PairwiseDistancesArgKmin.get_for(
+        X, Y, k, metric="euclidean"
+    ).compute(return_distance=True)
+    fse_dist, fse_indices = PairwiseDistancesArgKmin.get_for(
+        X, Y, k, metric="fast_euclidean"
+    ).compute(return_distance=True)
 
     assert_argkmin_results_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
 
-    eucl_dist, eucl_indices = RadiusNeighborhood.get_for(
+    eucl_dist, eucl_indices = PairwiseDistancesRadiusNeighborhood.get_for(
         X, Y, radius, metric="euclidean"
     ).compute(return_distance=True)
-    fse_dist, fse_indices = RadiusNeighborhood.get_for(
+    fse_dist, fse_indices = PairwiseDistancesRadiusNeighborhood.get_for(
         X, Y, radius, metric="fast_euclidean"
     ).compute(return_distance=True)
 
@@ -438,11 +458,11 @@ def test_fast_sqeuclidean_translation_invariance(
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
-    reference_dist, reference_indices = ArgKmin.get_for(
+    reference_dist, reference_indices = PairwiseDistancesArgKmin.get_for(
         X, Y, k, metric="fast_sqeuclidean"
     ).compute(return_distance=True)
 
-    dist, indices = ArgKmin.get_for(
+    dist, indices = PairwiseDistancesArgKmin.get_for(
         X + translation, Y + translation, k, metric="fast_sqeuclidean"
     ).compute(return_distance=True)
 

From 7c86a39f365c7b169fe580002b2391ebf6d34c7c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 14 Sep 2021 09:00:35 +0200
Subject: [PATCH 196/290] Remove checks for CSR matrices in DatasetsPair
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_dist_metrics.pyx | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 1697be9ee7409..ce7772237e490 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1228,10 +1228,10 @@ cdef class DatasetsPair:
         Parameters
         ----------
         X : {ndarray, sparse matrix} of shape (n_X, d)
-            Input data.
+            Input data. If provided as a sparse matrix, it must be in CSR format.
 
         Y : {ndarray, sparse matrix} of shape (n_Y, d)
-            Input data.
+            Input data. If provided as a sparse matrix, it must be in CSR format.
 
         metric : str, default='euclidean'
             The distance metric to use for argkmin. The default metric is
@@ -1256,9 +1256,6 @@ cdef class DatasetsPair:
         if X.dtype != np.float64 or Y.dtype != np.float64:
             raise ValueError("Only 64bit float datasets are supported for X and Y.")
 
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
-
         # Metric-specific checks that do not replace nor duplicate `check_array`.
         distance_metric._validate_data(X)
         distance_metric._validate_data(Y)
@@ -1351,10 +1348,10 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
     Parameters
     ----------
     X: sparse matrix of shape (n_X, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be in CSR format.
 
     Y: sparse matrix of shape (n_X, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be in CSR format.
 
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
@@ -1434,7 +1431,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
     Parameters
     ----------
     X: sparse matrix of shape (n_X, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be in CSR format.
 
     Y: ndarray of shape (n_Y, d)
         Rows represent vectors.
@@ -1520,7 +1517,7 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
         Rows represent vectors.
 
     Y: sparse matrix of shape (n_Y, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be in CSR format.
 
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances

From b3efe85cb8646bba7bdc2752b6720b348207a82a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 14 Sep 2021 09:00:35 +0200
Subject: [PATCH 197/290] Rename sparse_{dist,rdist} to csr_{dist,rdist}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_dist_metrics.pxd |  4 ++--
 sklearn/metrics/_dist_metrics.pyx | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index 0c143e37bd7fe..183e6851345f0 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -62,7 +62,7 @@ cdef class DistanceMetric:
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
-    cdef DTYPE_t sparse_dist(
+    cdef DTYPE_t csr_dist(
         self,
         const DTYPE_t[:] x1_data,
         const ITYPE_t[:] x1_indices,
@@ -70,7 +70,7 @@ cdef class DistanceMetric:
         const ITYPE_t[:] x2_indices,
     ) nogil except -1
 
-    cdef DTYPE_t sparse_rdist(
+    cdef DTYPE_t csr_rdist(
         self,
         const DTYPE_t[:] x1_data,
         const ITYPE_t[:] x1_indices,
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index ce7772237e490..9af02c2ae1018 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -315,7 +315,7 @@ cdef class DistanceMetric:
         """
         return self.dist(x1, x2, size)
 
-    cdef DTYPE_t sparse_dist(
+    cdef DTYPE_t csr_dist(
         self,
         const DTYPE_t[:] x1_data,
         const ITYPE_t[:] x1_indices,
@@ -329,7 +329,7 @@ cdef class DistanceMetric:
         """
         return -999
 
-    cdef DTYPE_t sparse_rdist(
+    cdef DTYPE_t csr_rdist(
         self,
         const DTYPE_t[:] x1_data,
         const ITYPE_t[:] x1_indices,
@@ -346,7 +346,7 @@ cdef class DistanceMetric:
         Euclidean metric, the rank-preserving surrogate distance is the
         squared-euclidean distance.
         """
-        return self.sparse_dist(x1_data, x1_indices, x2_data, x2_indices)
+        return self.csr_dist(x1_data, x1_indices, x2_data, x2_indices)
 
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
         """compute the pairwise distances between points in X"""
@@ -1402,7 +1402,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
             ITYPE_t yj_start = self.Y_indptr[j]
             ITYPE_t yj_end = self.Y_indptr[j + 1]
 
-        return self.distance_metric.sparse_rdist(
+        return self.distance_metric.csr_rdist(
             self.X_data[xi_start:xi_end],
             self.X_indices[xi_start:xi_end],
             self.Y_data[yj_start:yj_end],
@@ -1417,7 +1417,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
             ITYPE_t yj_start = self.Y_indptr[j]
             ITYPE_t yj_end = self.Y_indptr[j + 1]
 
-        return self.distance_metric.sparse_dist(
+        return self.distance_metric.csr_dist(
             self.X_data[xi_start:xi_end],
             self.X_indices[xi_start:xi_end],
             self.Y_data[yj_start:yj_end],
@@ -1486,7 +1486,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         # https://github.com/scikit-learn/scikit-learn/issues/17299
         # Ideally, we could pass pointers and indices and access elements
         # then in distance_metric.dist
-        return self.distance_metric.sparse_rdist(
+        return self.distance_metric.csr_rdist(
             self.X_data[xi_start:xi_end],
             self.X_indices[xi_start:xi_end],
             self.Y[j, :],
@@ -1500,7 +1500,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
             ITYPE_t xi_end = self.X_indptr[i + 1]
 
         # TODO: same as previous comment
-        return self.distance_metric.sparse_dist(
+        return self.distance_metric.csr_dist(
             self.X_data[xi_start:xi_end],
             self.X_indices[xi_start:xi_end],
             self.Y[j, :],

From a6f0e4a9d38603454c44297fa679ed9f668ed889 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 14 Sep 2021 09:19:34 +0200
Subject: [PATCH 198/290] fixup! Do not validate X and Y for same number of
 dimensions

---
 sklearn/neighbors/_base.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index e5dd4555cfd9c..cb134ddd2089d 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,10 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._pairwise_distances_reduction import ArgKmin, RadiusNeighborhood
+from ..metrics._pairwise_distances_reduction import (
+    PairwiseDistancesArgKmin,
+    PairwiseDistancesRadiusNeighborhood,
+)
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -757,10 +760,10 @@ class from an array representing our data set and ask who's
                 X, n_neighbors=n_neighbors, return_distance=return_distance
             )
 
-        elif self._fit_method == "brute" and ArgKmin.is_usable_for(
+        elif self._fit_method == "brute" and PairwiseDistancesArgKmin.is_usable_for(
             X, self._fit_X, self.effective_metric_
         ):
-            results = ArgKmin.get_for(
+            results = PairwiseDistancesArgKmin.get_for(
                 X=X,
                 Y=self._fit_X,
                 k=n_neighbors,
@@ -1085,10 +1088,13 @@ class from an array representing our data set and ask who's
                 X, radius=radius, return_distance=return_distance
             )
 
-        elif self._fit_method == "brute" and RadiusNeighborhood.is_usable_for(
-            X, self._fit_X, self.effective_metric_
+        elif (
+            self._fit_method == "brute"
+            and PairwiseDistancesRadiusNeighborhood.is_usable_for(
+                X, self._fit_X, self.effective_metric_
+            )
         ):
-            results = RadiusNeighborhood.get_for(
+            results = PairwiseDistancesRadiusNeighborhood.get_for(
                 X=X,
                 Y=self._fit_X,
                 radius=radius,

From 7231535260609ca326a782dd502211ce03af372d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 14 Sep 2021 09:00:35 +0200
Subject: [PATCH 199/290] Don't use f-strings for docstrings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 .../metrics/_pairwise_distances_reduction.pyx  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 998c47b46c01f..812779f8bc747 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -163,7 +163,7 @@ cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 #####################
 
 cdef class PairwiseDistancesReduction:
-    f"""Abstract class which compute pairwise distances between
+    """Abstract class which compute pairwise distances between
     a set of vectors (rows) X and another set of vectors (rows) of Y
     and apply a reduction on top.
 
@@ -178,7 +178,7 @@ cdef class PairwiseDistancesReduction:
     chunk_size: int, default=None,
         The number of vectors per chunk. If None (default) looks-up in
         scikit-learn configuration for `pairwise_dist_chunk_size`,
-        and use {CHUNK_SIZE} if it is not set.
+        and use 256 if it is not set.
 
     n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
@@ -472,7 +472,7 @@ cdef class PairwiseDistancesReduction:
         return
 
 cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
-    f"""Computes the argkmin of vectors (rows) of a set of
+    """Computes the argkmin of vectors (rows) of a set of
     vectors (rows) of X on another set of vectors (rows) of Y.
 
     Parameters
@@ -486,7 +486,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     chunk_size: int, default=None,
         The number of vectors per chunk. If None (default) looks-up in
         scikit-learn configuration for `pairwise_dist_chunk_size`,
-        and use {CHUNK_SIZE} if it is not set.
+        and use 256 if it is not set.
 
     n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
@@ -517,7 +517,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         dict metric_kwargs=dict(),
         n_threads=None,
     ) -> PairwiseDistancesArgKmin:
-        f"""Return the PairwiseDistancesArgKmin implementation for the given arguments.
+        """Return the PairwiseDistancesArgKmin implementation for the given arguments.
 
         Parameters
         ----------
@@ -539,7 +539,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         chunk_size: int, default=None,
             The number of vectors per chunk. If None (default) looks-up in
             scikit-learn configuration for `pairwise_dist_chunk_size`,
-            and use {CHUNK_SIZE} if it is not set.
+            and use 256 if it is not set.
 
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.
@@ -1019,7 +1019,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
     chunk_size: int, default=None,
         The number of vectors per chunk. If None (default) looks-up in
         scikit-learn configuration for `pairwise_dist_chunk_size`,
-        and use {CHUNK_SIZE} if it is not set.
+        and use 256 if it is not set.
 
     n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
@@ -1073,7 +1073,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         dict metric_kwargs=dict(),
         n_threads=None,
     ) -> PairwiseDistancesRadiusNeighborhood:
-        f"""Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments.
+        """Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments.
 
         Parameters
         ----------
@@ -1095,7 +1095,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         chunk_size: int, default=None,
             The number of vectors per chunk. If None (default) looks-up in
             scikit-learn configuration for `pairwise_dist_chunk_size`,
-            and use {CHUNK_SIZE} if it is not set.
+            and use 256 if it is not set.
 
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.

From faed7cc8a2fafd321f7ab9703fa61dc2099ebe0b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Sep 2021 17:31:31 +0200
Subject: [PATCH 200/290] Remove some checks on arrays

---
 sklearn/metrics/_dist_metrics.pyx             | 28 +++----
 .../metrics/_pairwise_distances_reduction.pyx |  2 +-
 .../test_pairwise_distances_reduction.py      | 76 +++++++------------
 3 files changed, 43 insertions(+), 63 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 9af02c2ae1018..2622239d1552d 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1228,10 +1228,14 @@ cdef class DatasetsPair:
         Parameters
         ----------
         X : {ndarray, sparse matrix} of shape (n_X, d)
-            Input data. If provided as a sparse matrix, it must be in CSR format.
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
 
         Y : {ndarray, sparse matrix} of shape (n_Y, d)
-            Input data. If provided as a sparse matrix, it must be in CSR format.
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
 
         metric : str, default='euclidean'
             The distance metric to use for argkmin. The default metric is
@@ -1299,10 +1303,10 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
     Parameters
     ----------
     X: ndarray of shape (n_X, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be C-contiguous.
 
     Y: ndarray of shape (n_Y, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be C-contiguous.
 
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
@@ -1317,8 +1321,9 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
         DatasetsPair.__init__(self, distance_metric)
-        self.X = check_array(X, dtype=DTYPE, order='C')
-        self.Y = check_array(Y, dtype=DTYPE, order='C')
+        # Arrays have already been checked
+        self.X = X
+        self.Y = Y
         self.d = X.shape[1]
 
     @final
@@ -1380,9 +1385,6 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
     def __init__(self, X, Y, DistanceMetric distance_metric):
         DatasetsPair.__init__(self, distance_metric)
 
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-        Y = check_array(Y, dtype=DTYPE, accept_sparse='csr')
-
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
         self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
 
@@ -1434,7 +1436,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         Rows represent vectors. Must be in CSR format.
 
     Y: ndarray of shape (n_Y, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be C-contiguous.
 
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
@@ -1462,10 +1464,10 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
     def __init__(self, X, Y, DistanceMetric distance_metric):
         DatasetsPair.__init__(self, distance_metric)
 
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
 
-        self.Y = check_array(Y, dtype=DTYPE)
+        # This array already has been checked here
+        self.Y = Y
         self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE)
 
     @final
@@ -1514,7 +1516,7 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
     Parameters
     ----------
     X: ndarray of shape (n_X, d)
-        Rows represent vectors.
+        Rows represent vectors. Must be C-contiguous.
 
     Y: sparse matrix of shape (n_Y, d)
         Rows represent vectors. Must be in CSR format.
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 812779f8bc747..e1b9727b311c7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -50,7 +50,7 @@ from typing import List
 from scipy.sparse import issparse
 from threadpoolctl import threadpool_limits
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
-from ..utils import check_array, check_scalar, _in_unstable_openblas_configuration
+from ..utils import check_scalar, _in_unstable_openblas_configuration
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index cc9abfeb3d6d0..332c146362ced 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1,17 +1,7 @@
-import itertools
-
 import numpy as np
 import pytest
 from numpy.testing import assert_array_equal, assert_allclose
-from scipy.sparse import (
-    bsr_matrix,
-    coo_matrix,
-    csc_matrix,
-    csr_matrix,
-    dia_matrix,
-    dok_matrix,
-    lil_matrix,
-)
+from scipy.sparse import csr_matrix
 
 from sklearn.metrics._dist_metrics import (
     DenseDenseDatasetsPair,
@@ -91,7 +81,7 @@ def test_pairwise_distances_reduction_is_usable_for():
 
     # TODO: remove once sparse matrices are supported
     assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric)
-    assert not PairwiseDistancesReduction.is_usable_for(X, csc_matrix(Y), metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric)
 
 
 def test_argkmin_factory_method_wrong_usages():
@@ -117,19 +107,21 @@ def test_argkmin_factory_method_wrong_usages():
         PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
 
     with pytest.raises(ValueError, match="k == 0, must be >= 1."):
-        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=0.1, metric=metric)
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=0, metric=metric)
 
     with pytest.raises(ValueError, match="Unrecognized metric"):
         PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric")
 
-    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
         PairwiseDistancesArgKmin.get_for(
             X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric
         )
 
-    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
         PairwiseDistancesArgKmin.get_for(
-            X=X, Y=np.array([1.0, 2.0]), k=k, metric=metric
+            X=np.asfortranarray(X), Y=Y, k=k, metric=metric
         )
 
 
@@ -162,14 +154,16 @@ def test_radius_neighborhood_factory_method_wrong_usages():
             X=X, Y=Y, radius=radius, metric="wrong metric"
         )
 
-    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
         PairwiseDistancesRadiusNeighborhood.get_for(
             X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
         )
 
-    with pytest.raises(ValueError, match="Expected 2D array, got 1D array instead"):
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
         PairwiseDistancesRadiusNeighborhood.get_for(
-            X=X, Y=np.array([1.0, 2.0]), radius=radius, metric=metric
+            X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
         )
 
 
@@ -200,36 +194,20 @@ def test_pairwise_distances_reduction_factory_method(
     dense_dense_instance = PairwiseDistancesReduction.get_for(X, Y, dummy_arg, metric)
     assert isinstance(dense_dense_instance.datasets_pair, DenseDenseDatasetsPair)
 
-    sparse_matrix_constructors = [
-        lil_matrix,
-        csc_matrix,
-        csr_matrix,
-        bsr_matrix,
-        coo_matrix,
-        dia_matrix,
-        dok_matrix,
-    ]
-
-    for c_X, c_Y in itertools.combinations_with_replacement(
-        sparse_matrix_constructors, r=2
-    ):
-        sparse_sparse_instance = PairwiseDistancesReduction.get_for(
-            c_X(X), c_Y(Y), dummy_arg, metric
-        )
-        assert isinstance(
-            sparse_sparse_instance.datasets_pair, SparseSparseDatasetsPair
-        )
+    sparse_sparse_instance = PairwiseDistancesReduction.get_for(
+        csr_matrix(X), csr_matrix(Y), dummy_arg, metric
+    )
+    assert isinstance(sparse_sparse_instance.datasets_pair, SparseSparseDatasetsPair)
 
-    for constructor in sparse_matrix_constructors:
-        dense_sparse_instance = PairwiseDistancesReduction.get_for(
-            X, constructor(Y), dummy_arg, metric=metric
-        )
-        assert isinstance(dense_sparse_instance.datasets_pair, DenseSparseDatasetsPair)
+    dense_sparse_instance = PairwiseDistancesReduction.get_for(
+        X, csr_matrix(Y), dummy_arg, metric=metric
+    )
+    assert isinstance(dense_sparse_instance.datasets_pair, DenseSparseDatasetsPair)
 
-        sparse_dense_instance = PairwiseDistancesReduction.get_for(
-            constructor(X), Y, dummy_arg, metric=metric
-        )
-        assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair)
+    sparse_dense_instance = PairwiseDistancesReduction.get_for(
+        csr_matrix(X), Y, dummy_arg, metric=metric
+    )
+    assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair)
 
     # Test specialisations creation
     fast_euclidean_instance = PairwiseDistancesReduction.get_for(
@@ -352,8 +330,8 @@ def test_strategies_consistency(
 
     # Haversine distance only accepts 2D data
     if metric == "haversine":
-        X = X[:, :2]
-        Y = Y[:, :2]
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
 
     parameter = (
         10

From 66b60b862a9aa27af6540bd51feb9698efa0b635 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 24 Sep 2021 13:37:37 +0200
Subject: [PATCH 201/290] Compute squared euclidean norm for rows in parallel

np.einsum('ij,ij->i') is handy but is single-threaded.

This new interface makes uses of OpenMP and BLAS dot
for parallelized and vectorized computations.
---
 .../metrics/_pairwise_distances_reduction.pyx | 33 ++++++++++++++++---
 .../test_pairwise_distances_reduction.py      | 22 +++++++++++++
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index e1b9727b311c7..7edfc80009ef8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -38,6 +38,7 @@ from ..utils._cython_blas cimport (
   NoTrans,
   RowMajor,
   Trans,
+  _dot,
   _gemm,
 )
 from ..utils._heap cimport simultaneous_sort, heap_push
@@ -116,6 +117,29 @@ cdef class StdVectorSentinelITYPE(StdVectorSentinel):
         return sentinel
 
 
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        ITYPE_t idx = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] row_norms = np.empty(n, dtype=DTYPE)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+
+    return row_norms
+
 cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
     """Create a numpy ndarray given a C++ vector.
 
@@ -243,7 +267,6 @@ cdef class PairwiseDistancesReduction:
         check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
         self.chunk_size = chunk_size
 
-
         self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads)
 
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
@@ -873,8 +896,8 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         cdef:
             DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
-        self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
-        self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+        self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+        self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
         self.use_squared_distances = use_squared_distances
 
         # Temporary datastructures used in threads
@@ -1452,8 +1475,8 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         cdef:
             DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
-        self.X_sq_norms = np.einsum('ij,ij->i', self.X, self.X)
-        self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+        self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+        self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
         self.use_squared_distances = use_squared_distances
 
         if use_squared_distances:
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 332c146362ced..e68f66fe5a40c 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -16,6 +16,7 @@
     PairwiseDistancesRadiusNeighborhood,
     FastEuclideanPairwiseDistancesArgKmin,
     FastEuclideanPairwiseDistancesRadiusNeighborhood,
+    _sqeuclidean_row_norms,
 )
 
 from sklearn.utils import _in_unstable_openblas_configuration
@@ -445,3 +446,24 @@ def test_fast_sqeuclidean_translation_invariance(
     ).compute(return_distance=True)
 
     assert_argkmin_results_equality(reference_dist, dist, reference_indices, indices)
+
+
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("num_threads", [1, 2, 8])
+def test_sqeuclidean_row_norms(
+    seed,
+    n_samples,
+    n_features,
+    num_threads,
+    dtype=np.float64,
+):
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
+    sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads))
+
+    assert_allclose(sq_row_norm_reference, sq_row_norm)

From 90cb9fd5ebcf9bd80c7a501ac41f38da30eb4ec1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 24 Sep 2021 16:38:29 +0200
Subject: [PATCH 202/290] Validate arrays for C-contiguity where needed

---
 sklearn/neighbors/_base.py | 63 +++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 21 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index cb134ddd2089d..8c69d0aceefee 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -413,7 +413,9 @@ def _check_algorithm_metric(self):
     def _fit(self, X, y=None):
         if self._get_tags()["requires_y"]:
             if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
-                X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True)
+                X, y = self._validate_data(
+                    X, y, accept_sparse="csr", multi_output=True, order="C"
+                )
 
             if is_classifier(self):
                 # Classification targets require a specific format
@@ -448,7 +450,7 @@ def _fit(self, X, y=None):
 
         else:
             if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
-                X = self._validate_data(X, accept_sparse="csr")
+                X = self._validate_data(X, accept_sparse="csr", order="C")
 
         self._check_algorithm_metric()
         if self.metric_params is None:
@@ -733,10 +735,21 @@ class from an array representing our data set and ask who's
                 % type(n_neighbors)
             )
 
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and PairwiseDistancesArgKmin.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+
         if X is not None:
             query_is_train = False
             if self.metric == "precomputed":
                 X = _check_precomputed(X)
+            elif use_pairwise_distances_reductions:
+                # We force the C-contiguity even if it creates a copy for F-ordered
+                # arrays because this implementation is more efficient.
+                X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
             else:
                 X = self._validate_data(X, accept_sparse="csr", reset=False)
         else:
@@ -755,14 +768,7 @@ class from an array representing our data set and ask who's
 
         n_jobs = effective_n_jobs(self.n_jobs)
         chunked_results = None
-        if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X):
-            results = _kneighbors_from_graph(
-                X, n_neighbors=n_neighbors, return_distance=return_distance
-            )
-
-        elif self._fit_method == "brute" and PairwiseDistancesArgKmin.is_usable_for(
-            X, self._fit_X, self.effective_metric_
-        ):
+        if use_pairwise_distances_reductions:
             results = PairwiseDistancesArgKmin.get_for(
                 X=X,
                 Y=self._fit_X,
@@ -775,6 +781,13 @@ class from an array representing our data set and ask who's
                 return_distance=return_distance,
             )
 
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
+            results = _kneighbors_from_graph(
+                X, n_neighbors=n_neighbors, return_distance=return_distance
+            )
+
         elif self._fit_method == "brute":
             # TODO: support sparse matrices
             # When ArgKmin is not supported and when the user ask for a
@@ -1070,10 +1083,21 @@ class from an array representing our data set and ask who's
         """
         check_is_fitted(self)
 
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and PairwiseDistancesRadiusNeighborhood.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+
         if X is not None:
             query_is_train = False
             if self.metric == "precomputed":
                 X = _check_precomputed(X)
+            elif use_pairwise_distances_reductions:
+                # We force the C-contiguity even if it creates a copy for F-ordered
+                # arrays because this implementation is more efficient.
+                X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
             else:
                 X = self._validate_data(X, accept_sparse="csr", reset=False)
         else:
@@ -1083,17 +1107,7 @@ class from an array representing our data set and ask who's
         if radius is None:
             radius = self.radius
 
-        if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X):
-            results = _radius_neighbors_from_graph(
-                X, radius=radius, return_distance=return_distance
-            )
-
-        elif (
-            self._fit_method == "brute"
-            and PairwiseDistancesRadiusNeighborhood.is_usable_for(
-                X, self._fit_X, self.effective_metric_
-            )
-        ):
+        if use_pairwise_distances_reductions:
             results = PairwiseDistancesRadiusNeighborhood.get_for(
                 X=X,
                 Y=self._fit_X,
@@ -1107,6 +1121,13 @@ class from an array representing our data set and ask who's
                 sort_results=sort_results,
             )
 
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
+            results = _radius_neighbors_from_graph(
+                X, radius=radius, return_distance=return_distance
+            )
+
         elif self._fit_method == "brute":
             # When RadiusNeighborhood is not supported and when the user ask for a
             # fast alternative, we need to revert to the standard.

From 1ae16d7da3363c101eae00df62442288afa10719 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 27 Sep 2021 14:02:03 +0200
Subject: [PATCH 203/290] Add tests for Neighbors-mixins subclasses

---
 sklearn/neighbors/tests/test_neighbors.py | 124 +++++++++++++++++++++-
 1 file changed, 120 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 8e6f3e34219e5..a086e64a3dea6 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -25,7 +25,11 @@
 from sklearn.neighbors import (
     VALID_METRICS_SPARSE,
 )
-from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed
+from sklearn.neighbors._base import (
+    _is_sorted_by_data,
+    _check_precomputed,
+    KNeighborsMixin,
+)
 from sklearn.pipeline import make_pipeline
 from sklearn.utils._testing import (
     assert_allclose,
@@ -146,15 +150,127 @@ def test_unsupervised_kneighbors(
         )
 
 
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("n_query_pts", [1, 10, 100])
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
+@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
 @pytest.mark.parametrize(
-    "NearestNeighbors",
+    "NeighborsMixinSubclass",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+def test_neigh_predictions_algorithm_agnosticity(
+    n_samples,
+    n_features,
+    n_query_pts,
+    metric,
+    n_neighbors,
+    radius,
+    NeighborsMixinSubclass,
+):
+    # The different algorithms must return identical predictions results
+    # on their common metrics.
+
+    # Redefining the rng locally to use the same generated X
+    local_rng = np.random.RandomState(0)
+    X = local_rng.rand(n_samples, n_features)
+    y = local_rng.randint(3, size=n_samples)
+
+    query = local_rng.rand(n_query_pts, n_features)
+
+    predict_results = []
+
+    parameter = (
+        n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
+    )
+
+    for algorithm in ALGORITHMS:
+        neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric)
+        neigh.fit(X, y)
+
+        predict_results.append(neigh.predict(query))
+
+    for i in range(len(predict_results) - 1):
+        algorithm = ALGORITHMS[i]
+        next_algorithm = ALGORITHMS[i + 1]
+
+        predictions, next_predictions = predict_results[i], predict_results[i + 1]
+
+        assert_allclose(
+            predictions,
+            next_predictions,
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different predictions."
+            ),
+        )
+
+
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
+@pytest.mark.parametrize(
+    "NeighborsMixinSubclass",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+def test_neighs_predictions_fast_euclidean_correctness(
+    seed,
+    n_samples,
+    n_features,
+    n_neighbors,
+    radius,
+    NeighborsMixinSubclass,
+    dtype=np.float64,
+):
+    # The fast euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
+    if n_samples < n_neighbors:
+        pytest.skip(
+            f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})",
+            allow_module_level=True,
+        )
+
+    rng = np.random.RandomState(seed)
+    X = rng.rand(n_samples, n_features).astype(dtype)
+    y = rng.randint(3, size=n_samples)
+
+    parameter = (
+        n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
+    )
+
+    fast_euclidean_clf = NeighborsMixinSubclass(
+        parameter, algorithm="brute", metric="euclidean"
+    ).fit(X, y)
+    euclidean_pred = fast_euclidean_clf.predict(X)
+
+    fast_euclidean_clf = NeighborsMixinSubclass(
+        parameter, algorithm="brute", metric="fast_euclidean"
+    ).fit(X, y)
+    fast_euclidean_pred = fast_euclidean_clf.predict(X)
+
+    assert_allclose(euclidean_pred, fast_euclidean_pred)
+
+
+@pytest.mark.parametrize(
+    "KNeighborsMixinSubclass",
     [
         neighbors.KNeighborsClassifier,
         neighbors.KNeighborsRegressor,
         neighbors.NearestNeighbors,
     ],
 )
-def test_unsupervised_inputs(NearestNeighbors):
+def test_unsupervised_inputs(KNeighborsMixinSubclass):
     # Test unsupervised inputs for neighbors estimators
 
     X = rng.random_sample((10, 3))
@@ -164,7 +280,7 @@ def test_unsupervised_inputs(NearestNeighbors):
 
     dist1, ind1 = nbrs_fid.kneighbors(X)
 
-    nbrs = NearestNeighbors(n_neighbors=1)
+    nbrs = KNeighborsMixinSubclass(n_neighbors=1)
 
     for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
         nbrs.fit(data, y)

From e9dfc953905fb41dbccefe43569bcfc30a4d4a6d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 27 Sep 2021 18:09:03 +0200
Subject: [PATCH 204/290] Xfail test on another numerical edge-case

This edge case was observed on Windows 32bit:

    > np.testing.assert_allclose(distances, fsq_distances, rtol=1e-5)
    E AssertionError:
    E Not equal to tolerance rtol=1e-05, atol=0
    E
    E Mismatched elements: 1 / 10000 (0.01%)
    E Max absolute difference: 0.00020249
    E Max relative difference: 1.05109993e-05
    E x: array([40.123604, 30.522007, 49.364288, ...,
    41.741158, 41.340405, 36.132567])
    E y: array([40.123588, 30.522021, 49.364299, ...,
    41.741134, 41.340408, 36.132622])
---
 sklearn/metrics/tests/test_pairwise.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index e330c9380e433..90b8db305b83b 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1474,11 +1474,11 @@ def test_fast_euclidean_correctness(
     X_translation, Y_translation, sign, n_samples=10000, n_features=10
 ):
     # This is the only failing test case, so we prefer xfailing.
-    numerical_edge_case = (1e7, 1e7, 1)
-    if (X_translation, Y_translation, sign) == numerical_edge_case:
+    numerical_edge_cases = {(1e6, 1e6, 1), (1e7, 1e7, 1)}
+    if (X_translation, Y_translation, sign) in numerical_edge_cases:
         pytest.xfail(
-            "Numerical edge-case for (X_translation, Y_translation,"
-            f" sign)={numerical_edge_case}"
+            "Numerical edge-case: (X_translation, Y_translation,"
+            f" sign)={(X_translation, Y_translation, sign)}"
         )
 
     # The fast squared euclidean strategy must return results

From 2dcac3f518fc82c05daa34033a6918641d41c7d0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 27 Sep 2021 18:12:36 +0200
Subject: [PATCH 205/290] Use PyArray_SetBaseObject via NumPy Cython API

Also remove irrelevant TODO.
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 7edfc80009ef8..5b89a85f9ab64 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -80,10 +80,6 @@ ctypedef fused vector_vector_DITYPE_t:
     vector[vector[DTYPE_t]]
 
 
-cdef extern from "numpy/arrayobject.h":
-    int PyArray_SetBaseObject(np.ndarray arr, PyObject *obj) nogil except -1
-
-
 cdef class StdVectorSentinel:
     """Wraps a reference to a vector which will be deallocated with this object.
 
@@ -93,6 +89,7 @@ cdef class StdVectorSentinel:
     """
     pass
 
+
 # We necessarily need to define two extension types extending StdVectorSentinel
 # because we need to provide the dtype of the vector but can't use numeric fused types.
 cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
@@ -150,8 +147,6 @@ cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
     typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE
     cdef:
         np.npy_intp size = deref(vect_ptr).size()
-        # TODO: use PyArray_SimpleNewFromData from the Numpy C API directly
-        # I've tried, but Cython fails when parsing the C API
         np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum,
                                                       deref(vect_ptr).data())
         StdVectorSentinel sentinel
@@ -166,7 +161,7 @@ cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
     # so we increase its reference counter.
     # See: https://docs.python.org/3/c-api/intro.html#reference-count-details
     Py_INCREF(sentinel)
-    PyArray_SetBaseObject(arr, <PyObject*>sentinel)
+    np.PyArray_SetBaseObject(arr, sentinel)
     return arr
 
 

From c5524f371708f71f53c250a86c1372e3784be0fb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 28 Sep 2021 08:56:38 +0200
Subject: [PATCH 206/290] Apply review suggestions

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_dist_metrics.pxd             |  2 +-
 sklearn/metrics/_dist_metrics.pyx             | 64 +++++----------
 .../metrics/_pairwise_distances_reduction.pyx | 78 +++++++++----------
 3 files changed, 59 insertions(+), 85 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index 183e6851345f0..e87f442019a9d 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -99,7 +99,7 @@ cdef class DatasetsPair:
 
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
 
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil
 
 
 cdef class DenseDenseDatasetsPair(DatasetsPair):
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index ee61aa59ff5e6..f75a3a2a75fcb 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1221,7 +1221,7 @@ cdef class DatasetsPair:
         X,
         Y,
         str metric="euclidean",
-        dict metric_kwargs=dict(),
+        dict metric_kwargs=None,
     ) -> DatasetsPair:
         """Return the DatasetsPair implementation for the given arguments.
 
@@ -1254,7 +1254,7 @@ cdef class DatasetsPair:
         cdef:
             DistanceMetric distance_metric = DistanceMetric.get_metric(
                 metric,
-                **metric_kwargs
+                **(metric_kwargs or {})
             )
 
         if X.dtype != np.float64 or Y.dtype != np.float64:
@@ -1275,22 +1275,23 @@ cdef class DatasetsPair:
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
         """Ensure getting ITYPE instead of int internally used for CSR matrices."""
-        # TODO: this adds another level of checks and conversion, could we remove it?
-        X_data = check_array(X.data, dtype=DTYPE, ensure_2d=False)
-        X_indices = check_array(X.indices, dtype=ITYPE, ensure_2d=False)
-        X_indptr = check_array(X.indptr, dtype=ITYPE, ensure_2d=False)
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=ITYPE)
+        X_indptr = np.asarray(X.indptr, dtype=ITYPE)
         return X_data, X_indptr, X_indptr
 
     def __init__(self, DistanceMetric distance_metric):
         self.distance_metric = distance_metric
 
     cdef ITYPE_t n_X(self) nogil:
+        """Number of samples in X."""
         return -999
 
     cdef ITYPE_t n_Y(self) nogil:
+        """Number of samples in Y."""
         return -999
 
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.dist(i, j)
 
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
@@ -1313,14 +1314,8 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         between two vectors of (X, Y).
     """
 
-    def __cinit__(self):
-        # Initializing memory view to prevent memory errors and seg-faults
-        # in rare cases where __init__ is not called
-        self.X = np.empty((1, 1), dtype=DTYPE, order='C')
-        self.Y = np.empty((1, 1), dtype=DTYPE, order='C')
-
     def __init__(self, X, Y, DistanceMetric distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
+        super().__init__(distance_metric)
         # Arrays have already been checked
         self.X = X
         self.Y = Y
@@ -1335,7 +1330,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.rdist(&self.X[i, 0],
                                           &self.Y[j, 0],
                                           self.d)
@@ -1348,7 +1343,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
 
 @final
 cdef class SparseSparseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of two sparse matrices.
+    """Compute distances between vectors of two CSR matrices.
 
     Parameters
     ----------
@@ -1371,16 +1366,6 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         const ITYPE_t[:] Y_indices
         const ITYPE_t[:] Y_indptr
 
-    def __cinit__(self):
-        # Initializing memory view to prevent memory errors and seg-faults
-        # in rare cases where __init__ is not called
-        self.X_data = np.empty((1), dtype=DTYPE, order='C')
-        self.X_indices = np.empty((1), dtype=ITYPE, order='C')
-        self.X_indptr = np.empty((1), dtype=ITYPE, order='C')
-
-        self.Y_data = np.empty((1), dtype=DTYPE, order='C')
-        self.Y_indices = np.empty((1), dtype=ITYPE, order='C')
-        self.Y_indptr = np.empty((1), dtype=ITYPE, order='C')
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
         DatasetsPair.__init__(self, distance_metric)
@@ -1397,7 +1382,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         return self.Y_indptr.shape[0] -1
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1428,7 +1413,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
 
 @final
 cdef class SparseDenseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of a sparse matrix and a dense array.
+    """Compute distances between vectors of a CSR matrix and a dense array.
 
     Parameters
     ----------
@@ -1450,19 +1435,8 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
         const ITYPE_t[:] Y_indices
 
-    def __cinit__(self):
-        # Initializing memory view to prevent memory errors and seg-faults
-        # in rare cases where __init__ is not called
-        self.X_data = np.empty((1), dtype=DTYPE, order='C')
-        self.X_indices = np.empty((1), dtype=ITYPE, order='C')
-        self.X_indptr = np.empty((1), dtype=ITYPE, order='C')
-
-        self.Y = np.empty((1, 1), dtype=DTYPE, order='C')
-        self.Y_indices = np.empty((1), dtype=ITYPE, order='C')
-
-
     def __init__(self, X, Y, DistanceMetric distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
+        super().__init__(distance_metric)
 
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
 
@@ -1479,7 +1453,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1511,7 +1485,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
 
 @final
 cdef class DenseSparseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of a dense array and a sparse matrix.
+    """Compute distances between vectors of a dense array and a CSR matrix.
 
     Parameters
     ----------
@@ -1531,7 +1505,7 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
         DatasetsPair datasets_pair
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
+        super().__init__(distance_metric)
         # Swapping arguments on the constructor
         self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
 
@@ -1546,9 +1520,9 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
         return self.datasets_pair.n_X()
 
     @final
-    cdef DTYPE_t proxy_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         # Swapping arguments on the same interface
-        return self.datasets_pair.proxy_dist(j, i)
+        return self.datasets_pair.ranking_preserving_dist(j, i)
 
     @final
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 5b89a85f9ab64..3a061058d0159 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -424,7 +424,7 @@ cdef class PairwiseDistancesReduction:
     # Placeholder methods which can be implemented
 
     cdef void compute_exact_distances(self) nogil:
-        """Convert proxy distances to exact distances or recompute them."""
+        """Convert ranking-preserving distances to exact distances or recompute them."""
         return
 
     cdef void _on_X_parallel_init(
@@ -521,7 +521,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         DTYPE_t[:, ::1] argkmin_distances
 
         # Used as array of pointers to private datastructures used in threads.
-        DTYPE_t ** heaps_proxy_distances_chunks
+        DTYPE_t ** heaps_r_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
@@ -597,7 +597,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         chunk_size=None,
         n_threads=None,
     ):
-        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size, n_threads)
+        super().__init__(datasets_pair, chunk_size, n_threads)
 
         check_scalar(k, "k", Integral, min_val=1)
         self.k = k
@@ -607,7 +607,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         # When reducing on small datasets, there can be more pointers than actual
         # threads used for the reduction but there won't be allocated but unused
         # datastructures.
-        self.heaps_proxy_distances_chunks = <DTYPE_t **> malloc(
+        self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
             sizeof(DTYPE_t *) * self.effective_omp_n_thread
         )
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
@@ -618,8 +618,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
 
-        if self.heaps_proxy_distances_chunks is not NULL:
-            free(self.heaps_proxy_distances_chunks)
+        if self.heaps_r_distances_chunks is not NULL:
+            free(self.heaps_r_distances_chunks)
 
     cdef void _reduce_on_chunks(
         self,
@@ -634,7 +634,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             ITYPE_t n_X = X_end - X_start
             ITYPE_t n_Y = Y_end - Y_start
             ITYPE_t k = self.k
-            DTYPE_t *heaps_proxy_distances = self.heaps_proxy_distances_chunks[thread_num]
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
         # Pushing the distance and their associated indices on heaps
@@ -642,10 +642,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         for i in range(n_X):
             for j in range(n_Y):
                 heap_push(
-                    heaps_proxy_distances + i * self.k,
+                    heaps_r_distances + i * self.k,
                     heaps_indices + i * self.k,
                     k,
-                    self._datasets_pair.proxy_dist(X_start + i, Y_start + j),
+                    self._datasets_pair.ranking_preserving_dist(X_start + i, Y_start + j),
                     Y_start + j,
                 )
 
@@ -658,7 +658,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     ) nogil:
         # As this strategy is embarrassingly parallel, we can set the
         # thread heaps pointers to the proper position on the main heaps
-        self.heaps_proxy_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
+        self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
     @final
@@ -674,7 +674,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         # Sorting indices of the argkmin for each query vector of X
         for idx in range(X_end - X_start):
             simultaneous_sort(
-                self.heaps_proxy_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
                 self.heaps_indices_chunks[thread_num] + idx * self.k,
                 self.k
             )
@@ -693,7 +693,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
-            self.heaps_proxy_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.heaps_r_distances_chunks[thread_num] = <DTYPE_t *> malloc(
                 heaps_size * sizeof(DTYPE_t)
             )
             self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
@@ -707,7 +707,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     ) nogil:
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
-            self.heaps_proxy_distances_chunks[thread_num][idx] = DBL_MAX
+            self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
             self.heaps_indices_chunks[thread_num][idx] = -1
 
     @final
@@ -729,7 +729,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                             &self.argkmin_distances[X_start + idx, 0],
                             &self.argkmin_indices[X_start + idx, 0],
                             self.k,
-                            self.heaps_proxy_distances_chunks[thread_num][idx * self.k + jdx],
+                            self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
                             self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                         )
 
@@ -743,7 +743,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         with nogil, parallel(num_threads=self.effective_omp_n_thread):
             # Deallocating temporary datastructures
             for thread_num in prange(num_threads, schedule='static'):
-                free(self.heaps_proxy_distances_chunks[thread_num])
+                free(self.heaps_r_distances_chunks[thread_num])
                 free(self.heaps_indices_chunks[thread_num])
 
             # Sort the main heaps into arrays in parallel
@@ -880,8 +880,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         bint use_squared_distances=False,
         chunk_size=None,
     ):
-        PairwiseDistancesArgKmin.__init__(
-            self,
+        super().__init__(
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             k=k,
@@ -970,7 +969,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
             const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
             const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_proxy_distances = self.heaps_proxy_distances_chunks[thread_num]
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
             # We compute the full pairwise squared distances matrix as follows
@@ -1009,10 +1008,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
                 heap_push(
-                    heaps_proxy_distances + i * k,
+                    heaps_r_distances + i * k,
                     heaps_indices + i * k,
                     k,
-                    # proxy distance: ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    # Using the squared euclidean distance as the ranking-preserving distance:
+                    # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                     (
                         self.X_sq_norms[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
@@ -1051,13 +1051,13 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
     cdef:
         DTYPE_t radius
 
-        # DistanceMetric compute rank-preserving surrogate distance via rdist
+        # DistanceMetric compute ranking-preserving surrogate distance via rdist
         # which are proxies necessitating less computations.
-        # We get the proxy for the radius to be able to compare it against
-        # vectors' rank-preserving surrogate distances.
-        DTYPE_t proxy_radius
+        # We get the equivalent for the radius to be able to compare it against
+        # vectors' ranking-preserving surrogate distances.
+        DTYPE_t r_radius
 
-        # Neighbors indices and distances are returned as np.ndarray or np.ndarray.
+        # Neighbors indices and distances are returned as np.ndarray of np.ndarray.
         #
         # We want resizable buffers which we will to wrapped within numpy
         # arrays at the end. std::vector comes as a handy interface for
@@ -1088,7 +1088,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         DTYPE_t radius,
         str metric="fast_euclidean",
         chunk_size=None,
-        dict metric_kwargs=dict(),
+        dict metric_kwargs=None,
         n_threads=None,
     ) -> PairwiseDistancesRadiusNeighborhood:
         """Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments.
@@ -1153,11 +1153,11 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         chunk_size=None,
         n_threads=None,
     ):
-        PairwiseDistancesReduction.__init__(self, datasets_pair, chunk_size, n_threads)
+        super().__init__(datasets_pair, chunk_size, n_threads)
 
         check_scalar(radius, "radius", Real, min_val=0)
         self.radius = radius
-        self.proxy_radius = self._datasets_pair.distance_metric._dist_to_rdist(radius)
+        self.r_radius = self._datasets_pair.distance_metric._dist_to_rdist(radius)
         self.sort_results = False
 
         # Allocating pointers to datastructures but not the datastructures themselves.
@@ -1189,13 +1189,13 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
     ) nogil:
         cdef:
             ITYPE_t i, j
-            DTYPE_t proxy_dist_i_j
+            DTYPE_t r_dist_i_j
 
         for i in range(X_start, X_end):
             for j in range(Y_start, Y_end):
-                proxy_dist_i_j = self._datasets_pair.proxy_dist(i, j)
-                if proxy_dist_i_j <= self.proxy_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(proxy_dist_i_j)
+                r_dist_i_j = self._datasets_pair.ranking_preserving_dist(i, j)
+                if r_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
 
     @final
@@ -1312,7 +1312,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         return
 
     cdef void compute_exact_distances(self) nogil:
-        """Convert proxy distances to pairwise distances in parallel."""
+        """Convert ranking-preserving distances to pairwise distances in parallel."""
         cdef:
             ITYPE_t i, j
 
@@ -1459,8 +1459,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         bint use_squared_distances=False,
         chunk_size=None,
     ):
-        PairwiseDistancesRadiusNeighborhood.__init__(
-            self,
+        super().__init__(
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             radius=radius,
@@ -1476,8 +1475,8 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
 
         if use_squared_distances:
             # In this specialisation and this setup, the value passed to the radius is
-            # already considered to be the the proxy radius, so we overwrite it.
-            self.proxy_radius = radius
+            # already considered to be the adapted radius, so we overwrite it.
+            self.r_radius = radius
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
@@ -1589,12 +1588,13 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         # Pushing the distance and their associated indices in vectors.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                # Using the squared euclidean distance as the ranking-preserving distance:
+                # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                 squared_dist_i_j = (
                     self.X_sq_norms[i + X_start]
                     + dist_middle_terms[i * Y_c.shape[0] + j]
                     + self.Y_sq_norms[j + Y_start]
                 )
-                if squared_dist_i_j <= self.proxy_radius:
+                if squared_dist_i_j <= self.r_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)

From eda2b260b75fe31298ed431f3a1eb5b328b138f3 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 14 Sep 2021 09:00:35 +0200
Subject: [PATCH 207/290] Revert whats_new entry.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 doc/whats_new/v1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0050948d7c16b..d8776653cd9e8 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -740,7 +740,7 @@ Changelog
   :pr:`19473` by :user:`jiefangxuanyan <jiefangxuanyan>` and
   :user:`Julien Jerphanion <jjerphan>`.
 
-- |FIX| :class:`metrics.DistanceMetric` subclasses now support readonly
+- |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly
   memory-mapped datasets. :pr:`19883` by :user:`Julien Jerphanion <jjerphan>`.
 
 - |FIX| :class:`neighbors.NearestNeighbors`, :class:`neighbors.KNeighborsClassifier`,

From 952d41ab2c63f47098498a268958835576310776 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 29 Sep 2021 15:35:11 +0200
Subject: [PATCH 208/290] Test the fast euclidean overriding

---
 sklearn/neighbors/tests/test_neighbors.py | 54 ++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index a086e64a3dea6..3b51d597e76a0 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -249,10 +249,10 @@ def test_neighs_predictions_fast_euclidean_correctness(
         n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
     )
 
-    fast_euclidean_clf = NeighborsMixinSubclass(
+    euclidean_clf = NeighborsMixinSubclass(
         parameter, algorithm="brute", metric="euclidean"
     ).fit(X, y)
-    euclidean_pred = fast_euclidean_clf.predict(X)
+    euclidean_pred = euclidean_clf.predict(X)
 
     fast_euclidean_clf = NeighborsMixinSubclass(
         parameter, algorithm="brute", metric="fast_euclidean"
@@ -262,6 +262,56 @@ def test_neighs_predictions_fast_euclidean_correctness(
     assert_allclose(euclidean_pred, fast_euclidean_pred)
 
 
+@pytest.mark.parametrize(
+    "KNeighborsEstimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ],
+)
+@pytest.mark.parametrize(
+    "weights, expected_kneighbors_metric",
+    [
+        ("uniform", "fast_sqeuclidean"),
+        ("distance", "fast_euclidean"),
+        (lambda x: x, "fast_euclidean"),
+    ],
+)
+def test_knn_prediction_fast_euclidean_overriding(
+    KNeighborsEstimator,
+    weights,
+    expected_kneighbors_metric,
+    n_samples=1000,
+    n_features=100,
+    dtype=np.float64,
+):
+    # The fast squared euclidean metric must be used over the fast euclidean
+    # metric solely when using the uniform sample-weighting.
+    class MockedKNeighborsEstimator(KNeighborsEstimator):
+        def kneighbors(self, *args, **kwargs):
+            self.kneighbors_metric_ = self.effective_metric_
+            return super().kneighbors(*args, **kwargs)
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(n_samples, n_features).astype(dtype)
+    y = rng.randint(3, size=n_samples)
+
+    parameter = 10
+
+    fast_euclidean_clf = MockedKNeighborsEstimator(
+        parameter,
+        algorithm="brute",
+        metric="fast_euclidean",
+        weights=weights,
+    ).fit(X, y)
+
+    # effective_metric_ must not be changed
+    assert fast_euclidean_clf.effective_metric_ == "fast_euclidean"
+    fast_euclidean_clf.predict(X)
+    assert fast_euclidean_clf.kneighbors_metric_ == expected_kneighbors_metric
+    assert fast_euclidean_clf.effective_metric_ == "fast_euclidean"
+
+
 @pytest.mark.parametrize(
     "KNeighborsMixinSubclass",
     [

From 86c0d6fc38bb6ae2add82d1e5eecbc8011b7f6d0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 29 Sep 2021 16:23:14 +0200
Subject: [PATCH 209/290] Mention distances computations and their reduction in
 dedicated method

---
 .../metrics/_pairwise_distances_reduction.pyx | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 3a061058d0159..5b43b2cd13830 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -331,7 +331,7 @@ cdef class PairwiseDistancesReduction:
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
-                    self._reduce_on_chunks(
+                    self._compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
                         thread_num,
@@ -392,7 +392,7 @@ cdef class PairwiseDistancesReduction:
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
-                    self._reduce_on_chunks(
+                    self._compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
                         thread_num,
@@ -410,7 +410,7 @@ cdef class PairwiseDistancesReduction:
 
     # Placeholder methods which have to be implemented
 
-    cdef void _reduce_on_chunks(
+    cdef void _compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -418,7 +418,10 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil:
-        """Implemented the reduction on a pair of chunks."""
+        """Compute the pairwise distances on two chunks of X and Y and reduce them.
+
+        This is the core critical region of PairwiseDistanceReductions' computations.
+        """
         return
 
     # Placeholder methods which can be implemented
@@ -621,7 +624,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         if self.heaps_r_distances_chunks is not NULL:
             free(self.heaps_r_distances_chunks)
 
-    cdef void _reduce_on_chunks(
+    cdef void _compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -954,7 +957,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _reduce_on_chunks(
+    cdef void _compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -1179,7 +1182,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         if self.neigh_indices_chunks is not NULL:
             free(self.neigh_indices_chunks)
 
-    cdef void _reduce_on_chunks(
+    cdef void _compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
         ITYPE_t X_end,
@@ -1538,7 +1541,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _reduce_on_chunks(
+    cdef void _compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
         ITYPE_t X_end,

From bd5a1db2000d860e330bc7785cf93a3c7abd9767 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 29 Sep 2021 16:25:29 +0200
Subject: [PATCH 210/290] fixup! Apply review suggestions

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 5b43b2cd13830..cc4739cfb2fc0 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -535,7 +535,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         ITYPE_t k,
         str metric="fast_euclidean",
         chunk_size=None,
-        dict metric_kwargs=dict(),
+        dict metric_kwargs=None,
         n_threads=None,
     ) -> PairwiseDistancesArgKmin:
         """Return the PairwiseDistancesArgKmin implementation for the given arguments.

From 1e2181da42104631e78b94b55e280019b6a24f94 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 29 Sep 2021 16:35:53 +0200
Subject: [PATCH 211/290] Tight self.neigh_{indices,distances}'s lifetime to
 their composite

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index cc4739cfb2fc0..881fa78605055 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -1175,6 +1175,11 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             sizeof(self.neigh_indices) * self.effective_omp_n_thread
         )
 
+        # Temporary datastructures which will be coerced to numpy arrays on before
+        # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed.
+        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
+        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
+
     def __dealloc__(self):
         if self.neigh_distances_chunks is not NULL:
             free(self.neigh_distances_chunks)
@@ -1182,6 +1187,13 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         if self.neigh_indices_chunks is not NULL:
             free(self.neigh_indices_chunks)
 
+        if self.neigh_indices is not NULL:
+            del self.neigh_indices
+
+        if self.neigh_distances is not NULL:
+            del self.neigh_distances
+
+
     cdef void _compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -1383,11 +1395,6 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             raise ValueError("return_distance must be True "
                              "if sort_results is True.")
 
-        # Temporary datastructures which will be coerced to
-        # numpy arrays on return and then freed.
-        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
-        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
-
         self.sort_results = sort_results
 
         if strategy is None:
@@ -1420,9 +1427,6 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         else:
             res = coerce_vectors_to_nd_arrays(self.neigh_indices)
 
-        del self.neigh_distances
-        del self.neigh_indices
-
         return res
 
 

From 3052db331d0f0fcb0acbc818880eddeefaa4f766 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 29 Sep 2021 17:31:14 +0200
Subject: [PATCH 212/290] Adapt docstrings

---
 .../metrics/_pairwise_distances_reduction.pyx | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 881fa78605055..af3cca753e01a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -234,6 +234,25 @@ cdef class PairwiseDistancesReduction:
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
+        """Return True if the PairwiseDistancesReduction for the given parameters.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_X, d)
+            Input data.
+
+        Y : {ndarray, sparse matrix} of shape (n_Y, d)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        Returns
+        -------
+        True if the PairwiseDistancesReduction can be used, else False.
+        """
         # Coercing to np.array to get the dtype
         # TODO: what is the best way to get lists' dtype?
         X = np.asarray(X) if isinstance(X, (tuple, list)) else X
@@ -420,7 +439,8 @@ cdef class PairwiseDistancesReduction:
     ) nogil:
         """Compute the pairwise distances on two chunks of X and Y and reduce them.
 
-        This is the core critical region of PairwiseDistanceReductions' computations.
+        This is the core critical region of PairwiseDistanceReductions' computations
+        which must be implemented in subclasses.
         """
         return
 
@@ -557,7 +577,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             For a list of available metrics, see the documentation of
             :class:`~sklearn.metrics.DistanceMetric`.
 
-        chunk_size: int, default=None,
+        chunk_size : int, default=None,
             The number of vectors per chunk. If None (default) looks-up in
             scikit-learn configuration for `pairwise_dist_chunk_size`,
             and use 256 if it is not set.
@@ -565,7 +585,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.
 
-        n_threads: int, default=None
+        n_threads : int, default=None
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
             depends on the `strategy` set on
@@ -1113,7 +1133,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             For a list of available metrics, see the documentation of
             :class:`~sklearn.metrics.DistanceMetric`.
 
-        chunk_size: int, default=None,
+        chunk_size : int, default=None,
             The number of vectors per chunk. If None (default) looks-up in
             scikit-learn configuration for `pairwise_dist_chunk_size`,
             and use 256 if it is not set.
@@ -1193,7 +1213,6 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         if self.neigh_distances is not NULL:
             del self.neigh_distances
 
-
     cdef void _compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,

From a758fc11e60811c9dceb65cc98f5977a64658388 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 29 Sep 2021 17:33:08 +0200
Subject: [PATCH 213/290] Factor compute in base class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 246 +++++++-----------
 sklearn/neighbors/_base.py                    |   2 +-
 2 files changed, 99 insertions(+), 149 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index af3cca753e01a..e3e4b7e618bc3 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -306,6 +306,66 @@ cdef class PairwiseDistancesReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+    def compute(
+        self,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Computes the reduction of vectors (rows) of X on Y.
+
+        Parameters
+        ----------
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            Strategies differs on the dispatching they use for chunks on threads:
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation
+              but is less used in practice (because X is smaller than Y generally).
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread then iterates on all the chunks of X. This strategy is
+              embarrassingly parallel but uses intermediate datastructures
+              synchronisation. However it is more useful in practice (because Y is
+              larger than X generally).
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y'.
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+        Results for the PairwiseDistancesReduction, usually an array of indices
+        and optionally an array of associated distances if return_distance is True.
+        """
+
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if strategy == 'parallel_on_Y':
+                self._parallel_on_Y()
+            elif strategy == 'parallel_on_X':
+                self._parallel_on_X()
+            else:
+                raise RuntimeError(f"strategy '{strategy}' not supported.")
+
+        return self._finalize_results(return_distance)
+
     @final
     cdef void _parallel_on_X(self) nogil:
         """Computes the reduction of each vector (row) of X on Y
@@ -444,6 +504,13 @@ cdef class PairwiseDistancesReduction:
         """
         return
 
+    def _finalize_results(self, bint return_distance):
+        """Call-back adapting datastructures before returning results.
+
+        This must be implemented in subclasses.
+        """
+        return None
+
     # Placeholder methods which can be implemented
 
     cdef void compute_exact_distances(self) nogil:
@@ -637,6 +704,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             sizeof(ITYPE_t *) * self.effective_omp_n_thread
         )
 
+        # Main heaps used by PairwiseDistancesArgKmin.compute to return results.
+        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE)
+
     def __dealloc__(self):
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
@@ -792,73 +863,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                     distances[i, j] if distances[i, j] > 0. else 0.
                 )
 
-    @final
-    def compute(
-        self,
-        str strategy=None,
-        bint return_distance=False,
-    ):
-        """Computes the reduction of vectors (rows) of X on Y.
-
-        Parameters
-        ----------
-        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
-            The chunking strategy defining which dataset parallelization are made on.
-
-            Strategies differs on the dispatching they use for chunks on threads:
-                 - 'parallel_on_X' dispatches chunks of X uniformly on threads.
-                 Each thread then iterates on all the chunks of Y. This strategy is
-                 embarrassingly parallel and comes with no datastructures synchronisation
-                 but is less used in practice (because X is smaller than Y generally).
-                 - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
-                 Each thread then iterates on all the chunks of X. This strategy is
-                 embarrassingly parallel but uses intermediate datastructures
-                 synchronisation. However it is more useful in practice (because Y is
-                 larger than X generally).
-                 - 'auto' relies on a simple heuristic to choose between
-                 'parallel_on_X' and 'parallel_on_Y'.
-                 - None (default) looks-up in scikit-learn configuration for
-                 `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
-
-        return_distance: boolean, default=False
-            Return distances between each X vector and its
-            argkmin if set to True.
-
-        Returns
-        -------
-        distances: ndarray of shape (n, k)
-            Distances between each X vector and its argkmin
-            in Y. Only returned if ``return_distance=True``.
-
-        indices: ndarray of shape (n, k)
-            Indices of argkmin of vectors of X in Y.
-        """
-
-        # Results returned by PairwiseDistancesArgKmin.compute used as the main heaps.
-        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
-        self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE)
-
-        if strategy is None:
-            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
-
-        if strategy == 'auto':
-            # This is a simple heuristic whose constant for the
-            # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
-                strategy = 'parallel_on_X'
-            else:
-                strategy = 'parallel_on_Y'
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if strategy == 'parallel_on_Y':
-                self._parallel_on_Y()
-            elif strategy == 'parallel_on_X':
-                self._parallel_on_X()
-            else:
-                raise RuntimeError(f"strategy '{strategy}' not supported.")
-
+    def _finalize_results(self, bint return_distance=False):
         if return_distance:
             # We eventually need to recompute distances because we relied on proxies.
             self.compute_exact_distances()
@@ -1113,6 +1118,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         chunk_size=None,
         dict metric_kwargs=None,
         n_threads=None,
+        bint sort_results=False,
     ) -> PairwiseDistancesRadiusNeighborhood:
         """Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments.
 
@@ -1149,6 +1155,10 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
             None and -1 means using all processors.
 
+        sort_results : boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
         Returns
         -------
         radius_neighborhood: PairwiseDistancesRadiusNeighborhood
@@ -1160,13 +1170,15 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             return FastEuclideanPairwiseDistancesRadiusNeighborhood(
                 X=X, Y=Y, radius=radius,
                 use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size
+                chunk_size=chunk_size,
+                sort_results=sort_results,
             )
 
         return PairwiseDistancesRadiusNeighborhood(
             datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
             radius=radius,
             chunk_size=chunk_size,
+            sort_results=sort_results,
         )
 
     def __init__(
@@ -1175,13 +1187,14 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         DTYPE_t radius,
         chunk_size=None,
         n_threads=None,
+        sort_results=False
     ):
         super().__init__(datasets_pair, chunk_size, n_threads)
 
         check_scalar(radius, "radius", Real, min_val=0)
         self.radius = radius
         self.r_radius = self._datasets_pair.distance_metric._dist_to_rdist(radius)
-        self.sort_results = False
+        self.sort_results = sort_results
 
         # Allocating pointers to datastructures but not the datastructures themselves.
         # There as many pointers as available threads.
@@ -1232,6 +1245,16 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
 
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            self.compute_exact_distances()
+            return (
+                coerce_vectors_to_nd_arrays(self.neigh_distances),
+                coerce_vectors_to_nd_arrays(self.neigh_indices),
+            )
+
+        return coerce_vectors_to_nd_arrays(self.neigh_indices)
+
     @final
     cdef void _on_X_prange_iter_init(
         self,
@@ -1366,87 +1389,12 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
     def compute(
         self,
         str strategy=None,
-        bint return_distance = False,
-        bint sort_results = False
+        bint return_distance=False,
     ):
-        """Computes the reduction of vectors (rows) of X on Y.
-
-        Parameters
-        ----------
-        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
-            The chunking strategy defining which dataset parallelization are made on.
-
-            Strategies differs on the dispatching they use for chunks on threads:
-                 - 'parallel_on_X' dispatches chunks of X uniformly on threads.
-                 Each thread then iterates on all the chunks of Y. This strategy is
-                 embarrassingly parallel and comes with no datastructures synchronisation
-                 but is less used in practice (because X is smaller than Y generally).
-                 - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
-                 Each thread then iterates on all the chunks of X. This strategy is
-                 embarrassingly parallel but uses intermediate datastructures
-                 synchronisation. However it is more useful in practice (because Y is
-                 larger than X generally).
-                 - 'auto' relies on a simple heuristic to choose between
-                 'parallel_on_X' and 'parallel_on_Y'.
-                 - None (default) looks-up in scikit-learn configuration for
-                 `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
-
-        return_distance: boolean, default=False
-            Return distances between each X vector and its
-            neighbors if set to True.
-
-        sort_results: boolean, default=False
-            Sort results with respect to distances between each X vector and its
-            neighbors if set to True.
-
-            return_distance must be True if sort_results is set to True.
-
-        Returns
-        -------
-        distances: ndarray of shape (n, k)
-            Distances between each X vector and its neighbors
-            in Y. Only returned if ``return_distance=True``.
-
-        indices: ndarray of shape (n, k)
-            Indices of each neighbor of vectors of X in Y.
-        """
-        if sort_results and not return_distance:
-            raise ValueError("return_distance must be True "
-                             "if sort_results is True.")
-
-        self.sort_results = sort_results
-
-        if strategy is None:
-            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
-
-        if strategy == 'auto':
-            # This is a simple heuristic whose constant for the
-            # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
-                strategy = 'parallel_on_X'
-            else:
-                strategy = 'parallel_on_Y'
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if strategy == 'parallel_on_Y':
-                self._parallel_on_Y()
-            elif strategy == 'parallel_on_X':
-                self._parallel_on_X()
-            else:
-                raise RuntimeError(f"strategy '{strategy}' not supported.")
-
-        if return_distance:
-            self.compute_exact_distances()
-            res = (
-                coerce_vectors_to_nd_arrays(self.neigh_distances),
-                coerce_vectors_to_nd_arrays(self.neigh_indices),
-            )
-        else:
-            res = coerce_vectors_to_nd_arrays(self.neigh_indices)
+        if self.sort_results and not return_distance:
+            raise ValueError("return_distance must be True if sort_results is True.")
 
-        return res
+        return super().compute(strategy=strategy, return_distance=return_distance)
 
 
 cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood):
@@ -1484,12 +1432,14 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         DTYPE_t radius,
         bint use_squared_distances=False,
         chunk_size=None,
+        sort_results=False,
     ):
         super().__init__(
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
             radius=radius,
             chunk_size=chunk_size,
+            sort_results=sort_results,
         )
         # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
         cdef:
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 8c69d0aceefee..981c4dc1a0c64 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -1115,10 +1115,10 @@ class from an array representing our data set and ask who's
                 metric=self.effective_metric_,
                 metric_kwargs=self.effective_metric_params_,
                 n_threads=self.n_jobs,
+                sort_results=sort_results,
             ).compute(
                 strategy="auto",
                 return_distance=return_distance,
-                sort_results=sort_results,
             )
 
         elif (

From 3dbe038dcb109b1bad8092e79f9f315dd87b99e8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 29 Sep 2021 17:55:31 +0200
Subject: [PATCH 214/290] Do not use frenchism
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🥖
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index e3e4b7e618bc3..c9740e98554da 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -10,7 +10,7 @@
 #    Author: Julien Jerphanion <git@jjerphan.xyz>
 #
 #
-# The routines defined here are used in various algorithms realising
+# The routines defined here are used in various algorithms performing
 # the same structure of operations on distances between vectors
 # of a datasets pair (X, Y).
 

From e8deb0fecbcdfd0c37e54854887dbcf3d9f89b48 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 30 Sep 2021 09:26:37 +0200
Subject: [PATCH 215/290] Test fast metric alternatives fallbacks

This should increases code coverage.
---
 .../metrics/_pairwise_distances_reduction.pyx |  4 +-
 sklearn/metrics/pairwise.py                   |  4 +-
 sklearn/neighbors/_base.py                    | 16 ++--
 sklearn/neighbors/tests/test_neighbors.py     | 87 +++++++++++++++++--
 4 files changed, 94 insertions(+), 17 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index c9740e98554da..6aa3eb9fc2a93 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -666,7 +666,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             The suited PairwiseDistancesArgKmin implementation.
         """
         # This factory comes to handle specialisations.
-        if metric in {"fast_euclidean", "fast_sqeuclidean"} and not issparse(X) and not issparse(Y):
+        if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y):
             use_squared_distances = metric == "fast_sqeuclidean"
             return FastEuclideanPairwiseDistancesArgKmin(
                 X=X, Y=Y, k=k,
@@ -1165,7 +1165,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             The suited PairwiseDistancesRadiusNeighborhood implementation.
         """
         # This factory comes to handle specialisations.
-        if metric in {"fast_euclidean", "fast_sqeuclidean"} and not issparse(X) and not issparse(Y):
+        if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y):
             use_squared_distances = metric == "fast_sqeuclidean"
             return FastEuclideanPairwiseDistancesRadiusNeighborhood(
                 X=X, Y=Y, radius=radius,
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 27af360e4256a..8d95b705d7640 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -664,8 +664,8 @@ def pairwise_distances_argmin_min(
         # we won't need to fallback to pairwise_distances_chunked anymore.
         # When ArgKmin is not supported and when the user asked for
         # a fast alternative, we need to revert to the standard one.
-        if metric in {"fast_sqeuclidean", "fast_euclidean"}:
-            metric = "euclidean"
+        if metric in ("fast_sqeuclidean", "fast_euclidean"):
+            metric = metric.replace("fast_", "")
 
         indices, values = zip(
             *pairwise_distances_chunked(
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 981c4dc1a0c64..8c8552c608083 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -367,11 +367,15 @@ def _check_algorithm_metric(self):
         else:
             alg_check = self.algorithm
 
-        if alg_check != "brute" and self.metric in {"fast_sqeuclidean", "sqeuclidean"}:
+        if alg_check != "brute" and self.metric in (
+            "fast_sqeuclidean",
+            "fast_euclidean",
+        ):
             alternative = self.metric.replace("fast_", "")
             warnings.warn(
-                f"'{self.metric}' is only available for algorithm='brute', falling"
-                f"back on metric='{alternative}'.",
+                f"'{self.metric}' is only available for algorithm='brute' (currently"
+                f" algorithm='{self.algorithm}'). Falling back on"
+                f" metric='{alternative}'.",
                 UserWarning,
                 stacklevel=3,
             )
@@ -518,7 +522,7 @@ def _fit(self, X, y=None):
             if self.algorithm not in ("auto", "brute"):
                 warnings.warn("cannot use tree with sparse input: using brute force")
 
-            if self.metric in {"fast_sqeuclidean", "fast_euclidean"}:
+            if self.metric in ("fast_sqeuclidean", "fast_euclidean"):
                 # The fast alternatives are only available for dense datasets.
                 self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
@@ -792,7 +796,7 @@ class from an array representing our data set and ask who's
             # TODO: support sparse matrices
             # When ArgKmin is not supported and when the user ask for a
             # fast alternative, we need to revert to the standard.
-            if self.effective_metric_ in {"fast_sqeuclidean", "fast_euclidean"}:
+            if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
                 # The fast alternatives are only available for dense datasets.
                 self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
@@ -1131,7 +1135,7 @@ class from an array representing our data set and ask who's
         elif self._fit_method == "brute":
             # When RadiusNeighborhood is not supported and when the user ask for a
             # fast alternative, we need to revert to the standard.
-            if self.effective_metric_ in {"fast_sqeuclidean", "fast_euclidean"}:
+            if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
                 # The fast alternatives are only available for dense datasets.
                 self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 3b51d597e76a0..ce3e3ceef7efb 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -3,6 +3,7 @@
 import pytest
 import re
 import numpy as np
+import scipy
 from scipy.sparse import (
     bsr_matrix,
     coo_matrix,
@@ -249,10 +250,10 @@ def test_neighs_predictions_fast_euclidean_correctness(
         n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
     )
 
-    euclidean_clf = NeighborsMixinSubclass(
+    euclidean_est = NeighborsMixinSubclass(
         parameter, algorithm="brute", metric="euclidean"
     ).fit(X, y)
-    euclidean_pred = euclidean_clf.predict(X)
+    euclidean_pred = euclidean_est.predict(X)
 
     fast_euclidean_clf = NeighborsMixinSubclass(
         parameter, algorithm="brute", metric="fast_euclidean"
@@ -298,7 +299,7 @@ def kneighbors(self, *args, **kwargs):
 
     parameter = 10
 
-    fast_euclidean_clf = MockedKNeighborsEstimator(
+    fast_euclidean_est = MockedKNeighborsEstimator(
         parameter,
         algorithm="brute",
         metric="fast_euclidean",
@@ -306,10 +307,82 @@ def kneighbors(self, *args, **kwargs):
     ).fit(X, y)
 
     # effective_metric_ must not be changed
-    assert fast_euclidean_clf.effective_metric_ == "fast_euclidean"
-    fast_euclidean_clf.predict(X)
-    assert fast_euclidean_clf.kneighbors_metric_ == expected_kneighbors_metric
-    assert fast_euclidean_clf.effective_metric_ == "fast_euclidean"
+    assert fast_euclidean_est.effective_metric_ == "fast_euclidean"
+    fast_euclidean_est.predict(X)
+    assert fast_euclidean_est.kneighbors_metric_ == expected_kneighbors_metric
+    assert fast_euclidean_est.effective_metric_ == "fast_euclidean"
+
+
+@pytest.mark.parametrize(
+    "KNeighborsEstimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ],
+)
+@pytest.mark.parametrize("algorithm", ["kd_tree", "ball_tree"])
+def test_knn_prediction_fast_alternatives_fall_back_on_tree(
+    KNeighborsEstimator,
+    algorithm,
+    specified_metric="fast_euclidean",
+    fall_back_metric="euclidean",
+    parameter=10,
+    n_samples=1000,
+    n_features=100,
+    dtype=np.float64,
+):
+    # The fast euclidean metric can't be used on "kd_tree", "ball_tree".
+    rng = np.random.RandomState(0)
+    X = rng.rand(n_samples, n_features).astype(dtype)
+    y = rng.randint(3, size=n_samples)
+
+    est = KNeighborsEstimator(
+        parameter,
+        algorithm=algorithm,
+        metric=specified_metric,
+    )
+    with pytest.warns(
+        UserWarning,
+        match=(
+            f"'{specified_metric}' is only available for algorithm='brute' \(currently "  # noqa
+            f"algorithm='{algorithm}'\). Falling "  # noqa
+            f"back on metric='{fall_back_metric}'."
+        ),
+    ):
+        est.fit(X, y)
+
+    assert est.metric == fall_back_metric
+    assert est.effective_metric_ == fall_back_metric
+
+
+@pytest.mark.parametrize(
+    "KNeighborsEstimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ],
+)
+def test_knn_prediction_fast_alternatives_fall_back_on_sparse(
+    KNeighborsEstimator,
+    specified_metric="fast_euclidean",
+    fall_back_metric="euclidean",
+    parameter=10,
+    n_samples=1000,
+    n_features=100,
+    dtype=np.float64,
+):
+    # The fast euclidean metric can't be used on sparse datasets.
+    rng = np.random.RandomState(0)
+    X = scipy.sparse.random(n_samples, n_features, density=0.25, random_state=rng)
+    y = rng.randint(3, size=n_samples)
+
+    est = KNeighborsEstimator(
+        parameter,
+        algorithm="brute",
+        metric=specified_metric,
+    )
+    est.fit(X, y)
+    assert est.effective_metric_ == fall_back_metric
 
 
 @pytest.mark.parametrize(

From 5911f1cee0dfa29aa22d337d39bd5a3b922c6627 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 30 Sep 2021 11:43:35 +0200
Subject: [PATCH 216/290] fixup! Test fast metric alternatives fallbacks

---
 sklearn/cluster/_affinity_propagation.py |  2 +-
 sklearn/cluster/_birch.py                |  2 +-
 sklearn/cluster/_mean_shift.py           |  2 +-
 sklearn/metrics/pairwise.py              | 47 ++++++++++++++++++++----
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 6467b8ea6a3fd..91322dba632d6 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -524,7 +524,7 @@ def predict(self, X):
         if self.cluster_centers_.shape[0] > 0:
             with config_context(assume_finite=True):
                 return pairwise_distances_argmin(
-                    X, self.cluster_centers_, metric="fast_sqeuclidean"
+                    X, self.cluster_centers_, metric="fast_euclidean"
                 )
         else:
             warnings.warn(
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index d9c48edcb8d5d..d2dd3f937a27d 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -656,7 +656,7 @@ def predict(self, X):
 
         with config_context(assume_finite=True):
             argmin = pairwise_distances_argmin(
-                X, self.subcluster_centers_, metric="fast_sqeuclidean"
+                X, self.subcluster_centers_, metric="fast_euclidean"
             )
         return self.subcluster_labels_[argmin]
 
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index f1030dd147130..542ed0dbc97aa 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -513,5 +513,5 @@ def predict(self, X):
         X = self._validate_data(X, reset=False)
         with config_context(assume_finite=True):
             return pairwise_distances_argmin(
-                X, self.cluster_centers_, metric="fast_sqeuclidean"
+                X, self.cluster_centers_, metric="fast_euclidean"
             )
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 8d95b705d7640..7a60bbb0b4ef1 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -575,6 +575,10 @@ def _argmin_min_reduce(dist, start):
     return indices, values
 
 
+def _argmin_reduce(dist, start):
+    return dist.argmin(axis=1)
+
+
 def pairwise_distances_argmin_min(
     X, Y, *, axis=1, metric="euclidean", metric_kwargs=None
 ):
@@ -662,10 +666,10 @@ def pairwise_distances_argmin_min(
     else:
         # TODO: once ArgKmin supports sparse input matrices and 32 bit,
         # we won't need to fallback to pairwise_distances_chunked anymore.
-        # When ArgKmin is not supported and when the user asked for
-        # a fast alternative, we need to revert to the standard one.
-        if metric in ("fast_sqeuclidean", "fast_euclidean"):
-            metric = metric.replace("fast_", "")
+        # When PairwiseDistancesArgKmin is not supported and when the user
+        # asked for a fast alternative, we need to revert to the standard one.
+        if metric == "fast_euclidean":
+            metric = "euclidean"
 
         indices, values = zip(
             *pairwise_distances_chunked(
@@ -745,9 +749,38 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    return pairwise_distances_argmin_min(
-        X, Y, axis=axis, metric=metric, metric_kwargs=metric_kwargs
-    )[0]
+    X, Y = check_pairwise_arrays(X, Y)
+
+    if axis == 0:
+        X, Y = Y, X
+
+    if metric_kwargs is None:
+        metric_kwargs = {}
+
+    if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric):
+        indices = PairwiseDistancesArgKmin.get_for(
+            X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
+        ).compute(strategy="auto", return_distance=False)
+        indices = indices.flatten()
+    else:
+        # TODO: once ArgKmin supports sparse input matrices and 32 bit,
+        # we won't need to fallback to pairwise_distances_chunked anymore.
+        # When PairwiseDistancesArgKmin is not supported and when the user
+        # asked for a fast alternative, we need to revert to the standard one.
+        if metric == "fast_euclidean":
+            metric = "euclidean"
+
+        indices = np.concatenate(
+            list(
+                # This returns a np.ndarray generator whose arrays we need
+                # to flatten into one.
+                pairwise_distances_chunked(
+                    X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs
+                )
+            )
+        )
+
+    return indices
 
 
 def haversine_distances(X, Y=None):

From 9b9fb7c04fd3605cde752a1f7b5f9850714986ab Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 30 Sep 2021 13:02:08 +0200
Subject: [PATCH 217/290] Change warning message

---
 sklearn/neighbors/_base.py                | 4 ++--
 sklearn/neighbors/tests/test_neighbors.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 8c8552c608083..e21a0ffb36a28 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -373,8 +373,8 @@ def _check_algorithm_metric(self):
         ):
             alternative = self.metric.replace("fast_", "")
             warnings.warn(
-                f"'{self.metric}' is only available for algorithm='brute' (currently"
-                f" algorithm='{self.algorithm}'). Falling back on"
+                f"'{self.metric}' is only available for algorithm='brute' but"
+                f" algorithm='{self.algorithm}' is used. Falling back on"
                 f" metric='{alternative}'.",
                 UserWarning,
                 stacklevel=3,
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index ce3e3ceef7efb..e2718d6b3a86c 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -344,8 +344,8 @@ def test_knn_prediction_fast_alternatives_fall_back_on_tree(
     with pytest.warns(
         UserWarning,
         match=(
-            f"'{specified_metric}' is only available for algorithm='brute' \(currently "  # noqa
-            f"algorithm='{algorithm}'\). Falling "  # noqa
+            f"'{specified_metric}' is only available for algorithm='brute' but "
+            f"algorithm='{algorithm}' is used. Falling "
             f"back on metric='{fall_back_metric}'."
         ),
     ):

From 5fc91e18979fd0dd73d9b5bbd92b062aa8136803 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 5 Oct 2021 13:49:48 +0200
Subject: [PATCH 218/290] Document and reword interfaces

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_dist_metrics.pxd             |   6 +-
 sklearn/metrics/_dist_metrics.pyx             |  71 +++---
 .../metrics/_pairwise_distances_reduction.pyx | 222 ++++++++++--------
 3 files changed, 165 insertions(+), 134 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index e87f442019a9d..271b4d9e07da8 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -93,13 +93,13 @@ cdef class DistanceMetric:
 cdef class DatasetsPair:
     cdef DistanceMetric distance_metric
 
-    cdef ITYPE_t n_X(self) nogil
+    cdef ITYPE_t n_samples_X(self) nogil
 
-    cdef ITYPE_t n_Y(self) nogil
+    cdef ITYPE_t n_samples_Y(self) nogil
 
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
 
-    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil
+    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil
 
 
 cdef class DenseDenseDatasetsPair(DatasetsPair):
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index f75a3a2a75fcb..8bf49c3275756 100755
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1198,8 +1198,17 @@ cdef inline double fmax(double a, double b) nogil:
 cdef class DatasetsPair:
     """Abstract class which wraps a pair of datasets (X, Y).
 
-    This class allows computing distances between two vectors (X_i, Y_j)
-    (rows of X and Y) at a time given the pair of their indices (i, j).
+    This class allows computing distances between a single pair of rows of
+    of X and Y at a time given the pair of their indices (i, j). This class is
+    specialized for each metric thanks to the :func:`get_for` factory classmethod.
+
+    The handling of chunking and parallelization to compute the distances
+    and aggregation for several rows at a time is handled in dedicated
+    subclasses of PairwiseDistancesReduction that in-turn rely on
+    subclasses of DatasetsPair for each pair of rows in the data. The goal
+    is to make it possible to decouple the generic parallelization and
+    aggregation logic from metric-specific computation as much as
+    possible.
 
     X and Y can be stored as np.ndarrays or CSR matrices in subclasses.
 
@@ -1227,12 +1236,12 @@ cdef class DatasetsPair:
 
         Parameters
         ----------
-        X : {ndarray, sparse matrix} of shape (n_X, d)
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
             Input data.
             If provided as a ndarray, it must be C-contiguous.
             If provided as a sparse matrix, it must be in CSR format.
 
-        Y : {ndarray, sparse matrix} of shape (n_Y, d)
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
             Input data.
             If provided as a ndarray, it must be C-contiguous.
             If provided as a sparse matrix, it must be in CSR format.
@@ -1283,15 +1292,15 @@ cdef class DatasetsPair:
     def __init__(self, DistanceMetric distance_metric):
         self.distance_metric = distance_metric
 
-    cdef ITYPE_t n_X(self) nogil:
+    cdef ITYPE_t n_samples_X(self) nogil:
         """Number of samples in X."""
         return -999
 
-    cdef ITYPE_t n_Y(self) nogil:
+    cdef ITYPE_t n_samples_Y(self) nogil:
         """Number of samples in Y."""
         return -999
 
-    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.dist(i, j)
 
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
@@ -1303,10 +1312,10 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
 
     Parameters
     ----------
-    X: ndarray of shape (n_X, d)
+    X: ndarray of shape (n_samples_X, n_features)
         Rows represent vectors. Must be C-contiguous.
 
-    Y: ndarray of shape (n_Y, d)
+    Y: ndarray of shape (n_samples_Y, n_features)
         Rows represent vectors. Must be C-contiguous.
 
     distance_metric: DistanceMetric
@@ -1322,15 +1331,15 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         self.d = X.shape[1]
 
     @final
-    cdef ITYPE_t n_X(self) nogil:
+    cdef ITYPE_t n_samples_X(self) nogil:
         return self.X.shape[0]
 
     @final
-    cdef ITYPE_t n_Y(self) nogil:
+    cdef ITYPE_t n_samples_Y(self) nogil:
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.rdist(&self.X[i, 0],
                                           &self.Y[j, 0],
                                           self.d)
@@ -1347,10 +1356,10 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
 
     Parameters
     ----------
-    X: sparse matrix of shape (n_X, d)
+    X: sparse matrix of shape (n_samples_X, n_features)
         Rows represent vectors. Must be in CSR format.
 
-    Y: sparse matrix of shape (n_X, d)
+    Y: sparse matrix of shape (n_samples_Y, n_features)
         Rows represent vectors. Must be in CSR format.
 
     distance_metric: DistanceMetric
@@ -1374,15 +1383,15 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
 
     @final
-    cdef ITYPE_t n_X(self) nogil:
+    cdef ITYPE_t n_samples_X(self) nogil:
         return self.X_indptr.shape[0] - 1
 
     @final
-    cdef ITYPE_t n_Y(self) nogil:
+    cdef ITYPE_t n_samples_Y(self) nogil:
         return self.Y_indptr.shape[0] -1
 
     @final
-    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1417,10 +1426,10 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
 
     Parameters
     ----------
-    X: sparse matrix of shape (n_X, d)
+    X: sparse matrix of shape (n_samples_X, n_features)
         Rows represent vectors. Must be in CSR format.
 
-    Y: ndarray of shape (n_Y, d)
+    Y: ndarray of shape (n_samples_Y, n_features)
         Rows represent vectors. Must be C-contiguous.
 
     distance_metric: DistanceMetric
@@ -1432,7 +1441,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         const ITYPE_t[:] X_indices,
         const ITYPE_t[:] X_indptr,
 
-        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+        const DTYPE_t[:, ::1] Y
         const ITYPE_t[:] Y_indices
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
@@ -1445,15 +1454,15 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE)
 
     @final
-    cdef ITYPE_t n_X(self) nogil:
+    cdef ITYPE_t n_samples_X(self) nogil:
         return self.X_indptr.shape[0] - 1
 
     @final
-    cdef ITYPE_t n_Y(self) nogil:
+    cdef ITYPE_t n_samples_Y(self) nogil:
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1489,10 +1498,10 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
 
     Parameters
     ----------
-    X: ndarray of shape (n_X, d)
+    X: ndarray of shape (n_samples_X, n_features)
         Rows represent vectors. Must be C-contiguous.
 
-    Y: sparse matrix of shape (n_Y, d)
+    Y: sparse matrix of shape (n_samples_Y, n_features)
         Rows represent vectors. Must be in CSR format.
 
     distance_metric: DistanceMetric
@@ -1510,19 +1519,19 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
         self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
 
     @final
-    cdef ITYPE_t n_X(self) nogil:
+    cdef ITYPE_t n_samples_X(self) nogil:
         # Swapping interface
-        return self.datasets_pair.n_Y()
+        return self.datasets_pair.n_samples_Y()
 
     @final
-    cdef ITYPE_t n_Y(self) nogil:
+    cdef ITYPE_t n_samples_Y(self) nogil:
         # Swapping interface
-        return self.datasets_pair.n_X()
+        return self.datasets_pair.n_samples_X()
 
     @final
-    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         # Swapping arguments on the same interface
-        return self.datasets_pair.ranking_preserving_dist(j, i)
+        return self.datasets_pair.rank_preserving_dist(j, i)
 
     @final
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 6aa3eb9fc2a93..54325353b04f0 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -25,7 +25,6 @@ from libc.stdlib cimport free, malloc
 from libc.float cimport DBL_MAX
 from libcpp.vector cimport vector
 from cython cimport final
-from cpython.object cimport PyObject
 from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
 from cpython.ref cimport Py_INCREF
@@ -182,12 +181,30 @@ cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 #####################
 
 cdef class PairwiseDistancesReduction:
-    """Abstract class which compute pairwise distances between
-    a set of vectors (rows) X and another set of vectors (rows) of Y
-    and apply a reduction on top.
+    """Abstract base class for pairwise distance computation & reduction
 
-    The computations of the distances and the reduction is parallelized
-    on chunks of vectors of X and Y.
+    Subclasses of this class compute pairwise distances between a set of
+    vectors (rows) X and another set of vectors (rows) of Y and apply a
+    reduction on top. The reduction takes a matrix of pairwise distances
+    between rows of X and Y as input and outputs an aggregate data-structure
+    for each row of X. The aggregate values are typically smaller than the number
+    of rows in Y, hence the term reduction.
+
+    For computational reasons, it is interesting to perform the reduction on
+    the fly on chunks of rows of X and Y so as to keep intermediate
+    data-structures in CPU cache and avoid unnecessary round trips of large
+    distance arrays with the RAM that would otherwise severely degrade the
+    speed by making the overall processing memory-bound.
+
+    The base class provides the generic chunked parallelization template using
+    OpenMP loops (Cython prange), either on rows of X or rows of Y depending on
+    their respective sizes.
+
+    The subclasses are specialized for reduction.
+
+    The actual distance computation for a given pair of rows of X and Y are
+    delegated to metric-specific subclasses of the DatasetsPair companion base
+    class.
 
     Parameters
     ----------
@@ -214,8 +231,8 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
 
-        ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
-        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
+        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
+        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
 
     @classmethod
     def valid_metrics(cls) -> List[str]:
@@ -238,10 +255,10 @@ cdef class PairwiseDistancesReduction:
 
         Parameters
         ----------
-        X : {ndarray, sparse matrix} of shape (n_X, d)
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
             Input data.
 
-        Y : {ndarray, sparse matrix} of shape (n_Y, d)
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
             Input data.
 
         metric : str, default='euclidean'
@@ -287,23 +304,23 @@ cdef class PairwiseDistancesReduction:
 
         self._datasets_pair = datasets_pair
 
-        self.n_Y = datasets_pair.n_Y()
-        self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
-        Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
-        self.Y_n_samples_remainder = self.n_Y % self.Y_n_samples_chunk
+        self.n_samples_Y = datasets_pair.n_samples_Y()
+        self.Y_n_samples_chunk = min(self.n_samples_Y, self.n_samples_chunk)
+        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
+        self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
 
-        self.n_X = datasets_pair.n_X()
-        self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
-        X_n_full_chunks = self.n_X // self.X_n_samples_chunk
-        self.X_n_samples_remainder = self.n_X % self.X_n_samples_chunk
+        self.n_samples_X = datasets_pair.n_samples_X()
+        self.X_n_samples_chunk = min(self.n_samples_X, self.n_samples_chunk)
+        X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
+        self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
 
         # Counting remainder chunk in total number of chunks
         self.Y_n_chunks = Y_n_full_chunks + (
-            self.n_Y != (Y_n_full_chunks * self.Y_n_samples_chunk)
+            self.n_samples_Y != (Y_n_full_chunks * self.Y_n_samples_chunk)
         )
 
         self.X_n_chunks = X_n_full_chunks + (
-            self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
+            self.n_samples_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
     def compute(
@@ -319,7 +336,7 @@ cdef class PairwiseDistancesReduction:
             The chunking strategy defining which dataset parallelization are made on.
 
             Strategies differs on the dispatching they use for chunks on threads:
-              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              - 'parallel_on_samples_X' dispatches chunks of X uniformly on threads.
               Each thread then iterates on all the chunks of Y. This strategy is
               embarrassingly parallel and comes with no datastructures synchronisation
               but is less used in practice (because X is smaller than Y generally).
@@ -329,7 +346,7 @@ cdef class PairwiseDistancesReduction:
               synchronisation. However it is more useful in practice (because Y is
               larger than X generally).
               - 'auto' relies on a simple heuristic to choose between
-              'parallel_on_X' and 'parallel_on_Y'.
+              'parallel_on_samples_X' and 'parallel_on_Y'.
               - None (default) looks-up in scikit-learn configuration for
               `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
 
@@ -349,7 +366,7 @@ cdef class PairwiseDistancesReduction:
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_samples_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
@@ -389,7 +406,7 @@ cdef class PairwiseDistancesReduction:
             thread_num = _openmp_thread_num()
 
             # Allocating thread datastructures
-            self._on_X_parallel_init(thread_num)
+            self._parallel_on_X_parallel_init(thread_num)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 X_start = X_chunk_idx * self.X_n_samples_chunk
@@ -400,7 +417,7 @@ cdef class PairwiseDistancesReduction:
                     X_end = X_start + self.X_n_samples_chunk
 
                 # Reinitializing thread datastructures for the new X chunk
-                self._on_X_prange_iter_init(thread_num, X_start, X_end)
+                self._parallel_on_X_threadwise_init_chunk(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -417,12 +434,12 @@ cdef class PairwiseDistancesReduction:
                     )
 
                 # Adjusting thread datastructures on the full pass on Y
-                self._on_X_prange_iter_finalize(thread_num, X_start, X_end)
+                self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end)
 
             # end: for X_chunk_idx
 
             # Deallocating thread datastructures
-            self._on_X_parallel_finalize(thread_num)
+            self._parallel_on_X_threadwise_finalize(thread_num)
 
         # end: with nogil, parallel
         return
@@ -448,7 +465,7 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t thread_num
 
         # Allocating datastructures
-        self._on_Y_init(num_threads)
+        self._parallel_on_Y_init(num_threads)
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
@@ -461,7 +478,7 @@ cdef class PairwiseDistancesReduction:
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
-                self._on_Y_parallel_init(thread_num)
+                self._parallel_on_Y_threadwise_init(thread_num)
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -477,14 +494,18 @@ cdef class PairwiseDistancesReduction:
                         thread_num,
                     )
                 # end: prange
+
+                # Note: we don't need a _parallel_on_Y_threadwise_finalize similarly.
+                # This can be introduced if needed.
+
             # end: with nogil, parallel
 
             # Synchronizing the thread datastructures with the main ones
-            self._on_Y_after_parallel(num_threads, X_start, X_end)
+            self._parallel_on_Y_synchronize(num_threads, X_start, X_end)
 
         # end: for X_chunk_idx
         # Deallocating temporary datastructures and adjusting main datastructures
-        self._on_Y_finalize(num_threads)
+        self._parallel_on_Y_finalize(num_threads)
         return
 
     # Placeholder methods which have to be implemented
@@ -514,17 +535,17 @@ cdef class PairwiseDistancesReduction:
     # Placeholder methods which can be implemented
 
     cdef void compute_exact_distances(self) nogil:
-        """Convert ranking-preserving distances to exact distances or recompute them."""
+        """Convert rank-preserving distances to exact distances or recompute them."""
         return
 
-    cdef void _on_X_parallel_init(
+    cdef void _parallel_on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
         """Allocate datastructures used in a thread given its number."""
         return
 
-    cdef void _on_X_prange_iter_init(
+    cdef void _parallel_on_X_threadwise_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -533,7 +554,7 @@ cdef class PairwiseDistancesReduction:
         """Initialise datastructures used in a thread given its number."""
         return
 
-    cdef void _on_X_prange_iter_finalize(
+    cdef void _parallel_on_X_prange_iter_finalize(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -542,41 +563,41 @@ cdef class PairwiseDistancesReduction:
         """Interact with datastructures after a reduction on chunks."""
         return
 
-    cdef void _on_X_parallel_finalize(
+    cdef void _parallel_on_X_threadwise_finalize(
         self,
         ITYPE_t thread_num
     ) nogil:
         """Interact with datastructures after executing all the reductions."""
         return
 
-    cdef void _on_Y_init(
+    cdef void _parallel_on_Y_init(
         self,
         ITYPE_t num_threads,
     ) nogil:
-        """Allocate datastructures used in threads."""
+        """Allocate datastructures used in all threads."""
         return
 
-    cdef void _on_Y_parallel_init(
+    cdef void _parallel_on_Y_threadwise_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
         """Initialise datastructures used in a thread given its number."""
         return
 
-    cdef void _on_Y_after_parallel(
+    cdef void _parallel_on_Y_synchronize(
         self,
         ITYPE_t num_threads,
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
-        """Interact with datastructures after a threads parallel region."""
+        """Update thread datastructures before leaving a parallel region."""
         return
 
-    cdef void _on_Y_finalize(
+    cdef void _parallel_on_Y_finalize(
         self,
         ITYPE_t num_threads,
     ) nogil:
-        """Interact with datastructures after executing all the reductions."""
+        """Update datastructures after executing all the reductions."""
         return
 
 cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
@@ -629,10 +650,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
         Parameters
         ----------
-        X : array-like of shape (n_X, d)
+        X : array-like of shape (n_samples_X, n_features)
             Input data.
 
-        Y : array-like of shape (n_Y, d)
+        Y : array-like of shape (n_samples_Y, n_features)
             Input data.
 
         k : int
@@ -705,8 +726,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         )
 
         # Main heaps used by PairwiseDistancesArgKmin.compute to return results.
-        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
-        self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE)
+        self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
 
     def __dealloc__(self):
         if self.heaps_indices_chunks is not NULL:
@@ -725,26 +746,26 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     ) nogil:
         cdef:
             ITYPE_t i, j
-            ITYPE_t n_X = X_end - X_start
-            ITYPE_t n_Y = Y_end - Y_start
+            ITYPE_t n_samples_X = X_end - X_start
+            ITYPE_t n_samples_Y = Y_end - Y_start
             ITYPE_t k = self.k
             DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
-        for i in range(n_X):
-            for j in range(n_Y):
+        for i in range(n_samples_X):
+            for j in range(n_samples_Y):
                 heap_push(
                     heaps_r_distances + i * self.k,
                     heaps_indices + i * self.k,
                     k,
-                    self._datasets_pair.ranking_preserving_dist(X_start + i, Y_start + j),
+                    self._datasets_pair.rank_preserving_dist(X_start + i, Y_start + j),
                     Y_start + j,
                 )
 
     @final
-    cdef void _on_X_prange_iter_init(
+    cdef void _parallel_on_X_threadwise_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -756,7 +777,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
     @final
-    cdef void _on_X_prange_iter_finalize(
+    cdef void _parallel_on_X_prange_iter_finalize(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -773,7 +794,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 self.k
             )
 
-    cdef void _on_Y_init(
+    cdef void _parallel_on_Y_init(
         self,
         ITYPE_t num_threads,
     ) nogil:
@@ -795,7 +816,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             )
 
     @final
-    cdef void _on_Y_parallel_init(
+    cdef void _parallel_on_Y_threadwise_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
@@ -805,7 +826,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             self.heaps_indices_chunks[thread_num][idx] = -1
 
     @final
-    cdef void _on_Y_after_parallel(
+    cdef void _parallel_on_Y_synchronize(
         self,
         ITYPE_t num_threads,
         ITYPE_t X_start,
@@ -827,7 +848,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                             self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                         )
 
-    cdef void _on_Y_finalize(
+    cdef void _parallel_on_Y_finalize(
         self,
         ITYPE_t num_threads,
     ) nogil:
@@ -842,7 +863,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
             # Sort the main heaps into arrays in parallel
             # in ascending order w.r.t the distances
-            for idx in prange(self.n_X, schedule='static'):
+            for idx in prange(self.n_samples_X, schedule='static'):
                 simultaneous_sort(
                     &self.argkmin_distances[idx, 0],
                     &self.argkmin_indices[idx, 0],
@@ -855,7 +876,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             ITYPE_t i, j
             ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
             DTYPE_t[:, ::1] distances = self.argkmin_distances
-        for i in prange(self.n_X, schedule='static', nogil=True,
+        for i in prange(self.n_samples_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
             for j in range(self.k):
                 distances[i, j] = self._datasets_pair.distance_metric._rdist_to_dist(
@@ -865,7 +886,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
     def _finalize_results(self, bint return_distance=False):
         if return_distance:
-            # We eventually need to recompute distances because we relied on proxies.
+            # We eventually need to recompute distances because we relied on
+            # rank-preserving distances.
             self.compute_exact_distances()
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
@@ -937,11 +959,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
             PairwiseDistancesArgKmin.compute_exact_distances(self)
 
     @final
-    cdef void _on_X_parallel_init(
+    cdef void _parallel_on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        PairwiseDistancesArgKmin._on_X_parallel_init(self, thread_num)
+        PairwiseDistancesArgKmin._parallel_on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
@@ -949,20 +971,20 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         )
 
     @final
-    cdef void _on_X_parallel_finalize(
+    cdef void _parallel_on_X_threadwise_finalize(
         self,
         ITYPE_t thread_num
     ) nogil:
-        PairwiseDistancesArgKmin._on_X_parallel_finalize(self, thread_num)
+        PairwiseDistancesArgKmin._parallel_on_X_threadwise_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _on_Y_init(
+    cdef void _parallel_on_Y_init(
         self,
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._on_Y_init(self, num_threads)
+        PairwiseDistancesArgKmin._parallel_on_Y_init(self, num_threads)
 
         for thread_num in range(num_threads):
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
@@ -971,12 +993,12 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
             )
 
     @final
-    cdef void _on_Y_finalize(
+    cdef void _parallel_on_Y_finalize(
         self,
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._on_Y_finalize(self, num_threads)
+        PairwiseDistancesArgKmin._parallel_on_Y_finalize(self, num_threads)
 
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
@@ -1039,8 +1061,8 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
                     heaps_r_distances + i * k,
                     heaps_indices + i * k,
                     k,
-                    # Using the squared euclidean distance as the ranking-preserving distance:
-                    # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    # Using the squared euclidean distance as the rank-preserving distance:
+                    # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                     (
                         self.X_sq_norms[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
@@ -1079,10 +1101,10 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
     cdef:
         DTYPE_t radius
 
-        # DistanceMetric compute ranking-preserving surrogate distance via rdist
+        # DistanceMetric compute rank-preserving surrogate distance via rdist
         # which are proxies necessitating less computations.
         # We get the equivalent for the radius to be able to compare it against
-        # vectors' ranking-preserving surrogate distances.
+        # vectors' rank-preserving surrogate distances.
         DTYPE_t r_radius
 
         # Neighbors indices and distances are returned as np.ndarray of np.ndarray.
@@ -1124,10 +1146,10 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
         Parameters
         ----------
-        X : array-like of shape (n_X, d)
+        X : array-like of shape (n_samples_X, n_features)
             Input data.
 
-        Y : array-like of shape (n_Y, d)
+        Y : array-like of shape (n_samples_Y, n_features)
             Input data.
 
         radius : float
@@ -1210,8 +1232,8 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
         # Temporary datastructures which will be coerced to numpy arrays on before
         # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed.
-        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
-        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
+        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_samples_X)
+        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_samples_X)
 
     def __dealloc__(self):
         if self.neigh_distances_chunks is not NULL:
@@ -1240,7 +1262,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
         for i in range(X_start, X_end):
             for j in range(Y_start, Y_end):
-                r_dist_i_j = self._datasets_pair.ranking_preserving_dist(i, j)
+                r_dist_i_j = self._datasets_pair.rank_preserving_dist(i, j)
                 if r_dist_i_j <= self.r_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
@@ -1256,7 +1278,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         return coerce_vectors_to_nd_arrays(self.neigh_indices)
 
     @final
-    cdef void _on_X_prange_iter_init(
+    cdef void _parallel_on_X_threadwise_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -1269,7 +1291,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         self.neigh_indices_chunks[thread_num] = self.neigh_indices
 
     @final
-    cdef void _on_X_prange_iter_finalize(
+    cdef void _parallel_on_X_prange_iter_finalize(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -1287,7 +1309,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
                     deref(self.neigh_indices)[idx].size()
                 )
 
-    cdef void _on_Y_init(
+    cdef void _parallel_on_Y_init(
         self,
         ITYPE_t num_threads,
     ) nogil:
@@ -1295,11 +1317,11 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             ITYPE_t thread_num
         # As chunks of X are shared across threads, so must datastructures
         # to avoid race conditions.
-        # Each thread has its own vectors of n_X vectors which are then merged
-        # back in the main n_X vectors.
+        # Each thread has its own vectors of n_samples_X vectors which are then merged
+        # back in the main n_samples_X vectors.
         for thread_num in range(num_threads):
-            self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X)
-            self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
+            self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_samples_X)
+            self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_samples_X)
 
     @final
     cdef void _merge_vectors(
@@ -1335,7 +1357,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
 
-    cdef void _on_Y_finalize(
+    cdef void _parallel_on_Y_finalize(
         self,
         ITYPE_t num_threads,
     ) nogil:
@@ -1348,7 +1370,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             # using dynamic scheduling because we generally do not have
             # the same number of neighbors for each query vectors.
             # TODO: compare 'dynamic' vs 'static' vs 'guided'
-            for idx in prange(self.n_X, schedule='dynamic'):
+            for idx in prange(self.n_samples_X, schedule='dynamic'):
                 self._merge_vectors(idx, num_threads)
 
             # The content of the vector have been std::moved,
@@ -1359,7 +1381,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
             # Sort in parallel in ascending order w.r.t the distances if needed
             if self.sort_results:
-                for idx in prange(self.n_X, schedule='static'):
+                for idx in prange(self.n_samples_X, schedule='static'):
                     simultaneous_sort(
                         deref(self.neigh_distances)[idx].data(),
                         deref(self.neigh_indices)[idx].data(),
@@ -1369,11 +1391,11 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         return
 
     cdef void compute_exact_distances(self) nogil:
-        """Convert ranking-preserving distances to pairwise distances in parallel."""
+        """Convert rank-preserving distances to pairwise distances in parallel."""
         cdef:
             ITYPE_t i, j
 
-        for i in prange(self.n_X, nogil=True, schedule='static',
+        for i in prange(self.n_samples_X, nogil=True, schedule='static',
                         num_threads=self.effective_omp_n_thread):
             for j in range(deref(self.neigh_indices)[i].size()):
                 deref(self.neigh_distances)[i][j] = (
@@ -1469,11 +1491,11 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
             PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self)
 
     @final
-    cdef void _on_X_parallel_init(
+    cdef void _parallel_on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        PairwiseDistancesRadiusNeighborhood._on_X_parallel_init(self, thread_num)
+        PairwiseDistancesRadiusNeighborhood._parallel_on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
@@ -1481,20 +1503,20 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         )
 
     @final
-    cdef void _on_X_parallel_finalize(
+    cdef void _parallel_on_X_threadwise_finalize(
         self,
         ITYPE_t thread_num
     ) nogil:
-        PairwiseDistancesRadiusNeighborhood._on_X_parallel_finalize(self, thread_num)
+        PairwiseDistancesRadiusNeighborhood._parallel_on_X_threadwise_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _on_Y_init(
+    cdef void _parallel_on_Y_init(
         self,
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesRadiusNeighborhood._on_Y_init(self, num_threads)
+        PairwiseDistancesRadiusNeighborhood._parallel_on_Y_init(self, num_threads)
 
         for thread_num in range(num_threads):
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
@@ -1503,12 +1525,12 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
             )
 
     @final
-    cdef void _on_Y_finalize(
+    cdef void _parallel_on_Y_finalize(
         self,
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesRadiusNeighborhood._on_Y_finalize(self, num_threads)
+        PairwiseDistancesRadiusNeighborhood._parallel_on_Y_finalize(self, num_threads)
 
         for thread_num in range(num_threads):
             free(self.dist_middle_terms_chunks[thread_num])
@@ -1564,8 +1586,8 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         # Pushing the distance and their associated indices in vectors.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                # Using the squared euclidean distance as the ranking-preserving distance:
-                # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                # Using the squared euclidean distance as the rank-preserving distance:
+                # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                 squared_dist_i_j = (
                     self.X_sq_norms[i + X_start]
                     + dist_middle_terms[i * Y_c.shape[0] + j]

From f75f08e6565a0be1a4aafbc3423f9174b057331f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 5 Oct 2021 13:58:21 +0200
Subject: [PATCH 219/290] Better adapt the strategy for uniform weighting

---
 sklearn/neighbors/_classification.py      | 33 ++++++---------
 sklearn/neighbors/_regression.py          | 23 ++++-------
 sklearn/neighbors/tests/test_neighbors.py | 50 -----------------------
 3 files changed, 20 insertions(+), 86 deletions(-)

diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index c8a7aa11675ff..4343de7241d09 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -217,12 +217,10 @@ def predict(self, X):
 
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
-            # In that case, it is safe to use the fast alternative which
-            # does not use sqrt on distances as this can be costly.
-            self.effective_metric_ = "fast_sqeuclidean"
-            neigh_dist, neigh_ind = self.kneighbors(X)
-            self.effective_metric_ = "fast_euclidean"
+        if self.weights == "uniform":
+            # In that case, we do not need the distance so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
         else:
             neigh_dist, neigh_ind = self.kneighbors(X)
 
@@ -273,12 +271,10 @@ def predict_proba(self, X):
 
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
-            # In that case, it is safe to use the fast alternative which
-            # does not use sqrt on distances as this can be costly.
-            self.effective_metric_ = "fast_sqeuclidean"
-            neigh_dist, neigh_ind = self.kneighbors(X)
-            self.effective_metric_ = "fast_euclidean"
+        if self.weights == "uniform":
+            # In that case, we do not need the distance so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
         else:
             neigh_dist, neigh_ind = self.kneighbors(X)
 
@@ -643,15 +639,10 @@ def predict_proba(self, X):
         X = self._validate_data(X, accept_sparse="csr", reset=False)
         n_queries = _num_samples(X)
 
-        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
-            # In that case, it is safe to use the fast alternative which
-            # does not use sqrt on distances as this can be costly.
-            original_radius = self.radius
-            self.effective_metric_ = "fast_sqeuclidean"
-            self.radius = original_radius * original_radius
-            neigh_dist, neigh_ind = self.radius_neighbors(X)
-            self.radius = original_radius
-            self.effective_metric_ = "fast_euclidean"
+        if self.weights == "uniform":
+            # In that case, we do not need the distance so we do not compute them.
+            neigh_ind = self.radius_neighbors(X, return_distance=False)
+            neigh_dist = None
         else:
             neigh_dist, neigh_ind = self.radius_neighbors(X)
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 5ea2db7ce4d21..3db60eadffc0d 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -233,12 +233,10 @@ def predict(self, X):
 
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
-            # In that case, it is safe to use the fast alternative which
-            # does not use sqrt on distances as this can be costly.
-            self.effective_metric_ = "fast_sqeuclidean"
-            neigh_dist, neigh_ind = self.kneighbors(X)
-            self.effective_metric_ = "fast_euclidean"
+        if self.weights == "uniform":
+            # In that case, we do not need the distance so we do not compute them.
+            neigh_ind = self.kneighbors(X, return_distance=False)
+            neigh_dist = None
         else:
             neigh_dist, neigh_ind = self.kneighbors(X)
 
@@ -454,15 +452,10 @@ def predict(self, X):
 
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
-            # In that case, it is safe to use the fast alternative which
-            # does not use sqrt on distances as this can be costly.
-            original_radius = self.radius
-            self.effective_metric_ = "fast_sqeuclidean"
-            self.radius = original_radius * original_radius
-            neigh_dist, neigh_ind = self.radius_neighbors(X)
-            self.radius = original_radius
-            self.effective_metric_ = "fast_euclidean"
+        if self.weights == "uniform":
+            # In that case, we do not need the distance so we do not compute them.
+            neigh_ind = self.radius_neighbors(X, return_distance=False)
+            neigh_dist = None
         else:
             neigh_dist, neigh_ind = self.radius_neighbors(X)
 
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index e2718d6b3a86c..e5aa9afba3b53 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -263,56 +263,6 @@ def test_neighs_predictions_fast_euclidean_correctness(
     assert_allclose(euclidean_pred, fast_euclidean_pred)
 
 
-@pytest.mark.parametrize(
-    "KNeighborsEstimator",
-    [
-        neighbors.KNeighborsClassifier,
-        neighbors.KNeighborsRegressor,
-    ],
-)
-@pytest.mark.parametrize(
-    "weights, expected_kneighbors_metric",
-    [
-        ("uniform", "fast_sqeuclidean"),
-        ("distance", "fast_euclidean"),
-        (lambda x: x, "fast_euclidean"),
-    ],
-)
-def test_knn_prediction_fast_euclidean_overriding(
-    KNeighborsEstimator,
-    weights,
-    expected_kneighbors_metric,
-    n_samples=1000,
-    n_features=100,
-    dtype=np.float64,
-):
-    # The fast squared euclidean metric must be used over the fast euclidean
-    # metric solely when using the uniform sample-weighting.
-    class MockedKNeighborsEstimator(KNeighborsEstimator):
-        def kneighbors(self, *args, **kwargs):
-            self.kneighbors_metric_ = self.effective_metric_
-            return super().kneighbors(*args, **kwargs)
-
-    rng = np.random.RandomState(0)
-    X = rng.rand(n_samples, n_features).astype(dtype)
-    y = rng.randint(3, size=n_samples)
-
-    parameter = 10
-
-    fast_euclidean_est = MockedKNeighborsEstimator(
-        parameter,
-        algorithm="brute",
-        metric="fast_euclidean",
-        weights=weights,
-    ).fit(X, y)
-
-    # effective_metric_ must not be changed
-    assert fast_euclidean_est.effective_metric_ == "fast_euclidean"
-    fast_euclidean_est.predict(X)
-    assert fast_euclidean_est.kneighbors_metric_ == expected_kneighbors_metric
-    assert fast_euclidean_est.effective_metric_ == "fast_euclidean"
-
-
 @pytest.mark.parametrize(
     "KNeighborsEstimator",
     [

From b484320f2d66170a571c87928e96570ce5af09d9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 5 Oct 2021 14:10:56 +0200
Subject: [PATCH 220/290] fixup! Better adapt the strategy for uniform
 weighting

---
 sklearn/neighbors/_classification.py | 14 +-------------
 sklearn/neighbors/_regression.py     |  9 ---------
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 4343de7241d09..b12e79d8ca005 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -11,7 +11,7 @@
 import numpy as np
 from scipy import stats
 from ..utils.extmath import weighted_mode
-from ..utils.validation import _is_arraylike, _num_samples, check_is_fitted
+from ..utils.validation import _is_arraylike, _num_samples
 
 import warnings
 from ._base import _check_weights, _get_weights
@@ -211,10 +211,6 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
             Class labels for each data sample.
         """
-        # Duplicated because of the check on self.effective_metric_'s value
-        # TODO: remove check_is_fitted duplication
-        check_is_fitted(self)
-
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         if self.weights == "uniform":
@@ -265,10 +261,6 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
-        # Duplicated because of the check on self.effective_metric_'s value
-        # TODO: remove check_is_fitted duplication
-        check_is_fitted(self)
-
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         if self.weights == "uniform":
@@ -632,10 +624,6 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
-        # Duplicated because of the check on self.effective_metric_'s value
-        # TODO: remove check_is_fitted duplication
-        check_is_fitted(self)
-
         X = self._validate_data(X, accept_sparse="csr", reset=False)
         n_queries = _num_samples(X)
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 3db60eadffc0d..926bab4e1fc9a 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -18,7 +18,6 @@
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import RegressorMixin
 from ..utils.deprecation import deprecated
-from ..utils.validation import check_is_fitted
 
 
 class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
@@ -227,10 +226,6 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
             Target values.
         """
-        # Duplicated because of the check on self.effective_metric_'s value
-        # TODO: remove check_is_fitted duplication
-        check_is_fitted(self)
-
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         if self.weights == "uniform":
@@ -446,10 +441,6 @@ def predict(self, X):
                 dtype=double
             Target values.
         """
-        # Duplicated because of the check on self.effective_metric_'s value
-        # TODO: remove check_is_fitted duplication
-        check_is_fitted(self)
-
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         if self.weights == "uniform":

From 79786d01e081dd4fa35db6fb33916c1bed636585 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 5 Oct 2021 17:00:26 +0200
Subject: [PATCH 221/290] Make pytest happy with proper checks on array-likes

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 54325353b04f0..484db1b58bcd5 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -16,6 +16,7 @@
 
 import numpy as np
 cimport numpy as np
+import scipy.sparse
 
 from .. import get_config
 
@@ -272,8 +273,8 @@ cdef class PairwiseDistancesReduction:
         """
         # Coercing to np.array to get the dtype
         # TODO: what is the best way to get lists' dtype?
-        X = np.asarray(X) if isinstance(X, (tuple, list)) else X
-        Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
+        X = np.asarray(X) if not isinstance(X, (np.ndarray, scipy.sparse.spmatrix)) else X
+        Y = np.asarray(Y) if not isinstance(Y, (np.ndarray, scipy.sparse.spmatrix)) else Y
         # TODO: support sparse arrays and 32 bits
         return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and
                 not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and

From cbf40ea3cae5b4caf5e93b3e82e663d5d2fb6e6c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 7 Oct 2021 15:29:57 +0200
Subject: [PATCH 222/290] Optimize squared norms' computations

Don't recompute norms if X is Y.
Allow passing Y vectors' squared norms via metric keyword arguments.
Rename the associated attributes.
Revert changes as to use this optimisation for BIRCH again.

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/cluster/_birch.py                     | 13 +++++-
 .../metrics/_pairwise_distances_reduction.pyx | 46 +++++++++++++------
 2 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index d2dd3f937a27d..a4d2d89971a7f 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -12,6 +12,7 @@
 from ..metrics import pairwise_distances_argmin
 from ..metrics.pairwise import euclidean_distances
 from ..base import TransformerMixin, ClusterMixin, BaseEstimator
+from ..utils.extmath import row_norms
 from ..utils import deprecated
 from ..utils.validation import check_is_fitted
 from ..exceptions import ConvergenceWarning
@@ -654,9 +655,15 @@ def predict(self, X):
         check_is_fitted(self)
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
+        # This allow not recomputing Y vectors' squared euclidean norms.
+        fast_euclidean_kwargs = {"Y_norm_squared": self._subcluster_norms}
+
         with config_context(assume_finite=True):
             argmin = pairwise_distances_argmin(
-                X, self.subcluster_centers_, metric="fast_euclidean"
+                X,
+                self.subcluster_centers_,
+                metric="fast_euclidean",
+                metric_kwargs=fast_euclidean_kwargs,
             )
         return self.subcluster_labels_[argmin]
 
@@ -702,6 +709,10 @@ def _global_clustering(self, X=None):
                 "n_clusters should be an instance of ClusterMixin or an int"
             )
 
+        # We compute it once here, so that we won't need to compute it again at
+        # each call of `Birch.predict`.
+        self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
+
         if clusterer is None or not_enough_centroids:
             self.subcluster_labels_ = np.arange(len(centroids))
             if not_enough_centroids:
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 484db1b58bcd5..d3cacb3a4e29b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -693,7 +693,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             return FastEuclideanPairwiseDistancesArgKmin(
                 X=X, Y=Y, k=k,
                 use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
             )
 
         return PairwiseDistancesArgKmin(
@@ -911,8 +912,8 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     cdef:
         const DTYPE_t[:, ::1] X
         const DTYPE_t[:, ::1] Y
-        const DTYPE_t[::1] X_sq_norms
-        const DTYPE_t[::1] Y_sq_norms
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
 
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
@@ -930,6 +931,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         ITYPE_t k,
         bint use_squared_distances=False,
         chunk_size=None,
+        metric_kwargs=None,
     ):
         super().__init__(
             # The datasets pair here is used for exact distances computations
@@ -941,8 +943,15 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         cdef:
             DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
-        self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
-        self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared", None)
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+        # Do not recompute norms if datasets are identical.
+        self.X_norm_squared = (
+            self.Y_norm_squared if X is Y else
+            _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+        )
         self.use_squared_distances = use_squared_distances
 
         # Temporary datastructures used in threads
@@ -1065,9 +1074,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
                     # Using the squared euclidean distance as the rank-preserving distance:
                     # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                     (
-                        self.X_sq_norms[i + X_start] +
+                        self.X_norm_squared[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
-                        self.Y_sq_norms[j + Y_start]
+                        self.Y_norm_squared[j + Y_start]
                     ),
                     j + Y_start,
                 )
@@ -1195,6 +1204,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
                 use_squared_distances=use_squared_distances,
                 chunk_size=chunk_size,
                 sort_results=sort_results,
+                metric_kwargs=metric_kwargs,
             )
 
         return PairwiseDistancesRadiusNeighborhood(
@@ -1436,8 +1446,8 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
     cdef:
         const DTYPE_t[:, ::1] X
         const DTYPE_t[:, ::1] Y
-        const DTYPE_t[::1] X_sq_norms
-        const DTYPE_t[::1] Y_sq_norms
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
 
         # Buffers for GEMM
         DTYPE_t ** dist_middle_terms_chunks
@@ -1456,6 +1466,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         bint use_squared_distances=False,
         chunk_size=None,
         sort_results=False,
+        metric_kwargs=None,
     ):
         super().__init__(
             # The datasets pair here is used for exact distances computations
@@ -1468,8 +1479,17 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         cdef:
             DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
-        self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
-        self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared", None)
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+
+        # Do not recompute norms if datasets are identical.
+        self.X_norm_squared = (
+            self.Y_norm_squared if X is Y else
+            _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+        )
         self.use_squared_distances = use_squared_distances
 
         if use_squared_distances:
@@ -1590,9 +1610,9 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
                 # Using the squared euclidean distance as the rank-preserving distance:
                 # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                 squared_dist_i_j = (
-                    self.X_sq_norms[i + X_start]
+                    self.X_norm_squared[i + X_start]
                     + dist_middle_terms[i * Y_c.shape[0] + j]
-                    + self.Y_sq_norms[j + Y_start]
+                    + self.Y_norm_squared[j + Y_start]
                 )
                 if squared_dist_i_j <= self.r_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)

From bf37c3d00d97e88002adbfb4bc39b5873e48b280 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 12 Oct 2021 11:29:07 +0200
Subject: [PATCH 223/290] Import backport to avoid runtime introspection

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 sklearn/utils/__init__.py                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index d3cacb3a4e29b..0e41178b40434 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -49,9 +49,9 @@ from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 from numbers import Integral, Real
 from typing import List
 from scipy.sparse import issparse
-from threadpoolctl import threadpool_limits
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
 from ..utils import check_scalar, _in_unstable_openblas_configuration
+from ..utils.fixes import threadpool_limits
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index d87b5da52339c..3745200b44dc2 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -20,7 +20,6 @@
 import warnings
 import numpy as np
 from scipy.sparse import issparse
-from threadpoolctl import threadpool_info
 
 from .murmurhash import murmurhash3_32
 from .class_weight import compute_class_weight, compute_sample_weight
@@ -41,6 +40,7 @@
     check_symmetric,
     check_scalar,
 )
+from ..utils.fixes import threadpool_info
 from .. import get_config
 
 

From 5660c0e74896c80461539cca86339324627a13cf Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 12 Oct 2021 11:35:16 +0200
Subject: [PATCH 224/290] Clarify comments for sorting arrays

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/utils/_heap.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index d6133eab7c658..b52bbef8ce55e 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -31,8 +31,9 @@ cdef int simultaneous_sort(
     """
     # TODO: In order to support discrete distance metrics, we need to have a
     # simultaneous sort which breaks ties on indices when distances are identical.
-    # The best might be using a std::sort and a Comparator which might need
-    # AoS instead of SoA (currently used).
+    # The best might be using a std::stable_sort and a Comparator which might need
+    # an Array of Structures (AoS) instead of the Structure of Arrays (SoA)
+    # currently used.
     cdef:
         ITYPE_t pivot_idx, i, store_idx
         floating pivot_val

From 072de9ef5e0954e2ea2988a81f4a6b7dc5c6a4cb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 12 Oct 2021 11:37:37 +0200
Subject: [PATCH 225/290] Correct indentation

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/utils/_heap.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index b52bbef8ce55e..643b8bbf14e6c 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -81,8 +81,8 @@ cdef int simultaneous_sort(
             simultaneous_sort(values, indices, pivot_idx)
         if pivot_idx + 2 < size:
             simultaneous_sort(values + pivot_idx + 1,
-                               indices + pivot_idx + 1,
-                               size - pivot_idx - 1)
+                              indices + pivot_idx + 1,
+                              size - pivot_idx - 1)
     return 0
 
 

From e191be251f84ee40e0d2779d85265f201a097542 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 12 Oct 2021 11:49:24 +0200
Subject: [PATCH 226/290] Prefer "surrogate distance" as a naming

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_dist_metrics.pxd                 |  2 +-
 sklearn/metrics/_dist_metrics.pyx                 | 12 ++++++------
 sklearn/metrics/_pairwise_distances_reduction.pyx |  8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index f20397ae3929b..b76bb41f89c02 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -98,7 +98,7 @@ cdef class DatasetsPair:
 
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
 
-    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
 
 
 cdef class DenseDenseDatasetsPair(DatasetsPair):
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 761253cc6e0cf..20812d19e368c 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1296,7 +1296,7 @@ cdef class DatasetsPair:
         """Number of samples in Y."""
         return -999
 
-    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.dist(i, j)
 
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
@@ -1335,7 +1335,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.rdist(&self.X[i, 0],
                                           &self.Y[j, 0],
                                           self.d)
@@ -1387,7 +1387,7 @@ cdef class SparseSparseDatasetsPair(DatasetsPair):
         return self.Y_indptr.shape[0] -1
 
     @final
-    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1458,7 +1458,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         cdef:
             ITYPE_t xi_start = self.X_indptr[i]
             ITYPE_t xi_end = self.X_indptr[i + 1]
@@ -1525,9 +1525,9 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
         return self.datasets_pair.n_samples_X()
 
     @final
-    cdef DTYPE_t rank_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         # Swapping arguments on the same interface
-        return self.datasets_pair.rank_preserving_dist(j, i)
+        return self.datasets_pair.surrogate_dist(j, i)
 
     @final
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 0e41178b40434..40be2da84ebe8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -762,7 +762,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                     heaps_r_distances + i * self.k,
                     heaps_indices + i * self.k,
                     k,
-                    self._datasets_pair.rank_preserving_dist(X_start + i, Y_start + j),
+                    self._datasets_pair.surrogate_dist(X_start + i, Y_start + j),
                     Y_start + j,
                 )
 
@@ -888,8 +888,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
     def _finalize_results(self, bint return_distance=False):
         if return_distance:
-            # We eventually need to recompute distances because we relied on
-            # rank-preserving distances.
+            # We need to recompute distances because we relied on
+            # surrogate distances for the reduction.
             self.compute_exact_distances()
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
@@ -1273,7 +1273,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
         for i in range(X_start, X_end):
             for j in range(Y_start, Y_end):
-                r_dist_i_j = self._datasets_pair.rank_preserving_dist(i, j)
+                r_dist_i_j = self._datasets_pair.surrogate_dist(i, j)
                 if r_dist_i_j <= self.r_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)

From 775a10db2b905136dc4795a479bbdabeb1430c93 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 12 Oct 2021 14:29:11 +0200
Subject: [PATCH 227/290] Use better notations for maths

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 40be2da84ebe8..6c8455178efc7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -975,7 +975,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     ) nogil:
         PairwiseDistancesArgKmin._parallel_on_X_parallel_init(self, thread_num)
 
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        # Temporary buffer for the `-2 * X_c @ Y_c.T` term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
             self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
         )
@@ -997,7 +997,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         PairwiseDistancesArgKmin._parallel_on_Y_init(self, num_threads)
 
         for thread_num in range(num_threads):
-            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            # Temporary buffer for the `-2 * X_c @ Y_c.T` term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
                 self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
             )
@@ -1060,7 +1060,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
             DTYPE_t * C = dist_middle_terms
             ITYPE_t ldc = Y_c.shape[0]
 
-        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
         _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
 
         # Pushing the distance and their associated indices on heaps
@@ -1518,7 +1518,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
     ) nogil:
         PairwiseDistancesRadiusNeighborhood._parallel_on_X_parallel_init(self, thread_num)
 
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        # Temporary buffer for the `-2 * X_c @ Y_c.T` term
         self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
             self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
         )
@@ -1540,7 +1540,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         PairwiseDistancesRadiusNeighborhood._parallel_on_Y_init(self, num_threads)
 
         for thread_num in range(num_threads):
-            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            # Temporary buffer for the `-2 * X_c @ Y_c.T` term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
                 self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
             )
@@ -1601,7 +1601,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
             DTYPE_t * C = dist_middle_terms
             ITYPE_t ldc = Y_c.shape[0]
 
-        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
         _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
 
         # Pushing the distance and their associated indices in vectors.

From dab0d4cff61f8ce6ce946a2abfd8b24a6cc69874 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 12 Oct 2021 14:37:43 +0200
Subject: [PATCH 228/290] Clarify the fast specialized alternative internals in
 the docstring

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx | 40 +++++++++++--------
 sklearn/metrics/pairwise.py                   |  5 ++-
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 6c8455178efc7..1307ec21330b0 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -899,11 +899,18 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     """Fast specialized alternative for PairwiseDistancesArgKmin on EuclideanDistance.
 
+    The full pairwise squared distances matrix is computed as follows:
+
+              ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²
+
+    The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
+
     Notes
     -----
     This implementation has a superior arithmetic intensity and hence
     better running time when the alternative is IO bound, but it can suffer
-    from numerical instability.
+    from numerical instability caused by catastrophic cancellation potentially
+    introduced by the subtraction in the arithmetic expression above.
 
     PairwiseDistancesArgKmin with EuclideanDistance must be used when higher
     numerical precision is needed.
@@ -1032,12 +1039,6 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
             DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-            # We compute the full pairwise squared distances matrix as follows
-            #
-            #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-            #
-            # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
-            #
             # Careful: LDA, LDB and LDC are given for F-ordered arrays
             # in BLAS documentations, for instance:
             # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
@@ -1072,7 +1073,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
                     heaps_indices + i * k,
                     k,
                     # Using the squared euclidean distance as the rank-preserving distance:
-                    # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    #
+                    #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    #
                     (
                         self.X_norm_squared[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
@@ -1433,13 +1436,20 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood):
     """Fast specialized alternative for PairwiseDistancesRadiusNeighborhood on EuclideanDistance.
 
+    The full pairwise squared distances matrix is computed as follows:
+
+              ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²
+
+    The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
+
     Notes
     -----
     This implementation has a superior arithmetic intensity and hence
     better running time when the alternative is IO bound, but it can suffer
-    from numerical instability.
+    from numerical instability caused by catastrophic cancellation potentially
+    introduced by the subtraction in the arithmetic expression above.
 
-    RadiusNeighborhood with EuclideanDistance must be used when higher
+    PairwiseDistancesRadiusNeighborhood with EuclideanDistance must be used when higher
     numerical precision is needed.
     """
 
@@ -1573,12 +1583,6 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
             const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
 
-            # We compute the full pairwise squared distances matrix as follows
-            #
-            #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-            #
-            # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
-            #
             # Careful: LDA, LDB and LDC are given for F-ordered arrays
             # in BLAS documentations, for instance:
             # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
@@ -1608,7 +1612,9 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
                 # Using the squared euclidean distance as the rank-preserving distance:
-                # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                #
+                #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                #
                 squared_dist_i_j = (
                     self.X_norm_squared[i + X_start]
                     + dist_middle_terms[i * Y_c.shape[0] + j]
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 7a60bbb0b4ef1..f68b305206427 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -666,8 +666,11 @@ def pairwise_distances_argmin_min(
     else:
         # TODO: once ArgKmin supports sparse input matrices and 32 bit,
         # we won't need to fallback to pairwise_distances_chunked anymore.
+        #
         # When PairwiseDistancesArgKmin is not supported and when the user
-        # asked for a fast alternative, we need to revert to the standard one.
+        # asked for a fast alternative, we need to revert to the standard
+        # "euclidean" strategy to match the API.
+        # Internally, the "euclidean" strategy still uses the GEMM trick.
         if metric == "fast_euclidean":
             metric = "euclidean"
 

From b423d6abdf2d8941cba2d9ff3e79ee4dbd5cb902 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 15 Oct 2021 16:19:20 +0200
Subject: [PATCH 229/290] Some more doc-string

---
 sklearn/metrics/pairwise.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index f68b305206427..e77d78d81eebd 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -664,7 +664,7 @@ def pairwise_distances_argmin_min(
         values = values.flatten()
         indices = indices.flatten()
     else:
-        # TODO: once ArgKmin supports sparse input matrices and 32 bit,
+        # TODO: once PairwiseDistancesArgKmin supports sparse input matrices and 32 bit,
         # we won't need to fallback to pairwise_distances_chunked anymore.
         #
         # When PairwiseDistancesArgKmin is not supported and when the user
@@ -766,10 +766,13 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs
         ).compute(strategy="auto", return_distance=False)
         indices = indices.flatten()
     else:
-        # TODO: once ArgKmin supports sparse input matrices and 32 bit,
+        # TODO: once PairwiseDistancesArgKmin supports sparse input matrices and 32 bit,
         # we won't need to fallback to pairwise_distances_chunked anymore.
+        #
         # When PairwiseDistancesArgKmin is not supported and when the user
         # asked for a fast alternative, we need to revert to the standard one.
+        # "euclidean" strategy to match the API.
+        # Internally, the "euclidean" strategy still uses the GEMM trick.
         if metric == "fast_euclidean":
             metric = "euclidean"
 

From f6f76ceb28659add5f96206a73c417608ef958ab Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 15 Oct 2021 16:25:09 +0200
Subject: [PATCH 230/290] Clean post-merge Circle CI script

---
 build_tools/circle/build_test_arm.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index c4109d08869ea..67beaae5dba31 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -64,7 +64,6 @@ if [[ "$TEST_DOCSTRINGS" == "true" ]]; then
 fi
 
 python --version
-conda list
 
 # Set parallelism to $N_CORES + 1 to overlap IO bound tasks with CPU bound tasks on CI
 # workers with $N_CORES cores when building the compiled extensions of scikit-learn.
@@ -83,7 +82,6 @@ mamba list
 # Changing directory not to have module resolution use scikit-learn source
 # directory but to the installed package.
 cd /tmp
-
 python -c "import sklearn; sklearn.show_versions()"
 python -m threadpoolctl --import sklearn
 # Test using as many workers as available cores

From 4078b0d3d5ef7c3e73c3a1ff0e5081d109e30aee Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 15 Oct 2021 17:15:45 +0200
Subject: [PATCH 231/290] Doc Add whats_new entry for #20254

---
 doc/whats_new/v1.1.rst | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 1a9f773ce08df..fdd6909c93f4d 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -23,6 +23,34 @@ Put the changes in their relevant module.
 Changed models
 --------------
 
+- |Efficiency| Low-level routines for reductions on pairwise distances
+  for dense float64 datasets have been refactored. The following functions
+  and estimators now benefit from improved performances, in particular on
+  multi-cores machines:
+    - :func:`sklearn.metrics.pairwise_distances_argmin`
+    - :func:`sklearn.metrics.pairwise_distances_argmin_min`
+    - :class:`sklearn.cluster.Birch`
+    - :class:`sklearn.cluster.DBSCAN`
+    - :class:`sklearn.cluster.MeanShift`
+    - :class:`sklearn.cluster.OPTICS`
+    - :class:`sklearn.cluster.SpectralClustering`
+    - :func:`sklearn.feature_selection.mutual_info_regression`
+    - :class:`sklearn.neighbors.AffinityPropagation`
+    - :class:`sklearn.neighbors.KNeighborsClassifier`
+    - :class:`sklearn.neighbors.KNeighborsRegressor`
+    - :class:`sklearn.neighbors.LocalOutlierFactor`
+    - :class:`sklearn.neighbors.NearestNeighbors`
+    - :class:`sklearn.manifold.Isomap`
+    - :class:`sklearn.manifold.LocallyLinearEmbedding`
+    - :class:`sklearn.manifold.TSNE`
+    - :func:`sklearn.manifold.trustworthiness`
+    - :class:`sklearn.semi_supervised.LabelPropagation`
+    - :class:`sklearn.semi_supervised.LabelSpreading`
+
+  For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
+  known up to 20× speed-up.
+
+  :pr:`20254` by :user:`Julien Jerphanion <jjerphan>`.
 
 Changelog
 ---------

From ceff92309be95546f312258a0acb7c5d08f2f5ea Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 18 Oct 2021 09:45:30 +0200
Subject: [PATCH 232/290] Better explain PairwiseDistancesArgKmin
 datastructures usage

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 1307ec21330b0..6086a874c3dc4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -717,9 +717,17 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
         # Allocating pointers to datastructures but not the datastructures themselves.
         # There as many pointers as available threads.
-        # When reducing on small datasets, there can be more pointers than actual
-        # threads used for the reduction but there won't be allocated but unused
-        # datastructures.
+        # However, when reducing on small datasets, there can be more pointers than
+        # actual threads.
+        # In this case, some pointers will be dynamically allocated but there won't
+        # be allocated yet unused data-structures referenced by them.
+        #
+        # For the sake of explicitness:
+        #   - when parallelizing on X, those heaps pointers are referencing
+        #   (with proper offsets) addresses of the two main heaps (see bellow) 
+        #   - when parallelizing on Y, those heaps pointer heaps are referencing
+        #   small heaps which are thread-wise-allocated and whose content will be
+        #   merged with the main heaps'.
         self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
             sizeof(DTYPE_t *) * self.effective_omp_n_thread
         )

From 51caae26c2e036b2f1a7819585c3bd0c6e42634f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 18 Oct 2021 09:40:08 +0200
Subject: [PATCH 233/290] fixup! Doc Add whats_new entry for #20254

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.1.rst | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index fdd6909c93f4d..a79aa65d9f973 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -27,28 +27,28 @@ Changed models
   for dense float64 datasets have been refactored. The following functions
   and estimators now benefit from improved performances, in particular on
   multi-cores machines:
-    - :func:`sklearn.metrics.pairwise_distances_argmin`
-    - :func:`sklearn.metrics.pairwise_distances_argmin_min`
-    - :class:`sklearn.cluster.Birch`
-    - :class:`sklearn.cluster.DBSCAN`
-    - :class:`sklearn.cluster.MeanShift`
-    - :class:`sklearn.cluster.OPTICS`
-    - :class:`sklearn.cluster.SpectralClustering`
-    - :func:`sklearn.feature_selection.mutual_info_regression`
-    - :class:`sklearn.neighbors.AffinityPropagation`
-    - :class:`sklearn.neighbors.KNeighborsClassifier`
-    - :class:`sklearn.neighbors.KNeighborsRegressor`
-    - :class:`sklearn.neighbors.LocalOutlierFactor`
-    - :class:`sklearn.neighbors.NearestNeighbors`
-    - :class:`sklearn.manifold.Isomap`
-    - :class:`sklearn.manifold.LocallyLinearEmbedding`
-    - :class:`sklearn.manifold.TSNE`
-    - :func:`sklearn.manifold.trustworthiness`
-    - :class:`sklearn.semi_supervised.LabelPropagation`
-    - :class:`sklearn.semi_supervised.LabelSpreading`
+  - :func:`sklearn.metrics.pairwise_distances_argmin`
+  - :func:`sklearn.metrics.pairwise_distances_argmin_min`
+  - :class:`sklearn.cluster.AffinityPropagation`
+  - :class:`sklearn.cluster.Birch`
+  - :class:`sklearn.cluster.DBSCAN`
+  - :class:`sklearn.cluster.MeanShift`
+  - :class:`sklearn.cluster.OPTICS`
+  - :class:`sklearn.cluster.SpectralClustering`
+  - :func:`sklearn.feature_selection.mutual_info_regression`
+  - :class:`sklearn.neighbors.KNeighborsClassifier`
+  - :class:`sklearn.neighbors.KNeighborsRegressor`
+  - :class:`sklearn.neighbors.LocalOutlierFactor`
+  - :class:`sklearn.neighbors.NearestNeighbors`
+  - :class:`sklearn.manifold.Isomap`
+  - :class:`sklearn.manifold.LocallyLinearEmbedding`
+  - :class:`sklearn.manifold.TSNE`
+  - :func:`sklearn.manifold.trustworthiness`
+  - :class:`sklearn.semi_supervised.LabelPropagation`
+  - :class:`sklearn.semi_supervised.LabelSpreading`
 
   For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
-  known up to 20× speed-up.
+  can be up to 20× faster than in the previous versions'.
 
   :pr:`20254` by :user:`Julien Jerphanion <jjerphan>`.
 

From 8613fd66d89133d29232b2c0d64ad6e5a380c5d6 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 18 Oct 2021 09:54:41 +0200
Subject: [PATCH 234/290] Better explain PairwiseDistancesRadiusNeighborhood
 behavior

Also fix some formatting.

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx    | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 6086a874c3dc4..a71cb5e124d5e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -724,7 +724,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         #
         # For the sake of explicitness:
         #   - when parallelizing on X, those heaps pointers are referencing
-        #   (with proper offsets) addresses of the two main heaps (see bellow) 
+        #   (with proper offsets) addresses of the two main heaps (see bellow)
         #   - when parallelizing on Y, those heaps pointer heaps are referencing
         #   small heaps which are thread-wise-allocated and whose content will be
         #   merged with the main heaps'.
@@ -1094,8 +1094,18 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
 
 
 cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
-    """Returns radius-based neighbors vectors' indices in a dataset Y of
-    of vectors in a dataset X.
+    """Compute radius-based neighbors for two sets of vectors.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) < radius
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    When this reduction is used within scikit-learn estimators
+    (X, Y) would generally be (X_test, X_train).
 
     Parameters
     ----------

From 4bf7eee1e60abc929c8183308ab747895e9dbc6a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 18 Oct 2021 16:39:34 +0200
Subject: [PATCH 235/290] Move changelog entry under the Miscellaneous section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 doc/whats_new/v1.1.rst | 61 +++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 85b67d64a1476..7910c83ccc0cd 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -23,34 +23,6 @@ Put the changes in their relevant module.
 Changed models
 --------------
 
-- |Efficiency| Low-level routines for reductions on pairwise distances
-  for dense float64 datasets have been refactored. The following functions
-  and estimators now benefit from improved performances, in particular on
-  multi-cores machines:
-  - :func:`sklearn.metrics.pairwise_distances_argmin`
-  - :func:`sklearn.metrics.pairwise_distances_argmin_min`
-  - :class:`sklearn.cluster.AffinityPropagation`
-  - :class:`sklearn.cluster.Birch`
-  - :class:`sklearn.cluster.DBSCAN`
-  - :class:`sklearn.cluster.MeanShift`
-  - :class:`sklearn.cluster.OPTICS`
-  - :class:`sklearn.cluster.SpectralClustering`
-  - :func:`sklearn.feature_selection.mutual_info_regression`
-  - :class:`sklearn.neighbors.KNeighborsClassifier`
-  - :class:`sklearn.neighbors.KNeighborsRegressor`
-  - :class:`sklearn.neighbors.LocalOutlierFactor`
-  - :class:`sklearn.neighbors.NearestNeighbors`
-  - :class:`sklearn.manifold.Isomap`
-  - :class:`sklearn.manifold.LocallyLinearEmbedding`
-  - :class:`sklearn.manifold.TSNE`
-  - :func:`sklearn.manifold.trustworthiness`
-  - :class:`sklearn.semi_supervised.LabelPropagation`
-  - :class:`sklearn.semi_supervised.LabelSpreading`
-
-  For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
-  can be up to 20× faster than in the previous versions'.
-
-  :pr:`20254` by :user:`Julien Jerphanion <jjerphan>`.
 
 Changelog
 ---------
@@ -141,6 +113,39 @@ Changelog
   :pr:`20880` by :user:`Guillaume Lemaitre <glemaitre>`
   and :user:`András Simon <simonandras>`.
 
+Miscellaneous
+.............
+
+- |Efficiency| Low-level routines for reductions on pairwise distances
+  for dense float64 datasets have been refactored. The following functions
+  and estimators now benefit from improved performances, in particular on
+  multi-cores machines:
+  - :func:`sklearn.metrics.pairwise_distances_argmin`
+  - :func:`sklearn.metrics.pairwise_distances_argmin_min`
+  - :class:`sklearn.cluster.AffinityPropagation`
+  - :class:`sklearn.cluster.Birch`
+  - :class:`sklearn.cluster.DBSCAN`
+  - :class:`sklearn.cluster.MeanShift`
+  - :class:`sklearn.cluster.OPTICS`
+  - :class:`sklearn.cluster.SpectralClustering`
+  - :func:`sklearn.feature_selection.mutual_info_regression`
+  - :class:`sklearn.neighbors.KNeighborsClassifier`
+  - :class:`sklearn.neighbors.KNeighborsRegressor`
+  - :class:`sklearn.neighbors.LocalOutlierFactor`
+  - :class:`sklearn.neighbors.NearestNeighbors`
+  - :class:`sklearn.manifold.Isomap`
+  - :class:`sklearn.manifold.LocallyLinearEmbedding`
+  - :class:`sklearn.manifold.TSNE`
+  - :func:`sklearn.manifold.trustworthiness`
+  - :class:`sklearn.semi_supervised.LabelPropagation`
+  - :class:`sklearn.semi_supervised.LabelSpreading`
+
+  For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
+  can be up to 20× faster than in the previous versions'.
+
+  :pr:`20254` by :user:`Julien Jerphanion <jjerphan>`.
+
+
 Code and Documentation Contributors
 -----------------------------------
 

From 7fa4a40aa0d84906159978b69dc629a9cef944f3 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 18 Oct 2021 16:39:34 +0200
Subject: [PATCH 236/290] Address review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/_dist_metrics.pyx             | 32 +++++++-----
 .../metrics/_pairwise_distances_reduction.pyx | 51 ++++++++++---------
 sklearn/metrics/tests/test_dist_metrics.py    |  2 +-
 sklearn/metrics/tests/test_pairwise.py        | 31 -----------
 .../test_pairwise_distances_reduction.py      |  2 +-
 sklearn/neighbors/__init__.py                 |  2 +-
 sklearn/neighbors/tests/test_neighbors.py     | 18 +++----
 sklearn/utils/__init__.py                     |  3 +-
 sklearn/utils/_heap.pyx                       |  1 -
 9 files changed, 57 insertions(+), 85 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 20812d19e368c..eae3b4bd1791a 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -307,8 +307,8 @@ cdef class DistanceMetric:
         This can optionally be overridden in a base class.
 
         The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute.  For example, for the
-        Euclidean metric, the rank-preserving surrogate distance is the
+        rank as the distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
         squared-euclidean distance.
         """
         return self.dist(x1, x2, size)
@@ -320,8 +320,10 @@ cdef class DistanceMetric:
         const DTYPE_t[:] x2_data,
         const ITYPE_t[:] x2_indices,
     ) nogil except -1:
-        """Compute the rank-preserving surrogate distance between vectors x1 and x2
-        given non null coordinates and their corresponding indices.
+        """Compute the distance between vectors x1 and x2 of a CSR matrix.
+
+        The computations is made given non null coordinates and
+        corresponding indices of the vectors CSR matrix.
 
         This should be overridden in a base class.
         """
@@ -334,14 +336,16 @@ cdef class DistanceMetric:
         const DTYPE_t[:] x2_data,
         const ITYPE_t[:] x2_indices,
     ) nogil except -1:
-        """Compute the rank-preserving surrogate distance between vectors x1 and x2
-        given non null coordinates and their corresponding indices.
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2 of a CSR matrix.
+
+        The computations is made given non null coordinates and
+        corresponding indices of the vectors CSR matrix.
 
         This can optionally be overridden in a base class.
 
         The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute.  For example, for the
-        Euclidean metric, the rank-preserving surrogate distance is the
+        rank as the distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
         squared-euclidean distance.
         """
         return self.csr_dist(x1_data, x1_indices, x2_data, x2_indices)
@@ -378,8 +382,9 @@ cdef class DistanceMetric:
         """Convert the rank-preserving surrogate distance to the distance.
 
         The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the surrogate distance is the squared-euclidean distance.
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
 
         Parameters
         ----------
@@ -397,8 +402,9 @@ cdef class DistanceMetric:
         """Convert the true distance to the rank-preserving surrogate distance.
 
         The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the surrogate distance is the squared-euclidean distance.
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
 
         Parameters
         ----------
@@ -1506,7 +1512,7 @@ cdef class DenseSparseDatasetsPair(DatasetsPair):
     """
     cdef:
         # As distance metrics are symmetric functions, we can
-        # simply rely on the other DatasetsPair and swap arguments.
+        # simply rely on the SparseDenseDatasetsPair and swap arguments.
         DatasetsPair datasets_pair
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index a71cb5e124d5e..c94415ca63480 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -185,7 +185,7 @@ cdef class PairwiseDistancesReduction:
     """Abstract base class for pairwise distance computation & reduction
 
     Subclasses of this class compute pairwise distances between a set of
-    vectors (rows) X and another set of vectors (rows) of Y and apply a
+    vectors (rows) X and another set of vectors (rows) Y and apply a
     reduction on top. The reduction takes a matrix of pairwise distances
     between rows of X and Y as input and outputs an aggregate data-structure
     for each row of X. The aggregate values are typically smaller than the number
@@ -204,7 +204,7 @@ cdef class PairwiseDistancesReduction:
     The subclasses are specialized for reduction.
 
     The actual distance computation for a given pair of rows of X and Y are
-    delegated to metric-specific subclasses of the DatasetsPair companion base
+    delegated to format-specific subclasses of the DatasetsPair companion base
     class.
 
     Parameters
@@ -252,7 +252,7 @@ cdef class PairwiseDistancesReduction:
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        """Return True if the PairwiseDistancesReduction for the given parameters.
+        """Return True if the PairwiseDistancesReduction can be used for the given parameters.
 
         Parameters
         ----------
@@ -329,7 +329,7 @@ cdef class PairwiseDistancesReduction:
         str strategy=None,
         bint return_distance=False,
     ):
-        """Computes the reduction of vectors (rows) of X on Y.
+        """Compute the pairwise distances and the reduction of vectors (rows) of X on Y.
 
         Parameters
         ----------
@@ -337,17 +337,19 @@ cdef class PairwiseDistancesReduction:
             The chunking strategy defining which dataset parallelization are made on.
 
             Strategies differs on the dispatching they use for chunks on threads:
-              - 'parallel_on_samples_X' dispatches chunks of X uniformly on threads.
+
+              - 'parallel_on__X' dispatches chunks of X uniformly on threads.
               Each thread then iterates on all the chunks of Y. This strategy is
-              embarrassingly parallel and comes with no datastructures synchronisation
-              but is less used in practice (because X is smaller than Y generally).
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
               - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
               Each thread then iterates on all the chunks of X. This strategy is
               embarrassingly parallel but uses intermediate datastructures
-              synchronisation. However it is more useful in practice (because Y is
-              larger than X generally).
+              synchronisation.
+
               - 'auto' relies on a simple heuristic to choose between
-              'parallel_on_samples_X' and 'parallel_on_Y'.
+              'parallel_on__X' and 'parallel_on_Y'.
+
               - None (default) looks-up in scikit-learn configuration for
               `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
 
@@ -357,8 +359,8 @@ cdef class PairwiseDistancesReduction:
 
         Returns
         -------
-        Results for the PairwiseDistancesReduction, usually an array of indices
-        and optionally an array of associated distances if return_distance is True.
+        If True, return the distances between each sample of X and
+        the samples of Y selected by the reduction function.
         """
 
         if strategy is None:
@@ -386,8 +388,8 @@ cdef class PairwiseDistancesReduction:
 
     @final
     cdef void _parallel_on_X(self) nogil:
-        """Computes the reduction of each vector (row) of X on Y
-        by parallelizing computation on chunks of X.
+        """Compute the pairwise distances of each vector (row) of X on Y
+        by parallelizing computation on chunks of X and reduce them.
 
         This strategy dispatches chunks of X uniformly on threads.
         Each thread then iterates on all the chunks of Y. This strategy is
@@ -447,8 +449,8 @@ cdef class PairwiseDistancesReduction:
 
     @final
     cdef void _parallel_on_Y(self) nogil:
-        """Computes the reduction of each vector (row) of X on Y
-        by parallelizing computation on chunks of Y.
+        """Compute the pairwise distances of each vector (row) of X on Y
+        by parallelizing computation on chunks of Y and reduce them.
 
         This strategy dispatches chunks of Y uniformly on threads.
         Each thread then iterates on all the chunks of X. This strategy is
@@ -602,8 +604,7 @@ cdef class PairwiseDistancesReduction:
         return
 
 cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
-    """Computes the argkmin of vectors (rows) of a set of
-    vectors (rows) of X on another set of vectors (rows) of Y.
+    """Compute the argkmin of vectors (rows) of X on the ones of Y.
 
     Parameters
     ----------
@@ -651,10 +652,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples_X, n_features)
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
             Input data.
 
-        Y : array-like of shape (n_samples_Y, n_features)
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
             Input data.
 
         k : int
@@ -716,7 +717,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         self.k = k
 
         # Allocating pointers to datastructures but not the datastructures themselves.
-        # There as many pointers as available threads.
+        # There are as many pointers as available threads.
         # However, when reducing on small datasets, there can be more pointers than
         # actual threads.
         # In this case, some pointers will be dynamically allocated but there won't
@@ -909,7 +910,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
 
     The full pairwise squared distances matrix is computed as follows:
 
-              ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²
+                  ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||²
 
     The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
 
@@ -1140,7 +1141,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
         # Neighbors indices and distances are returned as np.ndarray of np.ndarray.
         #
-        # We want resizable buffers which we will to wrapped within numpy
+        # We want resizable buffers which we will wrap into numpy
         # arrays at the end. std::vector comes as a handy interface for
         # interacting efficiently with resizable buffers.
         #
@@ -1251,7 +1252,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         self.sort_results = sort_results
 
         # Allocating pointers to datastructures but not the datastructures themselves.
-        # There as many pointers as available threads.
+        # There are as many pointers as available threads.
         # When reducing on small datasets, there can be more pointers than actual
         # threads used for the reduction but there won't be allocated but unused
         # datastructures.
@@ -1456,7 +1457,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
 
     The full pairwise squared distances matrix is computed as follows:
 
-              ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²
+                  ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||²
 
     The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
 
diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index 9f0750fd75669..9ef3b3c4af4f1 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -52,7 +52,7 @@ def dist_func(x1, x2, p):
 }
 
 
-# TODO: remove this test in 1.2
+# TODO: remove this test in 1.3
 def test_neighbors_distance_metric_deprecation():
     from sklearn.neighbors import DistanceMetric as DeprecatedDistanceMetric
 
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 90b8db305b83b..c0de29296613b 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1465,34 +1465,3 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
     # and fails due to rounding errors
     rtol = 1e-5 if dtype is np.float32 else 1e-7
     assert_allclose(dist, expected_dist, rtol=rtol)
-
-
-@pytest.mark.parametrize("X_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
-@pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
-@pytest.mark.parametrize("sign", [1, -1])
-def test_fast_euclidean_correctness(
-    X_translation, Y_translation, sign, n_samples=10000, n_features=10
-):
-    # This is the only failing test case, so we prefer xfailing.
-    numerical_edge_cases = {(1e6, 1e6, 1), (1e7, 1e7, 1)}
-    if (X_translation, Y_translation, sign) in numerical_edge_cases:
-        pytest.xfail(
-            "Numerical edge-case: (X_translation, Y_translation,"
-            f" sign)={(X_translation, Y_translation, sign)}"
-        )
-
-    # The fast squared euclidean strategy must return results
-    # that are close to the ones obtained with the euclidean distance
-    rng = np.random.RandomState(1)
-
-    spread = 100
-    X = X_translation + rng.rand(n_samples, n_features) * spread
-    Y = (Y_translation + rng.rand(n_samples, n_features) * spread) * sign
-
-    argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
-    fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
-        X, Y, metric="fast_euclidean"
-    )
-
-    np.testing.assert_array_equal(argmins, fsq_argmins)
-    np.testing.assert_allclose(distances, fsq_distances, rtol=1e-5)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index e68f66fe5a40c..29f340eadb76a 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -63,7 +63,7 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
 
 
 def test_pairwise_distances_reduction_is_usable_for():
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(0)
     X = rng.rand(100, 10)
     Y = rng.rand(100, 10)
     metric = "euclidean"
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index ff5ad4875d77d..340910008f75c 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -4,8 +4,8 @@
 """
 
 from ._ball_tree import BallTree
-from ._distance_metric import DistanceMetric
 from ._kd_tree import KDTree
+from ._distance_metric import DistanceMetric
 from ._graph import kneighbors_graph, radius_neighbors_graph
 from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
 from ._unsupervised import NearestNeighbors
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 10b52738977c8..41ab5cc461c7b 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -84,9 +84,9 @@ def _weight_func(dist):
     return retval ** 2
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("n_query_pts", [1, 10, 100])
+@pytest.mark.parametrize("n_samples", [100, 1000])
+@pytest.mark.parametrize("n_features", [5, 100])
+@pytest.mark.parametrize("n_query_pts", [10, 100])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100])
 @pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
 def test_unsupervised_kneighbors(
@@ -151,9 +151,9 @@ def test_unsupervised_kneighbors(
         )
 
 
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("n_query_pts", [1, 10, 100])
+@pytest.mark.parametrize("n_samples", [100, 1000])
+@pytest.mark.parametrize("n_features", [5, 100])
+@pytest.mark.parametrize("n_query_pts", [10, 100])
 @pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
 @pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
 @pytest.mark.parametrize(
@@ -212,8 +212,7 @@ def test_neigh_predictions_algorithm_agnosticity(
         )
 
 
-@pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
 @pytest.mark.parametrize(
@@ -226,7 +225,6 @@ def test_neigh_predictions_algorithm_agnosticity(
     ],
 )
 def test_neighs_predictions_fast_euclidean_correctness(
-    seed,
     n_samples,
     n_features,
     n_neighbors,
@@ -242,7 +240,7 @@ def test_neighs_predictions_fast_euclidean_correctness(
             allow_module_level=True,
         )
 
-    rng = np.random.RandomState(seed)
+    rng = np.random.RandomState(0)
     X = rng.rand(n_samples, n_features).astype(dtype)
     y = rng.randint(3, size=n_samples)
 
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 3745200b44dc2..488e96bf47829 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -26,7 +26,7 @@
 from . import _joblib
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
-from .fixes import np_version, parse_version
+from .fixes import np_version, parse_version, threadpool_info
 from ._estimator_html_repr import estimator_html_repr
 from .validation import (
     as_float_array,
@@ -40,7 +40,6 @@
     check_symmetric,
     check_scalar,
 )
-from ..utils.fixes import threadpool_info
 from .. import get_config
 
 
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index 643b8bbf14e6c..32e39ff37ff13 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -1,4 +1,3 @@
-#!python
 # cython: boundscheck=False
 # cython: cdivision=True
 # cython: initializedcheck=False

From 4a89d7fc9e107a446ec1a17605bb7f705b9652c5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 18 Oct 2021 16:39:34 +0200
Subject: [PATCH 237/290] Address review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/_config.py                            |  23 +++-
 sklearn/cluster/_birch.py                     |   5 +-
 sklearn/metrics/_dist_metrics.pyx             |   4 +-
 .../metrics/_pairwise_distances_reduction.pyx |  60 ++++------
 .../test_pairwise_distances_reduction.py      | 108 ++++++++++++------
 sklearn/neighbors/_base.py                    |  50 ++++----
 sklearn/neighbors/tests/test_neighbors.py     |  11 +-
 sklearn/utils/_testing.py                     |  21 ----
 8 files changed, 155 insertions(+), 127 deletions(-)

diff --git a/sklearn/_config.py b/sklearn/_config.py
index fe2d27f64857c..5d8bc566c9e9a 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -9,6 +9,9 @@
     "working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
     "print_changed_only": True,
     "display": "text",
+    "pairwise_dist_chunk_size": int(
+        os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
+    ),
 }
 _threadlocal = threading.local()
 
@@ -40,7 +43,11 @@ def get_config():
 
 
 def set_config(
-    assume_finite=None, working_memory=None, print_changed_only=None, display=None
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    pairwise_dist_chunk_size=None,
 ):
     """Set global scikit-learn configuration
 
@@ -80,6 +87,12 @@ def set_config(
 
         .. versionadded:: 0.23
 
+    pairwise_dist_chunk_size : int, default=None
+        The number of vectors per chunk for PairwiseDistancesReduction.
+        Default is 256 (optimal for most of modern laptops' caches and architectures).
+
+        .. versionadded:: 1.1
+
     See Also
     --------
     config_context : Context manager for global scikit-learn configuration.
@@ -95,6 +108,8 @@ def set_config(
         local_config["print_changed_only"] = print_changed_only
     if display is not None:
         local_config["display"] = display
+    if pairwise_dist_chunk_size is not None:
+        local_config["display"] = pairwise_dist_chunk_size
 
 
 @contextmanager
@@ -132,6 +147,12 @@ def config_context(**new_config):
 
         .. versionadded:: 0.23
 
+    pairwise_dist_chunk_size : int, default=None
+        The number of vectors per chunk for PairwiseDistancesReduction.
+        Default is 256 (optimal for most of modern laptops' caches and architectures).
+
+        .. versionadded:: 1.1
+
     Notes
     -----
     All settings, not just those presently modified, will be returned to
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 716f1aa9278af..5c7c3d2a6d729 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -677,7 +677,6 @@ def predict(self, X):
         check_is_fitted(self)
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        # This allow not recomputing Y vectors' squared euclidean norms.
         fast_euclidean_kwargs = {"Y_norm_squared": self._subcluster_norms}
 
         with config_context(assume_finite=True):
@@ -731,8 +730,8 @@ def _global_clustering(self, X=None):
                 "n_clusters should be an instance of ClusterMixin or an int"
             )
 
-        # We compute it once here, so that we won't need to compute it again at
-        # each call of `Birch.predict`.
+        # We compute subcluster norms once here, so that we won't need to compute it
+        # again at each call of `Birch.predict`.
         self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
 
         if clusterer is None or not_enough_centroids:
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index eae3b4bd1791a..b9bc877f681a5 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1471,8 +1471,8 @@ cdef class SparseDenseDatasetsPair(DatasetsPair):
 
         # TODO: the 2D to 1D memory-view conversion might make computation slower, see:
         # https://github.com/scikit-learn/scikit-learn/issues/17299
-        # Ideally, we could pass pointers and indices and access elements
-        # then in distance_metric.dist
+        # Alternatively, we could pass pointers and indices and access elements
+        # then in distance_metric.dist. This works but would complexify this API.
         return self.distance_metric.csr_rdist(
             self.X_data[xi_start:xi_end],
             self.X_indices[xi_start:xi_end],
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index c94415ca63480..f7748d52afd4b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -14,14 +14,11 @@
 # the same structure of operations on distances between vectors
 # of a datasets pair (X, Y).
 
-import numpy as np
 cimport numpy as np
+import numpy as np
 import scipy.sparse
 
 from .. import get_config
-
-np.import_array()
-
 from libc.stdlib cimport free, malloc
 from libc.float cimport DBL_MAX
 from libcpp.vector cimport vector
@@ -55,10 +52,7 @@ from ..utils.fixes import threadpool_limits
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
-# Those constants have been chosen for modern laptops' caches and architecture.
-DEF CHUNK_SIZE = 256  # number of vectors
-DEF MIN_CHUNK_SAMPLES = 20
-
+np.import_array()
 
 # TODO: change for `libcpp.algorithm.move` once Cython 3 is used
 # Introduction in Cython:
@@ -226,7 +220,7 @@ cdef class PairwiseDistancesReduction:
     """
 
     cdef:
-        DatasetsPair _datasets_pair
+        readonly DatasetsPair datasets_pair
 
         ITYPE_t n_threads
         ITYPE_t effective_omp_n_thread
@@ -280,10 +274,6 @@ cdef class PairwiseDistancesReduction:
                 not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and
                 metric in cls.valid_metrics())
 
-    @property
-    def datasets_pair(self) -> DatasetsPair:
-        return self._datasets_pair
-
     def __init__(
         self,
         DatasetsPair datasets_pair,
@@ -291,27 +281,26 @@ cdef class PairwiseDistancesReduction:
         n_threads=None,
      ):
         cdef:
-            ITYPE_t X_n_full_chunks, Y_n_full_chunks
+            ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
 
         if chunk_size is None:
-            chunk_size = get_config().get("pairwise_dist_chunk_size", CHUNK_SIZE)
+            chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
 
-        check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
-        self.chunk_size = chunk_size
+        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
 
         self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads)
 
-        self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+        n_samples_chunk = max(20, chunk_size)
 
-        self._datasets_pair = datasets_pair
+        self.datasets_pair = datasets_pair
 
         self.n_samples_Y = datasets_pair.n_samples_Y()
-        self.Y_n_samples_chunk = min(self.n_samples_Y, self.n_samples_chunk)
+        self.Y_n_samples_chunk = min(self.n_samples_Y, n_samples_chunk)
         Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
         self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
 
         self.n_samples_X = datasets_pair.n_samples_X()
-        self.X_n_samples_chunk = min(self.n_samples_X, self.n_samples_chunk)
+        self.X_n_samples_chunk = min(self.n_samples_X, n_samples_chunk)
         X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
         self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
 
@@ -420,7 +409,7 @@ cdef class PairwiseDistancesReduction:
                     X_end = X_start + self.X_n_samples_chunk
 
                 # Reinitializing thread datastructures for the new X chunk
-                self._parallel_on_X_threadwise_init_chunk(thread_num, X_start, X_end)
+                self._parallel_on_X_threadwise_init_chunk(thread_num, X_start)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -552,7 +541,6 @@ cdef class PairwiseDistancesReduction:
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
-        ITYPE_t X_end,
     ) nogil:
         """Initialise datastructures used in a thread given its number."""
         return
@@ -713,8 +701,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     ):
         super().__init__(datasets_pair, chunk_size, n_threads)
 
-        check_scalar(k, "k", Integral, min_val=1)
-        self.k = k
+        self.k = check_scalar(k, "k", Integral, min_val=1)
 
         # Allocating pointers to datastructures but not the datastructures themselves.
         # There are as many pointers as available threads.
@@ -771,7 +758,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                     heaps_r_distances + i * self.k,
                     heaps_indices + i * self.k,
                     k,
-                    self._datasets_pair.surrogate_dist(X_start + i, Y_start + j),
+                    self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
                     Y_start + j,
                 )
 
@@ -780,7 +767,6 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
-        ITYPE_t X_end,
     ) nogil:
         # As this strategy is embarrassingly parallel, we can set the
         # thread heaps pointers to the proper position on the main heaps
@@ -890,9 +876,9 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         for i in prange(self.n_samples_X, schedule='static', nogil=True,
                         num_threads=self.effective_omp_n_thread):
             for j in range(self.k):
-                distances[i, j] = self._datasets_pair.distance_metric._rdist_to_dist(
+                distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
                     # Guard against eventual -0., causing nan production.
-                    distances[i, j] if distances[i, j] > 0. else 0.
+                    max(distances[i, j], 0.)
                 )
 
     def _finalize_results(self, bint return_distance=False):
@@ -959,10 +945,12 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         cdef:
             DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
+
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared", None)
         else:
             self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
@@ -1246,9 +1234,8 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
     ):
         super().__init__(datasets_pair, chunk_size, n_threads)
 
-        check_scalar(radius, "radius", Real, min_val=0)
-        self.radius = radius
-        self.r_radius = self._datasets_pair.distance_metric._dist_to_rdist(radius)
+        self.radius = check_scalar(radius, "radius", Real, min_val=0)
+        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
         self.sort_results = sort_results
 
         # Allocating pointers to datastructures but not the datastructures themselves.
@@ -1295,7 +1282,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
         for i in range(X_start, X_end):
             for j in range(Y_start, Y_end):
-                r_dist_i_j = self._datasets_pair.surrogate_dist(i, j)
+                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
                 if r_dist_i_j <= self.r_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
@@ -1315,7 +1302,6 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
-        ITYPE_t X_end,
     ) nogil:
 
         # As this strategy is embarrassingly parallel, we can set the
@@ -1432,11 +1418,9 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
                         num_threads=self.effective_omp_n_thread):
             for j in range(deref(self.neigh_indices)[i].size()):
                 deref(self.neigh_distances)[i][j] = (
-                        self._datasets_pair.distance_metric._rdist_to_dist(
+                        self.datasets_pair.distance_metric._rdist_to_dist(
                             # Guard against eventual -0., causing nan production.
-                            deref(self.neigh_distances)[i][j]
-                            if deref(self.neigh_distances)[i][j] > 0.
-                            else 0
+                            max(deref(self.neigh_distances)[i][j], 0.)
                         )
                 )
 
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 29f340eadb76a..02b485a5694a6 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -20,10 +20,29 @@
 )
 
 from sklearn.utils import _in_unstable_openblas_configuration
-from sklearn.utils._testing import (
-    fails_if_unstable_openblas,
-    get_dummy_metric_kwargs,
-)
+
+from sklearn.utils._testing import fails_if_unstable_openblas
+
+
+def _get_dummy_metric_kwargs(metric: str, n_features: int):
+    """Return dummy DistanceMetric kwargs for tests."""
+    rng = np.random.RandomState(1)
+    weights = rng.random_sample(n_features)
+    weights /= weights.sum()
+
+    V = rng.random_sample((n_features, n_features))
+
+    # VI is positive-semidefinite, preferred for precision matrix
+    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
+
+    kwargs = {
+        "minkowski": dict(p=1.5),
+        "seuclidean": dict(V=weights),
+        "wminkowski": dict(p=1.5, w=weights),
+        "mahalanobis": dict(VI=VI),
+    }
+
+    return kwargs.get(metric, {})
 
 
 def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
@@ -220,7 +239,7 @@ def test_pairwise_distances_reduction_factory_method(
 
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(5))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
@@ -244,7 +263,7 @@ def test_chunk_size_agnosticism(
     parameter = (
         10
         if PairwiseDistancesReduction is PairwiseDistancesArgKmin
-        # Scaling the radius with the dimensions
+        # Scaling the radius slightly with the numbers of dimensions
         else 10 ** np.log(n_features)
     )
 
@@ -261,7 +280,7 @@ def test_chunk_size_agnosticism(
 
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(5))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
@@ -272,7 +291,6 @@ def test_n_threads_agnosticism(
     seed,
     n_samples,
     chunk_size,
-    metric="fast_euclidean",
     n_features=100,
     dtype=np.float64,
 ):
@@ -285,23 +303,23 @@ def test_n_threads_agnosticism(
     parameter = (
         10
         if PairwiseDistancesReduction is PairwiseDistancesArgKmin
-        # Scaling the radius with the dimensions
+        # Scaling the radius slightly with the numbers of dimensions
         else 10 ** np.log(n_features)
     )
 
     ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric="euclidean"
+        X, Y, parameter, metric="fast_euclidean"
     ).compute(return_distance=True)
 
     dist, indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric=metric, n_threads=1
+        X, Y, parameter, metric="fast_euclidean", n_threads=1
     ).compute(return_distance=True)
 
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 
 
 @pytest.mark.parametrize("seed", range(5))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
@@ -337,7 +355,7 @@ def test_strategies_consistency(
     parameter = (
         10
         if PairwiseDistancesReduction is PairwiseDistancesArgKmin
-        # Scaling the radius with the dimensions
+        # Scaling the radius slightly with the numbers of dimensions
         else 10 ** np.log(n_features)
     )
 
@@ -346,7 +364,7 @@ def test_strategies_consistency(
         Y,
         parameter,
         metric=metric,
-        metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
+        metric_kwargs=_get_dummy_metric_kwargs(metric, n_features),
         # To be sure to use parallelization
         chunk_size=n_samples // 4,
     )
@@ -366,7 +384,7 @@ def test_strategies_consistency(
 
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("k, radius", [(50, 100)])
 def test_fast_sqeuclidean_correctness(
@@ -412,44 +430,62 @@ def test_fast_sqeuclidean_correctness(
 
 
 @fails_if_unstable_openblas
-@pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("k", [1, 10, 100])
-@pytest.mark.parametrize("translation", [10 ** i for i in [4]])
+@pytest.mark.parametrize("n_features", [50, 500])
+@pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]])
+@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
 def test_fast_sqeuclidean_translation_invariance(
-    seed,
-    n_samples,
     n_features,
-    k,
     translation,
+    metric,
+    PairwiseDistancesReduction,
+    n_samples=1000,
     dtype=np.float64,
 ):
-    # The fast squared euclidean strategy should be translation invariant.
-    if n_samples < k:
-        pytest.skip(
-            f"Skipping as n_samples (={n_samples}) < n_neighbors (={k})",
-            allow_module_level=True,
-        )
+    # The reduction must be translation invariant.
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius slightly with the numbers of dimensions
+        else 10 ** np.log(n_features)
+    )
 
-    rng = np.random.RandomState(seed)
+    rng = np.random.RandomState(0)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
-    reference_dist, reference_indices = PairwiseDistancesArgKmin.get_for(
-        X, Y, k, metric="fast_sqeuclidean"
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
+
+    reference_dist, reference_indices = PairwiseDistancesReduction.get_for(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        metric_kwargs=_get_dummy_metric_kwargs(metric, n_features),
     ).compute(return_distance=True)
 
-    dist, indices = PairwiseDistancesArgKmin.get_for(
-        X + translation, Y + translation, k, metric="fast_sqeuclidean"
+    dist, indices = PairwiseDistancesReduction.get_for(
+        X + 0,
+        Y + 0,
+        parameter,
+        metric=metric,
+        metric_kwargs=_get_dummy_metric_kwargs(metric, n_features),
     ).compute(return_distance=True)
 
-    assert_argkmin_results_equality(reference_dist, dist, reference_indices, indices)
+    ASSERT_RESULT[PairwiseDistancesReduction](
+        reference_dist, dist, reference_indices, indices
+    )
 
 
 @pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("num_threads", [1, 2, 8])
 def test_sqeuclidean_row_norms(
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index e21a0ffb36a28..3aae7a11b7871 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -357,45 +357,47 @@ def _check_algorithm_metric(self):
         if self.algorithm not in ["auto", "brute", "kd_tree", "ball_tree"]:
             raise ValueError("unrecognized algorithm: '%s'" % self.algorithm)
 
+        self._metric = self.metric
+
         if self.algorithm == "auto":
-            if self.metric == "precomputed":
+            if self._metric == "precomputed":
                 alg_check = "brute"
-            elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]:
+            elif callable(self._metric) or self._metric in VALID_METRICS["ball_tree"]:
                 alg_check = "ball_tree"
             else:
                 alg_check = "brute"
         else:
             alg_check = self.algorithm
 
-        if alg_check != "brute" and self.metric in (
+        if alg_check != "brute" and self._metric in (
             "fast_sqeuclidean",
             "fast_euclidean",
         ):
-            alternative = self.metric.replace("fast_", "")
+            alternative = self._metric.replace("fast_", "")
             warnings.warn(
-                f"'{self.metric}' is only available for algorithm='brute' but"
+                f"'{self._metric}' is only available for algorithm='brute' but"
                 f" algorithm='{self.algorithm}' is used. Falling back on"
                 f" metric='{alternative}'.",
                 UserWarning,
                 stacklevel=3,
             )
-            self.metric = alternative
+            self._metric = alternative
 
-        if callable(self.metric):
+        if callable(self._metric):
             if self.algorithm == "kd_tree":
                 # callable metric is only valid for brute force and ball_tree
                 raise ValueError(
                     "kd_tree does not support callable metric '%s'"
                     "Function call overhead will result"
                     "in very poor performance."
-                    % self.metric
+                    % self._metric
                 )
-        elif self.metric not in VALID_METRICS[alg_check]:
+        elif self._metric not in VALID_METRICS[alg_check]:
             raise ValueError(
                 "Metric '%s' not valid. Use "
                 "sorted(sklearn.neighbors.VALID_METRICS['%s']) "
                 "to get valid options. "
-                "Metric can also be a callable function." % (self.metric, alg_check)
+                "Metric can also be a callable function." % (self._metric, alg_check)
             )
 
         if self.metric_params is not None and "p" in self.metric_params:
@@ -411,7 +413,7 @@ def _check_algorithm_metric(self):
         else:
             effective_p = self.p
 
-        if self.metric in ["wminkowski", "minkowski"] and effective_p < 1:
+        if self._metric in ["wminkowski", "minkowski"] and effective_p < 1:
             raise ValueError("p must be greater or equal to one for minkowski metric")
 
     def _fit(self, X, y=None):
@@ -463,12 +465,12 @@ def _fit(self, X, y=None):
             self.effective_metric_params_ = self.metric_params.copy()
 
         effective_p = self.effective_metric_params_.get("p", self.p)
-        if self.metric in ["wminkowski", "minkowski"]:
+        if self._metric in ["wminkowski", "minkowski"]:
             self.effective_metric_params_["p"] = effective_p
 
-        self.effective_metric_ = self.metric
+        self.effective_metric_ = self._metric
         # For minkowski distance, use more efficient methods where available
-        if self.metric == "minkowski":
+        if self._metric == "minkowski":
             p = self.effective_metric_params_.pop("p", 2)
             if p < 1:
                 raise ValueError(
@@ -504,7 +506,7 @@ def _fit(self, X, y=None):
             self.n_samples_fit_ = X.data.shape[0]
             return self
 
-        if self.metric == "precomputed":
+        if self._metric == "precomputed":
             X = _check_precomputed(X)
             # Precomputed matrix X must be squared
             if X.shape[0] != X.shape[1]:
@@ -522,7 +524,7 @@ def _fit(self, X, y=None):
             if self.algorithm not in ("auto", "brute"):
                 warnings.warn("cannot use tree with sparse input: using brute force")
 
-            if self.metric in ("fast_sqeuclidean", "fast_euclidean"):
+            if self._metric in ("fast_sqeuclidean", "fast_euclidean"):
                 # The fast alternatives are only available for dense datasets.
                 self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
@@ -550,7 +552,7 @@ def _fit(self, X, y=None):
             # A tree approach is better for small number of neighbors or small
             # number of features, with KDTree generally faster when available
             if (
-                self.metric == "precomputed"
+                self._metric == "precomputed"
                 or self._fit_X.shape[1] > 15
                 or (
                     self.n_neighbors is not None
@@ -588,7 +590,7 @@ def _fit(self, X, y=None):
         elif self._fit_method == "brute":
             if (
                 self.effective_metric_ in specialised_metrics
-                and self.metric not in specialised_metrics
+                and self._metric not in specialised_metrics
             ):
                 # In that case, the standard stabler metric has not been explicitly
                 # specified by the user, so we prefer its fast alternative.
@@ -748,7 +750,7 @@ class from an array representing our data set and ask who's
 
         if X is not None:
             query_is_train = False
-            if self.metric == "precomputed":
+            if self._metric == "precomputed":
                 X = _check_precomputed(X)
             elif use_pairwise_distances_reductions:
                 # We force the C-contiguity even if it creates a copy for F-ordered
@@ -786,7 +788,9 @@ class from an array representing our data set and ask who's
             )
 
         elif (
-            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+            self._fit_method == "brute"
+            and self._metric == "precomputed"
+            and issparse(X)
         ):
             results = _kneighbors_from_graph(
                 X, n_neighbors=n_neighbors, return_distance=return_distance
@@ -1096,7 +1100,7 @@ class from an array representing our data set and ask who's
 
         if X is not None:
             query_is_train = False
-            if self.metric == "precomputed":
+            if self._metric == "precomputed":
                 X = _check_precomputed(X)
             elif use_pairwise_distances_reductions:
                 # We force the C-contiguity even if it creates a copy for F-ordered
@@ -1126,7 +1130,9 @@ class from an array representing our data set and ask who's
             )
 
         elif (
-            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+            self._fit_method == "brute"
+            and self._metric == "precomputed"
+            and issparse(X)
         ):
             results = _radius_neighbors_from_graph(
                 X, radius=radius, return_distance=return_distance
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 41ab5cc461c7b..c3690b721db9f 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -21,6 +21,9 @@
 from sklearn.exceptions import EfficiencyWarning
 from sklearn.exceptions import NotFittedError
 from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.metrics.tests.test_pairwise_distances_reduction import (
+    _get_dummy_metric_kwargs,
+)
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import (
@@ -36,7 +39,6 @@
     assert_allclose,
     assert_array_almost_equal,
     assert_array_equal,
-    get_dummy_metric_kwargs,
 )
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
@@ -299,7 +301,8 @@ def test_knn_prediction_fast_alternatives_fall_back_on_tree(
     ):
         est.fit(X, y)
 
-    assert est.metric == fall_back_metric
+    assert est.metric == specified_metric
+    assert est._metric == fall_back_metric
     assert est.effective_metric_ == fall_back_metric
 
 
@@ -1522,7 +1525,7 @@ def test_neighbors_metrics(
     test = rng.rand(n_query_pts, n_features)
 
     algorithms = ["brute", "ball_tree", "kd_tree"]
-    metric_params = get_dummy_metric_kwargs(metric, n_features)
+    metric_params = _get_dummy_metric_kwargs(metric, n_features)
 
     # Haversine distance only accepts 2D data
     if metric == "haversine":
@@ -1581,7 +1584,7 @@ def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=
     X = rng.rand(n_samples, n_features)
     Xcsr = csr_matrix(X)
 
-    metric_params = get_dummy_metric_kwargs(metric, n_features)
+    metric_params = _get_dummy_metric_kwargs(metric, n_features)
 
     if metric == "precomputed":
         X_precomputed = rng.random_sample((10, 4))
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 18f45d2680b13..644cea2f2be25 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -1050,24 +1050,3 @@ def transform(self, X, y=None):
 
     def fit_transform(self, X, y=None):
         return self.fit(X, y).transform(X, y)
-
-
-def get_dummy_metric_kwargs(metric: str, n_features: int):
-    """Return dummy DistanceMetric kwargs for tests."""
-    rng = np.random.RandomState(1)
-    weights = rng.random_sample(n_features)
-    weights /= weights.sum()
-
-    V = rng.random_sample((n_features, n_features))
-
-    # VI is positive-semidefinite, preferred for precision matrix
-    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
-
-    kwargs = {
-        "minkowski": dict(p=1.5),
-        "seuclidean": dict(V=weights),
-        "wminkowski": dict(p=1.5, w=weights),
-        "mahalanobis": dict(VI=VI),
-    }
-
-    return kwargs.get(metric, {})

From 8f63e013248203e16943716a1ab05abd81887f0b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 20 Oct 2021 17:08:59 +0200
Subject: [PATCH 238/290] Fix config for 'pairwise_dist_chunk_size'

---
 sklearn/_config.py           | 2 +-
 sklearn/tests/test_config.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/_config.py b/sklearn/_config.py
index 5d8bc566c9e9a..c412f75a236e2 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -109,7 +109,7 @@ def set_config(
     if display is not None:
         local_config["display"] = display
     if pairwise_dist_chunk_size is not None:
-        local_config["display"] = pairwise_dist_chunk_size
+        local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size
 
 
 @contextmanager
diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index f78a9ff30b10a..e99eb5fc9db82 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -16,6 +16,7 @@ def test_config_context():
         "working_memory": 1024,
         "print_changed_only": True,
         "display": "text",
+        "pairwise_dist_chunk_size": 256,
     }
 
     # Not using as a context manager affects nothing
@@ -28,6 +29,7 @@ def test_config_context():
             "working_memory": 1024,
             "print_changed_only": True,
             "display": "text",
+            "pairwise_dist_chunk_size": 256,
         }
     assert get_config()["assume_finite"] is False
 
@@ -57,6 +59,7 @@ def test_config_context():
         "working_memory": 1024,
         "print_changed_only": True,
         "display": "text",
+        "pairwise_dist_chunk_size": 256,
     }
 
     # No positional arguments

From 2ad33ece7ee87f813bbaaf590350897bf6513a15 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 20 Oct 2021 17:10:03 +0200
Subject: [PATCH 239/290] Delay and better scope arrays C ordering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/neighbors/_base.py | 59 ++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 3aae7a11b7871..044de6bbdb52d 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -419,9 +419,7 @@ def _check_algorithm_metric(self):
     def _fit(self, X, y=None):
         if self._get_tags()["requires_y"]:
             if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
-                X, y = self._validate_data(
-                    X, y, accept_sparse="csr", multi_output=True, order="C"
-                )
+                X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True)
 
             if is_classifier(self):
                 # Classification targets require a specific format
@@ -456,7 +454,7 @@ def _fit(self, X, y=None):
 
         else:
             if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
-                X = self._validate_data(X, accept_sparse="csr", order="C")
+                X = self._validate_data(X, accept_sparse="csr")
 
         self._check_algorithm_metric()
         if self.metric_params is None:
@@ -748,22 +746,30 @@ class from an array representing our data set and ask who's
             )
         )
 
-        if X is not None:
-            query_is_train = False
-            if self._metric == "precomputed":
-                X = _check_precomputed(X)
-            elif use_pairwise_distances_reductions:
+        query_is_train = X is None
+        if query_is_train:
+            if use_pairwise_distances_reductions:
                 # We force the C-contiguity even if it creates a copy for F-ordered
-                # arrays because this implementation is more efficient.
-                X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
-            else:
-                X = self._validate_data(X, accept_sparse="csr", reset=False)
-        else:
-            query_is_train = True
+                # arrays because PairwiseDistancesArgKmin is more efficient.
+                self._fit_X = self._validate_data(
+                    self._fit_X, accept_sparse="csr", reset=False, order="C"
+                )
             X = self._fit_X
             # Include an extra neighbor to account for the sample itself being
             # returned, which is removed later
             n_neighbors += 1
+        else:
+            if use_pairwise_distances_reductions:
+                # We force the C-contiguity even if it creates a copy for F-ordered
+                # arrays because PairwiseDistancesArgKmin is more efficient.
+                X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
+                self._fit_X = self._validate_data(
+                    self._fit_X, accept_sparse="csr", reset=False, order="C"
+                )
+            elif self._metric == "precomputed":
+                X = _check_precomputed(X)
+            else:
+                X = self._validate_data(X, accept_sparse="csr", reset=False)
 
         n_samples_fit = self.n_samples_fit_
         if n_neighbors > n_samples_fit:
@@ -1098,19 +1104,24 @@ class from an array representing our data set and ask who's
             )
         )
 
-        if X is not None:
-            query_is_train = False
-            if self._metric == "precomputed":
-                X = _check_precomputed(X)
-            elif use_pairwise_distances_reductions:
+        query_is_train = X is None
+        if query_is_train:
+            if use_pairwise_distances_reductions:
+                # We force the C-contiguity even if it creates a copy for F-ordered
+                # arrays because PairwiseDistancesRadiusNeighborhood is more efficient.
+                self._fit_X = self._validate_data(
+                    self._fit_X, accept_sparse="csr", reset=False, order="C"
+                )
+            X = self._fit_X
+        else:
+            if use_pairwise_distances_reductions:
                 # We force the C-contiguity even if it creates a copy for F-ordered
-                # arrays because this implementation is more efficient.
+                # arrays because PairwiseDistancesRadiusNeighborhood is more efficient.
                 X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
+            elif self._metric == "precomputed":
+                X = _check_precomputed(X)
             else:
                 X = self._validate_data(X, accept_sparse="csr", reset=False)
-        else:
-            query_is_train = True
-            X = self._fit_X
 
         if radius is None:
             radius = self.radius

From eba6f031302e5d7885d518f9015e5c505f48a97e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Sat, 23 Oct 2021 11:35:41 +0200
Subject: [PATCH 240/290] Simplify counting for remainder chunks

Also reorder instructions to have X's before Y's.

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index f7748d52afd4b..8949404cd6bdd 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -294,24 +294,19 @@ cdef class PairwiseDistancesReduction:
 
         self.datasets_pair = datasets_pair
 
-        self.n_samples_Y = datasets_pair.n_samples_Y()
-        self.Y_n_samples_chunk = min(self.n_samples_Y, n_samples_chunk)
-        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
-        self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
-
         self.n_samples_X = datasets_pair.n_samples_X()
         self.X_n_samples_chunk = min(self.n_samples_X, n_samples_chunk)
         X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
         self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
 
-        # Counting remainder chunk in total number of chunks
-        self.Y_n_chunks = Y_n_full_chunks + (
-            self.n_samples_Y != (Y_n_full_chunks * self.Y_n_samples_chunk)
-        )
+        self.n_samples_Y = datasets_pair.n_samples_Y()
+        self.Y_n_samples_chunk = min(self.n_samples_Y, n_samples_chunk)
+        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
+        self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
 
-        self.X_n_chunks = X_n_full_chunks + (
-            self.n_samples_X != (X_n_full_chunks * self.X_n_samples_chunk)
-        )
+        # Counting remainder chunk in total number of chunks
+        self.X_n_chunks = X_n_full_chunks + (self.X_n_samples_remainder != 0)
+        self.Y_n_chunks = Y_n_full_chunks + (self.Y_n_samples_remainder != 0)
 
     def compute(
         self,

From 34468ad098b49c218306e06ecb1ecedf40b6a570 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 25 Oct 2021 14:47:38 +0200
Subject: [PATCH 241/290] Better motivate heaps' parallel allocation

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 8949404cd6bdd..2c764bd847be1 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -795,6 +795,9 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
             ITYPE_t thread_num
 
+        # The allocation is done in parallel for data locality purposes: this way
+        # the heaps used in each threads are allocated in pages which are closer
+        # to processor core used by the thread.
         for thread_num in prange(num_threads, schedule='static', nogil=True,
                                  num_threads=num_threads):
             # As chunks of X are shared across threads, so must their

From fa424a48eb39413e703de9d56ed17e7c5c431736 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 25 Oct 2021 14:23:36 +0200
Subject: [PATCH 242/290] Remove PairwiseDistancesRadiusNeighborhood

As to make #20254 smaller.
The removed hunks will be re-introduced in a subsequent PR.
---
 .../metrics/_pairwise_distances_reduction.pyx | 648 ------------------
 .../test_pairwise_distances_reduction.py      |  68 +-
 sklearn/neighbors/_base.py                    |  73 +-
 sklearn/neighbors/tests/test_neighbors.py     |   6 -
 4 files changed, 30 insertions(+), 765 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 2c764bd847be1..924fd88a4f216 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -21,11 +21,8 @@ import scipy.sparse
 from .. import get_config
 from libc.stdlib cimport free, malloc
 from libc.float cimport DBL_MAX
-from libcpp.vector cimport vector
 from cython cimport final
-from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
-from cpython.ref cimport Py_INCREF
 
 from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
 from ..utils._cython_blas cimport (
@@ -41,7 +38,6 @@ from ..utils._cython_blas cimport (
 from ..utils._heap cimport simultaneous_sort, heap_push
 from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
-from ..utils._typedefs cimport ITYPECODE, DTYPECODE
 
 from numbers import Integral, Real
 from typing import List
@@ -54,60 +50,6 @@ from ..utils._typedefs import ITYPE, DTYPE
 
 np.import_array()
 
-# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
-# Introduction in Cython:
-# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa
-cdef extern from "<algorithm>" namespace "std" nogil:
-    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa
-
-######################
-## std::vector to np.ndarray coercion
-# As type covariance is not supported for C++ containers via Cython,
-# we need to redefine fused types.
-ctypedef fused vector_DITYPE_t:
-    vector[ITYPE_t]
-    vector[DTYPE_t]
-
-
-ctypedef fused vector_vector_DITYPE_t:
-    vector[vector[ITYPE_t]]
-    vector[vector[DTYPE_t]]
-
-
-cdef class StdVectorSentinel:
-    """Wraps a reference to a vector which will be deallocated with this object.
-
-    When created, the StdVectorSentinel swaps the reference of its internal
-    vectors with the provided one (vec_ptr), thus making the StdVectorSentinel
-    manage the provided one's lifetime.
-    """
-    pass
-
-
-# We necessarily need to define two extension types extending StdVectorSentinel
-# because we need to provide the dtype of the vector but can't use numeric fused types.
-cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
-    cdef vector[DTYPE_t] vec
-
-    @staticmethod
-    cdef StdVectorSentinel create_for(vector[DTYPE_t] * vec_ptr):
-        # This initializes the object directly without calling __init__
-        cdef StdVectorSentinelDTYPE sentinel = StdVectorSentinelDTYPE.__new__(StdVectorSentinelDTYPE)
-        sentinel.vec.swap(deref(vec_ptr))
-        return sentinel
-
-
-cdef class StdVectorSentinelITYPE(StdVectorSentinel):
-    cdef vector[ITYPE_t] vec
-
-    @staticmethod
-    cdef StdVectorSentinel create_for(vector[ITYPE_t] * vec_ptr):
-        # This initializes the object directly without calling __init__
-        cdef StdVectorSentinelITYPE sentinel = StdVectorSentinelITYPE.__new__(StdVectorSentinelITYPE)
-        sentinel.vec.swap(deref(vec_ptr))
-        return sentinel
-
-
 cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
     const DTYPE_t[:, ::1] X,
     ITYPE_t num_threads,
@@ -131,50 +73,6 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
 
     return row_norms
 
-cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
-    """Create a numpy ndarray given a C++ vector.
-
-    The numpy array buffer is the one of the C++ vector.
-    A StdVectorSentinel is registered as the base object for the numpy array,
-    freeing the C++ vector it encapsulates when the numpy array is freed.
-    """
-    typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE
-    cdef:
-        np.npy_intp size = deref(vect_ptr).size()
-        np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum,
-                                                      deref(vect_ptr).data())
-        StdVectorSentinel sentinel
-
-    if vector_DITYPE_t is vector[DTYPE_t]:
-        sentinel = StdVectorSentinelDTYPE.create_for(vect_ptr)
-    else:
-        sentinel = StdVectorSentinelITYPE.create_for(vect_ptr)
-
-    # Makes the numpy array responsible of the life-cycle of its buffer.
-    # A reference to the StdVectorSentinel will be stolen by the call bellow,
-    # so we increase its reference counter.
-    # See: https://docs.python.org/3/c-api/intro.html#reference-count-details
-    Py_INCREF(sentinel)
-    np.PyArray_SetBaseObject(arr, sentinel)
-    return arr
-
-
-cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
-    vector_vector_DITYPE_t* vecs
-):
-    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
-    cdef:
-        ITYPE_t n = deref(vecs).size()
-        np.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n,
-                                                                     dtype=np.ndarray)
-
-    for i in range(n):
-        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
-
-    return nd_arrays_of_nd_arrays
-
-#####################
-
 cdef class PairwiseDistancesReduction:
     """Abstract base class for pairwise distance computation & reduction
 
@@ -1078,549 +976,3 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
                     ),
                     j + Y_start,
                 )
-
-
-cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
-    """Compute radius-based neighbors for two sets of vectors.
-
-    For each row-vector X[i] of the queries X, find all the indices j of
-    row-vectors in Y such that:
-
-                        dist(X[i], Y[j]) < radius
-
-    The distance function `dist` depends on the values of the `metric`
-    and `metric_kwargs` parameters.
-
-    When this reduction is used within scikit-learn estimators
-    (X, Y) would generally be (X_test, X_train).
-
-    Parameters
-    ----------
-    datasets_pair: DatasetsPair
-        The dataset pairs (X, Y) for the reduction.
-
-    radius: float
-        The radius defining the neighborhood.
-
-    chunk_size: int, default=None,
-        The number of vectors per chunk. If None (default) looks-up in
-        scikit-learn configuration for `pairwise_dist_chunk_size`,
-        and use 256 if it is not set.
-
-    n_threads: int, default=None
-        The number of OpenMP threads to use for the reduction.
-        Parallelism is done on chunks and the sharding of chunks
-        depends on the `strategy` set on
-        :method:`~PairwiseDistancesRadiusNeighborhood.compute`.
-
-        None and -1 means using all processors.
-    """
-
-    cdef:
-        DTYPE_t radius
-
-        # DistanceMetric compute rank-preserving surrogate distance via rdist
-        # which are proxies necessitating less computations.
-        # We get the equivalent for the radius to be able to compare it against
-        # vectors' rank-preserving surrogate distances.
-        DTYPE_t r_radius
-
-        # Neighbors indices and distances are returned as np.ndarray of np.ndarray.
-        #
-        # We want resizable buffers which we will wrap into numpy
-        # arrays at the end. std::vector comes as a handy interface for
-        # interacting efficiently with resizable buffers.
-        #
-        # Though it is possible to access their buffer address with
-        # std::vector::data, they can't be stolen: buffers lifetime
-        # is tight to their std::vector and are deallocated when
-        # std::vectors are.
-        #
-        # To solve this, we dynamically allocate std::vectors and then
-        # encapsulate them in a StdVectorSentinel responsible for
-        # freeing them when the associated np.ndarray is freed.
-        vector[vector[ITYPE_t]] * neigh_indices
-        vector[vector[DTYPE_t]] * neigh_distances
-
-        # Used as array of pointers to private datastructures used in threads.
-        vector[vector[ITYPE_t]] ** neigh_indices_chunks
-        vector[vector[DTYPE_t]] ** neigh_distances_chunks
-
-        bint sort_results
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        DTYPE_t radius,
-        str metric="fast_euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        n_threads=None,
-        bint sort_results=False,
-    ) -> PairwiseDistancesRadiusNeighborhood:
-        """Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples_X, n_features)
-            Input data.
-
-        Y : array-like of shape (n_samples_Y, n_features)
-            Input data.
-
-        radius : float
-            The radius defining the neighborhood.
-
-        metric : str, default='fast_euclidean'
-            The distance metric to use for argkmin. The default metric is
-            a fast implementation of the standard Euclidean metric.
-            For a list of available metrics, see the documentation of
-            :class:`~sklearn.metrics.DistanceMetric`.
-
-        chunk_size : int, default=None,
-            The number of vectors per chunk. If None (default) looks-up in
-            scikit-learn configuration for `pairwise_dist_chunk_size`,
-            and use 256 if it is not set.
-
-        metric_kwargs : dict, default=None
-            Keyword arguments to pass to specified metric function.
-
-        n_threads: int, default=None
-            The number of OpenMP threads to use for the reduction.
-            Parallelism is done on chunks and the sharding of chunks
-            depends on the `strategy` set on
-            :method:`~PairwiseDistancesRadiusNeighborhood.compute`.
-
-            None and -1 means using all processors.
-
-        sort_results : boolean, default=False
-            Sort results with respect to distances between each X vector and its
-            neighbors if set to True.
-
-        Returns
-        -------
-        radius_neighborhood: PairwiseDistancesRadiusNeighborhood
-            The suited PairwiseDistancesRadiusNeighborhood implementation.
-        """
-        # This factory comes to handle specialisations.
-        if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y):
-            use_squared_distances = metric == "fast_sqeuclidean"
-            return FastEuclideanPairwiseDistancesRadiusNeighborhood(
-                X=X, Y=Y, radius=radius,
-                use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size,
-                sort_results=sort_results,
-                metric_kwargs=metric_kwargs,
-            )
-
-        return PairwiseDistancesRadiusNeighborhood(
-            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
-            radius=radius,
-            chunk_size=chunk_size,
-            sort_results=sort_results,
-        )
-
-    def __init__(
-        self,
-        DatasetsPair datasets_pair,
-        DTYPE_t radius,
-        chunk_size=None,
-        n_threads=None,
-        sort_results=False
-    ):
-        super().__init__(datasets_pair, chunk_size, n_threads)
-
-        self.radius = check_scalar(radius, "radius", Real, min_val=0)
-        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
-        self.sort_results = sort_results
-
-        # Allocating pointers to datastructures but not the datastructures themselves.
-        # There are as many pointers as available threads.
-        # When reducing on small datasets, there can be more pointers than actual
-        # threads used for the reduction but there won't be allocated but unused
-        # datastructures.
-        self.neigh_distances_chunks = <vector[vector[DTYPE_t]] **> malloc(
-            sizeof(self.neigh_distances) * self.effective_omp_n_thread
-        )
-        self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
-            sizeof(self.neigh_indices) * self.effective_omp_n_thread
-        )
-
-        # Temporary datastructures which will be coerced to numpy arrays on before
-        # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed.
-        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_samples_X)
-        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_samples_X)
-
-    def __dealloc__(self):
-        if self.neigh_distances_chunks is not NULL:
-            free(self.neigh_distances_chunks)
-
-        if self.neigh_indices_chunks is not NULL:
-            free(self.neigh_indices_chunks)
-
-        if self.neigh_indices is not NULL:
-            del self.neigh_indices
-
-        if self.neigh_distances is not NULL:
-            del self.neigh_distances
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t r_dist_i_j
-
-        for i in range(X_start, X_end):
-            for j in range(Y_start, Y_end):
-                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
-                if r_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
-
-    def _finalize_results(self, bint return_distance=False):
-        if return_distance:
-            self.compute_exact_distances()
-            return (
-                coerce_vectors_to_nd_arrays(self.neigh_distances),
-                coerce_vectors_to_nd_arrays(self.neigh_indices),
-            )
-
-        return coerce_vectors_to_nd_arrays(self.neigh_indices)
-
-    @final
-    cdef void _parallel_on_X_threadwise_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-    ) nogil:
-
-        # As this strategy is embarrassingly parallel, we can set the
-        # thread vectors' pointers to the main vectors'.
-        self.neigh_distances_chunks[thread_num] = self.neigh_distances
-        self.neigh_indices_chunks[thread_num] = self.neigh_indices
-
-    @final
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx
-
-        # Sorting neighbors for each query vector of X
-        if self.sort_results:
-            for idx in range(X_start, X_end):
-                simultaneous_sort(
-                    deref(self.neigh_distances)[idx].data(),
-                    deref(self.neigh_indices)[idx].data(),
-                    deref(self.neigh_indices)[idx].size()
-                )
-
-    cdef void _parallel_on_Y_init(
-        self,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-        # As chunks of X are shared across threads, so must datastructures
-        # to avoid race conditions.
-        # Each thread has its own vectors of n_samples_X vectors which are then merged
-        # back in the main n_samples_X vectors.
-        for thread_num in range(num_threads):
-            self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_samples_X)
-            self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_samples_X)
-
-    @final
-    cdef void _merge_vectors(
-        self,
-        ITYPE_t idx,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-            ITYPE_t idx_n_elements = 0
-            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
-
-        # Resizing buffers only once for the given
-        for thread_num in range(num_threads):
-            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
-        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
-
-        # Moving the elements by range using the range first element
-        # as the reference for the insertion
-        for thread_num in range(num_threads):
-            move(
-                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
-                deref(self.neigh_distances)[idx].begin() + last_element_idx
-            )
-            move(
-                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
-                deref(self.neigh_indices)[idx].begin() + last_element_idx
-            )
-            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
-
-        with nogil, parallel(num_threads=self.effective_omp_n_thread):
-            # Merge vectors used in threads into the main ones.
-            # This is done in parallel sample-wise (no need for locks)
-            # using dynamic scheduling because we generally do not have
-            # the same number of neighbors for each query vectors.
-            # TODO: compare 'dynamic' vs 'static' vs 'guided'
-            for idx in prange(self.n_samples_X, schedule='dynamic'):
-                self._merge_vectors(idx, num_threads)
-
-            # The content of the vector have been std::moved,
-            # Hence they can't be used anymore and can only be deleted.
-            for thread_num in prange(num_threads, schedule='static'):
-                del self.neigh_distances_chunks[thread_num]
-                del self.neigh_indices_chunks[thread_num]
-
-            # Sort in parallel in ascending order w.r.t the distances if needed
-            if self.sort_results:
-                for idx in prange(self.n_samples_X, schedule='static'):
-                    simultaneous_sort(
-                        deref(self.neigh_distances)[idx].data(),
-                        deref(self.neigh_indices)[idx].data(),
-                        deref(self.neigh_indices)[idx].size()
-                    )
-
-        return
-
-    cdef void compute_exact_distances(self) nogil:
-        """Convert rank-preserving distances to pairwise distances in parallel."""
-        cdef:
-            ITYPE_t i, j
-
-        for i in prange(self.n_samples_X, nogil=True, schedule='static',
-                        num_threads=self.effective_omp_n_thread):
-            for j in range(deref(self.neigh_indices)[i].size()):
-                deref(self.neigh_distances)[i][j] = (
-                        self.datasets_pair.distance_metric._rdist_to_dist(
-                            # Guard against eventual -0., causing nan production.
-                            max(deref(self.neigh_distances)[i][j], 0.)
-                        )
-                )
-
-    @final
-    def compute(
-        self,
-        str strategy=None,
-        bint return_distance=False,
-    ):
-        if self.sort_results and not return_distance:
-            raise ValueError("return_distance must be True if sort_results is True.")
-
-        return super().compute(strategy=strategy, return_distance=return_distance)
-
-
-cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood):
-    """Fast specialized alternative for PairwiseDistancesRadiusNeighborhood on EuclideanDistance.
-
-    The full pairwise squared distances matrix is computed as follows:
-
-                  ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||²
-
-    The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
-
-    Notes
-    -----
-    This implementation has a superior arithmetic intensity and hence
-    better running time when the alternative is IO bound, but it can suffer
-    from numerical instability caused by catastrophic cancellation potentially
-    introduced by the subtraction in the arithmetic expression above.
-
-    PairwiseDistancesRadiusNeighborhood with EuclideanDistance must be used when higher
-    numerical precision is needed.
-    """
-
-    cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-        const DTYPE_t[::1] X_norm_squared
-        const DTYPE_t[::1] Y_norm_squared
-
-        # Buffers for GEMM
-        DTYPE_t ** dist_middle_terms_chunks
-        bint use_squared_distances
-
-    @classmethod
-    def is_usable_for(cls, X, Y, metric) -> bool:
-        return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric)
-                and not _in_unstable_openblas_configuration())
-
-    def __init__(
-        self,
-        X,
-        Y,
-        DTYPE_t radius,
-        bint use_squared_distances=False,
-        chunk_size=None,
-        sort_results=False,
-        metric_kwargs=None,
-    ):
-        super().__init__(
-            # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
-            radius=radius,
-            chunk_size=chunk_size,
-            sort_results=sort_results,
-        )
-        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
-        cdef:
-            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
-        self.X, self.Y = datasets_pair.X, datasets_pair.Y
-
-        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
-            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared", None)
-        else:
-            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
-
-        # Do not recompute norms if datasets are identical.
-        self.X_norm_squared = (
-            self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
-        )
-        self.use_squared_distances = use_squared_distances
-
-        if use_squared_distances:
-            # In this specialisation and this setup, the value passed to the radius is
-            # already considered to be the adapted radius, so we overwrite it.
-            self.r_radius = radius
-
-        # Temporary datastructures used in threads
-        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_omp_n_thread
-        )
-
-    def __dealloc__(self):
-        if self.dist_middle_terms_chunks is not NULL:
-            free(self.dist_middle_terms_chunks)
-
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        if not self.use_squared_distances:
-            PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self)
-
-    @final
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil:
-        PairwiseDistancesRadiusNeighborhood._parallel_on_X_parallel_init(self, thread_num)
-
-        # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
-        )
-
-    @final
-    cdef void _parallel_on_X_threadwise_finalize(
-        self,
-        ITYPE_t thread_num
-    ) nogil:
-        PairwiseDistancesRadiusNeighborhood._parallel_on_X_threadwise_finalize(self, thread_num)
-        free(self.dist_middle_terms_chunks[thread_num])
-
-    @final
-    cdef void _parallel_on_Y_init(
-        self,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef ITYPE_t thread_num
-        PairwiseDistancesRadiusNeighborhood._parallel_on_Y_init(self, num_threads)
-
-        for thread_num in range(num_threads):
-            # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
-            )
-
-    @final
-    cdef void _parallel_on_Y_finalize(
-        self,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef ITYPE_t thread_num
-        PairwiseDistancesRadiusNeighborhood._parallel_on_Y_finalize(self, num_threads)
-
-        for thread_num in range(num_threads):
-            free(self.dist_middle_terms_chunks[thread_num])
-
-    @final
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
-
-            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_c.shape[0]
-            ITYPE_t n = Y_c.shape[0]
-            ITYPE_t K = X_c.shape[1]
-            DTYPE_t alpha = - 2.
-            # Casting for A and B to remove the const is needed because APIs exposed via
-            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
-            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
-            ITYPE_t lda = X_c.shape[1]
-            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
-            ITYPE_t ldb = X_c.shape[1]
-            DTYPE_t beta = 0.
-            DTYPE_t * C = dist_middle_terms
-            ITYPE_t ldc = Y_c.shape[0]
-
-        # dist_middle_terms = `-2 * X_c @ Y_c.T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
-
-        # Pushing the distance and their associated indices in vectors.
-        for i in range(X_c.shape[0]):
-            for j in range(Y_c.shape[0]):
-                # Using the squared euclidean distance as the rank-preserving distance:
-                #
-                #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                #
-                squared_dist_i_j = (
-                    self.X_norm_squared[i + X_start]
-                    + dist_middle_terms[i * Y_c.shape[0] + j]
-                    + self.Y_norm_squared[j + Y_start]
-                )
-                if squared_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 02b485a5694a6..7d8312cdd0a74 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -13,9 +13,7 @@
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
-    PairwiseDistancesRadiusNeighborhood,
     FastEuclideanPairwiseDistancesArgKmin,
-    FastEuclideanPairwiseDistancesRadiusNeighborhood,
     _sqeuclidean_row_norms,
 )
 
@@ -77,7 +75,6 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
 
 ASSERT_RESULT = {
     PairwiseDistancesArgKmin: assert_argkmin_results_equality,
-    PairwiseDistancesRadiusNeighborhood: assert_radius_neighborhood_results_equality,
 }
 
 
@@ -145,58 +142,12 @@ def test_argkmin_factory_method_wrong_usages():
         )
 
 
-def test_radius_neighborhood_factory_method_wrong_usages():
-    rng = np.random.RandomState(1)
-    X = rng.rand(100, 10)
-    Y = rng.rand(100, 10)
-    radius = 5
-    metric = "euclidean"
-
-    with pytest.raises(
-        ValueError, match="Only 64bit float datasets are supported for X and Y."
-    ):
-        PairwiseDistancesRadiusNeighborhood.get_for(
-            X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
-        )
-
-    with pytest.raises(
-        ValueError, match="Only 64bit float datasets are supported for X and Y."
-    ):
-        PairwiseDistancesRadiusNeighborhood.get_for(
-            X=X, Y=Y.astype(np.int32), radius=radius, metric=metric
-        )
-
-    with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
-        PairwiseDistancesRadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric)
-
-    with pytest.raises(ValueError, match="Unrecognized metric"):
-        PairwiseDistancesRadiusNeighborhood.get_for(
-            X=X, Y=Y, radius=radius, metric="wrong metric"
-        )
-
-    with pytest.raises(
-        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
-    ):
-        PairwiseDistancesRadiusNeighborhood.get_for(
-            X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
-        )
-
-    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
-        PairwiseDistancesRadiusNeighborhood.get_for(
-            X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
-        )
-
-
 @fails_if_unstable_openblas
 @pytest.mark.filterwarnings("ignore:Constructing a DIA matrix")
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction, FastPairwiseDistancesReduction",
     [
         (PairwiseDistancesArgKmin, FastEuclideanPairwiseDistancesArgKmin),
-        (
-            PairwiseDistancesRadiusNeighborhood,
-            FastEuclideanPairwiseDistancesRadiusNeighborhood,
-        ),
     ],
 )
 def test_pairwise_distances_reduction_factory_method(
@@ -243,7 +194,7 @@ def test_pairwise_distances_reduction_factory_method(
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+    [PairwiseDistancesArgKmin],
 )
 def test_chunk_size_agnosticism(
     PairwiseDistancesReduction,
@@ -284,7 +235,7 @@ def test_chunk_size_agnosticism(
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+    [PairwiseDistancesArgKmin],
 )
 def test_n_threads_agnosticism(
     PairwiseDistancesReduction,
@@ -323,7 +274,7 @@ def test_n_threads_agnosticism(
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+    [PairwiseDistancesArgKmin],
 )
 def test_strategies_consistency(
     PairwiseDistancesReduction,
@@ -417,17 +368,6 @@ def test_fast_sqeuclidean_correctness(
 
     assert_argkmin_results_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
 
-    eucl_dist, eucl_indices = PairwiseDistancesRadiusNeighborhood.get_for(
-        X, Y, radius, metric="euclidean"
-    ).compute(return_distance=True)
-    fse_dist, fse_indices = PairwiseDistancesRadiusNeighborhood.get_for(
-        X, Y, radius, metric="fast_euclidean"
-    ).compute(return_distance=True)
-
-    assert_radius_neighborhood_results_equality(
-        eucl_dist, fse_dist, eucl_indices, fse_indices
-    )
-
 
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("n_features", [50, 500])
@@ -435,7 +375,7 @@ def test_fast_sqeuclidean_correctness(
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+    [PairwiseDistancesArgKmin],
 )
 def test_fast_sqeuclidean_translation_invariance(
     n_features,
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 044de6bbdb52d..69a6caa2344d8 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -25,7 +25,6 @@
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 from ..metrics._pairwise_distances_reduction import (
     PairwiseDistancesArgKmin,
-    PairwiseDistancesRadiusNeighborhood,
 )
 from ..utils import (
     check_array,
@@ -589,6 +588,9 @@ def _fit(self, X, y=None):
             if (
                 self.effective_metric_ in specialised_metrics
                 and self._metric not in specialised_metrics
+                # TODO: remove this condition once PairwiseDistancesRadiusNeighbors
+                # has been introduced.
+                and isinstance(self, KNeighborsMixin)
             ):
                 # In that case, the standard stabler metric has not been explicitly
                 # specified by the user, so we prefer its fast alternative.
@@ -804,7 +806,7 @@ class from an array representing our data set and ask who's
 
         elif self._fit_method == "brute":
             # TODO: support sparse matrices
-            # When ArgKmin is not supported and when the user ask for a
+            # When PairwiseDistancesArgKmin is not supported and when the user ask for a
             # fast alternative, we need to revert to the standard.
             if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
                 # The fast alternatives are only available for dense datasets.
@@ -1013,7 +1015,10 @@ def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance):
         neigh_ind = [np.where(d <= radius)[0] for d in dist]
 
         if return_distance:
-            dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
+            if self.effective_metric_ == "euclidean":
+                dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)]
+            else:
+                dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
             results = dist, neigh_ind
         else:
             results = neigh_ind
@@ -1097,65 +1102,39 @@ class from an array representing our data set and ask who's
         """
         check_is_fitted(self)
 
-        use_pairwise_distances_reductions = (
-            self._fit_method == "brute"
-            and PairwiseDistancesRadiusNeighborhood.is_usable_for(
-                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
-            )
-        )
-
-        query_is_train = X is None
-        if query_is_train:
-            if use_pairwise_distances_reductions:
-                # We force the C-contiguity even if it creates a copy for F-ordered
-                # arrays because PairwiseDistancesRadiusNeighborhood is more efficient.
-                self._fit_X = self._validate_data(
-                    self._fit_X, accept_sparse="csr", reset=False, order="C"
-                )
-            X = self._fit_X
-        else:
-            if use_pairwise_distances_reductions:
-                # We force the C-contiguity even if it creates a copy for F-ordered
-                # arrays because PairwiseDistancesRadiusNeighborhood is more efficient.
-                X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
-            elif self._metric == "precomputed":
+        if X is not None:
+            query_is_train = False
+            if self.metric == "precomputed":
                 X = _check_precomputed(X)
             else:
                 X = self._validate_data(X, accept_sparse="csr", reset=False)
+        else:
+            query_is_train = True
+            X = self._fit_X
 
         if radius is None:
             radius = self.radius
 
-        if use_pairwise_distances_reductions:
-            results = PairwiseDistancesRadiusNeighborhood.get_for(
-                X=X,
-                Y=self._fit_X,
-                radius=radius,
-                metric=self.effective_metric_,
-                metric_kwargs=self.effective_metric_params_,
-                n_threads=self.n_jobs,
-                sort_results=sort_results,
-            ).compute(
-                strategy="auto",
-                return_distance=return_distance,
-            )
-
-        elif (
-            self._fit_method == "brute"
-            and self._metric == "precomputed"
-            and issparse(X)
-        ):
+        if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X):
             results = _radius_neighbors_from_graph(
                 X, radius=radius, return_distance=return_distance
             )
 
         elif self._fit_method == "brute":
-            # When RadiusNeighborhood is not supported and when the user ask for a
-            # fast alternative, we need to revert to the standard.
+            # TODO: support sparse matrices
+            # When PairwiseDistancesRadiusNeighborhood is not supported and when
+            # the user ask for a fast alternative, we need to revert to the standard.
             if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
                 # The fast alternatives are only available for dense datasets.
                 self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
+            # for efficiency, use squared euclidean distances
+            if self.effective_metric_ == "euclidean":
+                radius *= radius
+                kwds = {"squared": True}
+            else:
+                kwds = self.effective_metric_params_
+
             reduce_func = partial(
                 self._radius_neighbors_reduce_func,
                 radius=radius,
@@ -1168,7 +1147,7 @@ class from an array representing our data set and ask who's
                 reduce_func=reduce_func,
                 metric=self.effective_metric_,
                 n_jobs=self.n_jobs,
-                **self.effective_metric_params_,
+                **kwds,
             )
             if return_distance:
                 neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index c3690b721db9f..7702fcb39287d 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -163,8 +163,6 @@ def test_unsupervised_kneighbors(
     [
         neighbors.KNeighborsClassifier,
         neighbors.KNeighborsRegressor,
-        neighbors.RadiusNeighborsClassifier,
-        neighbors.RadiusNeighborsRegressor,
     ],
 )
 def test_neigh_predictions_algorithm_agnosticity(
@@ -222,8 +220,6 @@ def test_neigh_predictions_algorithm_agnosticity(
     [
         neighbors.KNeighborsClassifier,
         neighbors.KNeighborsRegressor,
-        neighbors.RadiusNeighborsClassifier,
-        neighbors.RadiusNeighborsRegressor,
     ],
 )
 def test_neighs_predictions_fast_euclidean_correctness(
@@ -455,8 +451,6 @@ def make_train_test(X_train, X_test):
     estimators = [
         neighbors.KNeighborsClassifier,
         neighbors.KNeighborsRegressor,
-        neighbors.RadiusNeighborsClassifier,
-        neighbors.RadiusNeighborsRegressor,
     ]
     check_precomputed(make_train_test, estimators)
 

From 567866653d2c925901ea9ce3c615388f65e262bc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 25 Oct 2021 14:23:36 +0200
Subject: [PATCH 243/290] Remove DatasetsPair used for sparse datasets

As to make #20254 smaller.
The removed hunks will be re-introduced in a subsequent PR.
---
 sklearn/metrics/_dist_metrics.pyx             | 199 +-----------------
 .../test_pairwise_distances_reduction.py      |  29 ++-
 2 files changed, 18 insertions(+), 210 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index b9bc877f681a5..985f86a1e696b 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1275,13 +1275,10 @@ cdef class DatasetsPair:
         distance_metric._validate_data(X)
         distance_metric._validate_data(Y)
 
-        if not issparse(X) and not issparse(Y):
-            return DenseDenseDatasetsPair(X, Y, distance_metric)
-        if issparse(X) and not issparse(Y):
-            return SparseDenseDatasetsPair(X, Y, distance_metric)
-        if not issparse(X) and issparse(Y):
-            return DenseSparseDatasetsPair(X, Y, distance_metric)
-        return SparseSparseDatasetsPair(X, Y, distance_metric)
+        if issparse(X) or issparse(Y):
+            raise ValueError("Only dense datasets are supported for X and Y.")
+
+        return DenseDenseDatasetsPair(X, Y, distance_metric)
 
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
@@ -1351,191 +1348,3 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         return self.distance_metric.dist(&self.X[i, 0],
                                          &self.Y[j, 0],
                                          self.d)
-
-@final
-cdef class SparseSparseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of two CSR matrices.
-
-    Parameters
-    ----------
-    X: sparse matrix of shape (n_samples_X, n_features)
-        Rows represent vectors. Must be in CSR format.
-
-    Y: sparse matrix of shape (n_samples_Y, n_features)
-        Rows represent vectors. Must be in CSR format.
-
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two vectors of (X, Y).
-    """
-    cdef:
-        const DTYPE_t[:] X_data
-        const ITYPE_t[:] X_indices,
-        const ITYPE_t[:] X_indptr,
-
-        const DTYPE_t[:] Y_data
-        const ITYPE_t[:] Y_indices
-        const ITYPE_t[:] Y_indptr
-
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        DatasetsPair.__init__(self, distance_metric)
-
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
-
-    @final
-    cdef ITYPE_t n_samples_X(self) nogil:
-        return self.X_indptr.shape[0] - 1
-
-    @final
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        return self.Y_indptr.shape[0] -1
-
-    @final
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-            ITYPE_t yj_start = self.Y_indptr[j]
-            ITYPE_t yj_end = self.Y_indptr[j + 1]
-
-        return self.distance_metric.csr_rdist(
-            self.X_data[xi_start:xi_end],
-            self.X_indices[xi_start:xi_end],
-            self.Y_data[yj_start:yj_end],
-            self.Y_indices[yj_start:yj_end],
-        )
-
-    @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-            ITYPE_t yj_start = self.Y_indptr[j]
-            ITYPE_t yj_end = self.Y_indptr[j + 1]
-
-        return self.distance_metric.csr_dist(
-            self.X_data[xi_start:xi_end],
-            self.X_indices[xi_start:xi_end],
-            self.Y_data[yj_start:yj_end],
-            self.Y_indices[yj_start:yj_end]
-        )
-
-@final
-cdef class SparseDenseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of a CSR matrix and a dense array.
-
-    Parameters
-    ----------
-    X: sparse matrix of shape (n_samples_X, n_features)
-        Rows represent vectors. Must be in CSR format.
-
-    Y: ndarray of shape (n_samples_Y, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two vectors of (X, Y).
-    """
-    cdef:
-        const DTYPE_t[:] X_data
-        const ITYPE_t[:] X_indices,
-        const ITYPE_t[:] X_indptr,
-
-        const DTYPE_t[:, ::1] Y
-        const ITYPE_t[:] Y_indices
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        super().__init__(distance_metric)
-
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-
-        # This array already has been checked here
-        self.Y = Y
-        self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE)
-
-    @final
-    cdef ITYPE_t n_samples_X(self) nogil:
-        return self.X_indptr.shape[0] - 1
-
-    @final
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        return self.Y.shape[0]
-
-    @final
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-
-        # TODO: the 2D to 1D memory-view conversion might make computation slower, see:
-        # https://github.com/scikit-learn/scikit-learn/issues/17299
-        # Alternatively, we could pass pointers and indices and access elements
-        # then in distance_metric.dist. This works but would complexify this API.
-        return self.distance_metric.csr_rdist(
-            self.X_data[xi_start:xi_end],
-            self.X_indices[xi_start:xi_end],
-            self.Y[j, :],
-            self.Y_indices
-        )
-
-    @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        cdef:
-            ITYPE_t xi_start = self.X_indptr[i]
-            ITYPE_t xi_end = self.X_indptr[i + 1]
-
-        # TODO: same as previous comment
-        return self.distance_metric.csr_dist(
-            self.X_data[xi_start:xi_end],
-            self.X_indices[xi_start:xi_end],
-            self.Y[j, :],
-            self.Y_indices
-        )
-
-@final
-cdef class DenseSparseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of a dense array and a CSR matrix.
-
-    Parameters
-    ----------
-    X: ndarray of shape (n_samples_X, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    Y: sparse matrix of shape (n_samples_Y, n_features)
-        Rows represent vectors. Must be in CSR format.
-
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two vectors of (X, Y).
-    """
-    cdef:
-        # As distance metrics are symmetric functions, we can
-        # simply rely on the SparseDenseDatasetsPair and swap arguments.
-        DatasetsPair datasets_pair
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        super().__init__(distance_metric)
-        # Swapping arguments on the constructor
-        self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
-
-    @final
-    cdef ITYPE_t n_samples_X(self) nogil:
-        # Swapping interface
-        return self.datasets_pair.n_samples_Y()
-
-    @final
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        # Swapping interface
-        return self.datasets_pair.n_samples_X()
-
-    @final
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        # Swapping arguments on the same interface
-        return self.datasets_pair.surrogate_dist(j, i)
-
-    @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        # Swapping arguments on the same interface
-        return self.datasets_pair.dist(j, i)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 7d8312cdd0a74..263c85aac065d 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -5,9 +5,6 @@
 
 from sklearn.metrics._dist_metrics import (
     DenseDenseDatasetsPair,
-    DenseSparseDatasetsPair,
-    SparseDenseDatasetsPair,
-    SparseSparseDatasetsPair,
 )
 
 from sklearn.metrics._pairwise_distances_reduction import (
@@ -165,20 +162,22 @@ def test_pairwise_distances_reduction_factory_method(
     dense_dense_instance = PairwiseDistancesReduction.get_for(X, Y, dummy_arg, metric)
     assert isinstance(dense_dense_instance.datasets_pair, DenseDenseDatasetsPair)
 
-    sparse_sparse_instance = PairwiseDistancesReduction.get_for(
-        csr_matrix(X), csr_matrix(Y), dummy_arg, metric
-    )
-    assert isinstance(sparse_sparse_instance.datasets_pair, SparseSparseDatasetsPair)
+    with pytest.raises(
+        ValueError, match="Only dense datasets are supported for X and Y."
+    ):
+        PairwiseDistancesReduction.get_for(
+            csr_matrix(X), csr_matrix(Y), dummy_arg, metric
+        )
 
-    dense_sparse_instance = PairwiseDistancesReduction.get_for(
-        X, csr_matrix(Y), dummy_arg, metric=metric
-    )
-    assert isinstance(dense_sparse_instance.datasets_pair, DenseSparseDatasetsPair)
+    with pytest.raises(
+        ValueError, match="Only dense datasets are supported for X and Y."
+    ):
+        PairwiseDistancesReduction.get_for(X, csr_matrix(Y), dummy_arg, metric=metric)
 
-    sparse_dense_instance = PairwiseDistancesReduction.get_for(
-        csr_matrix(X), Y, dummy_arg, metric=metric
-    )
-    assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair)
+    with pytest.raises(
+        ValueError, match="Only dense datasets are supported for X and Y."
+    ):
+        PairwiseDistancesReduction.get_for(csr_matrix(X), Y, dummy_arg, metric=metric)
 
     # Test specialisations creation
     fast_euclidean_instance = PairwiseDistancesReduction.get_for(

From 45c7f6ed2b5279ccc271fd6c973bf966441f17d2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 26 Oct 2021 09:45:03 +0200
Subject: [PATCH 244/290] Add some general notes about the implementations

Taken and adapted from the description of #20254
written by Olivier.

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 924fd88a4f216..9147c97e41de5 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -13,6 +13,12 @@
 # The routines defined here are used in various algorithms performing
 # the same structure of operations on distances between vectors
 # of a datasets pair (X, Y).
+#
+# Importantly, the core of the computation is chunked to make sure that the pairwise
+# distance chunk matrices stay in CPU cache before applying the final reduction step.
+# Furthermore, the chunking strategy is also used to leverage OpenMP-based parallelism
+# (using Cython prange loops) which gives another multiplicative speed-up in
+# favorable cases on many-core machines.
 
 cimport numpy as np
 import numpy as np

From 0b8516749b2dd7211204fc8a7cb916bcac129762 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 26 Oct 2021 11:25:11 +0200
Subject: [PATCH 245/290] fixup! Remove PairwiseDistancesRadiusNeighborhood

---
 sklearn/neighbors/_classification.py |  7 +------
 sklearn/neighbors/_regression.py     | 11 +++--------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 251e0ddb3ad2b..f03d68622de35 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -623,12 +623,7 @@ def predict_proba(self, X):
 
         n_queries = _num_samples(X)
 
-        if self.weights == "uniform":
-            # In that case, we do not need the distance so we do not compute them.
-            neigh_ind = self.radius_neighbors(X, return_distance=False)
-            neigh_dist = None
-        else:
-            neigh_dist, neigh_ind = self.radius_neighbors(X)
+        neigh_dist, neigh_ind = self.radius_neighbors(X)
 
         outlier_mask = np.zeros(n_queries, dtype=bool)
         outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 68afdfc41e9ea..6993f5fea6ee3 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -311,8 +311,8 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBa
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. For a list of available metrics, see the documentation of
-        :class:`~sklearn.metrics.DistanceMetric`.
+        metric. See the documentation of :class:`DistanceMetric` for a
+        list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -439,12 +439,7 @@ def predict(self, X):
                 dtype=double
             Target values.
         """
-        if self.weights == "uniform":
-            # In that case, we do not need the distance so we do not compute them.
-            neigh_ind = self.radius_neighbors(X, return_distance=False)
-            neigh_dist = None
-        else:
-            neigh_dist, neigh_ind = self.radius_neighbors(X)
+        neigh_dist, neigh_ind = self.radius_neighbors(X)
 
         weights = _get_weights(neigh_dist, self.weights)
 

From e7b0689115390cfcf40d7013377fa0935b566e95 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 26 Oct 2021 11:00:50 +0200
Subject: [PATCH 246/290] Turn off finitness checks in
 pairwise_distances_argmin{,_min}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/metrics/pairwise.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index e77d78d81eebd..37bea92a70d12 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -19,6 +19,7 @@
 from scipy.sparse import issparse
 from joblib import Parallel, effective_n_jobs
 
+from .. import config_context
 from ..utils.validation import _num_samples
 from ..utils.validation import check_non_negative
 from ..utils import check_array
@@ -674,11 +675,14 @@ def pairwise_distances_argmin_min(
         if metric == "fast_euclidean":
             metric = "euclidean"
 
-        indices, values = zip(
-            *pairwise_distances_chunked(
-                X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
+        # Turn off check for finiteness because this is costly and because arrays
+        # have already been validated.
+        with config_context(assume_finite=True):
+            indices, values = zip(
+                *pairwise_distances_chunked(
+                    X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
+                )
             )
-        )
         indices = np.concatenate(indices)
         values = np.concatenate(values)
 
@@ -776,15 +780,18 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs
         if metric == "fast_euclidean":
             metric = "euclidean"
 
-        indices = np.concatenate(
-            list(
-                # This returns a np.ndarray generator whose arrays we need
-                # to flatten into one.
-                pairwise_distances_chunked(
-                    X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs
+        # Turn off check for finiteness because this is costly and because arrays
+        # have already been validated.
+        with config_context(assume_finite=True):
+            indices = np.concatenate(
+                list(
+                    # This returns a np.ndarray generator whose arrays we need
+                    # to flatten into one.
+                    pairwise_distances_chunked(
+                        X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs
+                    )
                 )
             )
-        )
 
     return indices
 

From 8d2a3d20b98f18d5b35f0da415e4cee343fec649 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 26 Oct 2021 11:21:59 +0200
Subject: [PATCH 247/290] Improve test for pairwise_distances_argmin{,_min}

---
 sklearn/metrics/tests/test_pairwise.py | 28 +++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index c0de29296613b..3f65b6d6ea09d 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -440,13 +440,15 @@ def test_pairwise_distances_argmin_min():
 
     expected_idx = [0, 1]
     expected_vals = [2, 2]
+    expected_vals_sq = [4, 4]
 
-    # euclidean metric
+    # Euclidean metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
     idx2 = pairwise_distances_argmin(X, Y, metric="euclidean")
     assert_array_almost_equal(idx, expected_idx)
     assert_array_almost_equal(idx2, expected_idx)
     assert_array_almost_equal(vals, expected_vals)
+
     # sparse matrix case
     idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
     assert_array_almost_equal(idxsp, expected_idx)
@@ -455,14 +457,26 @@ def test_pairwise_distances_argmin_min():
     assert type(idxsp) == np.ndarray
     assert type(valssp) == np.ndarray
 
-    # euclidean metric squared
-    idx, vals = pairwise_distances_argmin_min(
-        X,
-        Y,
-        metric="fast_euclidean",
-    )
+    # Squared Euclidean metric
+    idx, vals = pairwise_distances_argmin_min(X, Y, metric="sqeuclidean")
+    idx2 = pairwise_distances_argmin(X, Y, metric="sqeuclidean")
+    assert_array_almost_equal(idx, expected_idx)
+    assert_array_almost_equal(vals, expected_vals_sq)
+    assert_array_almost_equal(idx2, expected_idx)
+
+    # Fast Euclidean metric
+    idx, vals = pairwise_distances_argmin_min(X, Y, metric="fast_euclidean")
+    idx2 = pairwise_distances_argmin(X, Y, metric="fast_euclidean")
     assert_array_almost_equal(idx, expected_idx)
     assert_array_almost_equal(vals, expected_vals)
+    assert_array_almost_equal(idx2, expected_idx)
+
+    # Fast Squared Euclidean metric
+    idx, vals = pairwise_distances_argmin_min(X, Y, metric="fast_sqeuclidean")
+    idx2 = pairwise_distances_argmin(X, Y, metric="fast_sqeuclidean")
+    assert_array_almost_equal(idx, expected_idx)
+    assert_array_almost_equal(vals, expected_vals_sq)
+    assert_array_almost_equal(idx2, expected_idx)
 
     # Non-euclidean scikit-learn metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")

From 843a894dbb0be63f9b086734f6fd6a160eacc7bd Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 26 Oct 2021 16:47:08 +0200
Subject: [PATCH 248/290] Update whats_new entry

---
 doc/whats_new/v1.1.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 7910c83ccc0cd..31273813fff2e 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -124,7 +124,6 @@ Miscellaneous
   - :func:`sklearn.metrics.pairwise_distances_argmin_min`
   - :class:`sklearn.cluster.AffinityPropagation`
   - :class:`sklearn.cluster.Birch`
-  - :class:`sklearn.cluster.DBSCAN`
   - :class:`sklearn.cluster.MeanShift`
   - :class:`sklearn.cluster.OPTICS`
   - :class:`sklearn.cluster.SpectralClustering`
@@ -143,7 +142,7 @@ Miscellaneous
   For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
   can be up to 20× faster than in the previous versions'.
 
-  :pr:`20254` by :user:`Julien Jerphanion <jjerphan>`.
+  :pr:`21462` by :user:`Julien Jerphanion <jjerphan>`.
 
 
 Code and Documentation Contributors

From effd89749b868bd61952fbfd2261936c7d7727e8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 27 Oct 2021 14:29:29 +0200
Subject: [PATCH 249/290] Check for consistency when X_train is the query

---
 sklearn/neighbors/tests/test_neighbors.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 7702fcb39287d..644f90fd3117e 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -90,12 +90,14 @@ def _weight_func(dist):
 @pytest.mark.parametrize("n_features", [5, 100])
 @pytest.mark.parametrize("n_query_pts", [10, 100])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100])
+@pytest.mark.parametrize("query_is_train", [False, True])
 @pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
 def test_unsupervised_kneighbors(
     n_samples,
     n_features,
     n_query_pts,
     n_neighbors,
+    query_is_train,
     metric,
 ):
     # The different algorithms must return identical results
@@ -106,7 +108,7 @@ def test_unsupervised_kneighbors(
     local_rng = np.random.RandomState(0)
     X = local_rng.rand(n_samples, n_features)
 
-    test = local_rng.rand(n_query_pts, n_features)
+    query = X if query_is_train else local_rng.rand(n_query_pts, n_features)
 
     results_nodist = []
     results = []
@@ -117,8 +119,8 @@ def test_unsupervised_kneighbors(
         )
         neigh.fit(X)
 
-        results_nodist.append(neigh.kneighbors(test, return_distance=False))
-        results.append(neigh.kneighbors(test, return_distance=True))
+        results_nodist.append(neigh.kneighbors(query, return_distance=False))
+        results.append(neigh.kneighbors(query, return_distance=True))
 
     for i in range(len(results) - 1):
         algorithm = ALGORITHMS[i]

From 00577c5ea62ee65e84b5673d377f681b3778b985 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 27 Oct 2021 15:04:59 +0200
Subject: [PATCH 250/290] Inject placeholder value for MeanShift.bandwidth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This modify the test configuration so that it
makes sense for when a sole sample is provided
for MeanShift.

This test was passing previously for this
configuration but was not supposed to.

The new implementation strategy for kneighbors
which uses PairwiseDistancesArgKmin is numerically
stabler for this case, motivating this modication.

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
---
 sklearn/utils/estimator_checks.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index ccc6ff23ed8fc..517680acdc88a 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -632,6 +632,11 @@ def _set_checking_parameters(estimator):
     if "n_init" in params:
         # K-Means
         estimator.set_params(n_init=2)
+    if name == "MeanShift":
+        # In the case of check_fit2d_1sample, bandwidth is set to None and
+        # is thus estimated. De facto it is 0.0 as a single sample is provided
+        # and this makes the test fails. Hence we give it a placeholder value.
+        estimator.set_params(bandwidth=1.0)
 
     if name == "TruncatedSVD":
         # TruncatedSVD doesn't run with n_components = n_features

From eab07b59a276eb73ab2124a18febd6530437615b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 22 Nov 2021 09:31:13 +0100
Subject: [PATCH 251/290] Rename PairwiseDistancesReduction callbacks

As suggested by Thomas J. Fan.
_parallel_on_Y_init was also renamed to _parallel_on_Y_parallel_init.

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 9147c97e41de5..2ef420ecdec35 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -308,7 +308,7 @@ cdef class PairwiseDistancesReduction:
                     X_end = X_start + self.X_n_samples_chunk
 
                 # Reinitializing thread datastructures for the new X chunk
-                self._parallel_on_X_threadwise_init_chunk(thread_num, X_start)
+                self._parallel_on_X_init_chunk(thread_num, X_start)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -330,7 +330,7 @@ cdef class PairwiseDistancesReduction:
             # end: for X_chunk_idx
 
             # Deallocating thread datastructures
-            self._parallel_on_X_threadwise_finalize(thread_num)
+            self._parallel_on_X_parallel_finalize(thread_num)
 
         # end: with nogil, parallel
         return
@@ -356,7 +356,7 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t thread_num
 
         # Allocating datastructures
-        self._parallel_on_Y_init(num_threads)
+        self._parallel_on_Y_parallel_init(num_threads)
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
@@ -369,7 +369,7 @@ cdef class PairwiseDistancesReduction:
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
-                self._parallel_on_Y_threadwise_init(thread_num)
+                self._parallel_on_Y_init(thread_num)
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -386,7 +386,7 @@ cdef class PairwiseDistancesReduction:
                     )
                 # end: prange
 
-                # Note: we don't need a _parallel_on_Y_threadwise_finalize similarly.
+                # Note: we don't need a _parallel_on_Y_finalize similarly.
                 # This can be introduced if needed.
 
             # end: with nogil, parallel
@@ -436,7 +436,7 @@ cdef class PairwiseDistancesReduction:
         """Allocate datastructures used in a thread given its number."""
         return
 
-    cdef void _parallel_on_X_threadwise_init_chunk(
+    cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -453,21 +453,21 @@ cdef class PairwiseDistancesReduction:
         """Interact with datastructures after a reduction on chunks."""
         return
 
-    cdef void _parallel_on_X_threadwise_finalize(
+    cdef void _parallel_on_X_parallel_finalize(
         self,
         ITYPE_t thread_num
     ) nogil:
         """Interact with datastructures after executing all the reductions."""
         return
 
-    cdef void _parallel_on_Y_init(
+    cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t num_threads,
     ) nogil:
         """Allocate datastructures used in all threads."""
         return
 
-    cdef void _parallel_on_Y_threadwise_init(
+    cdef void _parallel_on_Y_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
@@ -662,7 +662,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 )
 
     @final
-    cdef void _parallel_on_X_threadwise_init_chunk(
+    cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
@@ -690,7 +690,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 self.k
             )
 
-    cdef void _parallel_on_Y_init(
+    cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t num_threads,
     ) nogil:
@@ -715,7 +715,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             )
 
     @final
-    cdef void _parallel_on_Y_threadwise_init(
+    cdef void _parallel_on_Y_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
@@ -887,15 +887,15 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         )
 
     @final
-    cdef void _parallel_on_X_threadwise_finalize(
+    cdef void _parallel_on_X_parallel_finalize(
         self,
         ITYPE_t thread_num
     ) nogil:
-        PairwiseDistancesArgKmin._parallel_on_X_threadwise_finalize(self, thread_num)
+        PairwiseDistancesArgKmin._parallel_on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
-    cdef void _parallel_on_Y_init(
+    cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t num_threads,
     ) nogil:

From 8a48ffdd1a1831f9aed8c4ba4630fdee1dd49705 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 22 Nov 2021 09:36:19 +0100
Subject: [PATCH 252/290] Link back to _openmp_effective_n_threads for
 n_threads' description

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 2ef420ecdec35..72eb37a4174d8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -120,7 +120,8 @@ cdef class PairwiseDistancesReduction:
         Parallelism is done on chunks and the sharding of chunks
         depends on the `strategy` set on :method:`~PairwiseDistancesReduction.compute`.
 
-        None and -1 means using all processors.
+        See _openmp_effective_n_threads, for details about
+        the specification of n_threads.
     """
 
     cdef:
@@ -511,7 +512,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         Parallelism is done on chunks and the sharding of chunks
         depends on the `strategy` set on :method:`~ArgKmin.compute`.
 
-        None and -1 means using all processors.
+        See _openmp_effective_n_threads, for details about
+        the specification of n_threads.
     """
 
     cdef:
@@ -568,7 +570,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             depends on the `strategy` set on
             :method:`~PairwiseDistancesArgKmin.compute`.
 
-            None and -1 means using all processors.
+            See _openmp_effective_n_threads, for details about
+            the specification of n_threads.
 
         Returns
         -------

From 83854fa13dafaaf37f597032f13eea6105db1d42 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 22 Nov 2021 09:40:25 +0100
Subject: [PATCH 253/290] Use self.k directly

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 72eb37a4174d8..2ae5fb1e027a8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -648,7 +648,6 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             ITYPE_t i, j
             ITYPE_t n_samples_X = X_end - X_start
             ITYPE_t n_samples_Y = Y_end - Y_start
-            ITYPE_t k = self.k
             DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
@@ -659,7 +658,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 heap_push(
                     heaps_r_distances + i * self.k,
                     heaps_indices + i * self.k,
-                    k,
+                    self.k,
                     self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
                     Y_start + j,
                 )
@@ -933,7 +932,6 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     ) nogil:
         cdef:
             ITYPE_t i, j
-            ITYPE_t k = self.k
 
             const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
             const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
@@ -971,9 +969,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
                 heap_push(
-                    heaps_r_distances + i * k,
-                    heaps_indices + i * k,
-                    k,
+                    heaps_r_distances + i * self.k,
+                    heaps_indices + i * self.k,
+                    self.k,
                     # Using the squared euclidean distance as the rank-preserving distance:
                     #
                     #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²

From 445c86026dcdc7be0f71dbefefb5b619b794113b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 22 Nov 2021 09:53:39 +0100
Subject: [PATCH 254/290] Remove unneeded csr_dist and csr_rdist interfaces

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_dist_metrics.pxd | 16 -------------
 sklearn/metrics/_dist_metrics.pyx | 37 -------------------------------
 2 files changed, 53 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index cd07d4e08700c..e7c2f2ea2f926 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -56,22 +56,6 @@ cdef class DistanceMetric:
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
-    cdef DTYPE_t csr_dist(
-        self,
-        const DTYPE_t[:] x1_data,
-        const ITYPE_t[:] x1_indices,
-        const DTYPE_t[:] x2_data,
-        const ITYPE_t[:] x2_indices,
-    ) nogil except -1
-
-    cdef DTYPE_t csr_rdist(
-        self,
-        const DTYPE_t[:] x1_data,
-        const ITYPE_t[:] x1_indices,
-        const DTYPE_t[:] x2_data,
-        const ITYPE_t[:] x2_indices,
-    ) nogil except -1
-
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
 
     cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 882f53bfc7c3f..ed9f0aa3dd659 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -308,43 +308,6 @@ cdef class DistanceMetric:
         """
         return self.dist(x1, x2, size)
 
-    cdef DTYPE_t csr_dist(
-        self,
-        const DTYPE_t[:] x1_data,
-        const ITYPE_t[:] x1_indices,
-        const DTYPE_t[:] x2_data,
-        const ITYPE_t[:] x2_indices,
-    ) nogil except -1:
-        """Compute the distance between vectors x1 and x2 of a CSR matrix.
-
-        The computations is made given non null coordinates and
-        corresponding indices of the vectors CSR matrix.
-
-        This should be overridden in a base class.
-        """
-        return -999
-
-    cdef DTYPE_t csr_rdist(
-        self,
-        const DTYPE_t[:] x1_data,
-        const ITYPE_t[:] x1_indices,
-        const DTYPE_t[:] x2_data,
-        const ITYPE_t[:] x2_indices,
-    ) nogil except -1:
-        """Compute the rank-preserving surrogate distance between vectors x1 and x2 of a CSR matrix.
-
-        The computations is made given non null coordinates and
-        corresponding indices of the vectors CSR matrix.
-
-        This can optionally be overridden in a base class.
-
-        The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For example, the
-        rank-preserving surrogate distance of the Euclidean metric is the
-        squared-euclidean distance.
-        """
-        return self.csr_dist(x1_data, x1_indices, x2_data, x2_indices)
-
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
         """compute the pairwise distances between points in X"""
         cdef ITYPE_t i1, i2

From 6a4d7fedd026a6f41b161cad1827826ba8227324 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 22 Nov 2021 10:05:43 +0100
Subject: [PATCH 255/290] Add pairwise_dist_chunk_size keyword argument to
 config_context

---
 sklearn/_config.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sklearn/_config.py b/sklearn/_config.py
index e209010068d8c..8ceb795dd052f 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -114,7 +114,12 @@ def set_config(
 
 @contextmanager
 def config_context(
-    *, assume_finite=None, working_memory=None, print_changed_only=None, display=None
+    *,
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    pairwise_dist_chunk_size=None,
 ):
     """Context manager for global scikit-learn configuration.
 

From 5399cc6b2096f16960cf23e6850adaf6dd5e2b12 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 24 Nov 2021 16:24:16 +0100
Subject: [PATCH 256/290] fixup! Rename PairwiseDistancesReduction callbacks

 - Gotcha, improper method resolution!
 - Noooo, can't segfault anymoreeee!

Lesson learnt: choose name carefully and don't reuse names.
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 2ae5fb1e027a8..a934cd9335d7a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -902,7 +902,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._parallel_on_Y_init(self, num_threads)
+        PairwiseDistancesArgKmin._parallel_on_Y_parallel_init(self, num_threads)
 
         for thread_num in range(num_threads):
             # Temporary buffer for the `-2 * X_c @ Y_c.T` term

From d40b33372c3a996c349a103eeb1785ed36ee3a28 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 25 Nov 2021 10:13:52 +0100
Subject: [PATCH 257/290] TST Refactor test and adapt checks and tolerances

Fixtures for metric params have to be factorised.
---
 sklearn/neighbors/tests/test_neighbors.py | 86 +++++++++++++----------
 1 file changed, 48 insertions(+), 38 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 41d94da8dc28b..695872e626424 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from itertools import product
 
 import pytest
@@ -129,7 +130,7 @@ def test_unsupervised_kneighbors(
         indices_no_dist = results_nodist[i]
         distances, next_distances = results[i][0], results[i + 1][0]
         indices, next_indices = results[i][1], results[i + 1][1]
-        assert_allclose(
+        assert_array_equal(
             indices_no_dist,
             indices,
             err_msg=(
@@ -137,7 +138,7 @@ def test_unsupervised_kneighbors(
                 "indices depending on 'return_distances'."
             ),
         )
-        assert_allclose(
+        assert_array_equal(
             indices,
             next_indices,
             err_msg=(
@@ -152,6 +153,7 @@ def test_unsupervised_kneighbors(
                 f"The '{algorithm}' and '{next_algorithm}' "
                 "algorithms return different distances."
             ),
+            atol=1e-6,
         )
 
 
@@ -1508,45 +1510,45 @@ def test_neighbors_badargs():
         nbrs.radius_neighbors_graph(X, mode="blah")
 
 
-@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
-def test_neighbors_metrics(
-    metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5
-):
-    # Test computing the neighbors for various metrics
-    # create a symmetric matrix
+def _get_dummy_metric_params_list(metric, n_features):
+
     V = rng.rand(n_features, n_features)
     VI = np.dot(V, V.T)
 
-    metrics = [
-        ("euclidean", {}),
-        ("manhattan", {}),
-        ("minkowski", dict(p=1)),
-        ("minkowski", dict(p=2)),
-        ("minkowski", dict(p=3)),
-        ("minkowski", dict(p=np.inf)),
-        ("chebyshev", {}),
-        ("seuclidean", dict(V=rng.rand(n_features))),
-        ("mahalanobis", dict(VI=VI)),
-        ("haversine", {}),
-    ]
+    METRICS_PARAMS = defaultdict(
+        list,
+        {
+            "euclidean": [],
+            "manhattan": [],
+            "minkowski": [dict(p=1), dict(p=2), dict(p=3), dict(p=np.inf)],
+            "chebyshev": [],
+            "seuclidean": [dict(V=rng.rand(n_features))],
+            "mahalanobis": [dict(VI=VI)],
+            "haversine": [],
+        },
+    )
+
     if sp_version < parse_version("1.8.0.dev0"):
         # TODO: remove once we no longer support scipy < 1.8.0.
         # wminkowski was removed in scipy 1.8.0 but should work for previous
         # versions.
-        metrics.append(
-            ("wminkowski", dict(p=3, w=rng.rand(n_features))),
-        )
+        METRICS_PARAMS["wminkowski"].append(dict(p=3, w=rng.rand(n_features)))
     else:
         # Recent scipy versions accept weights in the Minkowski metric directly:
-        metrics.append(
-            ("minkowski", dict(p=3, w=rng.rand(n_features))),
-        )
+        METRICS_PARAMS["minkowski"].append(dict(p=3, w=rng.rand(n_features)))
+
+    return METRICS_PARAMS.get(metric, [])
 
+
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
+def test_neighbors_metrics(
+    metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5
+):
+    # Test computing the neighbors for various metrics
+    # create a symmetric matrix
     algorithms = ["brute", "ball_tree", "kd_tree"]
     X = rng.rand(n_samples, n_features)
-
     test = rng.rand(n_query_pts, n_features)
-    metric_params = _get_dummy_metric_kwargs(metric, n_features)
 
     # Haversine distance only accepts 2D data
     if metric == "haversine":
@@ -1557,10 +1559,9 @@ def test_neighbors_metrics(
         X_train = X
         X_test = test
 
-    results = {}
-    p = metric_params.pop("p", 2)
+    metric_params_list = _get_dummy_metric_params_list(metric, n_features)
 
-    for metric, metric_params in metrics:
+    for metric_params in metric_params_list:
         results = {}
         p = metric_params.pop("p", 2)
         w = metric_params.get("w", None)
@@ -1602,14 +1603,23 @@ def test_neighbors_metrics(
                     test[:, feature_sl], return_distance=True
                 )
 
-        neigh.fit(X_train)
-        results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
+            neigh.fit(X_train)
+            results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
+
+        brute_dst, brute_idx = results["brute"]
+        ball_tree_dst, ball_tree_idx = results["ball_tree"]
+
+        assert_allclose(brute_dst, ball_tree_dst)
+        assert_array_equal(brute_idx, ball_tree_idx)
+
+        if "kd_tree" in results:
+            # KD tree might not have been computed
+            kd_tree_dst, kd_tree_idx = results["kd_tree"]
+            assert_allclose(brute_dst, kd_tree_dst)
+            assert_array_equal(brute_idx, kd_tree_idx)
 
-    assert_allclose(results["brute"][0], results["ball_tree"][0])
-    assert_allclose(results["brute"][1], results["ball_tree"][1])
-    if "kd_tree" in results:
-        assert_allclose(results["brute"][0], results["kd_tree"][0])
-        assert_allclose(results["brute"][1], results["kd_tree"][1])
+            assert_allclose(ball_tree_dst, kd_tree_dst)
+            assert_array_equal(ball_tree_idx, kd_tree_idx)
 
 
 def test_callable_metric():

From 2f02350b4e7fb81932c9e252265c3722051e55fc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 25 Nov 2021 10:13:52 +0100
Subject: [PATCH 258/290] TST Factorise fixtures for metric params

---
 .../test_pairwise_distances_reduction.py      | 48 +++++++++----
 sklearn/neighbors/tests/test_neighbors.py     | 71 ++++++-------------
 2 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 263c85aac065d..2188553c6d86d 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from collections import defaultdict
 from numpy.testing import assert_array_equal, assert_allclose
 from scipy.sparse import csr_matrix
 
@@ -15,12 +16,13 @@
 )
 
 from sklearn.utils import _in_unstable_openblas_configuration
-
+from sklearn.utils.fixes import sp_version, parse_version
 from sklearn.utils._testing import fails_if_unstable_openblas
 
 
-def _get_dummy_metric_kwargs(metric: str, n_features: int):
-    """Return dummy DistanceMetric kwargs for tests."""
+def _get_dummy_metric_params_list(metric: str, n_features: int):
+    """Return list of dummy DistanceMetric kwargs for tests."""
+
     rng = np.random.RandomState(1)
     weights = rng.random_sample(n_features)
     weights /= weights.sum()
@@ -30,14 +32,33 @@ def _get_dummy_metric_kwargs(metric: str, n_features: int):
     # VI is positive-semidefinite, preferred for precision matrix
     VI = np.dot(V, V.T) + 3 * np.eye(n_features)
 
-    kwargs = {
-        "minkowski": dict(p=1.5),
-        "seuclidean": dict(V=weights),
-        "wminkowski": dict(p=1.5, w=weights),
-        "mahalanobis": dict(VI=VI),
-    }
+    METRICS_PARAMS = defaultdict(
+        list,
+        {
+            "euclidean": [{}],
+            "manhattan": [{}],
+            "minkowski": [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)],
+            "chebyshev": [{}],
+            "seuclidean": [dict(V=rng.rand(n_features))],
+            "haversine": [{}],
+            "wminkowski": [dict(p=1.5, w=weights)],
+            "mahalanobis": [dict(VI=VI)],
+        },
+    )
+
+    wminkowski_kwargs = dict(p=3, w=rng.rand(n_features))
+
+    if sp_version < parse_version("1.8.0.dev0"):
+        # TODO: remove once we no longer support scipy < 1.8.0.
+        # wminkowski was removed in scipy 1.8.0 but should work for previous
+        # versions.
+        METRICS_PARAMS["wminkowski"].append(wminkowski_kwargs)  # type: ignore
+    else:
+        # Recent scipy versions accept weights in the Minkowski metric directly:
+        # type: ignore
+        METRICS_PARAMS["minkowski"].append(wminkowski_kwargs)  # type: ignore
 
-    return kwargs.get(metric, {})
+    return METRICS_PARAMS.get(metric, [{}])
 
 
 def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
@@ -314,7 +335,8 @@ def test_strategies_consistency(
         Y,
         parameter,
         metric=metric,
-        metric_kwargs=_get_dummy_metric_kwargs(metric, n_features),
+        # Taking the first
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
         # To be sure to use parallelization
         chunk_size=n_samples // 4,
     )
@@ -407,7 +429,7 @@ def test_fast_sqeuclidean_translation_invariance(
         Y,
         parameter,
         metric=metric,
-        metric_kwargs=_get_dummy_metric_kwargs(metric, n_features),
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
     ).compute(return_distance=True)
 
     dist, indices = PairwiseDistancesReduction.get_for(
@@ -415,7 +437,7 @@ def test_fast_sqeuclidean_translation_invariance(
         Y + 0,
         parameter,
         metric=metric,
-        metric_kwargs=_get_dummy_metric_kwargs(metric, n_features),
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
     ).compute(return_distance=True)
 
     ASSERT_RESULT[PairwiseDistancesReduction](
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 695872e626424..2de6b79527c5a 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1,4 +1,3 @@
-from collections import defaultdict
 from itertools import product
 
 import pytest
@@ -23,7 +22,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.metrics.tests.test_pairwise_distances_reduction import (
-    _get_dummy_metric_kwargs,
+    _get_dummy_metric_params_list,
 )
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
@@ -1510,36 +1509,6 @@ def test_neighbors_badargs():
         nbrs.radius_neighbors_graph(X, mode="blah")
 
 
-def _get_dummy_metric_params_list(metric, n_features):
-
-    V = rng.rand(n_features, n_features)
-    VI = np.dot(V, V.T)
-
-    METRICS_PARAMS = defaultdict(
-        list,
-        {
-            "euclidean": [],
-            "manhattan": [],
-            "minkowski": [dict(p=1), dict(p=2), dict(p=3), dict(p=np.inf)],
-            "chebyshev": [],
-            "seuclidean": [dict(V=rng.rand(n_features))],
-            "mahalanobis": [dict(VI=VI)],
-            "haversine": [],
-        },
-    )
-
-    if sp_version < parse_version("1.8.0.dev0"):
-        # TODO: remove once we no longer support scipy < 1.8.0.
-        # wminkowski was removed in scipy 1.8.0 but should work for previous
-        # versions.
-        METRICS_PARAMS["wminkowski"].append(dict(p=3, w=rng.rand(n_features)))
-    else:
-        # Recent scipy versions accept weights in the Minkowski metric directly:
-        METRICS_PARAMS["minkowski"].append(dict(p=3, w=rng.rand(n_features)))
-
-    return METRICS_PARAMS.get(metric, [])
-
-
 @pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
 def test_neighbors_metrics(
     metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5
@@ -1648,7 +1617,7 @@ def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=
     X = rng.rand(n_samples, n_features)
     Xcsr = csr_matrix(X)
 
-    metric_params = _get_dummy_metric_kwargs(metric, n_features)
+    metric_params_list = _get_dummy_metric_params_list(metric, n_features)
 
     if metric == "precomputed":
         X_precomputed = rng.random_sample((10, 4))
@@ -1662,24 +1631,28 @@ def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=
         nb_p.kneighbors(DYX)
 
     else:
-        nn = neighbors.NearestNeighbors(
-            n_neighbors=3, algorithm="auto", metric=metric, metric_params=metric_params
-        )
-        # Haversine distance only accepts 2D data
-        if metric == "haversine":
-            feature_sl = slice(None, 2)
-            X = np.ascontiguousarray(X[:, feature_sl])
-        else:
-            X = X
+        for metric_params in metric_params_list:
+            nn = neighbors.NearestNeighbors(
+                n_neighbors=3,
+                algorithm="auto",
+                metric=metric,
+                metric_params=metric_params,
+            )
+            # Haversine distance only accepts 2D data
+            if metric == "haversine":
+                feature_sl = slice(None, 2)
+                X = np.ascontiguousarray(X[:, feature_sl])
+            else:
+                X = X
 
-        nn.fit(X)
-        nn.kneighbors(X)
+            nn.fit(X)
+            nn.kneighbors(X)
 
-        if metric in VALID_METRICS_SPARSE["brute"]:
-            nn = neighbors.NearestNeighbors(
-                n_neighbors=3, algorithm="auto", metric=metric
-            ).fit(Xcsr)
-            nn.kneighbors(Xcsr)
+            if metric in VALID_METRICS_SPARSE["brute"]:
+                nn = neighbors.NearestNeighbors(
+                    n_neighbors=3, algorithm="auto", metric=metric
+                ).fit(Xcsr)
+                nn.kneighbors(Xcsr)
 
 
 def test_metric_params_interface():

From 19dd7cab712d94ba8353c6039392e17c0c5d49a4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 25 Nov 2021 15:37:46 +0100
Subject: [PATCH 259/290] Change metric to fast_sqeuclidean for
 pairwise_distances_argmin*

---
 sklearn/metrics/pairwise.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 0010c13c3cd83..3d7fb21337072 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -584,7 +584,7 @@ def _argmin_reduce(dist, start):
 
 
 def pairwise_distances_argmin_min(
-    X, Y, *, axis=1, metric="euclidean", metric_kwargs=None
+    X, Y, *, axis=1, metric="fast_euclidean", metric_kwargs=None
 ):
     """Compute minimum distances between one point and a set of points.
 
@@ -692,7 +692,9 @@ def pairwise_distances_argmin_min(
     return indices, values
 
 
-def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None):
+def pairwise_distances_argmin(
+    X, Y, *, axis=1, metric="fast_euclidean", metric_kwargs=None
+):
     """Compute minimum distances between one point and a set of points.
 
     This function computes for each row in X, the index of the row of Y which

From 54b4b964e2aed735ab0f2c704df1f9a8c1cce24a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 25 Nov 2021 17:59:53 +0100
Subject: [PATCH 260/290] TST Remove spurious skip for tests

---
 sklearn/neighbors/tests/test_neighbors.py | 29 ++++-------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 2de6b79527c5a..1149800967a38 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -235,12 +235,6 @@ def test_neighs_predictions_fast_euclidean_correctness(
 ):
     # The fast euclidean strategy must return results
     # that are close to the ones obtained with the euclidean distance
-    if n_samples < n_neighbors:
-        pytest.skip(
-            f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})",
-            allow_module_level=True,
-        )
-
     rng = np.random.RandomState(0)
     X = rng.rand(n_samples, n_features).astype(dtype)
     y = rng.randint(3, size=n_samples)
@@ -1533,18 +1527,7 @@ def test_neighbors_metrics(
     for metric_params in metric_params_list:
         results = {}
         p = metric_params.pop("p", 2)
-        w = metric_params.get("w", None)
         for algorithm in algorithms:
-            # KD tree doesn't support all metrics
-            if algorithm == "kd_tree" and (
-                metric not in neighbors.KDTree.valid_metrics or w is not None
-            ):
-                est = neighbors.NearestNeighbors(
-                    algorithm=algorithm, metric=metric, metric_params=metric_params
-                )
-                with pytest.raises(ValueError):
-                    est.fit(X)
-                continue
             neigh = neighbors.NearestNeighbors(
                 n_neighbors=n_neighbors,
                 algorithm=algorithm,
@@ -1576,19 +1559,17 @@ def test_neighbors_metrics(
             results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
 
         brute_dst, brute_idx = results["brute"]
+        kd_tree_dst, kd_tree_idx = results["kd_tree"]
         ball_tree_dst, ball_tree_idx = results["ball_tree"]
 
         assert_allclose(brute_dst, ball_tree_dst)
         assert_array_equal(brute_idx, ball_tree_idx)
 
-        if "kd_tree" in results:
-            # KD tree might not have been computed
-            kd_tree_dst, kd_tree_idx = results["kd_tree"]
-            assert_allclose(brute_dst, kd_tree_dst)
-            assert_array_equal(brute_idx, kd_tree_idx)
+        assert_allclose(brute_dst, kd_tree_dst)
+        assert_array_equal(brute_idx, kd_tree_idx)
 
-            assert_allclose(ball_tree_dst, kd_tree_dst)
-            assert_array_equal(ball_tree_idx, kd_tree_idx)
+        assert_allclose(ball_tree_dst, kd_tree_dst)
+        assert_array_equal(ball_tree_idx, kd_tree_idx)
 
 
 def test_callable_metric():

From 56e86ef7ed3002152af8055f0546200486991013 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 26 Nov 2021 15:19:32 +0100
Subject: [PATCH 261/290] TST Remove useless guard for haversine

---
 sklearn/neighbors/tests/test_neighbors.py | 25 ++++-------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 1149800967a38..4cf77138be937 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1510,17 +1510,8 @@ def test_neighbors_metrics(
     # Test computing the neighbors for various metrics
     # create a symmetric matrix
     algorithms = ["brute", "ball_tree", "kd_tree"]
-    X = rng.rand(n_samples, n_features)
-    test = rng.rand(n_query_pts, n_features)
-
-    # Haversine distance only accepts 2D data
-    if metric == "haversine":
-        feature_sl = slice(None, 2)
-        X_train = np.ascontiguousarray(X[:, feature_sl])
-        X_test = np.ascontiguousarray(test[:, feature_sl])
-    else:
-        X_train = X
-        X_test = test
+    X_train = rng.rand(n_samples, n_features)
+    X_test = rng.rand(n_query_pts, n_features)
 
     metric_params_list = _get_dummy_metric_params_list(metric, n_features)
 
@@ -1536,10 +1527,7 @@ def test_neighbors_metrics(
                 metric_params=metric_params,
             )
 
-            # Haversine distance only accepts 2D data
-            feature_sl = slice(None, 2) if metric == "haversine" else slice(None)
-
-            neigh.fit(X[:, feature_sl])
+            neigh.fit(X_train)
 
             # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
             ExceptionToAssert = None
@@ -1551,12 +1539,7 @@ def test_neighbors_metrics(
                 ExceptionToAssert = DeprecationWarning
 
             with pytest.warns(ExceptionToAssert):
-                results[algorithm] = neigh.kneighbors(
-                    test[:, feature_sl], return_distance=True
-                )
-
-            neigh.fit(X_train)
-            results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
+                results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
 
         brute_dst, brute_idx = results["brute"]
         kd_tree_dst, kd_tree_idx = results["kd_tree"]

From 355cbe24723a39bc4c75b8ce12c79ee7603dc506 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:12:51 +0100
Subject: [PATCH 262/290] DOC Clarify docstrings and comments

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/_config.py                       |  4 ++--
 sklearn/cluster/_affinity_propagation.py |  7 ++++++-
 sklearn/cluster/_birch.py                |  9 +++++---
 sklearn/cluster/_mean_shift.py           |  7 ++++++-
 sklearn/metrics/_dist_metrics.pyx        |  6 +++---
 sklearn/metrics/pairwise.py              | 26 ++++++++++++++++++------
 6 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/sklearn/_config.py b/sklearn/_config.py
index 8ceb795dd052f..d6a02737f640d 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -89,7 +89,7 @@ def set_config(
 
     pairwise_dist_chunk_size : int, default=None
         The number of vectors per chunk for PairwiseDistancesReduction.
-        Default is 256 (optimal for most of modern laptops' caches and architectures).
+        Default is 256 (suitable for most of modern laptops' caches and architectures).
 
         .. versionadded:: 1.1
 
@@ -160,7 +160,7 @@ def config_context(
 
     pairwise_dist_chunk_size : int, default=None
         The number of vectors per chunk for PairwiseDistancesReduction.
-        Default is 256 (optimal for most of modern laptops' caches and architectures).
+        Default is 256 (suitable for most of modern laptops' caches and architectures).
 
         .. versionadded:: 1.1
 
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 91322dba632d6..bf1d2d86dc77c 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -524,7 +524,12 @@ def predict(self, X):
         if self.cluster_centers_.shape[0] > 0:
             with config_context(assume_finite=True):
                 return pairwise_distances_argmin(
-                    X, self.cluster_centers_, metric="fast_euclidean"
+                    # We use the fast squared euclidean metric alternative to get
+                    # maximum acceleration as we are not concerned with the minimum
+                    # values but only their indices.
+                    X,
+                    self.cluster_centers_,
+                    metric="fast_sqeuclidean",
                 )
         else:
             warnings.warn(
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 5c7c3d2a6d729..347756f089933 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -677,14 +677,17 @@ def predict(self, X):
         check_is_fitted(self)
         X = self._validate_data(X, accept_sparse="csr", reset=False)
 
-        fast_euclidean_kwargs = {"Y_norm_squared": self._subcluster_norms}
+        metric_kwargs = {"Y_norm_squared": self._subcluster_norms}
 
         with config_context(assume_finite=True):
             argmin = pairwise_distances_argmin(
                 X,
                 self.subcluster_centers_,
-                metric="fast_euclidean",
-                metric_kwargs=fast_euclidean_kwargs,
+                # We use the fast squared euclidean metric alternative to get
+                # maximum acceleration as we are not concerned with the minimum
+                # values but only their indices.
+                metric="fast_sqeuclidean",
+                metric_kwargs=metric_kwargs,
             )
         return self.subcluster_labels_[argmin]
 
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 542ed0dbc97aa..e93de71da52c9 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -513,5 +513,10 @@ def predict(self, X):
         X = self._validate_data(X, reset=False)
         with config_context(assume_finite=True):
             return pairwise_distances_argmin(
-                X, self.cluster_centers_, metric="fast_euclidean"
+                # We use the fast squared euclidean metric alternative to get
+                # maximum acceleration as we are not concerned with the minimum
+                # values but only their indices.
+                X,
+                self.cluster_centers_,
+                metric="fast_sqeuclidean",
             )
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 573261adba1eb..3def08da7965c 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1176,8 +1176,8 @@ cdef class DatasetsPair:
     of X and Y at a time given the pair of their indices (i, j). This class is
     specialized for each metric thanks to the :func:`get_for` factory classmethod.
 
-    The handling of chunking and parallelization to compute the distances
-    and aggregation for several rows at a time is handled in dedicated
+    The handling of parallelization over chunks to compute the distances
+    and aggregation for several rows at a time is done in dedicated
     subclasses of PairwiseDistancesReduction that in-turn rely on
     subclasses of DatasetsPair for each pair of rows in the data. The goal
     is to make it possible to decouple the generic parallelization and
@@ -1189,7 +1189,7 @@ cdef class DatasetsPair:
     This class avoids the overhead of dispatching distance computations
     to :class:`sklearn.metrics.DistanceMetric` based on the physical
     representation of the vectors (sparse vs. dense). It makes use of
-    cython.final to remove the overhead of method calls' dispatch.
+    cython.final to remove the overhead of dispatching method calls.
 
     Parameters
     ----------
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 3d7fb21337072..404b1b4be34c7 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -610,7 +610,7 @@ def pairwise_distances_argmin_min(
     axis : int, default=1
         Axis along which the argmin and distances are to be computed.
 
-    metric : str or callable, default='euclidean'
+    metric : str or callable, default="fast_euclidean"
         Metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
@@ -624,8 +624,8 @@ def pairwise_distances_argmin_min(
 
         Valid values for metric are:
 
-        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-          'manhattan']
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'fast_euclidean',
+          'fast_euclidean', 'l1', 'l2', 'manhattan']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -633,6 +633,13 @@ def pairwise_distances_argmin_min(
           'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
           'yule']
 
+        'fast_euclidean' (the default metric) is a variant of the 'euclidean'
+        metric which has a superior arithmetic intensity and hence better
+        running time. However it can suffer from numerical instability caused
+        by catastrophic cancellation in rare configuration.
+        Hence when exact results are mandatory, 'euclidean' should be preferred.
+        The same remark applies for 'fast_sqeuclidean' regarding 'sqeuclidean'.
+
         See the documentation for scipy.spatial.distance for details on these
         metrics.
 
@@ -719,7 +726,7 @@ def pairwise_distances_argmin(
     axis : int, default=1
         Axis along which the argmin and distances are to be computed.
 
-    metric : str or callable, default="euclidean"
+    metric : str or callable, default="fast_euclidean"
         Metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
@@ -733,8 +740,8 @@ def pairwise_distances_argmin(
 
         Valid values for metric are:
 
-        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-          'manhattan']
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'fast_euclidean',
+          'fast_euclidean', 'l1', 'l2', 'manhattan']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -742,6 +749,13 @@ def pairwise_distances_argmin(
           'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
           'yule']
 
+        'fast_euclidean' (the default metric) is a variant of the 'euclidean'
+        metric which has a superior arithmetic intensity and hence better
+        running time. However it can suffer from numerical instability caused
+        by catastrophic cancellation in rare configuration.
+        Hence when exact results are mandatory, 'euclidean' should be preferred.
+        The same remark applies for 'fast_sqeuclidean' regarding 'sqeuclidean'.
+
         See the documentation for scipy.spatial.distance for details on these
         metrics.
 

From 56151ab28af1f3d06cea3a03fd9858c5b32fd232 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:14:09 +0100
Subject: [PATCH 263/290] MAINT Drop unneeded Cython directive

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 6 ------
 sklearn/metrics/setup.py                          | 1 -
 2 files changed, 7 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index a934cd9335d7a..570ffdd54b8fc 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -1,9 +1,3 @@
-# cython: boundscheck=False
-# cython: cdivision=True
-# cython: initializedcheck=False
-# cython: wraparound=False
-# distutils: language=c++
-
 # Pairwise Distances Reductions
 # =============================
 #
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index cd32817574dd3..29d7c870202a1 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -22,7 +22,6 @@ def configuration(parent_package="", top_path=None):
     config.add_extension(
         "_pairwise_distances_reduction",
         sources=["_pairwise_distances_reduction.pyx"],
-        language="c++",
         libraries=libraries,
     )
 

From 96aaa0b3789d416051c35168b5aae6283a6bc881 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:17:58 +0100
Subject: [PATCH 264/290] MAINT Better validate and use chunk_size

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 570ffdd54b8fc..60ff7596b863d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -185,21 +185,19 @@ cdef class PairwiseDistancesReduction:
         if chunk_size is None:
             chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
 
-        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
+        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
 
         self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads)
 
-        n_samples_chunk = max(20, chunk_size)
-
         self.datasets_pair = datasets_pair
 
         self.n_samples_X = datasets_pair.n_samples_X()
-        self.X_n_samples_chunk = min(self.n_samples_X, n_samples_chunk)
+        self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
         X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
         self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
 
         self.n_samples_Y = datasets_pair.n_samples_Y()
-        self.Y_n_samples_chunk = min(self.n_samples_Y, n_samples_chunk)
+        self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
         Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
         self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
 

From 9d5f7f75ae57e5a72411adc6c178d939932dd959 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:18:37 +0100
Subject: [PATCH 265/290] MAINT Raise UserWarning when uneeded metric_params
 are specified

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 60ff7596b863d..b803959a49c96 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -263,7 +263,8 @@ cdef class PairwiseDistancesReduction:
             elif strategy == 'parallel_on_X':
                 self._parallel_on_X()
             else:
-                raise RuntimeError(f"strategy '{strategy}' not supported.")
+                raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
+                                   f"or 'auto', but currently strategy='{strategy}'.")
 
         return self._finalize_results(return_distance)
 
@@ -831,6 +832,12 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         chunk_size=None,
         metric_kwargs=None,
     ):
+        if metric_kwargs is not None and len(metric_kwargs) > 0:
+            raise UserWarning(
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't"
+                f"usable for this case ({self.__class__.__name__}) and will be ignored."
+            )
+
         super().__init__(
             # The datasets pair here is used for exact distances computations
             datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),

From 5fa4cb1ca188efef3e18b422939f40552b343fc4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:19:42 +0100
Subject: [PATCH 266/290] Correctly fallback on standard metric

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/pairwise.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 404b1b4be34c7..e53e6be053328 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -682,8 +682,8 @@ def pairwise_distances_argmin_min(
         # asked for a fast alternative, we need to revert to the standard
         # "euclidean" strategy to match the API.
         # Internally, the "euclidean" strategy still uses the GEMM trick.
-        if metric == "fast_euclidean":
-            metric = "euclidean"
+        if metric in ("fast_euclidean", "fast_sqeuclidean"):
+            metric = metric.replace("fast_", "")
 
         # Turn off check for finiteness because this is costly and because arrays
         # have already been validated.
@@ -796,8 +796,8 @@ def pairwise_distances_argmin(
         # asked for a fast alternative, we need to revert to the standard one.
         # "euclidean" strategy to match the API.
         # Internally, the "euclidean" strategy still uses the GEMM trick.
-        if metric == "fast_euclidean":
-            metric = "euclidean"
+        if metric in ("fast_euclidean", "fast_sqeuclidean"):
+            metric = metric.replace("fast_", "")
 
         # Turn off check for finiteness because this is costly and because arrays
         # have already been validated.

From 7c36f119d068845e60aa85d496caff690286d603 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:20:33 +0100
Subject: [PATCH 267/290] TST Remove uneeded tests and use adapted version
 parsing

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../test_pairwise_distances_reduction.py      | 28 +------------------
 sklearn/utils/__init__.py                     |  5 ++--
 2 files changed, 3 insertions(+), 30 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 2188553c6d86d..eba3686217d97 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -61,22 +61,6 @@ def _get_dummy_metric_params_list(metric: str, n_features: int):
     return METRICS_PARAMS.get(metric, [{}])
 
 
-def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
-    # We get arrays of arrays and we need to check for individual pairs
-    for i in range(ref_dist.shape[0]):
-        assert_array_equal(
-            ref_indices[i],
-            indices[i],
-            err_msg=f"Query vector #{i} has different neighbors' indices",
-        )
-        assert_allclose(
-            ref_dist[i],
-            dist[i],
-            err_msg=f"Query vector #{i} has different neighbors' distances",
-            rtol=1e-7,
-        )
-
-
 def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
     assert_array_equal(
         ref_indices,
@@ -358,23 +342,13 @@ def test_strategies_consistency(
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("k, radius", [(50, 100)])
 def test_fast_sqeuclidean_correctness(
     seed,
     n_samples,
     n_features,
-    k,
-    radius,
+    k=50,
     dtype=np.float64,
 ):
-    # The fast squared euclidean strategy must return results
-    # that are close to the ones obtained with the euclidean distance
-    if n_samples < k:
-        pytest.skip(
-            f"Skipping as n_samples (={n_samples}) < k (={k})",
-            allow_module_level=True,
-        )
-
     rng = np.random.RandomState(seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index f43d2a47231a1..4b2261ad7c2f4 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -3,7 +3,6 @@
 """
 import pkgutil
 import inspect
-from distutils.version import LooseVersion
 from importlib import import_module
 from operator import itemgetter
 from collections.abc import Sequence
@@ -97,7 +96,7 @@ def _in_unstable_openblas_configuration():
 
     # OpenBLAS 0.3.16 fixed unstability for arm64, see:
     # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
-    openblas_arm64_stable_version = LooseVersion("0.3.16")
+    openblas_arm64_stable_version = parse_version("0.3.16")
     for info in modules_info:
         if info["internal_api"] != "openblas":
             continue
@@ -108,7 +107,7 @@ def _in_unstable_openblas_configuration():
             return True
         if (
             openblas_architecture == "neoversen1"
-            and openblas_version < openblas_arm64_stable_version
+            and parse_version(openblas_version) < openblas_arm64_stable_version
         ):
             # See discussions in https://github.com/numpy/numpy/issues/19411
             return True

From d2396a728faa9d226e3d301f27aa1d714a256680 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:21:19 +0100
Subject: [PATCH 268/290] Rename variable and fix docstring for the
 simultaneous swap

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/utils/_heap.pyx | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index 32e39ff37ff13..01808f4154212 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -9,15 +9,15 @@ from cython cimport floating, integral, numeric
 from ._typedefs cimport ITYPE_t
 
 cdef inline void dual_swap(floating* darr, ITYPE_t* iarr,
-                           ITYPE_t i1, ITYPE_t i2) nogil:
-    """Swap the values at inex i1 and i2 of both darr and iarr"""
-    cdef floating dtmp = darr[i1]
-    darr[i1] = darr[i2]
-    darr[i2] = dtmp
-
-    cdef ITYPE_t itmp = iarr[i1]
-    iarr[i1] = iarr[i2]
-    iarr[i2] = itmp
+                           ITYPE_t a, ITYPE_t b) nogil:
+    """Swap the values at index i1 and i2 of both darr and iarr"""
+    cdef floating dtmp = darr[a]
+    darr[a] = darr[b]
+    darr[b] = dtmp
+
+    cdef ITYPE_t itmp = iarr[a]
+    iarr[a] = iarr[b]
+    iarr[b] = itmp
 
 cdef int simultaneous_sort(
     floating* values,

From 02a0e922b042849a2737bcd49dc9391224999d60 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:32:58 +0100
Subject: [PATCH 269/290] fixup! MAINT Raise UserWarning when uneeded
 metric_params are specified

This raises a warning correctly.
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index b803959a49c96..1698f48db666e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -13,9 +13,9 @@
 # Furthermore, the chunking strategy is also used to leverage OpenMP-based parallelism
 # (using Cython prange loops) which gives another multiplicative speed-up in
 # favorable cases on many-core machines.
-
 cimport numpy as np
 import numpy as np
+import warnings
 import scipy.sparse
 
 from .. import get_config
@@ -833,9 +833,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         metric_kwargs=None,
     ):
         if metric_kwargs is not None and len(metric_kwargs) > 0:
-            raise UserWarning(
+            warnings.warn(
                 f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't"
-                f"usable for this case ({self.__class__.__name__}) and will be ignored."
+                f"usable for this case ({self.__class__.__name__}) and will be ignored.",
+                UserWarning,
+                stacklevel=3,
             )
 
         super().__init__(

From e2e5282a6d9e3e9ecc276544f793c0e438fbcc53 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:40:47 +0100
Subject: [PATCH 270/290] fixup! DOC Clarify docstrings and comments

---
 sklearn/metrics/pairwise.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index e53e6be053328..c65a9e4f25d2a 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -625,7 +625,7 @@ def pairwise_distances_argmin_min(
         Valid values for metric are:
 
         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'fast_euclidean',
-          'fast_euclidean', 'l1', 'l2', 'manhattan']
+          'fast_sqeuclidean', 'l1', 'l2', 'manhattan']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -741,7 +741,7 @@ def pairwise_distances_argmin(
         Valid values for metric are:
 
         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'fast_euclidean',
-          'fast_euclidean', 'l1', 'l2', 'manhattan']
+          'fast_sqeuclidean', 'l1', 'l2', 'manhattan']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',

From 1b77d2f9b66e9384f1db30856ce29b383c6de747 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 30 Nov 2021 17:43:00 +0100
Subject: [PATCH 271/290] fixup! MAINT Drop unneeded Cython directive

---
 sklearn/utils/_heap.pyx | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
index 01808f4154212..89fc779877c4f 100644
--- a/sklearn/utils/_heap.pyx
+++ b/sklearn/utils/_heap.pyx
@@ -1,9 +1,3 @@
-# cython: boundscheck=False
-# cython: cdivision=True
-# cython: initializedcheck=False
-# cython: wraparound=False
-
-
 from cython cimport floating, integral, numeric
 
 from ._typedefs cimport ITYPE_t

From f9037f05b2b6607de2c98ef152955ded5d88cd18 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Dec 2021 11:50:00 +0100
Subject: [PATCH 272/290] [WIP] Rework PairwiseDistancesArgKmin.compute

---
 .../metrics/_pairwise_distances_reduction.pyx |  64 ++++++--
 sklearn/metrics/pairwise.py                   |  24 ++-
 .../test_pairwise_distances_reduction.py      | 139 +++++++++++-------
 sklearn/neighbors/_base.py                    |   3 +-
 4 files changed, 156 insertions(+), 74 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 1698f48db666e..59585d9f6ebc4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -205,7 +205,7 @@ cdef class PairwiseDistancesReduction:
         self.X_n_chunks = X_n_full_chunks + (self.X_n_samples_remainder != 0)
         self.Y_n_chunks = Y_n_full_chunks + (self.Y_n_samples_remainder != 0)
 
-    def compute(
+    def _compute(
         self,
         str strategy=None,
         bint return_distance=False,
@@ -520,7 +520,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
-    def get_for(
+    def compute(
         cls,
         X,
         Y,
@@ -529,8 +529,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         chunk_size=None,
         dict metric_kwargs=None,
         n_threads=None,
-    ) -> PairwiseDistancesArgKmin:
-        """Return the PairwiseDistancesArgKmin implementation for the given arguments.
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Return the results of the reduction for the given arguments.
 
         Parameters
         ----------
@@ -566,26 +568,60 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             See _openmp_effective_n_threads, for details about
             the specification of n_threads.
 
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            Strategies differs on the dispatching they use for chunks on threads:
+
+              - 'parallel_on__X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread then iterates on all the chunks of X. This strategy is
+              embarrassingly parallel but uses intermediate datastructures
+              synchronisation.
+
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on__X' and 'parallel_on_Y'.
+
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
         Returns
         -------
-        argkmin: PairwiseDistancesArgKmin
-            The suited PairwiseDistancesArgKmin implementation.
+            Indices of argkmin for each vector in X and its associated distances
+            if return_distance=True.
         """
-        # This factory comes to handle specialisations.
-        if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y):
+        # Note (jjerphan): Some design thoughts for future extensions.
+        # This factory comes to handle specialisations for the given arguments.
+        # For future work, this might can be an entrypoint to specialise operations
+        # for various back-end and/or hardware and/or datatypes, and/or fused
+        # {sparse, dense}-datasetspair etc.
+        if (
+            metric in ("fast_euclidean", "fast_sqeuclidean")
+                and not issparse(X)
+                and not issparse(Y)
+        ):
             use_squared_distances = metric == "fast_sqeuclidean"
-            return FastEuclideanPairwiseDistancesArgKmin(
+            pda = FastEuclideanPairwiseDistancesArgKmin(
                 X=X, Y=Y, k=k,
                 use_squared_distances=use_squared_distances,
                 chunk_size=chunk_size,
                 metric_kwargs=metric_kwargs,
             )
+        else: # Fall back on the default
+            pda = PairwiseDistancesArgKmin(
+                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+                k=k,
+                chunk_size=chunk_size,
+            )
 
-        return PairwiseDistancesArgKmin(
-            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
-            k=k,
-            chunk_size=chunk_size,
-        )
+        return pda._compute(strategy=strategy, return_distance=return_distance)
 
     def __init__(
         self,
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c65a9e4f25d2a..0b84508a25415 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -669,9 +669,15 @@ def pairwise_distances_argmin_min(
         metric_kwargs = {}
 
     if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric):
-        values, indices = PairwiseDistancesArgKmin.get_for(
-            X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
-        ).compute(strategy="auto", return_distance=True)
+        values, indices = PairwiseDistancesArgKmin.compute(
+            X=X,
+            Y=Y,
+            k=1,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            strategy="auto",
+            return_distance=True,
+        )
         values = values.flatten()
         indices = indices.flatten()
     else:
@@ -784,9 +790,15 @@ def pairwise_distances_argmin(
         metric_kwargs = {}
 
     if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric):
-        indices = PairwiseDistancesArgKmin.get_for(
-            X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
-        ).compute(strategy="auto", return_distance=False)
+        indices = PairwiseDistancesArgKmin.compute(
+            X=X,
+            Y=Y,
+            k=1,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            strategy="auto",
+            return_distance=False,
+        )
         indices = indices.flatten()
     else:
         # TODO: once PairwiseDistancesArgKmin supports sparse input matrices and 32 bit,
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index eba3686217d97..27aeb0f5df401 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -4,10 +4,6 @@
 from numpy.testing import assert_array_equal, assert_allclose
 from scipy.sparse import csr_matrix
 
-from sklearn.metrics._dist_metrics import (
-    DenseDenseDatasetsPair,
-)
-
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
@@ -113,33 +109,33 @@ def test_argkmin_factory_method_wrong_usages():
     with pytest.raises(
         ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
-        PairwiseDistancesArgKmin.get_for(
+        PairwiseDistancesArgKmin.compute(
             X=X.astype(np.float32), Y=Y, k=k, metric=metric
         )
 
     with pytest.raises(
         ValueError, match="Only 64bit float datasets are supported for X and Y."
     ):
-        PairwiseDistancesArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
 
     with pytest.raises(ValueError, match="k == -1, must be >= 1."):
-        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)
 
     with pytest.raises(ValueError, match="k == 0, must be >= 1."):
-        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=0, metric=metric)
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=0, metric=metric)
 
     with pytest.raises(ValueError, match="Unrecognized metric"):
-        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric")
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")
 
     with pytest.raises(
         ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
     ):
-        PairwiseDistancesArgKmin.get_for(
+        PairwiseDistancesArgKmin.compute(
             X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric
         )
 
     with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
-        PairwiseDistancesArgKmin.get_for(
+        PairwiseDistancesArgKmin.compute(
             X=np.asfortranarray(X), Y=Y, k=k, metric=metric
         )
 
@@ -164,32 +160,33 @@ def test_pairwise_distances_reduction_factory_method(
     # Dummy value for k or radius
     dummy_arg = 5
 
-    dense_dense_instance = PairwiseDistancesReduction.get_for(X, Y, dummy_arg, metric)
-    assert isinstance(dense_dense_instance.datasets_pair, DenseDenseDatasetsPair)
-
     with pytest.raises(
         ValueError, match="Only dense datasets are supported for X and Y."
     ):
-        PairwiseDistancesReduction.get_for(
-            csr_matrix(X), csr_matrix(Y), dummy_arg, metric
+        PairwiseDistancesReduction.compute(
+            csr_matrix(X),
+            csr_matrix(Y),
+            dummy_arg,
+            metric,
         )
 
     with pytest.raises(
         ValueError, match="Only dense datasets are supported for X and Y."
     ):
-        PairwiseDistancesReduction.get_for(X, csr_matrix(Y), dummy_arg, metric=metric)
+        PairwiseDistancesReduction.compute(X, csr_matrix(Y), dummy_arg, metric=metric)
 
     with pytest.raises(
         ValueError, match="Only dense datasets are supported for X and Y."
     ):
-        PairwiseDistancesReduction.get_for(csr_matrix(X), Y, dummy_arg, metric=metric)
+        PairwiseDistancesReduction.compute(csr_matrix(X), Y, dummy_arg, metric=metric)
 
     # Test specialisations creation
-    fast_euclidean_instance = PairwiseDistancesReduction.get_for(
-        X, Y, dummy_arg, metric="fast_euclidean"
+    PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        dummy_arg,
+        metric="fast_euclidean",
     )
-    assert isinstance(fast_euclidean_instance, PairwiseDistancesReduction)
-    assert isinstance(fast_euclidean_instance, FastPairwiseDistancesReduction)
 
 
 @fails_if_unstable_openblas
@@ -222,13 +219,22 @@ def test_chunk_size_agnosticism(
         else 10 ** np.log(n_features)
     )
 
-    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric="euclidean"
-    ).compute(return_distance=True)
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric="euclidean",
+        return_distance=True,
+    )
 
-    dist, indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric=metric, chunk_size=chunk_size
-    ).compute(return_distance=True)
+    dist, indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        chunk_size=chunk_size,
+        return_distance=True,
+    )
 
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 
@@ -262,13 +268,22 @@ def test_n_threads_agnosticism(
         else 10 ** np.log(n_features)
     )
 
-    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric="fast_euclidean"
-    ).compute(return_distance=True)
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric="fast_euclidean",
+        return_distance=True,
+    )
 
-    dist, indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric="fast_euclidean", n_threads=1
-    ).compute(return_distance=True)
+    dist, indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric="fast_euclidean",
+        n_threads=1,
+        return_distance=True,
+    )
 
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 
@@ -314,7 +329,7 @@ def test_strategies_consistency(
         else 10 ** np.log(n_features)
     )
 
-    pairwise_distances_reduction = PairwiseDistancesReduction.get_for(
+    dist_par_X, indices_par_X = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
@@ -323,18 +338,28 @@ def test_strategies_consistency(
         metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
         # To be sure to use parallelization
         chunk_size=n_samples // 4,
+        strategy="parallel_on_X",
+        return_distance=True,
     )
 
-    dist_par_X, indices_par_X = pairwise_distances_reduction.compute(
-        strategy="parallel_on_X", return_distance=True
-    )
-
-    dist_par_Y, indices_par_Y = pairwise_distances_reduction.compute(
-        strategy="parallel_on_Y", return_distance=True
+    dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        # Taking the first
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        # To be sure to use parallelization
+        chunk_size=n_samples // 4,
+        strategy="parallel_on_Y",
+        return_distance=True,
     )
 
     ASSERT_RESULT[PairwiseDistancesReduction](
-        dist_par_X, dist_par_Y, indices_par_X, indices_par_Y
+        dist_par_X,
+        dist_par_Y,
+        indices_par_X,
+        indices_par_Y,
     )
 
 
@@ -354,12 +379,20 @@ def test_fast_sqeuclidean_correctness(
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
 
-    eucl_dist, eucl_indices = PairwiseDistancesArgKmin.get_for(
-        X, Y, k, metric="euclidean"
-    ).compute(return_distance=True)
-    fse_dist, fse_indices = PairwiseDistancesArgKmin.get_for(
-        X, Y, k, metric="fast_euclidean"
-    ).compute(return_distance=True)
+    eucl_dist, eucl_indices = PairwiseDistancesArgKmin.compute(
+        X,
+        Y,
+        k,
+        metric="euclidean",
+        return_distance=True,
+    )
+    fse_dist, fse_indices = PairwiseDistancesArgKmin.compute(
+        X,
+        Y,
+        k,
+        metric="fast_euclidean",
+        return_distance=True,
+    )
 
     assert_argkmin_results_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
 
@@ -398,21 +431,23 @@ def test_fast_sqeuclidean_translation_invariance(
         X = np.ascontiguousarray(X[:, :2])
         Y = np.ascontiguousarray(Y[:, :2])
 
-    reference_dist, reference_indices = PairwiseDistancesReduction.get_for(
+    reference_dist, reference_indices = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
         metric=metric,
         metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
-    ).compute(return_distance=True)
+        return_distance=True,
+    )
 
-    dist, indices = PairwiseDistancesReduction.get_for(
+    dist, indices = PairwiseDistancesReduction.compute(
         X + 0,
         Y + 0,
         parameter,
         metric=metric,
         metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
-    ).compute(return_distance=True)
+        return_distance=True,
+    )
 
     ASSERT_RESULT[PairwiseDistancesReduction](
         reference_dist, dist, reference_indices, indices
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index ac10a8768023a..c1e0fbae1d580 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -786,14 +786,13 @@ class from an array representing our data set and ask who's
         n_jobs = effective_n_jobs(self.n_jobs)
         chunked_results = None
         if use_pairwise_distances_reductions:
-            results = PairwiseDistancesArgKmin.get_for(
+            results = PairwiseDistancesArgKmin.compute(
                 X=X,
                 Y=self._fit_X,
                 k=n_neighbors,
                 metric=self.effective_metric_,
                 metric_kwargs=self.effective_metric_params_,
                 n_threads=self.n_jobs,
-            ).compute(
                 strategy="auto",
                 return_distance=return_distance,
             )

From 6983c3215497e8b9c1040ce49b2d736f32ef957d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Dec 2021 12:16:33 +0100
Subject: [PATCH 273/290] DOC Add notes for `PairwiseDistancesArgKmin.compute`

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 59585d9f6ebc4..4e94a2c3ce343 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -596,6 +596,19 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         -------
             Indices of argkmin for each vector in X and its associated distances
             if return_distance=True.
+
+        Notes
+        -----
+            This public classmethod is responsible of introspecting the arguments
+            values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute`
+            instance method of the most appropriate :class:`PairwiseDistancesArgKmin`
+            concrete implementation.
+
+            All temporarily allocated datastructures necessary for the concrete
+            implementation are therefore freed when this classmethod returns.
+
+            This allows entirely decoupling the interface entirely from the
+            implementation details whilst maintaining RAII.
         """
         # Note (jjerphan): Some design thoughts for future extensions.
         # This factory comes to handle specialisations for the given arguments.

From 449545c586f5b33cd674505d75f29c5806e1171a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Dec 2021 17:08:27 +0100
Subject: [PATCH 274/290] DOC Better word

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 4e94a2c3ce343..d1f9f3f169503 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -599,7 +599,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
         Notes
         -----
-            This public classmethod is responsible of introspecting the arguments
+            This public classmethod is responsible for introspecting the arguments
             values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute`
             instance method of the most appropriate :class:`PairwiseDistancesArgKmin`
             concrete implementation.

From 0915a360d779f322723ce254fd11439bf72fa5ea Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 15 Dec 2021 15:57:23 +0100
Subject: [PATCH 275/290] Remove 'fast_sqeuclidean' and 'fast_euclidean'

---
 sklearn/cluster/_affinity_propagation.py      |   4 -
 sklearn/cluster/_birch.py                     |   4 -
 sklearn/cluster/_mean_shift.py                |   4 -
 .../metrics/_pairwise_distances_reduction.pyx |  14 +--
 sklearn/metrics/pairwise.py                   |  46 ++-----
 sklearn/metrics/tests/test_pairwise.py        |  14 ---
 .../test_pairwise_distances_reduction.py      |  56 +++------
 sklearn/neighbors/_base.py                    |  42 -------
 sklearn/neighbors/tests/test_neighbors.py     | 115 ------------------
 9 files changed, 29 insertions(+), 270 deletions(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 89ee4370f81e6..3cf67427e49e4 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -524,12 +524,8 @@ def predict(self, X):
         if self.cluster_centers_.shape[0] > 0:
             with config_context(assume_finite=True):
                 return pairwise_distances_argmin(
-                    # We use the fast squared euclidean metric alternative to get
-                    # maximum acceleration as we are not concerned with the minimum
-                    # values but only their indices.
                     X,
                     self.cluster_centers_,
-                    metric="fast_sqeuclidean",
                 )
         else:
             warnings.warn(
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 867f0da2bf84b..cc13d8aa699a3 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -686,10 +686,6 @@ def _predict(self, X):
             argmin = pairwise_distances_argmin(
                 X,
                 self.subcluster_centers_,
-                # We use the fast squared euclidean metric alternative to get
-                # maximum acceleration as we are not concerned with the minimum
-                # values but only their indices.
-                metric="fast_sqeuclidean",
                 metric_kwargs=metric_kwargs,
             )
         return self.subcluster_labels_[argmin]
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 80d275ee3e60e..7ba7bf35a3f68 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -513,10 +513,6 @@ def predict(self, X):
         X = self._validate_data(X, reset=False)
         with config_context(assume_finite=True):
             return pairwise_distances_argmin(
-                # We use the fast squared euclidean metric alternative to get
-                # maximum acceleration as we are not concerned with the minimum
-                # values but only their indices.
                 X,
                 self.cluster_centers_,
-                metric="fast_sqeuclidean",
             )
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 1698f48db666e..f5fde8fbfe1a4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -140,8 +140,7 @@ cdef class PairwiseDistancesReduction:
             "hamming",
             *BOOL_METRICS,
         }
-        return sorted({"fast_euclidean", "fast_sqeuclidean",
-                       *METRIC_MAPPING.keys()}.difference(excluded))
+        return sorted(set(METRIC_MAPPING.keys()).difference(excluded))
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
@@ -525,7 +524,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         X,
         Y,
         ITYPE_t k,
-        str metric="fast_euclidean",
+        str metric="euclidean",
         chunk_size=None,
         dict metric_kwargs=None,
         n_threads=None,
@@ -543,9 +542,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         k : int
             The k for the argkmin reduction.
 
-        metric : str, default='fast_euclidean'
-            The distance metric to use for argkmin. The default metric is
-            a fast implementation of the standard Euclidean metric.
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin.
             For a list of available metrics, see the documentation of
             :class:`~sklearn.metrics.DistanceMetric`.
 
@@ -572,8 +570,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             The suited PairwiseDistancesArgKmin implementation.
         """
         # This factory comes to handle specialisations.
-        if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y):
-            use_squared_distances = metric == "fast_sqeuclidean"
+        if metric in ("euclidean", "sqeuclidean") and not issparse(X) and not issparse(Y):
+            use_squared_distances = metric == "sqeuclidean"
             return FastEuclideanPairwiseDistancesArgKmin(
                 X=X, Y=Y, k=k,
                 use_squared_distances=use_squared_distances,
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c65a9e4f25d2a..e322b32be4155 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -584,7 +584,7 @@ def _argmin_reduce(dist, start):
 
 
 def pairwise_distances_argmin_min(
-    X, Y, *, axis=1, metric="fast_euclidean", metric_kwargs=None
+    X, Y, *, axis=1, metric="euclidean", metric_kwargs=None
 ):
     """Compute minimum distances between one point and a set of points.
 
@@ -610,7 +610,7 @@ def pairwise_distances_argmin_min(
     axis : int, default=1
         Axis along which the argmin and distances are to be computed.
 
-    metric : str or callable, default="fast_euclidean"
+    metric : str or callable, default="euclidean"
         Metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
@@ -624,8 +624,8 @@ def pairwise_distances_argmin_min(
 
         Valid values for metric are:
 
-        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'fast_euclidean',
-          'fast_sqeuclidean', 'l1', 'l2', 'manhattan']
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -633,13 +633,6 @@ def pairwise_distances_argmin_min(
           'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
           'yule']
 
-        'fast_euclidean' (the default metric) is a variant of the 'euclidean'
-        metric which has a superior arithmetic intensity and hence better
-        running time. However it can suffer from numerical instability caused
-        by catastrophic cancellation in rare configuration.
-        Hence when exact results are mandatory, 'euclidean' should be preferred.
-        The same remark applies for 'fast_sqeuclidean' regarding 'sqeuclidean'.
-
         See the documentation for scipy.spatial.distance for details on these
         metrics.
 
@@ -677,13 +670,6 @@ def pairwise_distances_argmin_min(
     else:
         # TODO: once PairwiseDistancesArgKmin supports sparse input matrices and 32 bit,
         # we won't need to fallback to pairwise_distances_chunked anymore.
-        #
-        # When PairwiseDistancesArgKmin is not supported and when the user
-        # asked for a fast alternative, we need to revert to the standard
-        # "euclidean" strategy to match the API.
-        # Internally, the "euclidean" strategy still uses the GEMM trick.
-        if metric in ("fast_euclidean", "fast_sqeuclidean"):
-            metric = metric.replace("fast_", "")
 
         # Turn off check for finiteness because this is costly and because arrays
         # have already been validated.
@@ -699,9 +685,7 @@ def pairwise_distances_argmin_min(
     return indices, values
 
 
-def pairwise_distances_argmin(
-    X, Y, *, axis=1, metric="fast_euclidean", metric_kwargs=None
-):
+def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
 
     This function computes for each row in X, the index of the row of Y which
@@ -726,7 +710,7 @@ def pairwise_distances_argmin(
     axis : int, default=1
         Axis along which the argmin and distances are to be computed.
 
-    metric : str or callable, default="fast_euclidean"
+    metric : str or callable, default="euclidean"
         Metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
@@ -740,8 +724,8 @@ def pairwise_distances_argmin(
 
         Valid values for metric are:
 
-        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'fast_euclidean',
-          'fast_sqeuclidean', 'l1', 'l2', 'manhattan']
+        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+          'manhattan']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -749,13 +733,6 @@ def pairwise_distances_argmin(
           'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
           'yule']
 
-        'fast_euclidean' (the default metric) is a variant of the 'euclidean'
-        metric which has a superior arithmetic intensity and hence better
-        running time. However it can suffer from numerical instability caused
-        by catastrophic cancellation in rare configuration.
-        Hence when exact results are mandatory, 'euclidean' should be preferred.
-        The same remark applies for 'fast_sqeuclidean' regarding 'sqeuclidean'.
-
         See the documentation for scipy.spatial.distance for details on these
         metrics.
 
@@ -791,13 +768,6 @@ def pairwise_distances_argmin(
     else:
         # TODO: once PairwiseDistancesArgKmin supports sparse input matrices and 32 bit,
         # we won't need to fallback to pairwise_distances_chunked anymore.
-        #
-        # When PairwiseDistancesArgKmin is not supported and when the user
-        # asked for a fast alternative, we need to revert to the standard one.
-        # "euclidean" strategy to match the API.
-        # Internally, the "euclidean" strategy still uses the GEMM trick.
-        if metric in ("fast_euclidean", "fast_sqeuclidean"):
-            metric = metric.replace("fast_", "")
 
         # Turn off check for finiteness because this is costly and because arrays
         # have already been validated.
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 3f65b6d6ea09d..e3989a9f985c6 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -464,20 +464,6 @@ def test_pairwise_distances_argmin_min():
     assert_array_almost_equal(vals, expected_vals_sq)
     assert_array_almost_equal(idx2, expected_idx)
 
-    # Fast Euclidean metric
-    idx, vals = pairwise_distances_argmin_min(X, Y, metric="fast_euclidean")
-    idx2 = pairwise_distances_argmin(X, Y, metric="fast_euclidean")
-    assert_array_almost_equal(idx, expected_idx)
-    assert_array_almost_equal(vals, expected_vals)
-    assert_array_almost_equal(idx2, expected_idx)
-
-    # Fast Squared Euclidean metric
-    idx, vals = pairwise_distances_argmin_min(X, Y, metric="fast_sqeuclidean")
-    idx2 = pairwise_distances_argmin(X, Y, metric="fast_sqeuclidean")
-    assert_array_almost_equal(idx, expected_idx)
-    assert_array_almost_equal(vals, expected_vals_sq)
-    assert_array_almost_equal(idx2, expected_idx)
-
     # Non-euclidean scikit-learn metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
     idx2 = pairwise_distances_argmin(X, Y, metric="manhattan")
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index eba3686217d97..dcab6920968f8 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -185,11 +185,11 @@ def test_pairwise_distances_reduction_factory_method(
         PairwiseDistancesReduction.get_for(csr_matrix(X), Y, dummy_arg, metric=metric)
 
     # Test specialisations creation
-    fast_euclidean_instance = PairwiseDistancesReduction.get_for(
-        X, Y, dummy_arg, metric="fast_euclidean"
+    euclidean_instance = PairwiseDistancesReduction.get_for(
+        X, Y, dummy_arg, metric="euclidean"
     )
-    assert isinstance(fast_euclidean_instance, PairwiseDistancesReduction)
-    assert isinstance(fast_euclidean_instance, FastPairwiseDistancesReduction)
+    assert isinstance(euclidean_instance, PairwiseDistancesReduction)
+    assert isinstance(euclidean_instance, FastPairwiseDistancesReduction)
 
 
 @fails_if_unstable_openblas
@@ -205,7 +205,6 @@ def test_chunk_size_agnosticism(
     seed,
     n_samples,
     chunk_size,
-    metric="fast_euclidean",
     n_features=100,
     dtype=np.float64,
 ):
@@ -223,11 +222,13 @@ def test_chunk_size_agnosticism(
     )
 
     ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric="euclidean"
+        X,
+        Y,
+        parameter,
     ).compute(return_distance=True)
 
     dist, indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric=metric, chunk_size=chunk_size
+        X, Y, parameter, chunk_size=chunk_size
     ).compute(return_distance=True)
 
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
@@ -263,11 +264,13 @@ def test_n_threads_agnosticism(
     )
 
     ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric="fast_euclidean"
+        X,
+        Y,
+        parameter,
     ).compute(return_distance=True)
 
     dist, indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, metric="fast_euclidean", n_threads=1
+        X, Y, parameter, n_threads=1
     ).compute(return_distance=True)
 
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
@@ -289,12 +292,9 @@ def test_strategies_consistency(
     dtype=np.float64,
 ):
     # Results obtained using both parallelization strategies must be identical
-    if _in_unstable_openblas_configuration() and metric == {
-        "fast_sqeuclidean",
-        "fast_euclidean",
-    }:
+    if _in_unstable_openblas_configuration() and metric in ("sqeuclidean", "euclidean"):
         pytest.xfail(
-            "OpenBLAS (used for 'fast_(sq)euclidean') is unstable in this configuration"
+            "OpenBLAS (used for '(sq)euclidean') is unstable in this configuration"
         )
 
     rng = np.random.RandomState(seed)
@@ -338,32 +338,6 @@ def test_strategies_consistency(
     )
 
 
-@fails_if_unstable_openblas
-@pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("n_samples", [100, 1000])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-def test_fast_sqeuclidean_correctness(
-    seed,
-    n_samples,
-    n_features,
-    k=50,
-    dtype=np.float64,
-):
-    rng = np.random.RandomState(seed)
-    spread = 100
-    X = rng.rand(n_samples, n_features).astype(dtype) * spread
-    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
-
-    eucl_dist, eucl_indices = PairwiseDistancesArgKmin.get_for(
-        X, Y, k, metric="euclidean"
-    ).compute(return_distance=True)
-    fse_dist, fse_indices = PairwiseDistancesArgKmin.get_for(
-        X, Y, k, metric="fast_euclidean"
-    ).compute(return_distance=True)
-
-    assert_argkmin_results_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
-
-
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("n_features", [50, 500])
 @pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]])
@@ -372,7 +346,7 @@ def test_fast_sqeuclidean_correctness(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin],
 )
-def test_fast_sqeuclidean_translation_invariance(
+def test_euclidean_translation_invariance(
     n_features,
     translation,
     metric,
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index ac10a8768023a..22fe1ed54920b 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -53,8 +53,6 @@
             "correlation",
             "cosine",
             "dice",
-            "fast_euclidean",
-            "fast_sqeuclidean",
             "hamming",
             "jaccard",
             "kulsinski",
@@ -368,20 +366,6 @@ def _check_algorithm_metric(self):
         else:
             alg_check = self.algorithm
 
-        if alg_check != "brute" and self._metric in (
-            "fast_sqeuclidean",
-            "fast_euclidean",
-        ):
-            alternative = self._metric.replace("fast_", "")
-            warnings.warn(
-                f"'{self._metric}' is only available for algorithm='brute' but"
-                f" algorithm='{self.algorithm}' is used. Falling back on"
-                f" metric='{alternative}'.",
-                UserWarning,
-                stacklevel=3,
-            )
-            self._metric = alternative
-
         if callable(self._metric):
             if self.algorithm == "kd_tree":
                 # callable metric is only valid for brute force and ball_tree
@@ -524,10 +508,6 @@ def _fit(self, X, y=None):
             if self.algorithm not in ("auto", "brute"):
                 warnings.warn("cannot use tree with sparse input: using brute force")
 
-            if self._metric in ("fast_sqeuclidean", "fast_euclidean"):
-                # The fast alternatives are only available for dense datasets.
-                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
-
             if self.effective_metric_ not in VALID_METRICS_SPARSE[
                 "brute"
             ] and not callable(self.effective_metric_):
@@ -571,8 +551,6 @@ def _fit(self, X, y=None):
                 else:
                     self._fit_method = "brute"
 
-        specialised_metrics = {"euclidean", "sqeuclidean"}
-
         if self._fit_method == "ball_tree":
             self._tree = BallTree(
                 X,
@@ -588,16 +566,6 @@ def _fit(self, X, y=None):
                 **self.effective_metric_params_,
             )
         elif self._fit_method == "brute":
-            if (
-                self.effective_metric_ in specialised_metrics
-                and self._metric not in specialised_metrics
-                # TODO: remove this condition once PairwiseDistancesRadiusNeighbors
-                # has been introduced.
-                and isinstance(self, KNeighborsMixin)
-            ):
-                # In that case, the standard stabler metric has not been explicitly
-                # specified by the user, so we prefer its fast alternative.
-                self.effective_metric_ = f"fast_{self.effective_metric_}"
             self._tree = None
         else:
             raise ValueError("algorithm = '%s' not recognized" % self.algorithm)
@@ -809,11 +777,6 @@ class from an array representing our data set and ask who's
 
         elif self._fit_method == "brute":
             # TODO: support sparse matrices
-            # When PairwiseDistancesArgKmin is not supported and when the user ask for a
-            # fast alternative, we need to revert to the standard.
-            if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
-                # The fast alternatives are only available for dense datasets.
-                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
             reduce_func = partial(
                 self._kneighbors_reduce_func,
@@ -1125,11 +1088,6 @@ class from an array representing our data set and ask who's
 
         elif self._fit_method == "brute":
             # TODO: support sparse matrices
-            # When PairwiseDistancesRadiusNeighborhood is not supported and when
-            # the user ask for a fast alternative, we need to revert to the standard.
-            if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
-                # The fast alternatives are only available for dense datasets.
-                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
             # for efficiency, use squared euclidean distances
             if self.effective_metric_ == "euclidean":
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 4cf77138be937..e811fbee9b302 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -3,7 +3,6 @@
 import pytest
 import re
 import numpy as np
-import scipy
 from scipy.sparse import (
     bsr_matrix,
     coo_matrix,
@@ -215,120 +214,6 @@ def test_neigh_predictions_algorithm_agnosticity(
         )
 
 
-@pytest.mark.parametrize("n_samples", [100, 1000])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
-@pytest.mark.parametrize(
-    "NeighborsMixinSubclass",
-    [
-        neighbors.KNeighborsClassifier,
-        neighbors.KNeighborsRegressor,
-    ],
-)
-def test_neighs_predictions_fast_euclidean_correctness(
-    n_samples,
-    n_features,
-    n_neighbors,
-    radius,
-    NeighborsMixinSubclass,
-    dtype=np.float64,
-):
-    # The fast euclidean strategy must return results
-    # that are close to the ones obtained with the euclidean distance
-    rng = np.random.RandomState(0)
-    X = rng.rand(n_samples, n_features).astype(dtype)
-    y = rng.randint(3, size=n_samples)
-
-    parameter = (
-        n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
-    )
-
-    euclidean_est = NeighborsMixinSubclass(
-        parameter, algorithm="brute", metric="euclidean"
-    ).fit(X, y)
-    euclidean_pred = euclidean_est.predict(X)
-
-    fast_euclidean_clf = NeighborsMixinSubclass(
-        parameter, algorithm="brute", metric="fast_euclidean"
-    ).fit(X, y)
-    fast_euclidean_pred = fast_euclidean_clf.predict(X)
-
-    assert_allclose(euclidean_pred, fast_euclidean_pred)
-
-
-@pytest.mark.parametrize(
-    "KNeighborsEstimator",
-    [
-        neighbors.KNeighborsClassifier,
-        neighbors.KNeighborsRegressor,
-    ],
-)
-@pytest.mark.parametrize("algorithm", ["kd_tree", "ball_tree"])
-def test_knn_prediction_fast_alternatives_fall_back_on_tree(
-    KNeighborsEstimator,
-    algorithm,
-    specified_metric="fast_euclidean",
-    fall_back_metric="euclidean",
-    parameter=10,
-    n_samples=1000,
-    n_features=100,
-    dtype=np.float64,
-):
-    # The fast euclidean metric can't be used on "kd_tree", "ball_tree".
-    rng = np.random.RandomState(0)
-    X = rng.rand(n_samples, n_features).astype(dtype)
-    y = rng.randint(3, size=n_samples)
-
-    est = KNeighborsEstimator(
-        parameter,
-        algorithm=algorithm,
-        metric=specified_metric,
-    )
-    with pytest.warns(
-        UserWarning,
-        match=(
-            f"'{specified_metric}' is only available for algorithm='brute' but "
-            f"algorithm='{algorithm}' is used. Falling "
-            f"back on metric='{fall_back_metric}'."
-        ),
-    ):
-        est.fit(X, y)
-
-    assert est.metric == specified_metric
-    assert est._metric == fall_back_metric
-    assert est.effective_metric_ == fall_back_metric
-
-
-@pytest.mark.parametrize(
-    "KNeighborsEstimator",
-    [
-        neighbors.KNeighborsClassifier,
-        neighbors.KNeighborsRegressor,
-    ],
-)
-def test_knn_prediction_fast_alternatives_fall_back_on_sparse(
-    KNeighborsEstimator,
-    specified_metric="fast_euclidean",
-    fall_back_metric="euclidean",
-    parameter=10,
-    n_samples=1000,
-    n_features=100,
-    dtype=np.float64,
-):
-    # The fast euclidean metric can't be used on sparse datasets.
-    rng = np.random.RandomState(0)
-    X = scipy.sparse.random(n_samples, n_features, density=0.25, random_state=rng)
-    y = rng.randint(3, size=n_samples)
-
-    est = KNeighborsEstimator(
-        parameter,
-        algorithm="brute",
-        metric=specified_metric,
-    )
-    est.fit(X, y)
-    assert est.effective_metric_ == fall_back_metric
-
-
 @pytest.mark.parametrize(
     "KNeighborsMixinSubclass",
     [

From 048b958e5856866037edb79d8ecf4c1b4c26c3ed Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 15 Dec 2021 17:53:40 +0100
Subject: [PATCH 276/290] Revert uneeded changes

Dear Santa,

My name is Julien. I have been nice this year (I think).

My wish for Christmas is to have this PR merge in main.

Thank you,
Julien.
---
 sklearn/cluster/_affinity_propagation.py | 5 +----
 sklearn/cluster/_birch.py                | 9 +++------
 sklearn/cluster/_mean_shift.py           | 5 +----
 sklearn/metrics/pairwise.py              | 2 +-
 sklearn/metrics/tests/test_pairwise.py   | 3 +--
 5 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 3cf67427e49e4..e6cdfd8946325 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -523,10 +523,7 @@ def predict(self, X):
 
         if self.cluster_centers_.shape[0] > 0:
             with config_context(assume_finite=True):
-                return pairwise_distances_argmin(
-                    X,
-                    self.cluster_centers_,
-                )
+                return pairwise_distances_argmin(X, self.cluster_centers_)
         else:
             warnings.warn(
                 "This model does not have any cluster centers "
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index cc13d8aa699a3..8e86d8dd6ba08 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -680,13 +680,11 @@ def predict(self, X):
 
     def _predict(self, X):
         """Predict data using the ``centroids_`` of subclusters."""
-        metric_kwargs = {"Y_norm_squared": self._subcluster_norms}
+        kwargs = {"Y_norm_squared": self._subcluster_norms}
 
         with config_context(assume_finite=True):
             argmin = pairwise_distances_argmin(
-                X,
-                self.subcluster_centers_,
-                metric_kwargs=metric_kwargs,
+                X, self.subcluster_centers_, metric_kwargs=kwargs
             )
         return self.subcluster_labels_[argmin]
 
@@ -732,8 +730,7 @@ def _global_clustering(self, X=None):
                 "n_clusters should be an instance of ClusterMixin or an int"
             )
 
-        # We compute subcluster norms once here, so that we won't need to compute it
-        # again at each call of `Birch.predict`.
+        # To use in predict to avoid recalculation.
         self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
 
         if clusterer is None or not_enough_centroids:
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 7ba7bf35a3f68..f9ecf1ace0c21 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -512,7 +512,4 @@ def predict(self, X):
         check_is_fitted(self)
         X = self._validate_data(X, reset=False)
         with config_context(assume_finite=True):
-            return pairwise_distances_argmin(
-                X,
-                self.cluster_centers_,
-            )
+            return pairwise_distances_argmin(X, self.cluster_centers_)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index e322b32be4155..2ca10b71b80dc 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -610,7 +610,7 @@ def pairwise_distances_argmin_min(
     axis : int, default=1
         Axis along which the argmin and distances are to be computed.
 
-    metric : str or callable, default="euclidean"
+    metric : str or callable, default='euclidean'
         Metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index e3989a9f985c6..dd839166eb397 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -442,13 +442,12 @@ def test_pairwise_distances_argmin_min():
     expected_vals = [2, 2]
     expected_vals_sq = [4, 4]
 
-    # Euclidean metric
+    # euclidean metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
     idx2 = pairwise_distances_argmin(X, Y, metric="euclidean")
     assert_array_almost_equal(idx, expected_idx)
     assert_array_almost_equal(idx2, expected_idx)
     assert_array_almost_equal(vals, expected_vals)
-
     # sparse matrix case
     idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
     assert_array_almost_equal(idxsp, expected_idx)

From 7485f4495b1edcf71e2712c01a52a6cfcbe89944 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 16 Dec 2021 20:18:04 +0100
Subject: [PATCH 277/290] DOC Correct typos

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 6e0cb593ee0f8..3f7609aa96857 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -218,7 +218,7 @@ cdef class PairwiseDistancesReduction:
 
             Strategies differs on the dispatching they use for chunks on threads:
 
-              - 'parallel_on__X' dispatches chunks of X uniformly on threads.
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
               Each thread then iterates on all the chunks of Y. This strategy is
               embarrassingly parallel and comes with no datastructures synchronisation.
 
@@ -228,7 +228,7 @@ cdef class PairwiseDistancesReduction:
               synchronisation.
 
               - 'auto' relies on a simple heuristic to choose between
-              'parallel_on__X' and 'parallel_on_Y'.
+              'parallel_on_X' and 'parallel_on_Y'.
 
               - None (default) looks-up in scikit-learn configuration for
               `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.

From 7b6b39925c4a93a6074bc148b5324b29f6e64bcc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 21 Dec 2021 15:07:40 +0100
Subject: [PATCH 278/290] Choose the strategy at initialisation

---
 .../metrics/_pairwise_distances_reduction.pyx | 87 ++++++++++---------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 3f7609aa96857..972187e60353e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -116,6 +116,26 @@ cdef class PairwiseDistancesReduction:
 
         See _openmp_effective_n_threads, for details about
         the specification of n_threads.
+
+    strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+        The chunking strategy defining which dataset parallelization are made on.
+
+        Strategies differs on the dispatching they use for chunks on threads:
+
+          - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+          Each thread then iterates on all the chunks of Y. This strategy is
+          embarrassingly parallel and comes with no datastructures synchronisation.
+
+          - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+          Each thread then iterates on all the chunks of X. This strategy is
+          embarrassingly parallel but uses intermediate datastructures
+          synchronisation.
+
+          - 'auto' relies on a simple heuristic to choose between
+          'parallel_on_X' and 'parallel_on_Y'.
+
+          - None (default) looks-up in scikit-learn configuration for
+          `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
     """
 
     cdef:
@@ -128,6 +148,8 @@ cdef class PairwiseDistancesReduction:
         ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
         ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
 
+        bint execute_in_parallel_on_Y
+
     @classmethod
     def valid_metrics(cls) -> List[str]:
         excluded = {
@@ -177,6 +199,7 @@ cdef class PairwiseDistancesReduction:
         DatasetsPair datasets_pair,
         chunk_size=None,
         n_threads=None,
+        strategy='auto',
      ):
         cdef:
             ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
@@ -204,35 +227,31 @@ cdef class PairwiseDistancesReduction:
         self.X_n_chunks = X_n_full_chunks + (self.X_n_samples_remainder != 0)
         self.Y_n_chunks = Y_n_full_chunks + (self.Y_n_samples_remainder != 0)
 
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
+            raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
+                               f"or 'auto', but currently strategy='{self.strategy}'.")
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_samples_X:
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
+        self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
+
     def _compute(
         self,
-        str strategy=None,
         bint return_distance=False,
     ):
         """Compute the pairwise distances and the reduction of vectors (rows) of X on Y.
 
         Parameters
         ----------
-        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
-            The chunking strategy defining which dataset parallelization are made on.
-
-            Strategies differs on the dispatching they use for chunks on threads:
-
-              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
-              Each thread then iterates on all the chunks of Y. This strategy is
-              embarrassingly parallel and comes with no datastructures synchronisation.
-
-              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
-              Each thread then iterates on all the chunks of X. This strategy is
-              embarrassingly parallel but uses intermediate datastructures
-              synchronisation.
-
-              - 'auto' relies on a simple heuristic to choose between
-              'parallel_on_X' and 'parallel_on_Y'.
-
-              - None (default) looks-up in scikit-learn configuration for
-              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
-
         return_distance : boolean, default=False
             Return distances between each X vector and its
             argkmin if set to True.
@@ -243,27 +262,13 @@ cdef class PairwiseDistancesReduction:
         the samples of Y selected by the reduction function.
         """
 
-        if strategy is None:
-            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
-
-        if strategy == 'auto':
-            # This is a simple heuristic whose constant for the
-            # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_samples_X:
-                strategy = 'parallel_on_X'
-            else:
-                strategy = 'parallel_on_Y'
-
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
         with threadpool_limits(limits=1, user_api="blas"):
-            if strategy == 'parallel_on_Y':
+            if self.execute_in_parallel_on_Y:
                 self._parallel_on_Y()
-            elif strategy == 'parallel_on_X':
-                self._parallel_on_X()
             else:
-                raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
-                                   f"or 'auto', but currently strategy='{strategy}'.")
+                self._parallel_on_X()
 
         return self._finalize_results(return_distance)
 
@@ -623,6 +628,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 X=X, Y=Y, k=k,
                 use_squared_distances=use_squared_distances,
                 chunk_size=chunk_size,
+                strategy=strategy,
                 metric_kwargs=metric_kwargs,
             )
         else: # Fall back on the default
@@ -630,9 +636,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
                 k=k,
                 chunk_size=chunk_size,
+                strategy=strategy,
             )
 
-        return pda._compute(strategy=strategy, return_distance=return_distance)
+        return pda._compute(return_distance=return_distance)
 
     def __init__(
         self,
@@ -640,8 +647,9 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         ITYPE_t k,
         chunk_size=None,
         n_threads=None,
+        strategy='auto',
     ):
-        super().__init__(datasets_pair, chunk_size, n_threads)
+        super().__init__(datasets_pair, chunk_size, n_threads, strategy)
 
         self.k = check_scalar(k, "k", Integral, min_val=1)
 
@@ -877,6 +885,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         ITYPE_t k,
         bint use_squared_distances=False,
         chunk_size=None,
+        strategy='auto',
         metric_kwargs=None,
     ):
         if metric_kwargs is not None and len(metric_kwargs) > 0:

From 5749c0254ac161ef2702036adbd0154efc71a25c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 21 Dec 2021 15:08:09 +0100
Subject: [PATCH 279/290] TST Adapt for the new `compute` interface

---
 .../test_pairwise_distances_reduction.py      | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index e952447134d9e..b40ec64b895ea 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -180,13 +180,6 @@ def test_pairwise_distances_reduction_factory_method(
     ):
         PairwiseDistancesReduction.compute(csr_matrix(X), Y, dummy_arg, metric=metric)
 
-    # Test specialisations creation
-    euclidean_instance = PairwiseDistancesReduction.get_for(
-        X, Y, dummy_arg, metric="euclidean"
-    )
-    assert isinstance(euclidean_instance, PairwiseDistancesReduction)
-    assert isinstance(euclidean_instance, FastPairwiseDistancesReduction)
-
 
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(5))
@@ -217,15 +210,20 @@ def test_chunk_size_agnosticism(
         else 10 ** np.log(n_features)
     )
 
-    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
-    ).compute(return_distance=True)
+        return_distance=True,
+    )
 
-    dist, indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, chunk_size=chunk_size
-    ).compute(return_distance=True)
+    dist, indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        chunk_size=chunk_size,
+        return_distance=True,
+    )
 
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 
@@ -259,15 +257,16 @@ def test_n_threads_agnosticism(
         else 10 ** np.log(n_features)
     )
 
-    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
-    ).compute(return_distance=True)
+        return_distance=True,
+    )
 
-    dist, indices = PairwiseDistancesReduction.get_for(
-        X, Y, parameter, n_threads=1
-    ).compute(return_distance=True)
+    dist, indices = PairwiseDistancesReduction.compute(
+        X, Y, parameter, n_threads=1, return_distance=True
+    )
 
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 

From 2300c5e03e06e5159da05f307984604392add623 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 21 Dec 2021 16:58:36 +0100
Subject: [PATCH 280/290] Distinguish between effective and available threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This computes the exact number of threads needed (effective
threads) at the initialisation.

This allows allocating the exact number of pointers that we
need.

This also allows removing the `num_threads` parameters
from the callbacks methods' signatures (as the information
is now accessible via an attribute).

Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 65 +++++++++----------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 972187e60353e..edc9dd3cd26a7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -142,7 +142,7 @@ cdef class PairwiseDistancesReduction:
         readonly DatasetsPair datasets_pair
 
         ITYPE_t n_threads
-        ITYPE_t effective_omp_n_thread
+        ITYPE_t effective_n_threads, available_n_threads
         ITYPE_t n_samples_chunk, chunk_size
 
         ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
@@ -209,7 +209,7 @@ cdef class PairwiseDistancesReduction:
 
         self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
 
-        self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads)
+        self.available_n_threads = _openmp_effective_n_threads(n_threads)
 
         self.datasets_pair = datasets_pair
 
@@ -237,13 +237,20 @@ cdef class PairwiseDistancesReduction:
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_samples_X:
+            if 4 * self.chunk_size * self.available_n_threads < self.n_samples_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
 
         self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
 
+        # Not using less, not using more.
+        self.effective_n_threads = min(
+            self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
+            self.available_n_threads,
+        )
+
+
     def _compute(
         self,
         bint return_distance=False,
@@ -288,10 +295,9 @@ cdef class PairwiseDistancesReduction:
         """
         cdef:
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
-            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
             ITYPE_t thread_num
 
-        with nogil, parallel(num_threads=num_threads):
+        with nogil, parallel(num_threads=self.effective_n_threads):
             thread_num = _openmp_thread_num()
 
             # Allocating thread datastructures
@@ -350,11 +356,10 @@ cdef class PairwiseDistancesReduction:
         """
         cdef:
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
-            ITYPE_t num_threads = min(self.Y_n_chunks, self.effective_omp_n_thread)
             ITYPE_t thread_num
 
         # Allocating datastructures
-        self._parallel_on_Y_parallel_init(num_threads)
+        self._parallel_on_Y_parallel_init()
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
@@ -363,7 +368,7 @@ cdef class PairwiseDistancesReduction:
             else:
                 X_end = X_start + self.X_n_samples_chunk
 
-            with nogil, parallel(num_threads=num_threads):
+            with nogil, parallel(num_threads=self.effective_n_threads):
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
@@ -390,11 +395,11 @@ cdef class PairwiseDistancesReduction:
             # end: with nogil, parallel
 
             # Synchronizing the thread datastructures with the main ones
-            self._parallel_on_Y_synchronize(num_threads, X_start, X_end)
+            self._parallel_on_Y_synchronize(X_start, X_end)
 
         # end: for X_chunk_idx
         # Deallocating temporary datastructures and adjusting main datastructures
-        self._parallel_on_Y_finalize(num_threads)
+        self._parallel_on_Y_finalize()
         return
 
     # Placeholder methods which have to be implemented
@@ -460,7 +465,6 @@ cdef class PairwiseDistancesReduction:
 
     cdef void _parallel_on_Y_parallel_init(
         self,
-        ITYPE_t num_threads,
     ) nogil:
         """Allocate datastructures used in all threads."""
         return
@@ -474,7 +478,6 @@ cdef class PairwiseDistancesReduction:
 
     cdef void _parallel_on_Y_synchronize(
         self,
-        ITYPE_t num_threads,
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
@@ -483,7 +486,6 @@ cdef class PairwiseDistancesReduction:
 
     cdef void _parallel_on_Y_finalize(
         self,
-        ITYPE_t num_threads,
     ) nogil:
         """Update datastructures after executing all the reductions."""
         return
@@ -667,10 +669,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         #   small heaps which are thread-wise-allocated and whose content will be
         #   merged with the main heaps'.
         self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+            sizeof(DTYPE_t *) * self.effective_n_threads
         )
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
-            sizeof(ITYPE_t *) * self.effective_omp_n_thread
+            sizeof(ITYPE_t *) * self.effective_n_threads
         )
 
         # Main heaps used by PairwiseDistancesArgKmin.compute to return results.
@@ -742,7 +744,6 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
     cdef void _parallel_on_Y_parallel_init(
         self,
-        ITYPE_t num_threads,
     ) nogil:
         cdef:
             # Maximum number of scalar elements (the last chunks can be smaller)
@@ -752,8 +753,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         # The allocation is done in parallel for data locality purposes: this way
         # the heaps used in each threads are allocated in pages which are closer
         # to processor core used by the thread.
-        for thread_num in prange(num_threads, schedule='static', nogil=True,
-                                 num_threads=num_threads):
+        for thread_num in prange(self.effective_n_threads, schedule='static', nogil=True,
+                                 num_threads=self.effective_n_threads):
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
@@ -777,17 +778,16 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     @final
     cdef void _parallel_on_Y_synchronize(
         self,
-        ITYPE_t num_threads,
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
         cdef:
             ITYPE_t idx, jdx, thread_num
-        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+        with nogil, parallel(num_threads=self.available_n_threads):
             # Synchronising the thread heaps with the main heaps
             # This is done in parallel samples-wise (no need for locks)
             for idx in prange(X_end - X_start, schedule="static"):
-                for thread_num in range(num_threads):
+                for thread_num in range(self.effective_n_threads):
                     for jdx in range(self.k):
                         heap_push(
                             &self.argkmin_distances[X_start + idx, 0],
@@ -799,14 +799,13 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
     cdef void _parallel_on_Y_finalize(
         self,
-        ITYPE_t num_threads,
     ) nogil:
         cdef:
             ITYPE_t idx, thread_num
 
-        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+        with nogil, parallel(num_threads=self.effective_n_threads):
             # Deallocating temporary datastructures
-            for thread_num in prange(num_threads, schedule='static'):
+            for thread_num in prange(self.effective_n_threads, schedule='static'):
                 free(self.heaps_r_distances_chunks[thread_num])
                 free(self.heaps_indices_chunks[thread_num])
 
@@ -826,7 +825,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
             DTYPE_t[:, ::1] distances = self.argkmin_distances
         for i in prange(self.n_samples_X, schedule='static', nogil=True,
-                        num_threads=self.effective_omp_n_thread):
+                        num_threads=self.available_n_threads):
             for j in range(self.k):
                 distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
                     # Guard against eventual -0., causing nan production.
@@ -910,18 +909,18 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared", None)
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.available_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+            _sqeuclidean_row_norms(self.X, self.available_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+            sizeof(DTYPE_t *) * self.effective_n_threads
         )
 
     def __dealloc__(self):
@@ -956,12 +955,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     @final
     cdef void _parallel_on_Y_parallel_init(
         self,
-        ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._parallel_on_Y_parallel_init(self, num_threads)
+        PairwiseDistancesArgKmin._parallel_on_Y_parallel_init(self)
 
-        for thread_num in range(num_threads):
+        for thread_num in range(self.effective_n_threads):
             # Temporary buffer for the `-2 * X_c @ Y_c.T` term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
                 self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
@@ -970,12 +968,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     @final
     cdef void _parallel_on_Y_finalize(
         self,
-        ITYPE_t num_threads,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._parallel_on_Y_finalize(self, num_threads)
+        PairwiseDistancesArgKmin._parallel_on_Y_finalize(self)
 
-        for thread_num in range(num_threads):
+        for thread_num in range(self.effective_n_threads):
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final

From a6ab85f771db7e2d28d8c03c273712dbcb65b6da Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 21 Dec 2021 17:27:17 +0100
Subject: [PATCH 281/290] DOC Add and correct comments

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index edc9dd3cd26a7..545ce9114fc63 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -20,6 +20,7 @@ import scipy.sparse
 
 from .. import get_config
 from libc.stdlib cimport free, malloc
+from libc.stdio cimport printf
 from libc.float cimport DBL_MAX
 from cython cimport final
 from cython.parallel cimport parallel, prange
@@ -675,7 +676,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             sizeof(ITYPE_t *) * self.effective_n_threads
         )
 
-        # Main heaps used by PairwiseDistancesArgKmin.compute to return results.
+        # Main heaps used by PairwiseDistancesArgKmin._compute to return results.
         self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
 
@@ -719,8 +720,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         ITYPE_t thread_num,
         ITYPE_t X_start,
     ) nogil:
-        # As this strategy is embarrassingly parallel, we can set the
-        # thread heaps pointers to the proper position on the main heaps
+        # As this strategy is embarrassingly parallel, we can set each
+        # thread's heaps pointer to the proper position on the main heaps.
         self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
         self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
@@ -784,8 +785,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         cdef:
             ITYPE_t idx, jdx, thread_num
         with nogil, parallel(num_threads=self.available_n_threads):
-            # Synchronising the thread heaps with the main heaps
-            # This is done in parallel samples-wise (no need for locks)
+            # Synchronising the thread heaps with the main heaps.
+            # This is done in parallel sample-wise (no need for locks).
+            # This might break each thread's data locality a bit but
+            # but this is negligible and this parallel pattern has
+            # shown to be efficient in practice.
             for idx in prange(X_end - X_start, schedule="static"):
                 for thread_num in range(self.effective_n_threads):
                     for jdx in range(self.k):

From 7435885f9bc856cc93c9a5e3adfb719d66afc63f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 21 Dec 2021 21:55:58 +0100
Subject: [PATCH 282/290] Update
 sklearn/metrics/_pairwise_distances_reduction.pyx

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 545ce9114fc63..68e373824a3d1 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -20,7 +20,6 @@ import scipy.sparse
 
 from .. import get_config
 from libc.stdlib cimport free, malloc
-from libc.stdio cimport printf
 from libc.float cimport DBL_MAX
 from cython cimport final
 from cython.parallel cimport parallel, prange

From 27329c1bdc017f831915ef33f2484fcf5c0f19bc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 21 Dec 2021 17:50:05 +0100
Subject: [PATCH 283/290] fixup! DOC Add and correct comments

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 68e373824a3d1..b2375d3550a18 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -656,11 +656,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         self.k = check_scalar(k, "k", Integral, min_val=1)
 
         # Allocating pointers to datastructures but not the datastructures themselves.
-        # There are as many pointers as available threads.
-        # However, when reducing on small datasets, there can be more pointers than
-        # actual threads.
-        # In this case, some pointers will be dynamically allocated but there won't
-        # be allocated yet unused data-structures referenced by them.
+        # There are as many pointers as effective threads.
         #
         # For the sake of explicitness:
         #   - when parallelizing on X, those heaps pointers are referencing

From 8754998d073be5dd254d321154b91afb97828106 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 21 Dec 2021 17:54:57 +0100
Subject: [PATCH 284/290] FIX Change strategy default value to None

---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index b2375d3550a18..88fbf147812a9 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -199,7 +199,7 @@ cdef class PairwiseDistancesReduction:
         DatasetsPair datasets_pair,
         chunk_size=None,
         n_threads=None,
-        strategy='auto',
+        strategy=None,
      ):
         cdef:
             ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
@@ -649,7 +649,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         ITYPE_t k,
         chunk_size=None,
         n_threads=None,
-        strategy='auto',
+        strategy=None,
     ):
         super().__init__(datasets_pair, chunk_size, n_threads, strategy)
 
@@ -883,7 +883,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         ITYPE_t k,
         bint use_squared_distances=False,
         chunk_size=None,
-        strategy='auto',
+        strategy=None,
         metric_kwargs=None,
     ):
         if metric_kwargs is not None and len(metric_kwargs) > 0:

From 9cda95f03e69ef9e04c2429ed233be11254e5e82 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 22 Dec 2021 09:27:38 +0100
Subject: [PATCH 285/290] MAINT Rename n_threads variables and document them.

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 58 ++++++++++++-------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 88fbf147812a9..63da687c88d5b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -141,8 +141,22 @@ cdef class PairwiseDistancesReduction:
     cdef:
         readonly DatasetsPair datasets_pair
 
-        ITYPE_t n_threads
-        ITYPE_t effective_n_threads, available_n_threads
+        # The number of threads that can be used is stored in effective_n_threads.
+        #
+        # The number of threads to use in the parallelisation strategy
+        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
+        # for small datasets, less threads might be needed to loop over pair of chunks.
+        #
+        # Hence the number of threads that _will_ be used for looping over chunks
+        # is stored in chunks_n_threads, allowing solely using what we need.
+        #
+        # Thus, an invariant is:
+        #
+        #                 chunks_n_threads <= effective_n_threads
+        #
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+
         ITYPE_t n_samples_chunk, chunk_size
 
         ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
@@ -209,7 +223,7 @@ cdef class PairwiseDistancesReduction:
 
         self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
 
-        self.available_n_threads = _openmp_effective_n_threads(n_threads)
+        self.effective_n_threads = _openmp_effective_n_threads(n_threads)
 
         self.datasets_pair = datasets_pair
 
@@ -237,7 +251,7 @@ cdef class PairwiseDistancesReduction:
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.available_n_threads < self.n_samples_X:
+            if 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
@@ -245,9 +259,9 @@ cdef class PairwiseDistancesReduction:
         self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
 
         # Not using less, not using more.
-        self.effective_n_threads = min(
+        self.chunks_n_threads = min(
             self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
-            self.available_n_threads,
+            self.effective_n_threads,
         )
 
 
@@ -297,7 +311,7 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
             ITYPE_t thread_num
 
-        with nogil, parallel(num_threads=self.effective_n_threads):
+        with nogil, parallel(num_threads=self.chunks_n_threads):
             thread_num = _openmp_thread_num()
 
             # Allocating thread datastructures
@@ -368,7 +382,7 @@ cdef class PairwiseDistancesReduction:
             else:
                 X_end = X_start + self.X_n_samples_chunk
 
-            with nogil, parallel(num_threads=self.effective_n_threads):
+            with nogil, parallel(num_threads=self.chunks_n_threads):
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
@@ -665,10 +679,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         #   small heaps which are thread-wise-allocated and whose content will be
         #   merged with the main heaps'.
         self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_n_threads
+            sizeof(DTYPE_t *) * self.chunks_n_threads
         )
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
-            sizeof(ITYPE_t *) * self.effective_n_threads
+            sizeof(ITYPE_t *) * self.chunks_n_threads
         )
 
         # Main heaps used by PairwiseDistancesArgKmin._compute to return results.
@@ -749,8 +763,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         # The allocation is done in parallel for data locality purposes: this way
         # the heaps used in each threads are allocated in pages which are closer
         # to processor core used by the thread.
-        for thread_num in prange(self.effective_n_threads, schedule='static', nogil=True,
-                                 num_threads=self.effective_n_threads):
+        for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
+                                 num_threads=self.chunks_n_threads):
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
@@ -779,14 +793,14 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     ) nogil:
         cdef:
             ITYPE_t idx, jdx, thread_num
-        with nogil, parallel(num_threads=self.available_n_threads):
+        with nogil, parallel(num_threads=self.effective_n_threads):
             # Synchronising the thread heaps with the main heaps.
             # This is done in parallel sample-wise (no need for locks).
             # This might break each thread's data locality a bit but
             # but this is negligible and this parallel pattern has
             # shown to be efficient in practice.
             for idx in prange(X_end - X_start, schedule="static"):
-                for thread_num in range(self.effective_n_threads):
+                for thread_num in range(self.chunks_n_threads):
                     for jdx in range(self.k):
                         heap_push(
                             &self.argkmin_distances[X_start + idx, 0],
@@ -802,9 +816,9 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         cdef:
             ITYPE_t idx, thread_num
 
-        with nogil, parallel(num_threads=self.effective_n_threads):
+        with nogil, parallel(num_threads=self.chunks_n_threads):
             # Deallocating temporary datastructures
-            for thread_num in prange(self.effective_n_threads, schedule='static'):
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
                 free(self.heaps_r_distances_chunks[thread_num])
                 free(self.heaps_indices_chunks[thread_num])
 
@@ -824,7 +838,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
             DTYPE_t[:, ::1] distances = self.argkmin_distances
         for i in prange(self.n_samples_X, schedule='static', nogil=True,
-                        num_threads=self.available_n_threads):
+                        num_threads=self.effective_n_threads):
             for j in range(self.k):
                 distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
                     # Guard against eventual -0., causing nan production.
@@ -908,18 +922,18 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared", None)
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.available_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms(self.X, self.available_n_threads)
+            _sqeuclidean_row_norms(self.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
         # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.effective_n_threads
+            sizeof(DTYPE_t *) * self.chunks_n_threads
         )
 
     def __dealloc__(self):
@@ -958,7 +972,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         cdef ITYPE_t thread_num
         PairwiseDistancesArgKmin._parallel_on_Y_parallel_init(self)
 
-        for thread_num in range(self.effective_n_threads):
+        for thread_num in range(self.chunks_n_threads):
             # Temporary buffer for the `-2 * X_c @ Y_c.T` term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
                 self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
@@ -971,7 +985,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         cdef ITYPE_t thread_num
         PairwiseDistancesArgKmin._parallel_on_Y_finalize(self)
 
-        for thread_num in range(self.effective_n_threads):
+        for thread_num in range(self.chunks_n_threads):
             free(self.dist_middle_terms_chunks[thread_num])
 
     @final

From 5d77467e949e8416d0160da676268e83655aeb91 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 22 Dec 2021 09:40:31 +0100
Subject: [PATCH 286/290] DOC Improve remark regarding support for discrete
 distance metrics

This was mentionned by one of Oliviers' review but I forgot it.

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 63da687c88d5b..3ec4808a0f59d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -171,8 +171,8 @@ cdef class PairwiseDistancesReduction:
             "mahalanobis", # is numerically unstable
             # TODO: In order to support discrete distance metrics, we need to have a
             # simultaneous sort which breaks ties on indices when distances are identical.
-            # The best might be using a std::sort and a Comparator which might need
-            # AoS instead of SoA (currently used).
+            # The best might be using std::stable_sort and a Comparator taking an
+            # Arrays of Structures instead of Structure of Arrays (currently used).
             "hamming",
             *BOOL_METRICS,
         }

From af5a991b0abf44f99431cc7b1f3227f56acb3995 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 09:16:46 +0100
Subject: [PATCH 287/290] MAINT Remove useless private _compute method

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 39 +++++--------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 3ec4808a0f59d..e08a63e8fe4ab 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -264,35 +264,6 @@ cdef class PairwiseDistancesReduction:
             self.effective_n_threads,
         )
 
-
-    def _compute(
-        self,
-        bint return_distance=False,
-    ):
-        """Compute the pairwise distances and the reduction of vectors (rows) of X on Y.
-
-        Parameters
-        ----------
-        return_distance : boolean, default=False
-            Return distances between each X vector and its
-            argkmin if set to True.
-
-        Returns
-        -------
-        If True, return the distances between each sample of X and
-        the samples of Y selected by the reduction function.
-        """
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if self.execute_in_parallel_on_Y:
-                self._parallel_on_Y()
-            else:
-                self._parallel_on_X()
-
-        return self._finalize_results(return_distance)
-
     @final
     cdef void _parallel_on_X(self) nogil:
         """Compute the pairwise distances of each vector (row) of X on Y
@@ -655,7 +626,15 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 strategy=strategy,
             )
 
-        return pda._compute(return_distance=return_distance)
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
 
     def __init__(
         self,

From decb029d6f453764d62c250b5974ac5f3f0830d6 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 09:34:56 +0100
Subject: [PATCH 288/290] Remove duplicated submodules registration

---
 sklearn/utils/setup.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
index db4765e72a55a..9b75979d5a41f 100644
--- a/sklearn/utils/setup.py
+++ b/sklearn/utils/setup.py
@@ -96,20 +96,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_heap",
-        sources=["_heap.pyx"],
-        include_dirs=[numpy.get_include()],
-        libraries=libraries,
-    )
-
-    config.add_extension(
-        "_typedefs",
-        sources=["_typedefs.pyx"],
-        include_dirs=[numpy.get_include()],
-        libraries=libraries,
-    )
-
     config.add_extension(
         "_heap",
         sources=["_heap.pyx"],

From 527555db79ec2d5a14306124794de2f7960f930b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 09:56:24 +0100
Subject: [PATCH 289/290] fixup! Remove duplicated submodules registration

---
 sklearn/utils/setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
index 9b75979d5a41f..e3ceab6c52bbf 100644
--- a/sklearn/utils/setup.py
+++ b/sklearn/utils/setup.py
@@ -85,7 +85,6 @@ def configuration(parent_package="", top_path=None):
     config.add_extension(
         "_readonly_array_wrapper",
         sources=["_readonly_array_wrapper.pyx"],
-        include_dirs=[numpy.get_include()],
         libraries=libraries,
     )
 

From b11109ce4b86d7c3c262c0fa226cf51c110001b4 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 11 Feb 2022 16:51:51 +0100
Subject: [PATCH 290/290] Move changelog entry at the top

---
 doc/whats_new/v1.1.rst | 62 ++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 10de1efae5400..b05ebc8a9f630 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -65,6 +65,34 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+- |Efficiency| Low-level routines for reductions on pairwise distances
+  for dense float64 datasets have been refactored. The following functions
+  and estimators now benefit from improved performances, in particular on
+  multi-cores machines:
+  - :func:`sklearn.metrics.pairwise_distances_argmin`
+  - :func:`sklearn.metrics.pairwise_distances_argmin_min`
+  - :class:`sklearn.cluster.AffinityPropagation`
+  - :class:`sklearn.cluster.Birch`
+  - :class:`sklearn.cluster.MeanShift`
+  - :class:`sklearn.cluster.OPTICS`
+  - :class:`sklearn.cluster.SpectralClustering`
+  - :func:`sklearn.feature_selection.mutual_info_regression`
+  - :class:`sklearn.neighbors.KNeighborsClassifier`
+  - :class:`sklearn.neighbors.KNeighborsRegressor`
+  - :class:`sklearn.neighbors.LocalOutlierFactor`
+  - :class:`sklearn.neighbors.NearestNeighbors`
+  - :class:`sklearn.manifold.Isomap`
+  - :class:`sklearn.manifold.LocallyLinearEmbedding`
+  - :class:`sklearn.manifold.TSNE`
+  - :func:`sklearn.manifold.trustworthiness`
+  - :class:`sklearn.semi_supervised.LabelPropagation`
+  - :class:`sklearn.semi_supervised.LabelSpreading`
+
+  For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
+  can be up to 20× faster than in the previous versions'.
+
+  :pr:`21462` by :user:`Julien Jerphanion <jjerphan>`.
+
 - |Enhancement| All scikit-learn models now generate a more informative
   error message when some input contains unexpected `NaN` or infinite values.
   In particular the message contains the input name ("X", "y" or
@@ -148,7 +176,7 @@ Changelog
   :user:`Sebastian Pujalte <pujaltes>`.
 
 - |Enhancement| :func:`datasets.make_blobs` no longer copies data during the generation
-  process, therefore uses less memory. 
+  process, therefore uses less memory.
   :pr:`22412` by :user:`Zhehao Liu <MaxwellLZH>`.
 
 - |Enhancement| :func:`datasets.load_diabetes` now accepts the parameter
@@ -621,38 +649,6 @@ Changelog
   left corner of the HTML representation to show how the elements are
   clickable. :pr:`21298` by `Thomas Fan`_.
 
-
-Miscellaneous
-.............
-
-- |Efficiency| Low-level routines for reductions on pairwise distances
-  for dense float64 datasets have been refactored. The following functions
-  and estimators now benefit from improved performances, in particular on
-  multi-cores machines:
-  - :func:`sklearn.metrics.pairwise_distances_argmin`
-  - :func:`sklearn.metrics.pairwise_distances_argmin_min`
-  - :class:`sklearn.cluster.AffinityPropagation`
-  - :class:`sklearn.cluster.Birch`
-  - :class:`sklearn.cluster.MeanShift`
-  - :class:`sklearn.cluster.OPTICS`
-  - :class:`sklearn.cluster.SpectralClustering`
-  - :func:`sklearn.feature_selection.mutual_info_regression`
-  - :class:`sklearn.neighbors.KNeighborsClassifier`
-  - :class:`sklearn.neighbors.KNeighborsRegressor`
-  - :class:`sklearn.neighbors.LocalOutlierFactor`
-  - :class:`sklearn.neighbors.NearestNeighbors`
-  - :class:`sklearn.manifold.Isomap`
-  - :class:`sklearn.manifold.LocallyLinearEmbedding`
-  - :class:`sklearn.manifold.TSNE`
-  - :func:`sklearn.manifold.trustworthiness`
-  - :class:`sklearn.semi_supervised.LabelPropagation`
-  - :class:`sklearn.semi_supervised.LabelSpreading`
-
-  For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
-  can be up to 20× faster than in the previous versions'.
-
-  :pr:`21462` by :user:`Julien Jerphanion <jjerphan>`.
-
 - |Enhancement| :func:`utils.validation.check_scalar` now has better messages
   when displaying the type. :pr:`22218` by `Thomas Fan`_.