From 24fe8a64d3cfd322755645d84f7f9a61387423a8 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 30 Sep 2022 22:10:29 +0200
Subject: [PATCH 01/27] wip

---
 .../_argkmin.pxd.tp                           |  4 +-
 .../_argkmin.pyx.tp                           | 15 +++-
 .../_dispatcher.py                            |  2 +-
 .../_gemm_term_computer.pxd.tp                | 30 +++++++-
 .../_gemm_term_computer.pyx.tp                | 74 ++++++++++++++++++-
 .../_radius_neighborhood.pxd.tp               |  4 +-
 .../_radius_neighborhood.pyx.tp               | 16 +++-
 7 files changed, 130 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
index 37dd7f5836857..fcdaf6e63887a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
@@ -22,7 +22,7 @@ cnp.import_array()
 {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
 from ._base cimport BaseDistanceReducer{{name_suffix}}
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
     """{{name_suffix}}bit implementation of BaseDistanceReducer{{name_suffix}} for the `ArgKmin` reduction."""
@@ -41,7 +41,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
 cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
     """EuclideanDistance-specialized {{name_suffix}}bit implementation of ArgKmin{{name_suffix}}."""
     cdef:
-        GEMMTermComputer{{name_suffix}} gemm_term_computer
+        MiddleTermComputer{{name_suffix}} gemm_term_computer
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 1c1459e27f210..a765bad4ec9bb 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -47,9 +47,13 @@ from ._base cimport (
 from ._datasets_pair cimport (
     DatasetsPair{{name_suffix}},
     DenseDenseDatasetsPair{{name_suffix}},
+    SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._gemm_term_computer cimport (
+    DenseDenseMiddleTermComputer{{name_suffix}},
+    SparseSparseMiddleTermComputer{{name_suffix}},
+)
 
 
 cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
@@ -82,8 +86,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
         """
         if (
             metric in ("euclidean", "sqeuclidean")
-            and not issparse(X)
-            and not issparse(Y)
+            and not (issparse(X) ^ issparse(Y))
         ):
             # Specialized implementation with improved arithmetic intensity
             # and vector instructions (SIMD) by processing several vectors
@@ -372,7 +375,11 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.gemm_term_computer = GEMMTermComputer{{name_suffix}}(
+        if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}):
+            MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}}
+        else:
+            MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}}
+        self.gemm_term_computer = MiddleTermComputer{{name_suffix}}(
             datasets_pair.X,
             datasets_pair.Y,
             self.effective_n_threads,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 809d683b52ced..4b57e8327b2a4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -131,7 +131,7 @@ def is_valid_sparse_matrix(X):
         # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
         # using sparse-dense routines for matrix-vector multiplications.
         fused_sparse_dense_euclidean_case_guard = not (
-            (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y))
+            (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y))
             and isinstance(metric, str)
             and "euclidean" in metric
         )
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
index 5978cfee9ebee..4956e0fb40fff 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
@@ -20,7 +20,7 @@ from libcpp.vector cimport vector
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
-cdef class GEMMTermComputer{{name_suffix}}:
+cdef class MiddleTermComputer{{name_suffix}}:
     cdef:
         const {{INPUT_DTYPE_t}}[:, ::1] X
         const {{INPUT_DTYPE_t}}[:, ::1] Y
@@ -76,6 +76,34 @@ cdef class GEMMTermComputer{{name_suffix}}:
         ITYPE_t thread_num
     ) nogil
 
+
+cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef void _middle_term_sparse_sparse_{{name_suffix}}(
+    const {{INPUT_DTYPE_t}}[:] X_data,
+    const ITYPE_t[:] X_indices,
+    const ITYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const {{INPUT_DTYPE_t}}[:] Y_data,
+    const ITYPE_t[:] Y_indices,
+    const ITYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t[:, ::1] D
+) nogil
+
+
+cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
index 35e57219a96a7..9e4dce4199459 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
@@ -31,7 +31,7 @@ from ...utils._cython_blas cimport (
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
-cdef class GEMMTermComputer{{name_suffix}}:
+cdef class MiddleTermComputer{{name_suffix}}:
     """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
 
     `FastEuclidean*` classes internally compute the squared Euclidean distances between
@@ -167,6 +167,9 @@ cdef class GEMMTermComputer{{name_suffix}}:
         return
 {{endif}}
 
+
+cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -214,4 +217,73 @@ cdef class GEMMTermComputer{{name_suffix}}:
 
         return dist_middle_terms
 
+
+cdef void _middle_term_sparse_sparse_{{name_suffix}}(
+    const {{INPUT_DTYPE_t}}[:] X_data,
+    const ITYPE_t[:] X_indices,
+    const ITYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const {{INPUT_DTYPE_t}}[:] Y_data,
+    const ITYPE_t[:] Y_indices,
+    const ITYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t[:, ::1] D,
+) nogil:
+    cdef:
+        ITYPE_t i, j
+        ITYPE_t n_X = X_end - X_start
+        ITYPE_t n_Y = Y_end - Y_start
+        ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr
+
+    for i in range(n_X):
+        for _X_indptr in range(X_indptr[i], X_indptr[i+1]):
+            _X_indices = X_indices[_X_indptr]
+            for j in range(n_Y):
+                for _Y_indptr in range(Y_indptr[j], Y_indptr[j+1]):
+                    _Y_indices = Y_indices[_Y_indptr]
+                    if _X_indices == _Y_indices:
+                        D[X_start + i, Y_start + j] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr]
+
+
+cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
+            const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            const {{INPUT_DTYPE_t}}[:] X_data = X_c.data
+            const ITYPE_t[:] X_indices =  X_c.indices
+            const ITYPE_t[:] X_indptr = X_c.indptr
+
+            const {{INPUT_DTYPE_t}}[:] Y_data = Y_c.data
+            const ITYPE_t[:] Y_indices =  Y_c.indices
+            const ITYPE_t[:] Y_indptr = Y_c.indptr
+
+            _middle_term_sparse_sparse_{{name_suffix}}(
+                X_data,
+                X_indices,
+                X_indptr,
+                X_start,
+                X_end,
+                Y_data,
+                Y_indices,
+                Y_indptr,
+                Y_start,
+                Y_end,
+                dist_middle_terms,
+            )
+
+        return dist_middle_terms
+
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp
index 9bbe186589f20..7fddb77365579 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp
@@ -44,7 +44,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
 from ._base cimport BaseDistanceReducer{{name_suffix}}
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
     """
@@ -99,7 +99,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
 cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
     """EuclideanDistance-specialized {{name_suffix}}bit implementation for RadiusNeighbors{{name_suffix}}."""
     cdef:
-        GEMMTermComputer{{name_suffix}} gemm_term_computer
+        MiddleTermComputer{{name_suffix}} gemm_term_computer
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
index 8d50a6e04abf9..9a8fb9500477c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
@@ -66,9 +66,13 @@ from ._base cimport (
 from ._datasets_pair cimport (
     DatasetsPair{{name_suffix}},
     DenseDenseDatasetsPair{{name_suffix}},
+    SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._gemm_term_computer cimport (
+    DenseDenseMiddleTermComputer{{name_suffix}},
+    SparseSparseMiddleTermComputer{{name_suffix}},
+)
 
 
 cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
@@ -105,8 +109,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
         """
         if (
             metric in ("euclidean", "sqeuclidean")
-            and not issparse(X)
-            and not issparse(Y)
+            and not (issparse(X) ^ issparse(Y))
         ):
             # Specialized implementation with improved arithmetic intensity
             # and vector instructions (SIMD) by processing several vectors
@@ -381,7 +384,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.gemm_term_computer = GEMMTermComputer{{name_suffix}}(
+        if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}):
+            MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}}
+        else:
+            MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}}
+
+        self.gemm_term_computer = MiddleTermComputer{{name_suffix}}(
             datasets_pair.X,
             datasets_pair.Y,
             self.effective_n_threads,

From 0132e562fdbc8673296cd3c448d1799a7d9d4e8a Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 30 Sep 2022 22:21:31 +0200
Subject: [PATCH 02/27] remove potential bug

---
 .../_argkmin.pyx.tp                           | 29 +++++++++++-------
 .../_radius_neighborhood.pyx.tp               | 30 +++++++++++--------
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index a765bad4ec9bb..d003b9be08f75 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -376,18 +376,25 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
         if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}):
-            MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}}
+            self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}(
+                datasets_pair.X,
+                datasets_pair.Y,
+                self.effective_n_threads,
+                self.chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features=datasets_pair.X.shape[1],
+                chunk_size=self.chunk_size,
+            )
         else:
-            MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}}
-        self.gemm_term_computer = MiddleTermComputer{{name_suffix}}(
-            datasets_pair.X,
-            datasets_pair.Y,
-            self.effective_n_threads,
-            self.chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
-            chunk_size=self.chunk_size,
-        )
+            self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}(
+                datasets_pair.X,
+                datasets_pair.Y,
+                self.effective_n_threads,
+                self.chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features=datasets_pair.X.shape[1],
+                chunk_size=self.chunk_size,
+            )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = check_array(
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
index 9a8fb9500477c..7b6195154def3 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
@@ -385,19 +385,25 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
         if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}):
-            MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}}
+            self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}(
+                datasets_pair.X,
+                datasets_pair.Y,
+                self.effective_n_threads,
+                self.chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features=datasets_pair.X.shape[1],
+                chunk_size=self.chunk_size,
+            )
         else:
-            MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}}
-
-        self.gemm_term_computer = MiddleTermComputer{{name_suffix}}(
-            datasets_pair.X,
-            datasets_pair.Y,
-            self.effective_n_threads,
-            self.chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
-            chunk_size=self.chunk_size,
-        )
+            self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}(
+                datasets_pair.X,
+                datasets_pair.Y,
+                self.effective_n_threads,
+                self.chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features=datasets_pair.X.shape[1],
+                chunk_size=self.chunk_size,
+            )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = check_array(

From c0ac7db46f7cbd211008bf27cb893d0cf431cd27 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 11 Oct 2022 18:36:03 +0200
Subject: [PATCH 03/27] much fix such wow

---
 .../_argkmin.pxd.tp                           |   2 +-
 .../_argkmin.pyx.tp                           |  43 ++--
 .../_gemm_term_computer.pxd.tp                |  59 +++--
 .../_gemm_term_computer.pyx.tp                | 217 +++++++++++++-----
 .../_radius_neighborhood.pxd.tp               |   2 +-
 .../_radius_neighborhood.pyx.tp               |  43 ++--
 6 files changed, 227 insertions(+), 139 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
index fcdaf6e63887a..4751c5fa04026 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
@@ -41,7 +41,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
 cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
     """EuclideanDistance-specialized {{name_suffix}}bit implementation of ArgKmin{{name_suffix}}."""
     cdef:
-        MiddleTermComputer{{name_suffix}} gemm_term_computer
+        MiddleTermComputer{{name_suffix}} middle_term_computer
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index d003b9be08f75..99e6c1ac2d77d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -375,26 +375,15 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}):
-            self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}(
-                datasets_pair.X,
-                datasets_pair.Y,
-                self.effective_n_threads,
-                self.chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features=datasets_pair.X.shape[1],
-                chunk_size=self.chunk_size,
-            )
-        else:
-            self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}(
-                datasets_pair.X,
-                datasets_pair.Y,
-                self.effective_n_threads,
-                self.chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features=datasets_pair.X.shape[1],
-                chunk_size=self.chunk_size,
-            )
+        self.middle_term_computer = MiddleTermComputer.get_for(
+            X,
+            Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
+        )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = check_array(
@@ -424,7 +413,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ITYPE_t thread_num,
     ) nogil:
         ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
-        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
+        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
 
 
     @final
@@ -435,7 +424,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ITYPE_t X_end,
     ) nogil:
         ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
 
     @final
@@ -453,7 +442,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
@@ -464,7 +453,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
     ) nogil:
         cdef ITYPE_t thread_num
         ArgKmin{{name_suffix}}._parallel_on_Y_init(self)
-        self.gemm_term_computer._parallel_on_Y_init()
+        self.middle_term_computer._parallel_on_Y_init()
 
 
     @final
@@ -475,7 +464,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ITYPE_t X_end,
     ) nogil:
         ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
 
     @final
@@ -493,7 +482,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num
         )
 
@@ -512,7 +501,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             DTYPE_t squared_dist_i_j
             ITYPE_t n_X = X_end - X_start
             ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
+            DTYPE_t * dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks(
                 X_start, X_end, Y_start, Y_end, thread_num
             )
             DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
index 4956e0fb40fff..d9c5f96cd13e6 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
@@ -15,16 +15,29 @@ implementation_specific_values = [
 }}
 cimport numpy as cnp
 
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 from libcpp.vector cimport vector
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
+
+cdef void _middle_term_sparse_sparse_{{name_suffix}}(
+    const {{INPUT_DTYPE_t}}[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const {{INPUT_DTYPE_t}}[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil
+
+
 cdef class MiddleTermComputer{{name_suffix}}:
     cdef:
-        const {{INPUT_DTYPE_t}}[:, ::1] X
-        const {{INPUT_DTYPE_t}}[:, ::1] Y
-
         ITYPE_t effective_n_threads
         ITYPE_t chunks_n_threads
         ITYPE_t dist_middle_terms_chunks_size
@@ -34,12 +47,6 @@ cdef class MiddleTermComputer{{name_suffix}}:
         # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
         vector[vector[DTYPE_t]] dist_middle_terms_chunks
 
-{{if upcast_to_float64}}
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
-        vector[vector[DTYPE_t]] X_c_upcast
-        vector[vector[DTYPE_t]] Y_c_upcast
-{{endif}}
-
     cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -78,6 +85,16 @@ cdef class MiddleTermComputer{{name_suffix}}:
 
 
 cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:, ::1] X
+        const {{INPUT_DTYPE_t}}[:, ::1] Y
+
+    {{if upcast_to_float64}}
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
+        vector[vector[DTYPE_t]] X_c_upcast
+        vector[vector[DTYPE_t]] Y_c_upcast
+    {{endif}}
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -88,22 +105,16 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
     ) nogil
 
 
-cdef void _middle_term_sparse_sparse_{{name_suffix}}(
-    const {{INPUT_DTYPE_t}}[:] X_data,
-    const ITYPE_t[:] X_indices,
-    const ITYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const {{INPUT_DTYPE_t}}[:] Y_data,
-    const ITYPE_t[:] Y_indices,
-    const ITYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t[:, ::1] D
-) nogil
+cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
 
+        const {{INPUT_DTYPE_t}}[:] Y_data
+        const SPARSE_INDEX_TYPE_t[:] Y_indices
+        const SPARSE_INDEX_TYPE_t[:] Y_indptr
 
-cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
index 9e4dce4199459..a034401566ca5 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
@@ -14,10 +14,13 @@ implementation_specific_values = [
 
 }}
 cimport numpy as cnp
+import numpy as np
 
 from libcpp.vector cimport vector
 
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t
+from scipy.sparse import issparse, csr_matrix
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+from ...utils._typedefs import SPARSE_INDEX_TYPE
 
 from ...utils._cython_blas cimport (
   BLAS_Order,
@@ -31,6 +34,37 @@ from ...utils._cython_blas cimport (
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
+
+cdef void _middle_term_sparse_sparse_{{name_suffix}}(
+    const {{INPUT_DTYPE_t}}[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const {{INPUT_DTYPE_t}}[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil:
+    cdef:
+        ITYPE_t i, j, k
+        ITYPE_t n_X = X_end - X_start
+        ITYPE_t n_Y = Y_end - Y_start
+        ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr
+
+    for i in range(n_X):
+        for _X_indptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+            _X_indices = X_indices[_X_indptr]
+            for j in range(n_Y):
+                for _Y_indptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
+                    _Y_indices = Y_indices[_Y_indptr]
+                    if _X_indices == _Y_indices:
+                        k = (X_start + i) * n_X + Y_start + j
+                        D[k] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr]
+
+
 cdef class MiddleTermComputer{{name_suffix}}:
     """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
 
@@ -46,17 +80,79 @@ cdef class MiddleTermComputer{{name_suffix}}:
     arithmetic intensity.
     """
 
-    def __init__(self,
-        const {{INPUT_DTYPE_t}}[:, ::1] X,
-        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        effective_n_threads,
+        chunks_n_threads,
+        dist_middle_terms_chunks_size,
+        n_features,
+        chunk_size,
+    ) -> MiddleTermComputer{{name_suffix}}:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Returns
+        -------
+        middle_term_computer: MiddleTermComputer{{name_suffix}}
+            The suited MiddleTermComputer{{name_suffix}} implementation.
+        """
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseMiddleTermComputer{{name_suffix}}(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+        if X_is_sparse and Y_is_sparse:
+            return SparseSparseMiddleTermComputer{{name_suffix}}(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+
+        raise NotImplementedError("X and Y must be both sparse or dense")
+
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
+        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
+        return X_data, X_indices, X_indptr
+
+    def __init__(
+        self,
         ITYPE_t effective_n_threads,
         ITYPE_t chunks_n_threads,
         ITYPE_t dist_middle_terms_chunks_size,
         ITYPE_t n_features,
         ITYPE_t chunk_size,
     ):
-        self.X = X
-        self.Y = Y
         self.effective_n_threads = effective_n_threads
         self.chunks_n_threads = chunks_n_threads
         self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
@@ -170,6 +266,26 @@ cdef class MiddleTermComputer{{name_suffix}}:
 
 cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
 
+    def __init__(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X = X
+        self.Y = Y
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -218,37 +334,28 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
         return dist_middle_terms
 
 
-cdef void _middle_term_sparse_sparse_{{name_suffix}}(
-    const {{INPUT_DTYPE_t}}[:] X_data,
-    const ITYPE_t[:] X_indices,
-    const ITYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const {{INPUT_DTYPE_t}}[:] Y_data,
-    const ITYPE_t[:] Y_indices,
-    const ITYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t[:, ::1] D,
-) nogil:
-    cdef:
-        ITYPE_t i, j
-        ITYPE_t n_X = X_end - X_start
-        ITYPE_t n_Y = Y_end - Y_start
-        ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr
-
-    for i in range(n_X):
-        for _X_indptr in range(X_indptr[i], X_indptr[i+1]):
-            _X_indices = X_indices[_X_indptr]
-            for j in range(n_Y):
-                for _Y_indptr in range(Y_indptr[j], Y_indptr[j+1]):
-                    _Y_indices = Y_indices[_Y_indptr]
-                    if _X_indices == _Y_indices:
-                        D[X_start + i, Y_start + j] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr]
-
-
 cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
 
+    def __init__(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -258,32 +365,24 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
         ITYPE_t thread_num,
     ) nogil:
         cdef:
-            const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
-            const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
-
-            const {{INPUT_DTYPE_t}}[:] X_data = X_c.data
-            const ITYPE_t[:] X_indices =  X_c.indices
-            const ITYPE_t[:] X_indptr = X_c.indptr
-
-            const {{INPUT_DTYPE_t}}[:] Y_data = Y_c.data
-            const ITYPE_t[:] Y_indices =  Y_c.indices
-            const ITYPE_t[:] Y_indptr = Y_c.indptr
-
-            _middle_term_sparse_sparse_{{name_suffix}}(
-                X_data,
-                X_indices,
-                X_indptr,
-                X_start,
-                X_end,
-                Y_data,
-                Y_indices,
-                Y_indptr,
-                Y_start,
-                Y_end,
-                dist_middle_terms,
+            DTYPE_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
             )
 
+        _middle_term_sparse_sparse_{{name_suffix}}(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y_data,
+            self.Y_indices,
+            self.Y_indptr,
+            Y_start,
+            Y_end,
+            dist_middle_terms,
+        )
+
         return dist_middle_terms
 
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp
index 7fddb77365579..f0dd22da6faed 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp
@@ -99,7 +99,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
 cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
     """EuclideanDistance-specialized {{name_suffix}}bit implementation for RadiusNeighbors{{name_suffix}}."""
     cdef:
-        MiddleTermComputer{{name_suffix}} gemm_term_computer
+        MiddleTermComputer{{name_suffix}} middle_term_computer
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
index 7b6195154def3..688edf72fa696 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
@@ -384,26 +384,15 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}):
-            self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}(
-                datasets_pair.X,
-                datasets_pair.Y,
-                self.effective_n_threads,
-                self.chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features=datasets_pair.X.shape[1],
-                chunk_size=self.chunk_size,
-            )
-        else:
-            self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}(
-                datasets_pair.X,
-                datasets_pair.Y,
-                self.effective_n_threads,
-                self.chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features=datasets_pair.X.shape[1],
-                chunk_size=self.chunk_size,
-            )
+        self.middle_term_computer = MiddleTermComputer.get_for(
+            X,
+            Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
+        )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = check_array(
@@ -433,7 +422,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
         ITYPE_t thread_num,
     ) nogil:
         RadiusNeighbors{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
-        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
+        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
 
     @final
     cdef void _parallel_on_X_init_chunk(
@@ -443,7 +432,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
         ITYPE_t X_end,
     ) nogil:
         RadiusNeighbors{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
     @final
     cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
@@ -460,7 +449,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
@@ -470,7 +459,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
     ) nogil:
         cdef ITYPE_t thread_num
         RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self)
-        self.gemm_term_computer._parallel_on_Y_init()
+        self.middle_term_computer._parallel_on_Y_init()
 
     @final
     cdef void _parallel_on_Y_parallel_init(
@@ -480,7 +469,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
         ITYPE_t X_end,
     ) nogil:
         RadiusNeighbors{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
     @final
     cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
@@ -497,7 +486,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num
         )
 
@@ -520,7 +509,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             DTYPE_t squared_dist_i_j
             ITYPE_t n_X = X_end - X_start
             ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
+            DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks(
                 X_start, X_end, Y_start, Y_end, thread_num
             )
 

From 27b9427911e56592fdef13f23c6e9ad45b02c3b2 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 12 Oct 2022 22:24:52 +0200
Subject: [PATCH 04/27] compiling !

---
 .../_argkmin.pyx.tp                           |   7 +-
 .../_gemm_term_computer.pxd.tp                |  41 ++++++
 .../_gemm_term_computer.pyx.tp                | 124 ++++++++++++------
 .../_radius_neighborhood.pyx.tp               |   7 +-
 4 files changed, 131 insertions(+), 48 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 99e6c1ac2d77d..baddc41cef7e9 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -50,10 +50,7 @@ from ._datasets_pair cimport (
     SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport (
-    DenseDenseMiddleTermComputer{{name_suffix}},
-    SparseSparseMiddleTermComputer{{name_suffix}},
-)
+from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 
 cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
@@ -375,7 +372,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.middle_term_computer = MiddleTermComputer.get_for(
+        self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
             X,
             Y,
             self.effective_n_threads,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
index d9c5f96cd13e6..0e5d744720ffb 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
@@ -83,6 +83,15 @@ cdef class MiddleTermComputer{{name_suffix}}:
         ITYPE_t thread_num
     ) nogil
 
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
 
 cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
     cdef:
@@ -95,6 +104,38 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
         vector[vector[DTYPE_t]] Y_c_upcast
     {{endif}}
 
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
index a034401566ca5..9dbeca75bcd87 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
@@ -20,7 +20,7 @@ from libcpp.vector cimport vector
 
 from scipy.sparse import issparse, csr_matrix
 from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
-from ...utils._typedefs import SPARSE_INDEX_TYPE
+from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
 
 from ...utils._cython_blas cimport (
   BLAS_Order,
@@ -61,7 +61,7 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}(
                 for _Y_indptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
                     _Y_indices = Y_indices[_Y_indptr]
                     if _X_indices == _Y_indices:
-                        k = (X_start + i) * n_X + Y_start + j
+                        k = i * n_X + j
                         D[k] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr]
 
 
@@ -140,7 +140,7 @@ cdef class MiddleTermComputer{{name_suffix}}:
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
         """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
         X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
         X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
         return X_data, X_indices, X_indptr
@@ -161,6 +161,87 @@ cdef class MiddleTermComputer{{name_suffix}}:
 
         self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
 
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = vector[DTYPE_t](1).data()
+        return dist_middle_terms
+
+
+
+cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+
+    def __init__(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X = X
+        self.Y = Y
+
 {{if upcast_to_float64}}
         # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit.
         self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
@@ -194,10 +275,6 @@ cdef class MiddleTermComputer{{name_suffix}}:
         return
 {{endif}}
 
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
     cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
@@ -217,12 +294,6 @@ cdef class MiddleTermComputer{{name_suffix}}:
         return
 {{endif}}
 
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
     cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
@@ -263,29 +334,6 @@ cdef class MiddleTermComputer{{name_suffix}}:
         return
 {{endif}}
 
-
-cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
-
-    def __init__(
-        self,
-        const {{INPUT_DTYPE_t}}[:, ::1] X,
-        const {{INPUT_DTYPE_t}}[:, ::1] Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        super().__init__(
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-        )
-        self.X = X
-        self.Y = Y
-
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -338,8 +386,8 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
 
     def __init__(
         self,
-        const {{INPUT_DTYPE_t}}[:, ::1] X,
-        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        X,
+        Y,
         ITYPE_t effective_n_threads,
         ITYPE_t chunks_n_threads,
         ITYPE_t dist_middle_terms_chunks_size,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
index 688edf72fa696..9b6d22f8c25b2 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
@@ -69,10 +69,7 @@ from ._datasets_pair cimport (
     SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport (
-    DenseDenseMiddleTermComputer{{name_suffix}},
-    SparseSparseMiddleTermComputer{{name_suffix}},
-)
+from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 
 cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}):
@@ -384,7 +381,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.middle_term_computer = MiddleTermComputer.get_for(
+        self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
             X,
             Y,
             self.effective_n_threads,

From 2a1415455677494bb66ce757aa6dff739d2e1af9 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sun, 16 Oct 2022 16:18:20 +0200
Subject: [PATCH 05/27] add sparse method for sq_euclidean_norm

---
 .../_argkmin.pxd.tp                           |  3 +
 .../_argkmin.pyx.tp                           | 17 ++++-
 .../_base.pxd.tp                              | 29 +++++++-
 .../_base.pyx.tp                              | 74 ++++++++++++++++++-
 .../_dispatcher.py                            |  9 +--
 .../_gemm_term_computer.pxd.tp                |  1 +
 .../_gemm_term_computer.pyx.tp                | 37 ++++++++--
 .../_radius_neighborhood.pyx.tp               |  6 +-
 8 files changed, 151 insertions(+), 25 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
index 4751c5fa04026..0e5fa6dcc13cd 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
@@ -16,6 +16,9 @@ implementation_specific_values = [
 
 cimport numpy as cnp
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+from libcpp.vector cimport vector
+from ...utils._vector_sentinel cimport vector_to_nd_array
+from ...utils._typedefs cimport DTYPECODE
 
 cnp.import_array()
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index baddc41cef7e9..c1822eac3d40f 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -33,6 +33,9 @@ from scipy.sparse import issparse
 from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration
 from ...utils.fixes import threadpool_limits
 from ...utils._typedefs import ITYPE, DTYPE
+from libcpp.vector cimport vector
+from ...utils._vector_sentinel cimport vector_to_nd_array
+from ...utils._typedefs cimport DTYPECODE
 
 
 cnp.import_array()
@@ -41,7 +44,7 @@ cnp.import_array()
 
 from ._base cimport (
     BaseDistanceReducer{{name_suffix}},
-    _sqeuclidean_row_norms{{name_suffix}},
+    SqeuclideanRowNorm{{name_suffix}},
 )
 
 from ._datasets_pair cimport (
@@ -390,14 +393,22 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
                 dtype=np.float64
             )
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = SqeuclideanRowNorm{{name_suffix}}.get_for(
+                Y, self.effective_n_threads
+            ).compute()
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads)
+            SqeuclideanRowNorm{{name_suffix}}.get_for(
+                X, self.effective_n_threads
+            ).compute()
         )
         self.use_squared_distances = use_squared_distances
+        print("DEBUG X_norm_squared")
+        print(np.asarray(self.X_norm_squared))
+        print("DEBUG Y_norm_squared")
+        print(np.asarray(self.Y_norm_squared))
 
     @final
     cdef void compute_exact_distances(self) nogil:
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
index 77a689a9e4e94..9c09fdd71c38c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -17,7 +17,7 @@ cimport numpy as cnp
 
 from cython cimport final
 
-from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t, SPARSE_INDEX_TYPE_t
 
 cnp.import_array()
 
@@ -25,11 +25,36 @@ cnp.import_array()
 
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_dense(
     const {{INPUT_DTYPE_t}}[:, ::1] X,
     ITYPE_t num_threads,
 )
 
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
+    const {{INPUT_DTYPE_t}}[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t num_threads,
+)
+
+cdef class SqeuclideanRowNorm{{name_suffix}}:
+    cdef:
+        ITYPE_t num_threads
+
+    cpdef DTYPE_t[::1] compute(self)
+
+
+cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:, ::1] X
+
+
+cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+
 cdef class BaseDistanceReducer{{name_suffix}}:
     """
     Base {{name_suffix}}bit implementation template of the pairwise-distances reduction
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index d0c06de0f8761..3a573126d892a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -27,17 +27,18 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 import numpy as np
 
+from scipy.sparse import issparse, csr_matrix
 from numbers import Integral
 from sklearn import get_config
 from sklearn.utils import check_scalar
 from ...utils._openmp_helpers import _openmp_effective_n_threads
-from ...utils._typedefs import ITYPE, DTYPE
+from ...utils._typedefs import ITYPE, DTYPE, SPARSE_INDEX_TYPE
 
 cnp.import_array()
 
 #####################
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense(
     const DTYPE_t[:, ::1] X,
     ITYPE_t num_threads,
 ):
@@ -62,7 +63,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
     return squared_row_norms
 
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
     const cnp.float32_t[:, ::1] X,
     ITYPE_t num_threads,
 ):
@@ -103,6 +104,73 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
 
 {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
+    const {{INPUT_DTYPE_t}}[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t num_threads,
+):
+    cdef:
+        ITYPE_t n = X_indptr.shape[0] - 1
+        ITYPE_t idx = 0
+        ITYPE_t thread_num
+        ITYPE_t X_i_ptr
+        DTYPE_t[::1] squared_row_norms = np.zeros(n, dtype=DTYPE)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]):
+            squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr]
+
+    return squared_row_norms
+
+
+cdef class SqeuclideanRowNorm{{name_suffix}}:
+
+    @classmethod
+    def get_for(cls, X, num_threads):
+        if issparse(X):
+            return SparseSqeuclideanRowNorm{{name_suffix}}(X, num_threads)
+        else:
+            return DenseSqeuclideanRowNorm{{name_suffix}}(X, num_threads)
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+        X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
+        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
+        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
+        return X_data, X_indices, X_indptr
+
+    def __init__(self, num_threads):
+        self.num_threads = num_threads
+
+    cpdef DTYPE_t[::1] compute(self):
+        return None
+
+
+cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
+
+    def __init__(self, X, num_threads):
+        super().__init__(num_threads)
+        self.X = X
+
+    cpdef DTYPE_t[::1] compute(self):
+        return _sqeuclidean_row_norms{{name_suffix}}_dense(self.X, self.num_threads)
+
+
+cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
+
+    def __init__(self, X, num_threads):
+        super().__init__(num_threads)
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+
+    cpdef DTYPE_t[::1] compute(self):
+        return _sqeuclidean_row_norms{{name_suffix}}_sparse(
+            self.X_data,
+            self.X_indptr,
+            self.num_threads,
+        )
+
+
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 cdef class BaseDistanceReducer{{name_suffix}}:
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 4b57e8327b2a4..7f46df232f720 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -8,10 +8,7 @@
 
 from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING
 
-from ._base import (
-    _sqeuclidean_row_norms64,
-    _sqeuclidean_row_norms32,
-)
+from ._base import SqeuclideanRowNorm32, SqeuclideanRowNorm64
 from ._argkmin import (
     ArgKmin64,
     ArgKmin32,
@@ -41,9 +38,9 @@ def sqeuclidean_row_norms(X, num_threads):
         Arrays containing the squared euclidean norm of each row of X.
     """
     if X.dtype == np.float64:
-        return _sqeuclidean_row_norms64(X, num_threads)
+        return SqeuclideanRowNorm64.get_for(X, num_threads).compute()
     if X.dtype == np.float32:
-        return _sqeuclidean_row_norms32(X, num_threads)
+        return SqeuclideanRowNorm32.get_for(X, num_threads).compute()
 
     raise ValueError(
         "Only float64 or float32 datasets are supported at this time, "
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
index 0e5d744720ffb..dbae5ef42119c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
@@ -17,6 +17,7 @@ cimport numpy as cnp
 
 from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 from libcpp.vector cimport vector
+from ...utils._vector_sentinel cimport vector_to_nd_array
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
index 9dbeca75bcd87..bee05aa2b528b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
@@ -21,6 +21,7 @@ from libcpp.vector cimport vector
 from scipy.sparse import issparse, csr_matrix
 from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
+from ...utils._vector_sentinel cimport vector_to_nd_array
 
 from ...utils._cython_blas cimport (
   BLAS_Order,
@@ -52,17 +53,17 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}(
         ITYPE_t i, j, k
         ITYPE_t n_X = X_end - X_start
         ITYPE_t n_Y = Y_end - Y_start
-        ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr
+        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
 
     for i in range(n_X):
-        for _X_indptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
-            _X_indices = X_indices[_X_indptr]
+        for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+            X_i_col_idx = X_indices[X_i_ptr]
             for j in range(n_Y):
-                for _Y_indptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
-                    _Y_indices = Y_indices[_Y_indptr]
-                    if _X_indices == _Y_indices:
-                        k = i * n_X + j
-                        D[k] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr]
+                k = i * n_Y + j
+                for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
+                    Y_j_col_idx = Y_indices[Y_j_ptr]
+                    if X_i_col_idx == Y_j_col_idx:
+                        D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
 
 
 cdef class MiddleTermComputer{{name_suffix}}:
@@ -416,6 +417,17 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
             DTYPE_t *dist_middle_terms = (
                 self.dist_middle_terms_chunks[thread_num].data()
             )
+            vector[DTYPE_t] vec_to_inspect
+
+        with gil:
+            print("DEBUG Before _middle_term_sparse_sparse_{{name_suffix}}")
+            print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=")
+            vec_to_inspect = self.dist_middle_terms_chunks[thread_num]
+            vec_as_np_array = vector_to_nd_array(
+                # This is the address of the vector
+                &vec_to_inspect,
+            )
+            print(vec_as_np_array)
 
         _middle_term_sparse_sparse_{{name_suffix}}(
             self.X_data,
@@ -430,6 +442,15 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
             Y_end,
             dist_middle_terms,
         )
+        with gil:
+            print("DEBUG After _middle_term_sparse_sparse_{{name_suffix}}")
+            print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=")
+            vec_to_inspect = self.dist_middle_terms_chunks[thread_num]
+            vec_as_np_array = vector_to_nd_array(
+               # This is the address of the vector
+               &vec_to_inspect
+            )
+            print(vec_as_np_array)
 
         return dist_middle_terms
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
index 9b6d22f8c25b2..885d9191ad2b0 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
@@ -60,7 +60,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 
 from ._base cimport (
     BaseDistanceReducer{{name_suffix}},
-    _sqeuclidean_row_norms{{name_suffix}}
+    _sqeuclidean_row_norms{{name_suffix}}_dense
 )
 
 from ._datasets_pair cimport (
@@ -399,12 +399,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
                 dtype=np.float64
             )
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 

From 4060271fd2276c1ea787974c6002cf17cb4efef8 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sun, 16 Oct 2022 16:24:43 +0200
Subject: [PATCH 06/27] remove print

---
 .../_argkmin.pxd.tp                           |  3 ---
 .../_argkmin.pyx.tp                           |  8 --------
 .../_gemm_term_computer.pxd.tp                |  1 -
 .../_gemm_term_computer.pyx.tp                | 20 -------------------
 4 files changed, 32 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
index 0e5fa6dcc13cd..4751c5fa04026 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
@@ -16,9 +16,6 @@ implementation_specific_values = [
 
 cimport numpy as cnp
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
-from libcpp.vector cimport vector
-from ...utils._vector_sentinel cimport vector_to_nd_array
-from ...utils._typedefs cimport DTYPECODE
 
 cnp.import_array()
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index c1822eac3d40f..2237c00f05f05 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -33,10 +33,6 @@ from scipy.sparse import issparse
 from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration
 from ...utils.fixes import threadpool_limits
 from ...utils._typedefs import ITYPE, DTYPE
-from libcpp.vector cimport vector
-from ...utils._vector_sentinel cimport vector_to_nd_array
-from ...utils._typedefs cimport DTYPECODE
-
 
 cnp.import_array()
 
@@ -405,10 +401,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             ).compute()
         )
         self.use_squared_distances = use_squared_distances
-        print("DEBUG X_norm_squared")
-        print(np.asarray(self.X_norm_squared))
-        print("DEBUG Y_norm_squared")
-        print(np.asarray(self.Y_norm_squared))
 
     @final
     cdef void compute_exact_distances(self) nogil:
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
index dbae5ef42119c..0e5d744720ffb 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
@@ -17,7 +17,6 @@ cimport numpy as cnp
 
 from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 from libcpp.vector cimport vector
-from ...utils._vector_sentinel cimport vector_to_nd_array
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
index bee05aa2b528b..52d7136423cc2 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
@@ -21,7 +21,6 @@ from libcpp.vector cimport vector
 from scipy.sparse import issparse, csr_matrix
 from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
-from ...utils._vector_sentinel cimport vector_to_nd_array
 
 from ...utils._cython_blas cimport (
   BLAS_Order,
@@ -419,16 +418,6 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
             )
             vector[DTYPE_t] vec_to_inspect
 
-        with gil:
-            print("DEBUG Before _middle_term_sparse_sparse_{{name_suffix}}")
-            print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=")
-            vec_to_inspect = self.dist_middle_terms_chunks[thread_num]
-            vec_as_np_array = vector_to_nd_array(
-                # This is the address of the vector
-                &vec_to_inspect,
-            )
-            print(vec_as_np_array)
-
         _middle_term_sparse_sparse_{{name_suffix}}(
             self.X_data,
             self.X_indices,
@@ -442,15 +431,6 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
             Y_end,
             dist_middle_terms,
         )
-        with gil:
-            print("DEBUG After _middle_term_sparse_sparse_{{name_suffix}}")
-            print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=")
-            vec_to_inspect = self.dist_middle_terms_chunks[thread_num]
-            vec_as_np_array = vector_to_nd_array(
-               # This is the address of the vector
-               &vec_to_inspect
-            )
-            print(vec_as_np_array)
 
         return dist_middle_terms
 

From c8e35fd47fddfcadb1e3db4a2c8cc0766eb4dacd Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 18 Oct 2022 10:45:00 +0200
Subject: [PATCH 07/27] underkill the overkill

---
 .../_argkmin.pyx.tp                           | 10 ++--
 .../_base.pxd.tp                              | 22 ++------
 .../_base.pyx.tp                              | 52 +++----------------
 .../_dispatcher.py                            |  6 +--
 .../_radius_neighborhood.pyx.tp               |  6 +--
 5 files changed, 23 insertions(+), 73 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 2237c00f05f05..83ba2bf31e7d0 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -40,7 +40,7 @@ cnp.import_array()
 
 from ._base cimport (
     BaseDistanceReducer{{name_suffix}},
-    SqeuclideanRowNorm{{name_suffix}},
+    _sqeuclidean_row_norms{{name_suffix}}
 )
 
 from ._datasets_pair cimport (
@@ -389,16 +389,16 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
                 dtype=np.float64
             )
         else:
-            self.Y_norm_squared = SqeuclideanRowNorm{{name_suffix}}.get_for(
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
                 Y, self.effective_n_threads
-            ).compute()
+            )
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            SqeuclideanRowNorm{{name_suffix}}.get_for(
+            _sqeuclidean_row_norms{{name_suffix}}(
                 X, self.effective_n_threads
-            ).compute()
+            )
         )
         self.use_squared_distances = use_squared_distances
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
index 9c09fdd71c38c..3058d3946e56c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -36,24 +36,10 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
     ITYPE_t num_threads,
 )
 
-cdef class SqeuclideanRowNorm{{name_suffix}}:
-    cdef:
-        ITYPE_t num_threads
-
-    cpdef DTYPE_t[::1] compute(self)
-
-
-cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
-    cdef:
-        const {{INPUT_DTYPE_t}}[:, ::1] X
-
-
-cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
-    cdef:
-        const {{INPUT_DTYPE_t}}[:] X_data
-        const SPARSE_INDEX_TYPE_t[:] X_indices
-        const SPARSE_INDEX_TYPE_t[:] X_indptr
-
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
+    X,
+    ITYPE_t num_threads,
+)
 
 cdef class BaseDistanceReducer{{name_suffix}}:
     """
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index 3a573126d892a..c9fe70f6dbf30 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -123,52 +123,16 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
     return squared_row_norms
 
 
-cdef class SqeuclideanRowNorm{{name_suffix}}:
-
-    @classmethod
-    def get_for(cls, X, num_threads):
-        if issparse(X):
-            return SparseSqeuclideanRowNorm{{name_suffix}}(X, num_threads)
-        else:
-            return DenseSqeuclideanRowNorm{{name_suffix}}(X, num_threads)
-
-    @classmethod
-    def unpack_csr_matrix(cls, X: csr_matrix):
-        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
+    X,
+    ITYPE_t num_threads,
+):
+    if issparse(X):
         X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
-        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
         X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
-        return X_data, X_indices, X_indptr
-
-    def __init__(self, num_threads):
-        self.num_threads = num_threads
-
-    cpdef DTYPE_t[::1] compute(self):
-        return None
-
-
-cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
-
-    def __init__(self, X, num_threads):
-        super().__init__(num_threads)
-        self.X = X
-
-    cpdef DTYPE_t[::1] compute(self):
-        return _sqeuclidean_row_norms{{name_suffix}}_dense(self.X, self.num_threads)
-
-
-cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}):
-
-    def __init__(self, X, num_threads):
-        super().__init__(num_threads)
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-
-    cpdef DTYPE_t[::1] compute(self):
-        return _sqeuclidean_row_norms{{name_suffix}}_sparse(
-            self.X_data,
-            self.X_indptr,
-            self.num_threads,
-        )
+        return _sqeuclidean_row_norms{{name_suffix}}_sparse(X_data, X_indptr, num_threads)
+    else:
+        return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads)
 
 
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 7f46df232f720..43ae1330a1cac 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -8,7 +8,7 @@
 
 from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING
 
-from ._base import SqeuclideanRowNorm32, SqeuclideanRowNorm64
+from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64
 from ._argkmin import (
     ArgKmin64,
     ArgKmin32,
@@ -38,9 +38,9 @@ def sqeuclidean_row_norms(X, num_threads):
         Arrays containing the squared euclidean norm of each row of X.
     """
     if X.dtype == np.float64:
-        return SqeuclideanRowNorm64.get_for(X, num_threads).compute()
+        return _sqeuclidean_row_norms64(X, num_threads)
     if X.dtype == np.float32:
-        return SqeuclideanRowNorm32.get_for(X, num_threads).compute()
+        return _sqeuclidean_row_norms32(X, num_threads)
 
     raise ValueError(
         "Only float64 or float32 datasets are supported at this time, "
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
index 885d9191ad2b0..eca246cfd029a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
@@ -60,7 +60,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 
 from ._base cimport (
     BaseDistanceReducer{{name_suffix}},
-    _sqeuclidean_row_norms{{name_suffix}}_dense
+    _sqeuclidean_row_norms{{name_suffix}},
 )
 
 from ._datasets_pair cimport (
@@ -399,12 +399,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
                 dtype=np.float64
             )
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 

From c9aed9db3f6a3846bc27483702fde23d11cbe48a Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 18 Oct 2022 17:51:11 +0200
Subject: [PATCH 08/27] baptism by fire

---
 .../_argkmin.pyx.tp                           |  1 -
 .../_gemm_term_computer.pxd.tp                | 18 ++++++++
 .../_gemm_term_computer.pyx.tp                | 41 +++++++++++++++++++
 .../_radius_neighborhood.pyx.tp               |  4 +-
 4 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 83ba2bf31e7d0..b55175c288797 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -507,7 +507,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num]
 
-
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
         for i in range(n_X):
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
index 0e5d744720ffb..c28a81100aa3e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
@@ -156,6 +156,24 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
         const SPARSE_INDEX_TYPE_t[:] Y_indices
         const SPARSE_INDEX_TYPE_t[:] Y_indptr
 
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
index 52d7136423cc2..d2ab669543ea4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
@@ -32,6 +32,13 @@ from ...utils._cython_blas cimport (
   _gemm,
 )
 
+# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
+# Introduction in Cython:
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
+cdef extern from "<algorithm>" namespace "std" nogil:
+    void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
+
+
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
 
@@ -404,6 +411,40 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
         self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
         self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
 
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t val = 0
+        # reset the vector
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            val,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t val = 0
+        # reset the vector
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            val,
+        )
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
index eca246cfd029a..5bf367d00c90e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp
@@ -399,12 +399,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
                 dtype=np.float64
             )
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{name_suffix}}(X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 

From c5bd316311e6833de0bdb86867e8bbc04191cb9d Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 19 Oct 2022 10:42:07 +0200
Subject: [PATCH 09/27] clean it up, yo

---
 .../_pairwise_distances_reduction/_argkmin.pyx.tp    |  1 +
 .../_pairwise_distances_reduction/_base.pxd.tp       | 11 -----------
 .../_pairwise_distances_reduction/_base.pyx.tp       |  6 +++---
 .../_gemm_term_computer.pyx.tp                       | 12 ++++--------
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 1ae6b97fbc3fb..38725c9a8b831 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -18,6 +18,7 @@ from ...utils import check_array, check_scalar, _in_unstable_openblas_configurat
 from ...utils.fixes import threadpool_limits
 from ...utils._typedefs import ITYPE, DTYPE
 
+
 cnp.import_array()
 
 {{for name_suffix in ['64', '32']}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
index 0584e3570e104..be44f3a98a263 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -11,17 +11,6 @@ cnp.import_array()
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_dense(
-    const {{INPUT_DTYPE_t}}[:, ::1] X,
-    ITYPE_t num_threads,
-)
-
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
-    const {{INPUT_DTYPE_t}}[:] X_data,
-    const SPARSE_INDEX_TYPE_t[:] X_indptr,
-    ITYPE_t num_threads,
-)
-
 cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
     X,
     ITYPE_t num_threads,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index 2149203615080..54f8cca66a746 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -38,7 +38,7 @@ cnp.import_array()
 
 #####################
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense(
+cdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense(
     const DTYPE_t[:, ::1] X,
     ITYPE_t num_threads,
 ):
@@ -63,7 +63,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense(
     return squared_row_norms
 
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
+cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
     const cnp.float32_t[:, ::1] X,
     ITYPE_t num_threads,
 ):
@@ -108,7 +108,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
+cdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
     const {{INPUT_DTYPE_t}}[:] X_data,
     const SPARSE_INDEX_TYPE_t[:] X_indptr,
     ITYPE_t num_threads,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
index dc40bfc023522..3ace80ef230ee 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
@@ -413,13 +413,11 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil:
-        cdef:
-            DTYPE_t val = 0
-        # reset the vector
+        # Flush the thread dist_middle_terms_chunks to 0.0
         fill(
             self.dist_middle_terms_chunks[thread_num].begin(),
             self.dist_middle_terms_chunks[thread_num].end(),
-            val,
+            0.0,
         )
 
     cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
@@ -430,13 +428,11 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil:
-        cdef:
-            DTYPE_t val = 0
-        # reset the vector
+        # Flush the thread dist_middle_terms_chunks to 0.0
         fill(
             self.dist_middle_terms_chunks[thread_num].begin(),
             self.dist_middle_terms_chunks[thread_num].end(),
-            val,
+            0.0,
         )
 
     cdef DTYPE_t * _compute_dist_middle_terms(

From 50bbf47d7cebaa08621073b426861bc1ba7e74d6 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 19 Oct 2022 10:51:33 +0200
Subject: [PATCH 10/27] worldwide renaming

---
 .gitignore                                    |   4 +-
 setup.cfg                                     |   4 +-
 setup.py                                      |   2 +-
 .../_argkmin.pxd.tp                           |   2 +-
 .../_argkmin.pyx.tp                           |   2 +-
 .../_base.pyx.tp                              |   8 +-
 .../_gemm_term_computer.pxd                   | 323 ++++++++
 .../_gemm_term_computer.pyx                   | 784 ++++++++++++++++++
 ...er.pxd.tp => _middle_term_computer.pxd.tp} |   0
 ...er.pyx.tp => _middle_term_computer.pyx.tp} |   0
 .../_radius_neighbors.pxd.tp                  |   2 +-
 .../_radius_neighbors.pyx.tp                  |   2 +-
 .../_pairwise_distances_reduction/setup.py    |   6 +-
 13 files changed, 1123 insertions(+), 16 deletions(-)
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
 rename sklearn/metrics/_pairwise_distances_reduction/{_gemm_term_computer.pxd.tp => _middle_term_computer.pxd.tp} (100%)
 rename sklearn/metrics/_pairwise_distances_reduction/{_gemm_term_computer.pyx.tp => _middle_term_computer.pyx.tp} (100%)

diff --git a/.gitignore b/.gitignore
index 68299a202b7c7..47ec8fa2faf79 100644
--- a/.gitignore
+++ b/.gitignore
@@ -93,7 +93,7 @@ sklearn/metrics/_pairwise_distances_reduction/_base.pxd
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx
 sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
 sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
-sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
-sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
+sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
+sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
diff --git a/setup.cfg b/setup.cfg
index 3aa89052a4b8e..a7e632e9887eb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -77,8 +77,8 @@ ignore =
     sklearn/metrics/_pairwise_distances_reduction/_base.pyx
     sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
     sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
-    sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
-    sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
+    sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
+    sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
 
diff --git a/setup.py b/setup.py
index cc5b6f11749cb..467edff1ba360 100755
--- a/setup.py
+++ b/setup.py
@@ -89,7 +89,7 @@
     "sklearn.manifold._barnes_hut_tsne",
     "sklearn.metrics.cluster._expected_mutual_info_fast",
     "sklearn.metrics._pairwise_distances_reduction._datasets_pair",
-    "sklearn.metrics._pairwise_distances_reduction._gemm_term_computer",
+    "sklearn.metrics._pairwise_distances_reduction._middle_term_computer",
     "sklearn.metrics._pairwise_distances_reduction._base",
     "sklearn.metrics._pairwise_distances_reduction._argkmin",
     "sklearn.metrics._pairwise_distances_reduction._radius_neighbors",
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
index 2337516f22726..2bbe9e53518b3 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
@@ -6,7 +6,7 @@ cnp.import_array()
 {{for name_suffix in ['64', '32']}}
 
 from ._base cimport BaseDistancesReduction{{name_suffix}}
-from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
     """float{{name_suffix}} implementation of the ArgKmin."""
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 38725c9a8b831..6c948d0abc018 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -34,7 +34,7 @@ from ._datasets_pair cimport (
     SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 
 cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index 54f8cca66a746..b20381677fc8d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -408,7 +408,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_X_init_chunk(
+        self.middle_term_computer._parallel_on_X_init_chunk(
             thread_num, X_start, X_end,
         )
 
@@ -431,7 +431,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
@@ -474,7 +474,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_Y_parallel_init(
+        self.middle_term_computer._parallel_on_Y_parallel_init(
             thread_num, X_start, X_end,
         )
 
@@ -497,7 +497,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
new file mode 100644
index 0000000000000..4f0eab969de3e
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
@@ -0,0 +1,323 @@
+cimport numpy as cnp
+
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+from libcpp.vector cimport vector
+
+
+cdef void _middle_term_sparse_sparse_64(
+    const DTYPE_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const DTYPE_t[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil
+
+
+cdef class MiddleTermComputer64:
+    cdef:
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+        ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
+        ITYPE_t chunk_size
+
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[DTYPE_t]] dist_middle_terms_chunks
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_init(self) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64):
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64):
+    cdef:
+        const DTYPE_t[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const DTYPE_t[:] Y_data
+        const SPARSE_INDEX_TYPE_t[:] Y_indices
+        const SPARSE_INDEX_TYPE_t[:] Y_indptr
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef void _middle_term_sparse_sparse_32(
+    const cnp.float32_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const cnp.float32_t[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil
+
+
+cdef class MiddleTermComputer32:
+    cdef:
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+        ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
+        ITYPE_t chunk_size
+
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[DTYPE_t]] dist_middle_terms_chunks
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_init(self) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32):
+    cdef:
+        const cnp.float32_t[:, ::1] X
+        const cnp.float32_t[:, ::1] Y
+
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
+        vector[vector[DTYPE_t]] X_c_upcast
+        vector[vector[DTYPE_t]] Y_c_upcast
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32):
+    cdef:
+        const cnp.float32_t[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const cnp.float32_t[:] Y_data
+        const SPARSE_INDEX_TYPE_t[:] Y_indices
+        const SPARSE_INDEX_TYPE_t[:] Y_indptr
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
new file mode 100644
index 0000000000000..75e44f6c6b097
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
@@ -0,0 +1,784 @@
+cimport numpy as cnp
+
+from libcpp.vector cimport vector
+
+from ...utils._cython_blas cimport (
+  BLAS_Order,
+  BLAS_Trans,
+  NoTrans,
+  RowMajor,
+  Trans,
+  _gemm,
+)
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+
+import numpy as np
+from scipy.sparse import issparse, csr_matrix
+from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
+
+# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used
+# Introduction in Cython:
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
+cdef extern from "<algorithm>" namespace "std" nogil:
+    void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
+
+
+
+cdef void _middle_term_sparse_sparse_64(
+    const DTYPE_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const DTYPE_t[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil:
+    cdef:
+        ITYPE_t i, j, k
+        ITYPE_t n_X = X_end - X_start
+        ITYPE_t n_Y = Y_end - Y_start
+        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
+
+    for i in range(n_X):
+        for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+            X_i_col_idx = X_indices[X_i_ptr]
+            for j in range(n_Y):
+                k = i * n_Y + j
+                for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
+                    Y_j_col_idx = Y_indices[Y_j_ptr]
+                    if X_i_col_idx == Y_j_col_idx:
+                        D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
+
+
+cdef class MiddleTermComputer64:
+    """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM.
+
+    `EuclideanDistance` subclasses internally compute the squared Euclidean distances
+    between chunks of vectors X_c and Y_c using the following decomposition:
+
+
+                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
+    arithmetic intensity.
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        effective_n_threads,
+        chunks_n_threads,
+        dist_middle_terms_chunks_size,
+        n_features,
+        chunk_size,
+    ) -> MiddleTermComputer64:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Returns
+        -------
+        middle_term_computer: MiddleTermComputer64
+            The suited MiddleTermComputer64 implementation.
+        """
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseMiddleTermComputer64(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+        if X_is_sparse and Y_is_sparse:
+            return SparseSparseMiddleTermComputer64(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+
+        raise NotImplementedError("X and Y must be both sparse or dense")
+
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
+        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
+        return X_data, X_indices, X_indptr
+
+    def __init__(
+        self,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = n_features
+        self.chunk_size = chunk_size
+
+        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return NULL
+
+
+cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64):
+
+    def __init__(
+        self,
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X = X
+        self.Y = Y
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            # See: https://github.com/scipy/scipy/issues/14262
+            DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
+            DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
+
+
+cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64):
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        _middle_term_sparse_sparse_64(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y_data,
+            self.Y_indices,
+            self.Y_indptr,
+            Y_start,
+            Y_end,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms
+
+
+cdef void _middle_term_sparse_sparse_32(
+    const cnp.float32_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const cnp.float32_t[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil:
+    cdef:
+        ITYPE_t i, j, k
+        ITYPE_t n_X = X_end - X_start
+        ITYPE_t n_Y = Y_end - Y_start
+        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
+
+    for i in range(n_X):
+        for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+            X_i_col_idx = X_indices[X_i_ptr]
+            for j in range(n_Y):
+                k = i * n_Y + j
+                for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
+                    Y_j_col_idx = Y_indices[Y_j_ptr]
+                    if X_i_col_idx == Y_j_col_idx:
+                        D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
+
+
+cdef class MiddleTermComputer32:
+    """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM.
+
+    `EuclideanDistance` subclasses internally compute the squared Euclidean distances
+    between chunks of vectors X_c and Y_c using the following decomposition:
+
+
+                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
+    arithmetic intensity.
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        effective_n_threads,
+        chunks_n_threads,
+        dist_middle_terms_chunks_size,
+        n_features,
+        chunk_size,
+    ) -> MiddleTermComputer32:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Returns
+        -------
+        middle_term_computer: MiddleTermComputer32
+            The suited MiddleTermComputer32 implementation.
+        """
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseMiddleTermComputer32(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+        if X_is_sparse and Y_is_sparse:
+            return SparseSparseMiddleTermComputer32(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+
+        raise NotImplementedError("X and Y must be both sparse or dense")
+
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+        X_data = np.asarray(X.data, dtype=np.float32)
+        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
+        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
+        return X_data, X_indices, X_indptr
+
+    def __init__(
+        self,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = n_features
+        self.chunk_size = chunk_size
+
+        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return NULL
+
+
+cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32):
+
+    def __init__(
+        self,
+        const cnp.float32_t[:, ::1] X,
+        const cnp.float32_t[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X = X
+        self.Y = Y
+        # We populate the buffer for upcasting chunks of X and Y from float32 to float64.
+        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+        upcast_buffer_n_elements = self.chunk_size * n_features
+
+        for thread_num in range(self.effective_n_threads):
+            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            const cnp.float32_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const cnp.float32_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            DTYPE_t * A = self.X_c_upcast[thread_num].data()
+            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
+            ITYPE_t lda = X_c.shape[1]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
+
+
+cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32):
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        _middle_term_sparse_sparse_32(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y_data,
+            self.Y_indices,
+            self.Y_indptr,
+            Y_start,
+            Y_end,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
similarity index 100%
rename from sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
rename to sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
similarity index 100%
rename from sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
rename to sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
index 0b0829f82d975..b6e4508468d2b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
@@ -29,7 +29,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 {{for name_suffix in ['64', '32']}}
 
 from ._base cimport BaseDistancesReduction{{name_suffix}}
-from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
     """float{{name_suffix}} implementation of the RadiusNeighbors."""
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index 92decf15f1155..94272f9aa8f3c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -54,7 +54,7 @@ from ._datasets_pair cimport (
     SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 
 cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
diff --git a/sklearn/metrics/_pairwise_distances_reduction/setup.py b/sklearn/metrics/_pairwise_distances_reduction/setup.py
index e1fbbceea7eb8..4c8632ebb9088 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/setup.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/setup.py
@@ -15,8 +15,8 @@ def configuration(parent_package="", top_path=None):
     templates = [
         "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp",
+        "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp",
+        "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp",
@@ -29,7 +29,7 @@ def configuration(parent_package="", top_path=None):
 
     cython_sources = [
         "_datasets_pair.pyx",
-        "_gemm_term_computer.pyx",
+        "_middle_term_computer.pyx",
         "_base.pyx",
         "_argkmin.pyx",
         "_radius_neighbors.pyx",

From e9cc5ad04ead355fc9be23cd1662cb372dcc1e59 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 19 Oct 2022 11:16:57 +0200
Subject: [PATCH 11/27] add sparse test for test_sqeuclidean_row_norms

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index f929a55105509..2a3ef30a5f975 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1148,10 +1148,15 @@ def test_sqeuclidean_row_norms(
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
 
+    X_csr = csr_matrix(X)
+
     sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
     sq_row_norm = np.asarray(sqeuclidean_row_norms(X, num_threads=num_threads))
 
+    sq_row_norm_csr = np.asarray(sqeuclidean_row_norms(X_csr, num_threads=num_threads))
+
     assert_allclose(sq_row_norm_reference, sq_row_norm)
+    assert_allclose(sq_row_norm_reference, sq_row_norm_csr)
 
     with pytest.raises(ValueError):
         X = np.asfortranarray(X)

From 2ee517d7c73f524a8cbf0a237957e1d26b48ebcd Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sun, 23 Oct 2022 19:21:26 +0200
Subject: [PATCH 12/27] Remove previous Cython templates and sources

---
 .../_gemm_term_computer.pxd                   | 323 ------
 .../_gemm_term_computer.pyx                   | 784 --------------
 .../_radius_neighborhood.pxd                  | 152 ---
 .../_radius_neighborhood.pyx                  | 979 ------------------
 4 files changed, 2238 deletions(-)
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
deleted file mode 100644
index 4f0eab969de3e..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
+++ /dev/null
@@ -1,323 +0,0 @@
-cimport numpy as cnp
-
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
-from libcpp.vector cimport vector
-
-
-cdef void _middle_term_sparse_sparse_64(
-    const DTYPE_t[:] X_data,
-    const SPARSE_INDEX_TYPE_t[:] X_indices,
-    const SPARSE_INDEX_TYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const DTYPE_t[:] Y_data,
-    const SPARSE_INDEX_TYPE_t[:] Y_indices,
-    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t * D,
-) nogil
-
-
-cdef class MiddleTermComputer64:
-    cdef:
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-        ITYPE_t dist_middle_terms_chunks_size
-        ITYPE_t n_features
-        ITYPE_t chunk_size
-
-        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
-        vector[vector[DTYPE_t]] dist_middle_terms_chunks
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_init(self) nogil
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-
-cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64):
-    cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-
-cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64):
-    cdef:
-        const DTYPE_t[:] X_data
-        const SPARSE_INDEX_TYPE_t[:] X_indices
-        const SPARSE_INDEX_TYPE_t[:] X_indptr
-
-        const DTYPE_t[:] Y_data
-        const SPARSE_INDEX_TYPE_t[:] Y_indices
-        const SPARSE_INDEX_TYPE_t[:] Y_indptr
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-
-cdef void _middle_term_sparse_sparse_32(
-    const cnp.float32_t[:] X_data,
-    const SPARSE_INDEX_TYPE_t[:] X_indices,
-    const SPARSE_INDEX_TYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const cnp.float32_t[:] Y_data,
-    const SPARSE_INDEX_TYPE_t[:] Y_indices,
-    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t * D,
-) nogil
-
-
-cdef class MiddleTermComputer32:
-    cdef:
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-        ITYPE_t dist_middle_terms_chunks_size
-        ITYPE_t n_features
-        ITYPE_t chunk_size
-
-        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
-        vector[vector[DTYPE_t]] dist_middle_terms_chunks
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_init(self) nogil
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-
-cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32):
-    cdef:
-        const cnp.float32_t[:, ::1] X
-        const cnp.float32_t[:, ::1] Y
-
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
-        vector[vector[DTYPE_t]] X_c_upcast
-        vector[vector[DTYPE_t]] Y_c_upcast
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-
-cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32):
-    cdef:
-        const cnp.float32_t[:] X_data
-        const SPARSE_INDEX_TYPE_t[:] X_indices
-        const SPARSE_INDEX_TYPE_t[:] X_indptr
-
-        const cnp.float32_t[:] Y_data
-        const SPARSE_INDEX_TYPE_t[:] Y_indices
-        const SPARSE_INDEX_TYPE_t[:] Y_indptr
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
deleted file mode 100644
index 75e44f6c6b097..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
+++ /dev/null
@@ -1,784 +0,0 @@
-cimport numpy as cnp
-
-from libcpp.vector cimport vector
-
-from ...utils._cython_blas cimport (
-  BLAS_Order,
-  BLAS_Trans,
-  NoTrans,
-  RowMajor,
-  Trans,
-  _gemm,
-)
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
-
-import numpy as np
-from scipy.sparse import issparse, csr_matrix
-from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
-
-# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used
-# Introduction in Cython:
-# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
-cdef extern from "<algorithm>" namespace "std" nogil:
-    void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
-
-
-
-cdef void _middle_term_sparse_sparse_64(
-    const DTYPE_t[:] X_data,
-    const SPARSE_INDEX_TYPE_t[:] X_indices,
-    const SPARSE_INDEX_TYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const DTYPE_t[:] Y_data,
-    const SPARSE_INDEX_TYPE_t[:] Y_indices,
-    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t * D,
-) nogil:
-    cdef:
-        ITYPE_t i, j, k
-        ITYPE_t n_X = X_end - X_start
-        ITYPE_t n_Y = Y_end - Y_start
-        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
-
-    for i in range(n_X):
-        for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
-            X_i_col_idx = X_indices[X_i_ptr]
-            for j in range(n_Y):
-                k = i * n_Y + j
-                for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
-                    Y_j_col_idx = Y_indices[Y_j_ptr]
-                    if X_i_col_idx == Y_j_col_idx:
-                        D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
-
-
-cdef class MiddleTermComputer64:
-    """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM.
-
-    `EuclideanDistance` subclasses internally compute the squared Euclidean distances
-    between chunks of vectors X_c and Y_c using the following decomposition:
-
-
-                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-
-
-    This helper class is in charge of wrapping the common logic to compute
-    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
-    arithmetic intensity.
-    """
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        effective_n_threads,
-        chunks_n_threads,
-        dist_middle_terms_chunks_size,
-        n_features,
-        chunk_size,
-    ) -> MiddleTermComputer64:
-        """Return the DatasetsPair implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        Returns
-        -------
-        middle_term_computer: MiddleTermComputer64
-            The suited MiddleTermComputer64 implementation.
-        """
-        X_is_sparse = issparse(X)
-        Y_is_sparse = issparse(Y)
-
-        if not X_is_sparse and not Y_is_sparse:
-            return DenseDenseMiddleTermComputer64(
-                X,
-                Y,
-                effective_n_threads,
-                chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features,
-                chunk_size,
-            )
-        if X_is_sparse and Y_is_sparse:
-            return SparseSparseMiddleTermComputer64(
-                X,
-                Y,
-                effective_n_threads,
-                chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features,
-                chunk_size,
-            )
-
-        raise NotImplementedError("X and Y must be both sparse or dense")
-
-
-    @classmethod
-    def unpack_csr_matrix(cls, X: csr_matrix):
-        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        X_data = np.asarray(X.data, dtype=DTYPE)
-        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
-        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
-        return X_data, X_indices, X_indptr
-
-    def __init__(
-        self,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        self.effective_n_threads = effective_n_threads
-        self.chunks_n_threads = chunks_n_threads
-        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
-        self.n_features = n_features
-        self.chunk_size = chunk_size
-
-        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-        return
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return NULL
-
-
-cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64):
-
-    def __init__(
-        self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        super().__init__(
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-        )
-        self.X = X
-        self.Y = Y
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-        return
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_c.shape[0]
-            ITYPE_t n = Y_c.shape[0]
-            ITYPE_t K = X_c.shape[1]
-            DTYPE_t alpha = - 2.
-            # Casting for A and B to remove the const is needed because APIs exposed via
-            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
-            # See: https://github.com/scipy/scipy/issues/14262
-            DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
-            DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
-            ITYPE_t lda = X_c.shape[1]
-            ITYPE_t ldb = X_c.shape[1]
-            DTYPE_t beta = 0.
-            ITYPE_t ldc = Y_c.shape[0]
-
-        # dist_middle_terms = `-2 * X_c @ Y_c.T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
-
-        return dist_middle_terms
-
-
-cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64):
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        super().__init__(
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-        )
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            DTYPE_t *dist_middle_terms = (
-                self.dist_middle_terms_chunks[thread_num].data()
-            )
-
-        _middle_term_sparse_sparse_64(
-            self.X_data,
-            self.X_indices,
-            self.X_indptr,
-            X_start,
-            X_end,
-            self.Y_data,
-            self.Y_indices,
-            self.Y_indptr,
-            Y_start,
-            Y_end,
-            dist_middle_terms,
-        )
-
-        return dist_middle_terms
-
-
-cdef void _middle_term_sparse_sparse_32(
-    const cnp.float32_t[:] X_data,
-    const SPARSE_INDEX_TYPE_t[:] X_indices,
-    const SPARSE_INDEX_TYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const cnp.float32_t[:] Y_data,
-    const SPARSE_INDEX_TYPE_t[:] Y_indices,
-    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t * D,
-) nogil:
-    cdef:
-        ITYPE_t i, j, k
-        ITYPE_t n_X = X_end - X_start
-        ITYPE_t n_Y = Y_end - Y_start
-        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
-
-    for i in range(n_X):
-        for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
-            X_i_col_idx = X_indices[X_i_ptr]
-            for j in range(n_Y):
-                k = i * n_Y + j
-                for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
-                    Y_j_col_idx = Y_indices[Y_j_ptr]
-                    if X_i_col_idx == Y_j_col_idx:
-                        D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
-
-
-cdef class MiddleTermComputer32:
-    """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM.
-
-    `EuclideanDistance` subclasses internally compute the squared Euclidean distances
-    between chunks of vectors X_c and Y_c using the following decomposition:
-
-
-                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-
-
-    This helper class is in charge of wrapping the common logic to compute
-    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
-    arithmetic intensity.
-    """
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        effective_n_threads,
-        chunks_n_threads,
-        dist_middle_terms_chunks_size,
-        n_features,
-        chunk_size,
-    ) -> MiddleTermComputer32:
-        """Return the DatasetsPair implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        Returns
-        -------
-        middle_term_computer: MiddleTermComputer32
-            The suited MiddleTermComputer32 implementation.
-        """
-        X_is_sparse = issparse(X)
-        Y_is_sparse = issparse(Y)
-
-        if not X_is_sparse and not Y_is_sparse:
-            return DenseDenseMiddleTermComputer32(
-                X,
-                Y,
-                effective_n_threads,
-                chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features,
-                chunk_size,
-            )
-        if X_is_sparse and Y_is_sparse:
-            return SparseSparseMiddleTermComputer32(
-                X,
-                Y,
-                effective_n_threads,
-                chunks_n_threads,
-                dist_middle_terms_chunks_size,
-                n_features,
-                chunk_size,
-            )
-
-        raise NotImplementedError("X and Y must be both sparse or dense")
-
-
-    @classmethod
-    def unpack_csr_matrix(cls, X: csr_matrix):
-        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        X_data = np.asarray(X.data, dtype=np.float32)
-        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
-        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
-        return X_data, X_indices, X_indptr
-
-    def __init__(
-        self,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        self.effective_n_threads = effective_n_threads
-        self.chunks_n_threads = chunks_n_threads
-        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
-        self.n_features = n_features
-        self.chunk_size = chunk_size
-
-        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-        return
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return NULL
-
-
-cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32):
-
-    def __init__(
-        self,
-        const cnp.float32_t[:, ::1] X,
-        const cnp.float32_t[:, ::1] Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        super().__init__(
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-        )
-        self.X = X
-        self.Y = Y
-        # We populate the buffer for upcasting chunks of X and Y from float32 to float64.
-        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
-        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-        upcast_buffer_n_elements = self.chunk_size * n_features
-
-        for thread_num in range(self.effective_n_threads):
-            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
-            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
-
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = X_end - X_start
-
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = X_end - X_start
-
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
-
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            const cnp.float32_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const cnp.float32_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_c.shape[0]
-            ITYPE_t n = Y_c.shape[0]
-            ITYPE_t K = X_c.shape[1]
-            DTYPE_t alpha = - 2.
-            DTYPE_t * A = self.X_c_upcast[thread_num].data()
-            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
-            ITYPE_t lda = X_c.shape[1]
-            ITYPE_t ldb = X_c.shape[1]
-            DTYPE_t beta = 0.
-            ITYPE_t ldc = Y_c.shape[0]
-
-        # dist_middle_terms = `-2 * X_c @ Y_c.T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
-
-        return dist_middle_terms
-
-
-cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32):
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        super().__init__(
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-        )
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            DTYPE_t *dist_middle_terms = (
-                self.dist_middle_terms_chunks[thread_num].data()
-            )
-
-        _middle_term_sparse_sparse_32(
-            self.X_data,
-            self.X_indices,
-            self.X_indptr,
-            X_start,
-            X_end,
-            self.Y_data,
-            self.Y_indices,
-            self.Y_indptr,
-            Y_start,
-            Y_end,
-            dist_middle_terms,
-        )
-
-        return dist_middle_terms
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd
deleted file mode 100644
index b4467b9e14569..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd
+++ /dev/null
@@ -1,152 +0,0 @@
-cimport numpy as cnp
-
-from libcpp.memory cimport shared_ptr
-from libcpp.vector cimport vector
-from cython cimport final
-
-from ...utils._typedefs cimport ITYPE_t, DTYPE_t
-
-cnp.import_array()
-
-######################
-## std::vector to np.ndarray coercion
-# As type covariance is not supported for C++ containers via Cython,
-# we need to redefine fused types.
-ctypedef fused vector_DITYPE_t:
-    vector[ITYPE_t]
-    vector[DTYPE_t]
-
-
-ctypedef fused vector_vector_DITYPE_t:
-    vector[vector[ITYPE_t]]
-    vector[vector[DTYPE_t]]
-
-cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
-    shared_ptr[vector_vector_DITYPE_t] vecs
-)
-
-#####################
-
-from ._base cimport BaseDistanceReducer64
-from ._gemm_term_computer cimport MiddleTermComputer64
-
-cdef class RadiusNeighbors64(BaseDistanceReducer64):
-    """
-    64bit implementation of BaseDistanceReducer64 for the
-    `RadiusNeighbors` reduction.
-    """
-
-    cdef:
-        DTYPE_t radius
-
-        # DistanceMetric64 compute rank-preserving surrogate distance via rdist
-        # which are proxies necessitating less computations.
-        # We get the equivalent for the radius to be able to compare it against
-        # vectors' rank-preserving surrogate distances.
-        DTYPE_t r_radius
-
-        # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
-        #
-        # For this implementation, we want resizable buffers which we will wrap
-        # into numpy arrays at the end. std::vector comes as a handy container
-        # for interacting efficiently with resizable buffers.
-        #
-        # Though it is possible to access their buffer address with
-        # std::vector::data, they can't be stolen: buffers lifetime
-        # is tied to their std::vector and are deallocated when
-        # std::vectors are.
-        #
-        # To solve this, we dynamically allocate std::vectors and then
-        # encapsulate them in a StdVectorSentinel responsible for
-        # freeing them when the associated np.ndarray is freed.
-        #
-        # Shared pointers (defined via shared_ptr) are use for safer memory management.
-        # Unique pointers (defined via unique_ptr) can't be used as datastructures
-        # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
-        shared_ptr[vector[vector[ITYPE_t]]] neigh_indices
-        shared_ptr[vector[vector[DTYPE_t]]] neigh_distances
-
-        # Used as array of pointers to private datastructures used in threads.
-        vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks
-        vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks
-
-        bint sort_results
-
-    @final
-    cdef void _merge_vectors(
-        self,
-        ITYPE_t idx,
-        ITYPE_t num_threads,
-    ) nogil
-
-
-cdef class EuclideanRadiusNeighbors64(RadiusNeighbors64):
-    """EuclideanDistance-specialized 64bit implementation for RadiusNeighbors64."""
-    cdef:
-        MiddleTermComputer64 middle_term_computer
-        const DTYPE_t[::1] X_norm_squared
-        const DTYPE_t[::1] Y_norm_squared
-
-        bint use_squared_distances
-
-from ._base cimport BaseDistanceReducer32
-from ._gemm_term_computer cimport MiddleTermComputer32
-
-cdef class RadiusNeighbors32(BaseDistanceReducer32):
-    """
-    32bit implementation of BaseDistanceReducer32 for the
-    `RadiusNeighbors` reduction.
-    """
-
-    cdef:
-        DTYPE_t radius
-
-        # DistanceMetric32 compute rank-preserving surrogate distance via rdist
-        # which are proxies necessitating less computations.
-        # We get the equivalent for the radius to be able to compare it against
-        # vectors' rank-preserving surrogate distances.
-        DTYPE_t r_radius
-
-        # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
-        #
-        # For this implementation, we want resizable buffers which we will wrap
-        # into numpy arrays at the end. std::vector comes as a handy container
-        # for interacting efficiently with resizable buffers.
-        #
-        # Though it is possible to access their buffer address with
-        # std::vector::data, they can't be stolen: buffers lifetime
-        # is tied to their std::vector and are deallocated when
-        # std::vectors are.
-        #
-        # To solve this, we dynamically allocate std::vectors and then
-        # encapsulate them in a StdVectorSentinel responsible for
-        # freeing them when the associated np.ndarray is freed.
-        #
-        # Shared pointers (defined via shared_ptr) are use for safer memory management.
-        # Unique pointers (defined via unique_ptr) can't be used as datastructures
-        # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
-        shared_ptr[vector[vector[ITYPE_t]]] neigh_indices
-        shared_ptr[vector[vector[DTYPE_t]]] neigh_distances
-
-        # Used as array of pointers to private datastructures used in threads.
-        vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks
-        vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks
-
-        bint sort_results
-
-    @final
-    cdef void _merge_vectors(
-        self,
-        ITYPE_t idx,
-        ITYPE_t num_threads,
-    ) nogil
-
-
-cdef class EuclideanRadiusNeighbors32(RadiusNeighbors32):
-    """EuclideanDistance-specialized 32bit implementation for RadiusNeighbors32."""
-    cdef:
-        MiddleTermComputer32 middle_term_computer
-        const DTYPE_t[::1] X_norm_squared
-        const DTYPE_t[::1] Y_norm_squared
-
-        bint use_squared_distances
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx
deleted file mode 100644
index a133d23e0979e..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx
+++ /dev/null
@@ -1,979 +0,0 @@
-cimport numpy as cnp
-import numpy as np
-import warnings
-
-from libcpp.memory cimport shared_ptr, make_shared
-from libcpp.vector cimport vector
-from cython cimport final
-from cython.operator cimport dereference as deref
-from cython.parallel cimport parallel, prange
-
-from ...utils._sorting cimport simultaneous_sort
-from ...utils._typedefs cimport ITYPE_t, DTYPE_t
-from ...utils._vector_sentinel cimport vector_to_nd_array
-
-from numbers import Real
-from scipy.sparse import issparse
-from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration
-from ...utils.fixes import threadpool_limits
-
-cnp.import_array()
-
-# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
-# Introduction in Cython:
-# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa
-cdef extern from "<algorithm>" namespace "std" nogil:
-    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa
-
-######################
-
-cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
-    shared_ptr[vector_vector_DITYPE_t] vecs
-):
-    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
-    cdef:
-        ITYPE_t n = deref(vecs).size()
-        cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray)
-
-    for i in range(n):
-        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
-
-    return nd_arrays_of_nd_arrays
-
-#####################
-
-from ._base cimport (
-    BaseDistanceReducer64,
-    _sqeuclidean_row_norms64,
-)
-
-from ._datasets_pair cimport (
-    DatasetsPair64,
-    DenseDenseDatasetsPair64,
-    SparseSparseDatasetsPair64,
-)
-
-from ._gemm_term_computer cimport MiddleTermComputer64
-
-
-cdef class RadiusNeighbors64(BaseDistanceReducer64):
-    """
-    64bit implementation of the pairwise-distance reduction BaseDistanceReducer64 for the
-    `RadiusNeighbors` reduction.
-    """
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        DTYPE_t radius,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-        bint return_distance=False,
-        bint sort_results=False,
-    ):
-        """Compute the radius-neighbors reduction.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`RadiusNeighbors64`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance should directly be created outside of this class method.
-        """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))
-        ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in GEMMTermComputer docstring.
-            use_squared_distances = metric == "sqeuclidean"
-            pda = EuclideanRadiusNeighbors64(
-                X=X, Y=Y, radius=radius,
-                use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                sort_results=sort_results,
-                metric_kwargs=metric_kwargs,
-            )
-        else:
-             # Fall back on a generic implementation that handles most scipy
-             # metrics by computing the distances between 2 vectors at a time.
-            pda = RadiusNeighbors64(
-                datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs),
-                radius=radius,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                sort_results=sort_results,
-            )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results(return_distance)
-
-
-    def __init__(
-        self,
-        DatasetsPair64 datasets_pair,
-        DTYPE_t radius,
-        chunk_size=None,
-        strategy=None,
-        sort_results=False,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-        )
-
-        self.radius = check_scalar(radius, "radius", Real, min_val=0)
-        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
-        self.sort_results = sort_results
-
-        # Allocating pointers to datastructures but not the datastructures themselves.
-        # There are as many pointers as effective threads.
-        #
-        # For the sake of explicitness:
-        #   - when parallelizing on X, the pointers of those heaps are referencing
-        #   self.neigh_distances and self.neigh_indices
-        #   - when parallelizing on Y, the pointers of those heaps are referencing
-        #   std::vectors of std::vectors which are thread-wise-allocated and whose
-        #   content will be merged into self.neigh_distances and self.neigh_indices.
-        self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]](
-            self.chunks_n_threads
-        )
-        self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]](
-            self.chunks_n_threads
-        )
-
-        # Temporary datastructures which will be coerced to numpy arrays on before
-        # RadiusNeighbors.compute "return" and will be then freed.
-        self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
-        self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t r_dist_i_j
-
-        for i in range(X_start, X_end):
-            for j in range(Y_start, Y_end):
-                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
-                if r_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
-
-    def _finalize_results(self, bint return_distance=False):
-        if return_distance:
-            # We need to recompute distances because we relied on
-            # surrogate distances for the reduction.
-            self.compute_exact_distances()
-            return (
-                coerce_vectors_to_nd_arrays(self.neigh_distances),
-                coerce_vectors_to_nd_arrays(self.neigh_indices),
-            )
-
-        return coerce_vectors_to_nd_arrays(self.neigh_indices)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-
-        # As this strategy is embarrassingly parallel, we can set the
-        # thread vectors' pointers to the main vectors'.
-        self.neigh_distances_chunks[thread_num] = self.neigh_distances
-        self.neigh_indices_chunks[thread_num] = self.neigh_indices
-
-    @final
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx
-
-        # Sorting neighbors for each query vector of X
-        if self.sort_results:
-            for idx in range(X_start, X_end):
-                simultaneous_sort(
-                    deref(self.neigh_distances)[idx].data(),
-                    deref(self.neigh_indices)[idx].data(),
-                    deref(self.neigh_indices)[idx].size()
-                )
-
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-        # As chunks of X are shared across threads, so must datastructures to avoid race
-        # conditions: each thread has its own vectors of n_samples_X vectors which are
-        # then merged back in the main n_samples_X vectors.
-        for thread_num in range(self.chunks_n_threads):
-            self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
-            self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
-
-    @final
-    cdef void _merge_vectors(
-        self,
-        ITYPE_t idx,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-            ITYPE_t idx_n_elements = 0
-            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
-
-        # Resizing buffers only once for the given number of elements.
-        for thread_num in range(num_threads):
-            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
-        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
-
-        # Moving the elements by range using the range first element
-        # as the reference for the insertion.
-        for thread_num in range(num_threads):
-            move(
-                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
-                deref(self.neigh_distances)[idx].begin() + last_element_idx
-            )
-            move(
-                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
-                deref(self.neigh_indices)[idx].begin() + last_element_idx
-            )
-            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
-
-        with nogil, parallel(num_threads=self.effective_n_threads):
-            # Merge vectors used in threads into the main ones.
-            # This is done in parallel sample-wise (no need for locks).
-            for idx in prange(self.n_samples_X, schedule='static'):
-                self._merge_vectors(idx, self.chunks_n_threads)
-
-            # The content of the vector have been std::moved.
-            # Hence they can't be used anymore and can be deleted.
-            # Their deletion is carried out automatically as the
-            # implementation relies on shared pointers.
-
-            # Sort in parallel in ascending order w.r.t the distances if requested.
-            if self.sort_results:
-                for idx in prange(self.n_samples_X, schedule='static'):
-                    simultaneous_sort(
-                        deref(self.neigh_distances)[idx].data(),
-                        deref(self.neigh_indices)[idx].data(),
-                        deref(self.neigh_indices)[idx].size()
-                    )
-
-        return
-
-    cdef void compute_exact_distances(self) nogil:
-        """Convert rank-preserving distances to pairwise distances in parallel."""
-        cdef:
-            ITYPE_t i, j
-
-        for i in prange(self.n_samples_X, nogil=True, schedule='static',
-                        num_threads=self.effective_n_threads):
-            for j in range(deref(self.neigh_indices)[i].size()):
-                deref(self.neigh_distances)[i][j] = (
-                        self.datasets_pair.distance_metric._rdist_to_dist(
-                            # Guard against eventual -0., causing nan production.
-                            max(deref(self.neigh_distances)[i][j], 0.)
-                        )
-                )
-
-
-cdef class EuclideanRadiusNeighbors64(RadiusNeighbors64):
-    """EuclideanDistance-specialized implementation for RadiusNeighbors64."""
-
-    @classmethod
-    def is_usable_for(cls, X, Y, metric) -> bool:
-        return (RadiusNeighbors64.is_usable_for(X, Y, metric)
-                and not _in_unstable_openblas_configuration())
-
-    def __init__(
-        self,
-        X,
-        Y,
-        DTYPE_t radius,
-        bint use_squared_distances=False,
-        chunk_size=None,
-        strategy=None,
-        sort_results=False,
-        metric_kwargs=None,
-    ):
-        if (
-            metric_kwargs is not None and
-            len(metric_kwargs) > 0 and
-            "Y_norm_squared" not in metric_kwargs
-        ):
-            warnings.warn(
-                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
-                f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.",
-                UserWarning,
-                stacklevel=3,
-            )
-
-        super().__init__(
-            # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair64.get_for(X, Y, metric="euclidean"),
-            radius=radius,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            sort_results=sort_results,
-        )
-        # X and Y are checked by the DatasetsPair64 implemented
-        # as a DenseDenseDatasetsPair64
-        cdef:
-            DenseDenseDatasetsPair64 datasets_pair = (
-                <DenseDenseDatasetsPair64> self.datasets_pair
-            )
-            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
-
-        self.middle_term_computer = MiddleTermComputer64.get_for(
-            X,
-            Y,
-            self.effective_n_threads,
-            self.chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
-            chunk_size=self.chunk_size,
-        )
-
-        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
-            self.Y_norm_squared = check_array(
-                metric_kwargs.pop("Y_norm_squared"),
-                ensure_2d=False,
-                input_name="Y_norm_squared",
-                dtype=np.float64
-            )
-        else:
-            self.Y_norm_squared = _sqeuclidean_row_norms64(Y, self.effective_n_threads)
-
-        # Do not recompute norms if datasets are identical.
-        self.X_norm_squared = (
-            self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms64(X, self.effective_n_threads)
-        )
-        self.use_squared_distances = use_squared_distances
-
-        if use_squared_distances:
-            # In this specialisation and this setup, the value passed to the radius is
-            # already considered to be the adapted radius, so we overwrite it.
-            self.r_radius = radius
-
-    @final
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil:
-        RadiusNeighbors64._parallel_on_X_parallel_init(self, thread_num)
-        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
-
-    @final
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        RadiusNeighbors64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
-
-    @final
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        RadiusNeighbors64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num,
-        )
-
-    @final
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef ITYPE_t thread_num
-        RadiusNeighbors64._parallel_on_Y_init(self)
-        self.middle_term_computer._parallel_on_Y_init()
-
-    @final
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        RadiusNeighbors64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
-
-    @final
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        RadiusNeighbors64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num
-        )
-
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        if not self.use_squared_distances:
-            RadiusNeighbors64.compute_exact_distances(self)
-
-    @final
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
-            ITYPE_t n_X = X_end - X_start
-            ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks(
-                X_start, X_end, Y_start, Y_end, thread_num
-            )
-
-        # Pushing the distance and their associated indices in vectors.
-        for i in range(n_X):
-            for j in range(n_Y):
-                # Using the squared euclidean distance as the rank-preserving distance:
-                #
-                #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                #
-                squared_dist_i_j = (
-                    self.X_norm_squared[i + X_start]
-                    + dist_middle_terms[i * n_Y + j]
-                    + self.Y_norm_squared[j + Y_start]
-                )
-                if squared_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
-
-from ._base cimport (
-    BaseDistanceReducer32,
-    _sqeuclidean_row_norms32,
-)
-
-from ._datasets_pair cimport (
-    DatasetsPair32,
-    DenseDenseDatasetsPair32,
-    SparseSparseDatasetsPair32,
-)
-
-from ._gemm_term_computer cimport MiddleTermComputer32
-
-
-cdef class RadiusNeighbors32(BaseDistanceReducer32):
-    """
-    32bit implementation of the pairwise-distance reduction BaseDistanceReducer32 for the
-    `RadiusNeighbors` reduction.
-    """
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        DTYPE_t radius,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-        bint return_distance=False,
-        bint sort_results=False,
-    ):
-        """Compute the radius-neighbors reduction.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`RadiusNeighbors32`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance should directly be created outside of this class method.
-        """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))
-        ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in GEMMTermComputer docstring.
-            use_squared_distances = metric == "sqeuclidean"
-            pda = EuclideanRadiusNeighbors32(
-                X=X, Y=Y, radius=radius,
-                use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                sort_results=sort_results,
-                metric_kwargs=metric_kwargs,
-            )
-        else:
-             # Fall back on a generic implementation that handles most scipy
-             # metrics by computing the distances between 2 vectors at a time.
-            pda = RadiusNeighbors32(
-                datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs),
-                radius=radius,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                sort_results=sort_results,
-            )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results(return_distance)
-
-
-    def __init__(
-        self,
-        DatasetsPair32 datasets_pair,
-        DTYPE_t radius,
-        chunk_size=None,
-        strategy=None,
-        sort_results=False,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-        )
-
-        self.radius = check_scalar(radius, "radius", Real, min_val=0)
-        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
-        self.sort_results = sort_results
-
-        # Allocating pointers to datastructures but not the datastructures themselves.
-        # There are as many pointers as effective threads.
-        #
-        # For the sake of explicitness:
-        #   - when parallelizing on X, the pointers of those heaps are referencing
-        #   self.neigh_distances and self.neigh_indices
-        #   - when parallelizing on Y, the pointers of those heaps are referencing
-        #   std::vectors of std::vectors which are thread-wise-allocated and whose
-        #   content will be merged into self.neigh_distances and self.neigh_indices.
-        self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]](
-            self.chunks_n_threads
-        )
-        self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]](
-            self.chunks_n_threads
-        )
-
-        # Temporary datastructures which will be coerced to numpy arrays on before
-        # RadiusNeighbors.compute "return" and will be then freed.
-        self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
-        self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t r_dist_i_j
-
-        for i in range(X_start, X_end):
-            for j in range(Y_start, Y_end):
-                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
-                if r_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
-
-    def _finalize_results(self, bint return_distance=False):
-        if return_distance:
-            # We need to recompute distances because we relied on
-            # surrogate distances for the reduction.
-            self.compute_exact_distances()
-            return (
-                coerce_vectors_to_nd_arrays(self.neigh_distances),
-                coerce_vectors_to_nd_arrays(self.neigh_indices),
-            )
-
-        return coerce_vectors_to_nd_arrays(self.neigh_indices)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-
-        # As this strategy is embarrassingly parallel, we can set the
-        # thread vectors' pointers to the main vectors'.
-        self.neigh_distances_chunks[thread_num] = self.neigh_distances
-        self.neigh_indices_chunks[thread_num] = self.neigh_indices
-
-    @final
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx
-
-        # Sorting neighbors for each query vector of X
-        if self.sort_results:
-            for idx in range(X_start, X_end):
-                simultaneous_sort(
-                    deref(self.neigh_distances)[idx].data(),
-                    deref(self.neigh_indices)[idx].data(),
-                    deref(self.neigh_indices)[idx].size()
-                )
-
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-        # As chunks of X are shared across threads, so must datastructures to avoid race
-        # conditions: each thread has its own vectors of n_samples_X vectors which are
-        # then merged back in the main n_samples_X vectors.
-        for thread_num in range(self.chunks_n_threads):
-            self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
-            self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
-
-    @final
-    cdef void _merge_vectors(
-        self,
-        ITYPE_t idx,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-            ITYPE_t idx_n_elements = 0
-            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
-
-        # Resizing buffers only once for the given number of elements.
-        for thread_num in range(num_threads):
-            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
-        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
-
-        # Moving the elements by range using the range first element
-        # as the reference for the insertion.
-        for thread_num in range(num_threads):
-            move(
-                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
-                deref(self.neigh_distances)[idx].begin() + last_element_idx
-            )
-            move(
-                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
-                deref(self.neigh_indices)[idx].begin() + last_element_idx
-            )
-            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
-
-        with nogil, parallel(num_threads=self.effective_n_threads):
-            # Merge vectors used in threads into the main ones.
-            # This is done in parallel sample-wise (no need for locks).
-            for idx in prange(self.n_samples_X, schedule='static'):
-                self._merge_vectors(idx, self.chunks_n_threads)
-
-            # The content of the vector have been std::moved.
-            # Hence they can't be used anymore and can be deleted.
-            # Their deletion is carried out automatically as the
-            # implementation relies on shared pointers.
-
-            # Sort in parallel in ascending order w.r.t the distances if requested.
-            if self.sort_results:
-                for idx in prange(self.n_samples_X, schedule='static'):
-                    simultaneous_sort(
-                        deref(self.neigh_distances)[idx].data(),
-                        deref(self.neigh_indices)[idx].data(),
-                        deref(self.neigh_indices)[idx].size()
-                    )
-
-        return
-
-    cdef void compute_exact_distances(self) nogil:
-        """Convert rank-preserving distances to pairwise distances in parallel."""
-        cdef:
-            ITYPE_t i, j
-
-        for i in prange(self.n_samples_X, nogil=True, schedule='static',
-                        num_threads=self.effective_n_threads):
-            for j in range(deref(self.neigh_indices)[i].size()):
-                deref(self.neigh_distances)[i][j] = (
-                        self.datasets_pair.distance_metric._rdist_to_dist(
-                            # Guard against eventual -0., causing nan production.
-                            max(deref(self.neigh_distances)[i][j], 0.)
-                        )
-                )
-
-
-cdef class EuclideanRadiusNeighbors32(RadiusNeighbors32):
-    """EuclideanDistance-specialized implementation for RadiusNeighbors32."""
-
-    @classmethod
-    def is_usable_for(cls, X, Y, metric) -> bool:
-        return (RadiusNeighbors32.is_usable_for(X, Y, metric)
-                and not _in_unstable_openblas_configuration())
-
-    def __init__(
-        self,
-        X,
-        Y,
-        DTYPE_t radius,
-        bint use_squared_distances=False,
-        chunk_size=None,
-        strategy=None,
-        sort_results=False,
-        metric_kwargs=None,
-    ):
-        if (
-            metric_kwargs is not None and
-            len(metric_kwargs) > 0 and
-            "Y_norm_squared" not in metric_kwargs
-        ):
-            warnings.warn(
-                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
-                f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.",
-                UserWarning,
-                stacklevel=3,
-            )
-
-        super().__init__(
-            # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair32.get_for(X, Y, metric="euclidean"),
-            radius=radius,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            sort_results=sort_results,
-        )
-        # X and Y are checked by the DatasetsPair32 implemented
-        # as a DenseDenseDatasetsPair32
-        cdef:
-            DenseDenseDatasetsPair32 datasets_pair = (
-                <DenseDenseDatasetsPair32> self.datasets_pair
-            )
-            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
-
-        self.middle_term_computer = MiddleTermComputer32.get_for(
-            X,
-            Y,
-            self.effective_n_threads,
-            self.chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
-            chunk_size=self.chunk_size,
-        )
-
-        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
-            self.Y_norm_squared = check_array(
-                metric_kwargs.pop("Y_norm_squared"),
-                ensure_2d=False,
-                input_name="Y_norm_squared",
-                dtype=np.float64
-            )
-        else:
-            self.Y_norm_squared = _sqeuclidean_row_norms32(Y, self.effective_n_threads)
-
-        # Do not recompute norms if datasets are identical.
-        self.X_norm_squared = (
-            self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms32(X, self.effective_n_threads)
-        )
-        self.use_squared_distances = use_squared_distances
-
-        if use_squared_distances:
-            # In this specialisation and this setup, the value passed to the radius is
-            # already considered to be the adapted radius, so we overwrite it.
-            self.r_radius = radius
-
-    @final
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil:
-        RadiusNeighbors32._parallel_on_X_parallel_init(self, thread_num)
-        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
-
-    @final
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        RadiusNeighbors32._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
-
-    @final
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        RadiusNeighbors32._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num,
-        )
-
-    @final
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef ITYPE_t thread_num
-        RadiusNeighbors32._parallel_on_Y_init(self)
-        self.middle_term_computer._parallel_on_Y_init()
-
-    @final
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        RadiusNeighbors32._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
-
-    @final
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        RadiusNeighbors32._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num
-        )
-
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        if not self.use_squared_distances:
-            RadiusNeighbors32.compute_exact_distances(self)
-
-    @final
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
-            ITYPE_t n_X = X_end - X_start
-            ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks(
-                X_start, X_end, Y_start, Y_end, thread_num
-            )
-
-        # Pushing the distance and their associated indices in vectors.
-        for i in range(n_X):
-            for j in range(n_Y):
-                # Using the squared euclidean distance as the rank-preserving distance:
-                #
-                #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                #
-                squared_dist_i_j = (
-                    self.X_norm_squared[i + X_start]
-                    + dist_middle_terms[i * n_Y + j]
-                    + self.Y_norm_squared[j + Y_start]
-                )
-                if squared_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)

From 9df551d445e0cf3675d0dfb128f80dd10beefbe6 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sun, 23 Oct 2022 19:21:54 +0200
Subject: [PATCH 13/27] Apply suggestions

---
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp  | 6 +++---
 .../metrics/_pairwise_distances_reduction/_dispatcher.py    | 4 ++--
 .../_middle_term_computer.pxd.tp                            | 3 ++-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index b20381677fc8d..aeeb5ee126ddd 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -8,8 +8,8 @@ implementation_specific_values = [
     # We also use the float64 dtype and C-type names as defined in
     # `sklearn.utils._typedefs` to maintain consistency.
     #
-    ('64', False, 'DTYPE_t', 'DTYPE'),
-    ('32', True, 'cnp.float32_t', 'np.float32')
+    ('64', 'DTYPE_t', 'DTYPE'),
+    ('32', 'cnp.float32_t', 'np.float32')
 ]
 
 }}
@@ -103,7 +103,7 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
     return squared_row_norms
 
 
-{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 57006420e6d49..5bc9bfcc35dbb 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -38,9 +38,9 @@ def sqeuclidean_row_norms(X, num_threads):
         Arrays containing the squared euclidean norm of each row of X.
     """
     if X.dtype == np.float64:
-        return _sqeuclidean_row_norms64(X, num_threads)
+        return np.asarray(_sqeuclidean_row_norms64(X, num_threads))
     if X.dtype == np.float32:
-        return _sqeuclidean_row_norms32(X, num_threads)
+        return np.asarray(_sqeuclidean_row_norms32(X, num_threads))
 
     raise ValueError(
         "Only float64 or float32 datasets are supported at this time, "
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
index 238ef829dc36e..6fd9a81ff6369 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
@@ -15,9 +15,10 @@ implementation_specific_values = [
 }}
 cimport numpy as cnp
 
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 from libcpp.vector cimport vector
 
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
 

From 76574eb85462bef3a10f497d0498d1e088a4411a Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 24 Oct 2022 09:56:47 +0200
Subject: [PATCH 14/27] remove np.asarray from test

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 2a3ef30a5f975..b78e50f014ae1 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1151,9 +1151,9 @@ def test_sqeuclidean_row_norms(
     X_csr = csr_matrix(X)
 
     sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
-    sq_row_norm = np.asarray(sqeuclidean_row_norms(X, num_threads=num_threads))
+    sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads)
 
-    sq_row_norm_csr = np.asarray(sqeuclidean_row_norms(X_csr, num_threads=num_threads))
+    sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads)
 
     assert_allclose(sq_row_norm_reference, sq_row_norm)
     assert_allclose(sq_row_norm_reference, sq_row_norm_csr)

From 07622be4a149b25bb92694f161f55353658a8726 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 27 Oct 2022 16:11:27 +0200
Subject: [PATCH 15/27] update some doc

---
 doc/whats_new/v1.2.rst                        |  3 +-
 .../_argkmin.pyx.tp                           |  6 ++--
 .../_dispatcher.py                            |  2 +-
 .../_middle_term_computer.pyx.tp              | 29 +++++++++++++++----
 .../_radius_neighbors.pyx.tp                  |  6 ++--
 .../test_pairwise_distances_reduction.py      |  3 ++
 6 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index 6430a0996f501..2d9cde0dceb9d 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -94,7 +94,8 @@ Changes impacting all modules
   - :func:`sklearn.manifold.trustworthiness`
 
   :pr:`23604` and :pr:`23585` by :user:`Julien Jerphanion <jjerphan>`,
-  :user:`Olivier Grisel <ogrisel>`, and `Thomas Fan`_.
+  :user:`Olivier Grisel <ogrisel>`, and `Thomas Fan`_,
+  :pr:`24556` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
 - |Fix| Systematically check the sha256 digest of dataset tarballs used in code
   examples in the documentation.
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 6c948d0abc018..1aeb97e27f15b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -71,8 +71,10 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         ):
             # Specialized implementation with improved arithmetic intensity
             # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in the docstring.
+            # at time to leverage a call to a matrix multiplication routine.
+            # Decomposing the euclidean distance into as a sum of squared
+            # matrices and a middle term cross product is explained in more
+            # details in the docstring of MiddleTermComputer.
             use_squared_distances = metric == "sqeuclidean"
             pda = EuclideanArgKmin{{name_suffix}}(
                 X=X, Y=Y, k=k,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 5bc9bfcc35dbb..921074c3ae968 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -26,7 +26,7 @@ def sqeuclidean_row_norms(X, num_threads):
 
     Parameters
     ----------
-    X : ndarray of shape (n_samples, n_features)
+    X : ndarray or CSR matrix of shape (n_samples, n_features)
         Input data. Must be c-contiguous.
 
     num_threads : int
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index 3ace80ef230ee..442b0742ce278 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -72,18 +72,19 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}(
 
 
 cdef class MiddleTermComputer{{name_suffix}}:
-    """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM.
+    """Abstract base class for the `EuclideanDistance` specialisation
+    components wrapping the computation of the distance matrix chunks.
 
-    `EuclideanDistance` subclasses internally compute the squared Euclidean distances
-    between chunks of vectors X_c and Y_c using the following decomposition:
+    `EuclideanDistance` subclasses relies on the squared Euclidean
+    distances between chunks of vectors X_c and Y_c using the
+    following decomposition for the (i,j) pair :
 
 
-                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+         ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
 
 
     This helper class is in charge of wrapping the common logic to compute
-    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
-    arithmetic intensity.
+    the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
     """
 
     @classmethod
@@ -224,6 +225,13 @@ cdef class MiddleTermComputer{{name_suffix}}:
 
 
 cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Computes the middle term of the Euclidean distance between two chunked dense matrices
+    X_c and Y_c.
+
+            dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    This class use the BLAS gemm routine to efficiently perform the dot product.
+    """
 
     def __init__(
         self,
@@ -384,6 +392,15 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
 
 
 cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Computes the middle term of the Euclidean distance between two chunked CSR matrices X_c and Y_c
+    and returns the result as a contiguous array.
+
+            dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_{{name_suffix}}.
+    This routine iterates over the data, indices and indptr arrays of the sparse matrices without
+    densifying them.
+    """
 
     def __init__(
         self,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index 94272f9aa8f3c..0b0c8ebf728f2 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -92,8 +92,10 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
         ):
             # Specialized implementation with improved arithmetic intensity
             # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in GEMMTermComputer docstring.
+            # at time to leverage a call to a matrix multiplication routine.
+            # Decomposing the euclidean distance into as a sum of squared
+            # matrices and a middle term cross product is explained in more
+            # details in the docstring of MiddleTermComputer.
             use_squared_distances = metric == "sqeuclidean"
             pda = EuclideanRadiusNeighbors{{name_suffix}}(
                 X=X, Y=Y, radius=radius,
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index b78e50f014ae1..37ca3f4bfd981 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -564,6 +564,9 @@ def test_pairwise_distances_reduction_is_usable_for():
     assert not BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y_csr, metric="sqeuclidean"
     )
+    assert BaseDistancesReductionDispatcher.is_usable_for(
+        X_csr, Y_csr, metric="euclidean"
+    )
 
     # CSR matrices without non-zeros elements aren't currently supported
     # TODO: support CSR matrices without non-zeros elements

From fb9a3c9748bf5b80bf465a67e201200c67206d1a Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 28 Oct 2022 12:10:12 +0200
Subject: [PATCH 16/27] doc improvement

---
 .../_argkmin.pyx.tp                            | 18 ++++++++++--------
 .../_middle_term_computer.pyx.tp               |  3 ++-
 .../_radius_neighbors.pyx.tp                   | 16 ++++++++++------
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 1aeb97e27f15b..67f3f13deb4fc 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -69,12 +69,14 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
             metric in ("euclidean", "sqeuclidean")
             and not (issparse(X) ^ issparse(Y))
         ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to a matrix multiplication routine.
-            # Decomposing the euclidean distance into as a sum of squared
-            # matrices and a middle term cross product is explained in more
-            # details in the docstring of MiddleTermComputer.
+            # This implementation processes the computation of the chunks of the
+            # distance using a decomposition of the chunks of the distance matrix
+            # for the Squared Euclidean distance.
+            # This specialisation has an improved arithmetic intensity for both
+            # the dense and sparse settings, allowing in most case speed-ups of
+            # several orders of magnitude compared to the generic ArgKmin
+            # implementation.
+            # For more information see MiddleTermComputer.
             use_squared_distances = metric == "sqeuclidean"
             pda = EuclideanArgKmin{{name_suffix}}(
                 X=X, Y=Y, k=k,
@@ -84,8 +86,8 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
                 metric_kwargs=metric_kwargs,
             )
         else:
-             # Fall back on a generic implementation that handles most scipy
-             # metrics by computing the distances between 2 vectors at a time.
+            # Fall back on a generic implementation that handles most scipy
+            # metrics by computing the distances between 2 vectors at a time.
             pda = ArgKmin{{name_suffix}}(
                 datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
                 k=k,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index 442b0742ce278..099910a2f8de7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -230,7 +230,8 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
 
             dist_middle_terms = - 2 X_c_i.Y_c_j^T
 
-    This class use the BLAS gemm routine to efficiently perform the dot product.
+    This class use the BLAS gemm routine to perform the dot product of each chunks
+    of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
     """
 
     def __init__(
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index 0b0c8ebf728f2..ab7fd69a5c501 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -90,12 +90,16 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
             metric in ("euclidean", "sqeuclidean")
             and not (issparse(X) ^ issparse(Y))
         ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to a matrix multiplication routine.
-            # Decomposing the euclidean distance into as a sum of squared
-            # matrices and a middle term cross product is explained in more
-            # details in the docstring of MiddleTermComputer.
+            # Specialized implementation of RadiusNeighbors for the Euclidean
+            # distance.
+            # This implementation processes the computation of the chunks of the
+            # distance using a decomposition of the chunks of the distance matrix
+            # for the Squared Euclidean distance.
+            # This specialisation has an improved arithmetic intensity for both
+            # the dense and sparse settings, allowing in most case speed-ups of
+            # several orders of magnitude compared to the generic RadiusNeighbors
+            # implementation.
+            # For more information see MiddleTermComputer.
             use_squared_distances = metric == "sqeuclidean"
             pda = EuclideanRadiusNeighbors{{name_suffix}}(
                 X=X, Y=Y, radius=radius,

From 88e074d33345033c65702f5301b8197b84fc0dff Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 28 Oct 2022 15:07:08 +0200
Subject: [PATCH 17/27] Update
 sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 67f3f13deb4fc..ef2c8e27816c6 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -69,6 +69,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
             metric in ("euclidean", "sqeuclidean")
             and not (issparse(X) ^ issparse(Y))
         ):
+            # Specialized implementation of ArgKmin for the Euclidean distance.
             # This implementation processes the computation of the chunks of the
             # distance using a decomposition of the chunks of the distance matrix
             # for the Squared Euclidean distance.

From d6bff5563a083431415409b53b7faa7e6114be90 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 28 Oct 2022 12:14:55 +0200
Subject: [PATCH 18/27] extend test_pairwise_distances_argkmin, fail with
 dtype=float32 and translation=1e7

---
 .../test_pairwise_distances_reduction.py      | 38 ++++++++++---------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 37ca3f4bfd981..c08fb5b898751 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -977,6 +977,9 @@ def test_pairwise_distances_argkmin(
     X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
 
+    X_csr = csr_matrix(X)
+    Y_csr = csr_matrix(Y)
+
     # Haversine distance only accepts 2D data
     if metric == "haversine":
         X = np.ascontiguousarray(X[:, :2])
@@ -999,24 +1002,25 @@ def test_pairwise_distances_argkmin(
             row_idx, argkmin_indices_ref[row_idx]
         ]
 
-    argkmin_distances, argkmin_indices = ArgKmin.compute(
-        X,
-        Y,
-        k,
-        metric=metric,
-        metric_kwargs=metric_kwargs,
-        return_distance=True,
-        # So as to have more than a chunk, forcing parallelism.
-        chunk_size=n_samples // 4,
-        strategy=strategy,
-    )
+    for _X, _Y in [(X, Y), (X_csr, Y_csr)]:
+        argkmin_distances, argkmin_indices = ArgKmin.compute(
+            _X,
+            _Y,
+            k,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            return_distance=True,
+            # So as to have more than a chunk, forcing parallelism.
+            chunk_size=n_samples // 4,
+            strategy=strategy,
+        )
 
-    ASSERT_RESULT[(ArgKmin, dtype)](
-        argkmin_distances,
-        argkmin_distances_ref,
-        argkmin_indices,
-        argkmin_indices_ref,
-    )
+        ASSERT_RESULT[(ArgKmin, dtype)](
+            argkmin_distances,
+            argkmin_distances_ref,
+            argkmin_indices,
+            argkmin_indices_ref,
+        )
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed

From 9f96865fda54412ee0b7b5a679026095e4400143 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 28 Oct 2022 15:08:21 +0200
Subject: [PATCH 19/27] branch SparseSparseMiddleTermComputer32 to
 _middle_term_sparse_sparse_64

---
 .../_middle_term_computer.pxd.tp              | 16 +++++++-------
 .../_middle_term_computer.pyx.tp              | 21 +++++++++++--------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
index 6fd9a81ff6369..e6ef5de2727b5 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
@@ -19,16 +19,14 @@ from libcpp.vector cimport vector
 
 from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 
-{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
-
 
-cdef void _middle_term_sparse_sparse_{{name_suffix}}(
-    const {{INPUT_DTYPE_t}}[:] X_data,
+cdef void _middle_term_sparse_sparse_64(
+    const DTYPE_t[:] X_data,
     const SPARSE_INDEX_TYPE_t[:] X_indices,
     const SPARSE_INDEX_TYPE_t[:] X_indptr,
     ITYPE_t X_start,
     ITYPE_t X_end,
-    const {{INPUT_DTYPE_t}}[:] Y_data,
+    const DTYPE_t[:] Y_data,
     const SPARSE_INDEX_TYPE_t[:] Y_indices,
     const SPARSE_INDEX_TYPE_t[:] Y_indptr,
     ITYPE_t Y_start,
@@ -37,6 +35,9 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}(
 ) nogil
 
 
+{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+
 cdef class MiddleTermComputer{{name_suffix}}:
     cdef:
         ITYPE_t effective_n_threads
@@ -149,11 +150,11 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
 
 cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
     cdef:
-        const {{INPUT_DTYPE_t}}[:] X_data
+        const DTYPE_t[:] X_data
         const SPARSE_INDEX_TYPE_t[:] X_indices
         const SPARSE_INDEX_TYPE_t[:] X_indptr
 
-        const {{INPUT_DTYPE_t}}[:] Y_data
+        const DTYPE_t[:] Y_data
         const SPARSE_INDEX_TYPE_t[:] Y_indices
         const SPARSE_INDEX_TYPE_t[:] Y_indptr
 
@@ -184,4 +185,5 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
         ITYPE_t thread_num,
     ) nogil
 
+
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index 099910a2f8de7..49117c5401cb4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -38,16 +38,13 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
 
 
-{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
-
-
-cdef void _middle_term_sparse_sparse_{{name_suffix}}(
-    const {{INPUT_DTYPE_t}}[:] X_data,
+cdef void _middle_term_sparse_sparse_64(
+    const DTYPE_t[:] X_data,
     const SPARSE_INDEX_TYPE_t[:] X_indices,
     const SPARSE_INDEX_TYPE_t[:] X_indptr,
     ITYPE_t X_start,
     ITYPE_t X_end,
-    const {{INPUT_DTYPE_t}}[:] Y_data,
+    const DTYPE_t[:] Y_data,
     const SPARSE_INDEX_TYPE_t[:] Y_indices,
     const SPARSE_INDEX_TYPE_t[:] Y_indptr,
     ITYPE_t Y_start,
@@ -71,6 +68,9 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}(
                         D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
 
 
+{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+
 cdef class MiddleTermComputer{{name_suffix}}:
     """Abstract base class for the `EuclideanDistance` specialisation
     components wrapping the computation of the distance matrix chunks.
@@ -147,7 +147,9 @@ cdef class MiddleTermComputer{{name_suffix}}:
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
         """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
+        print("DEBUG before casting", X.data.dtype)
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        print("DEBUG after casting", X_data.dtype)
         X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
         X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
         return X_data, X_indices, X_indptr
@@ -398,7 +400,7 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
 
             dist_middle_terms = - 2 X_c_i.Y_c_j^T
 
-    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_{{name_suffix}}.
+    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
     This routine iterates over the data, indices and indptr arrays of the sparse matrices without
     densifying them.
     """
@@ -466,7 +468,7 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
                 self.dist_middle_terms_chunks[thread_num].data()
             )
 
-        _middle_term_sparse_sparse_{{name_suffix}}(
+        _middle_term_sparse_sparse_64(
             self.X_data,
             self.X_indices,
             self.X_indptr,
@@ -482,4 +484,5 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
 
         return dist_middle_terms
 
+
 {{endfor}}

From 2df86bc3f28c0a5fa49b3b69e982a9415df155d1 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 28 Oct 2022 19:50:41 +0200
Subject: [PATCH 20/27] branch sparse_sqeuclidean_row_norm32 to
 sparse_sqeuclidean_row_norm64 to avoid overflow

---
 .../_pairwise_distances_reduction/_base.pyx.tp | 18 +++++++++---------
 .../_middle_term_computer.pyx.tp               |  2 --
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index 886245598c3f9..1a5edc417834e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -103,13 +103,8 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
     return squared_row_norms
 
 
-{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
-
-from ._datasets_pair cimport DatasetsPair{{name_suffix}}
-
-
-cdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
-    const {{INPUT_DTYPE_t}}[:] X_data,
+cdef DTYPE_t[::1] _sqeuclidean_row_norms64_sparse(
+    const DTYPE_t[:] X_data,
     const SPARSE_INDEX_TYPE_t[:] X_indptr,
     ITYPE_t num_threads,
 ):
@@ -127,14 +122,19 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse(
     return squared_row_norms
 
 
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
+
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
+
+
 cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
     X,
     ITYPE_t num_threads,
 ):
     if issparse(X):
-        X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
+        X_data = np.asarray(X.data, dtype=DTYPE)
         X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
-        return _sqeuclidean_row_norms{{name_suffix}}_sparse(X_data, X_indptr, num_threads)
+        return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads)
     else:
         return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads)
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index 49117c5401cb4..ecb259b6aaa38 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -147,9 +147,7 @@ cdef class MiddleTermComputer{{name_suffix}}:
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
         """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        print("DEBUG before casting", X.data.dtype)
         X_data = np.asarray(X.data, dtype=DTYPE)
-        print("DEBUG after casting", X_data.dtype)
         X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
         X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
         return X_data, X_indices, X_indptr

From c0675acc9fca4934161e58179c661bcb76dfbf56 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 4 Nov 2022 12:08:24 +0100
Subject: [PATCH 21/27] remove unused variables

---
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index 2fb82029632d9..e4aef3c982589 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -26,12 +26,12 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 import numpy as np
 
-from scipy.sparse import issparse, csr_matrix
+from scipy.sparse import issparse
 from numbers import Integral
 from sklearn import get_config
 from sklearn.utils import check_scalar
 from ...utils._openmp_helpers import _openmp_effective_n_threads
-from ...utils._typedefs import ITYPE, DTYPE, SPARSE_INDEX_TYPE
+from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
 
 cnp.import_array()
 
@@ -111,7 +111,6 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms64_sparse(
     cdef:
         ITYPE_t n = X_indptr.shape[0] - 1
         ITYPE_t idx = 0
-        ITYPE_t thread_num
         ITYPE_t X_i_ptr
         DTYPE_t[::1] squared_row_norms = np.zeros(n, dtype=DTYPE)
 
@@ -156,7 +155,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         strategy=None,
      ):
         cdef:
-            ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
+            ITYPE_t X_n_full_chunks, Y_n_full_chunks
 
         if chunk_size is None:
             chunk_size = get_config().get("pairwise_dist_chunk_size", 256)

From 366548e68c024bfdfcfdca3fc33dbf58553cabce Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 4 Nov 2022 15:34:04 +0100
Subject: [PATCH 22/27] some suggestions from Maurice

---
 .../_middle_term_computer.pyx.tp              | 23 ++++++--
 .../_pairwise_distances_reduction/setup.py    | 55 -------------------
 2 files changed, 18 insertions(+), 60 deletions(-)
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/setup.py

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index 7c9112f247d68..083c1366bf554 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -27,17 +27,25 @@ from ...utils._cython_blas cimport (
 )
 from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 
-import numpy as np
-from scipy.sparse import issparse, csr_matrix
-from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
-
 # TODO: change for `libcpp.algorithm.fill` once Cython 3 is used
 # Introduction in Cython:
+#
 # https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
 cdef extern from "<algorithm>" namespace "std" nogil:
     void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
 
+import numpy as np
+from scipy.sparse import issparse, csr_matrix
+from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
 
+# TODO: If possible optimize this routine to efficiently treat cases where
+# `n_samples_X << n_samples_Y` met in practise when X_test consists of a
+# few samples, and thus when there's a single chunk of X whose number of
+# samples is less that the default chunk size.
+
+# TODO: compare this routine with the similar ones in SciPy, especially
+# `csr_matmat` which might implement a better algorithm.
+# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L603-L669  # noqa
 cdef void _middle_term_sparse_sparse_64(
     const DTYPE_t[:] X_data,
     const SPARSE_INDEX_TYPE_t[:] X_indices,
@@ -51,6 +59,9 @@ cdef void _middle_term_sparse_sparse_64(
     ITYPE_t Y_end,
     DTYPE_t * D,
 ) nogil:
+    # This routine assumes that D points to the first element of a
+    # zeroed buffer of length at least equal to n_X × n_Y, conceptually
+    # representing a 2-d C-ordered array.
     cdef:
         ITYPE_t i, j, k
         ITYPE_t n_X = X_end - X_start
@@ -141,7 +152,9 @@ cdef class MiddleTermComputer{{name_suffix}}:
                 chunk_size,
             )
 
-        raise NotImplementedError("X and Y must be both sparse or dense")
+        raise NotImplementedError(
+            "Both X and Y must be CSR sparse matrices xor numpy arrays."
+        )
 
 
     @classmethod
diff --git a/sklearn/metrics/_pairwise_distances_reduction/setup.py b/sklearn/metrics/_pairwise_distances_reduction/setup.py
deleted file mode 100644
index 4c8632ebb9088..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/setup.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-
-import numpy as np
-from numpy.distutils.misc_util import Configuration
-
-from sklearn._build_utils import gen_from_templates
-
-
-def configuration(parent_package="", top_path=None):
-    config = Configuration("_pairwise_distances_reduction", parent_package, top_path)
-    libraries = []
-    if os.name == "posix":
-        libraries.append("m")
-
-    templates = [
-        "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp",
-    ]
-
-    gen_from_templates(templates)
-
-    cython_sources = [
-        "_datasets_pair.pyx",
-        "_middle_term_computer.pyx",
-        "_base.pyx",
-        "_argkmin.pyx",
-        "_radius_neighbors.pyx",
-    ]
-
-    for source_file in cython_sources:
-        private_extension_name = source_file.replace(".pyx", "")
-        config.add_extension(
-            name=private_extension_name,
-            sources=[source_file],
-            include_dirs=[np.get_include()],
-            language="c++",
-            libraries=libraries,
-            extra_compile_args=["-std=c++11"],
-        )
-
-    return config
-
-
-if __name__ == "__main__":
-    from numpy.distutils.core import setup
-
-    setup(**configuration().todict())

From 8ee5a294023e95e435e1b737c12e21cdc59abf74 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 7 Nov 2022 19:41:18 +0100
Subject: [PATCH 23/27] Apply suggestions from code review

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp  | 5 ++---
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp | 3 +--
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 7 ++++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 078c4d4fd8d5e..a69d7643738ad 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -66,9 +66,8 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
             and not (issparse(X) ^ issparse(Y))
         ):
             # Specialized implementation of ArgKmin for the Euclidean distance.
-            # This implementation processes the computation of the chunks of the
-            # distance using a decomposition of the chunks of the distance matrix
-            # for the Squared Euclidean distance.
+            # This implementation computes the distances by chunk using
+            # a decomposition of the Squared Euclidean distance.
             # This specialisation has an improved arithmetic intensity for both
             # the dense and sparse settings, allowing in most case speed-ups of
             # several orders of magnitude compared to the generic ArgKmin
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index e4aef3c982589..932b8f9006d4e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -110,8 +110,7 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms64_sparse(
 ):
     cdef:
         ITYPE_t n = X_indptr.shape[0] - 1
-        ITYPE_t idx = 0
-        ITYPE_t X_i_ptr
+        SPARSE_INDEX_TYPE_t X_i_ptr, idx = 0
         DTYPE_t[::1] squared_row_norms = np.zeros(n, dtype=DTYPE)
 
     for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index c08fb5b898751..04542f8b18ff7 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -564,9 +564,10 @@ def test_pairwise_distances_reduction_is_usable_for():
     assert not BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y_csr, metric="sqeuclidean"
     )
-    assert BaseDistancesReductionDispatcher.is_usable_for(
-        X_csr, Y_csr, metric="euclidean"
-    )
+    # TODO: add this assertion once #24542 and #24715 are merged. 
+    # assert BaseDistancesReductionDispatcher.is_usable_for(
+    #     X_csr, Y_csr, metric="euclidean"
+    # )
 
     # CSR matrices without non-zeros elements aren't currently supported
     # TODO: support CSR matrices without non-zeros elements

From 8d814fa3c2da326dfb22cc494e4e9711ab623188 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 8 Nov 2022 21:16:11 +0100
Subject: [PATCH 24/27] Update
 sklearn/metrics/tests/test_pairwise_distances_reduction.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 04542f8b18ff7..299a3d57ca1f5 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -564,7 +564,7 @@ def test_pairwise_distances_reduction_is_usable_for():
     assert not BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y_csr, metric="sqeuclidean"
     )
-    # TODO: add this assertion once #24542 and #24715 are merged. 
+    # TODO: add this assertion once #24542 and #24715 are merged.
     # assert BaseDistancesReductionDispatcher.is_usable_for(
     #     X_csr, Y_csr, metric="euclidean"
     # )

From 6fa803fcd6aeaddc0cfc74a84419d0ff33ced69e Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 16 Nov 2022 15:53:29 +0100
Subject: [PATCH 25/27] Update
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index 932b8f9006d4e..1b2a8a31fb679 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -130,6 +130,8 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
     ITYPE_t num_threads,
 ):
     if issparse(X):
+        # TODO: remove this instruction which is a cast in the float32 case
+        # by moving squared row norms computations in MiddleTermComputer. 
         X_data = np.asarray(X.data, dtype=DTYPE)
         X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
         return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads)

From c08a5155ac389a657abb9765a3c17999f85c5d03 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 18 Nov 2022 15:14:38 +0100
Subject: [PATCH 26/27] apply suggestions :)

---
 .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp  | 5 +++--
 .../metrics/_pairwise_distances_reduction/_dispatcher.py   | 4 +++-
 .../_middle_term_computer.pyx.tp                           | 7 ++++---
 .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp | 4 ++--
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 7 +++----
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index a69d7643738ad..ad4e1c32acc70 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -63,9 +63,10 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         """
         if (
             metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))
+            and not (issparse(X) ^ issparse(Y))  # "^" is the XOR operator
         ):
-            # Specialized implementation of ArgKmin for the Euclidean distance.
+            # Specialized implementation of ArgKmin for the Euclidean distance
+            # for the dense-dense and sparse-sparse cases.
             # This implementation computes the distances by chunk using
             # a decomposition of the Squared Euclidean distance.
             # This specialisation has an improved arithmetic intensity for both
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 921074c3ae968..eb2979864f02d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -130,8 +130,10 @@ def is_valid_sparse_matrix(X):
         # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
         # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
         # using sparse-dense routines for matrix-vector multiplications.
+        # Currently, only dense-dense and sparse-sparse are optimized for
+        # the Euclidean case.
         fused_sparse_dense_euclidean_case_guard = not (
-            (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y))
+            (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y))  # "^" is XOR
             and isinstance(metric, str)
             and "euclidean" in metric
         )
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index 083c1366bf554..3363eb9524263 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -153,7 +153,7 @@ cdef class MiddleTermComputer{{name_suffix}}:
             )
 
         raise NotImplementedError(
-            "Both X and Y must be CSR sparse matrices xor numpy arrays."
+            "X and Y must be both CSR sparse matrices or both numpy arrays."
         )
 
 
@@ -406,8 +406,9 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
 
 
 cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
-    """Computes the middle term of the Euclidean distance between two chunked CSR matrices X_c and Y_c
-    and returns the result as a contiguous array.
+    """Middle term of the Euclidean distance between two chunked CSR matrices.
+
+    The result is return as a contiguous array.
 
             dist_middle_terms = - 2 X_c_i.Y_c_j^T
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index 36516af162b60..a498c6cd9856e 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -84,10 +84,10 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
         """
         if (
             metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))
+            and not (issparse(X) ^ issparse(Y))  # "^" is XOR
         ):
             # Specialized implementation of RadiusNeighbors for the Euclidean
-            # distance.
+            # distance for the dense-dense and sparse-sparse cases.
             # This implementation computes the distances by chunk using
             # a decomposition of the Squared Euclidean distance.
             # This specialisation has an improved arithmetic intensity for both
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 299a3d57ca1f5..c08fb5b898751 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -564,10 +564,9 @@ def test_pairwise_distances_reduction_is_usable_for():
     assert not BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y_csr, metric="sqeuclidean"
     )
-    # TODO: add this assertion once #24542 and #24715 are merged.
-    # assert BaseDistancesReductionDispatcher.is_usable_for(
-    #     X_csr, Y_csr, metric="euclidean"
-    # )
+    assert BaseDistancesReductionDispatcher.is_usable_for(
+        X_csr, Y_csr, metric="euclidean"
+    )
 
     # CSR matrices without non-zeros elements aren't currently supported
     # TODO: support CSR matrices without non-zeros elements

From 001fa5b108c9e440de26f25f131a4585ffebfd3b Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 18 Nov 2022 18:36:02 +0100
Subject: [PATCH 27/27] Fu... sion!

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index c08fb5b898751..c334087c65448 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -561,7 +561,7 @@ def test_pairwise_distances_reduction_is_usable_for():
     assert not BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y, metric="euclidean"
     )
-    assert not BaseDistancesReductionDispatcher.is_usable_for(
+    assert BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y_csr, metric="sqeuclidean"
     )
     assert BaseDistancesReductionDispatcher.is_usable_for(