From 8bab40ffab17a0b9b55ad236f7ec4bac439c37fd Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 1 Nov 2022 17:30:35 +0100
Subject: [PATCH 1/4] first commit

---
 .gitignore                                    |   4 +-
 setup.cfg                                     |   4 +-
 setup.py                                      |   2 +-
 .../_argkmin.pxd.tp                           |   4 +-
 .../_argkmin.pyx.tp                           |  52 +++--
 .../_base.pxd.tp                              |  19 +-
 .../_base.pyx.tp                              |  41 +++-
 .../_dispatcher.py                            |  13 +-
 ...er.pxd.tp => _middle_term_computer.pxd.tp} |  70 ++++++-
 ...er.pyx.tp => _middle_term_computer.pyx.tp} | 189 +++++++++++++++---
 .../_radius_neighbors.pxd.tp                  |   4 +-
 .../_radius_neighbors.pyx.tp                  |  44 ++--
 .../_pairwise_distances_reduction/setup.py    |   6 +-
 13 files changed, 336 insertions(+), 116 deletions(-)
 rename sklearn/metrics/_pairwise_distances_reduction/{_gemm_term_computer.pxd.tp => _middle_term_computer.pxd.tp} (62%)
 rename sklearn/metrics/_pairwise_distances_reduction/{_gemm_term_computer.pyx.tp => _middle_term_computer.pyx.tp} (58%)

diff --git a/.gitignore b/.gitignore
index 68299a202b7c7..47ec8fa2faf79 100644
--- a/.gitignore
+++ b/.gitignore
@@ -93,7 +93,7 @@ sklearn/metrics/_pairwise_distances_reduction/_base.pxd
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx
 sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
 sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
-sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
-sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
+sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
+sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
diff --git a/setup.cfg b/setup.cfg
index 3aa89052a4b8e..a7e632e9887eb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -77,8 +77,8 @@ ignore =
     sklearn/metrics/_pairwise_distances_reduction/_base.pyx
     sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
     sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
-    sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
-    sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
+    sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
+    sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
 
diff --git a/setup.py b/setup.py
index 6004bf127a334..d128aa84b7e42 100755
--- a/setup.py
+++ b/setup.py
@@ -89,7 +89,7 @@
     "sklearn.manifold._barnes_hut_tsne",
     "sklearn.metrics.cluster._expected_mutual_info_fast",
     "sklearn.metrics._pairwise_distances_reduction._datasets_pair",
-    "sklearn.metrics._pairwise_distances_reduction._gemm_term_computer",
+    "sklearn.metrics._pairwise_distances_reduction._middle_term_computer",
     "sklearn.metrics._pairwise_distances_reduction._base",
     "sklearn.metrics._pairwise_distances_reduction._argkmin",
     "sklearn.metrics._pairwise_distances_reduction._radius_neighbors",
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
index b738cda119c11..2bbe9e53518b3 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp
@@ -6,7 +6,7 @@ cnp.import_array()
 {{for name_suffix in ['64', '32']}}
 
 from ._base cimport BaseDistancesReduction{{name_suffix}}
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
     """float{{name_suffix}} implementation of the ArgKmin."""
@@ -25,7 +25,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
 cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
     """EuclideanDistance-specialisation of ArgKmin{{name_suffix}}."""
     cdef:
-        GEMMTermComputer{{name_suffix}} gemm_term_computer
+        MiddleTermComputer{{name_suffix}} middle_term_computer
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index a2ecb7c2266b6..550dcc03094f7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -31,9 +31,10 @@ from ._base cimport (
 from ._datasets_pair cimport (
     DatasetsPair{{name_suffix}},
     DenseDenseDatasetsPair{{name_suffix}},
+    SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 
 cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
@@ -66,13 +67,17 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         """
         if (
             metric in ("euclidean", "sqeuclidean")
-            and not issparse(X)
-            and not issparse(Y)
+            and not (issparse(X) or issparse(Y))
         ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in the docstring.
+            # Specialized implementation of ArgKmin for the Euclidean distance.
+            # This implementation processes the computation of the chunks of the
+            # distance using a decomposition of the chunks of the distance matrix
+            # for the Squared Euclidean distance.
+            # This specialisation has an improved arithmetic intensity for both
+            # the dense and sparse settings, allowing in most case speed-ups of
+            # several orders of magnitude compared to the generic ArgKmin
+            # implementation.
+            # For more information see MiddleTermComputer.
             use_squared_distances = metric == "sqeuclidean"
             pda = EuclideanArgKmin{{name_suffix}}(
                 X=X, Y=Y, k=k,
@@ -82,8 +87,8 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
                 metric_kwargs=metric_kwargs,
             )
         else:
-             # Fall back on a generic implementation that handles most scipy
-             # metrics by computing the distances between 2 vectors at a time.
+            # Fall back on a generic implementation that handles most scipy
+            # metrics by computing the distances between 2 vectors at a time.
             pda = ArgKmin{{name_suffix}}(
                 datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
                 k=k,
@@ -355,9 +360,9 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.gemm_term_computer = GEMMTermComputer{{name_suffix}}(
-            datasets_pair.X,
-            datasets_pair.Y,
+        self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
+            X,
+            Y,
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
@@ -373,12 +378,16 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
                 dtype=np.float64
             )
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
+                Y, self.effective_n_threads
+            )
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{name_suffix}}(
+                X, self.effective_n_threads
+            )
         )
         self.use_squared_distances = use_squared_distances
 
@@ -393,7 +402,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ITYPE_t thread_num,
     ) nogil:
         ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
-        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
+        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
 
 
     @final
@@ -404,7 +413,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ITYPE_t X_end,
     ) nogil:
         ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
 
     @final
@@ -422,7 +431,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
@@ -432,7 +441,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         self,
     ) nogil:
         ArgKmin{{name_suffix}}._parallel_on_Y_init(self)
-        self.gemm_term_computer._parallel_on_Y_init()
+        self.middle_term_computer._parallel_on_Y_init()
 
 
     @final
@@ -443,7 +452,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ITYPE_t X_end,
     ) nogil:
         ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
 
     @final
@@ -461,7 +470,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num
         )
 
@@ -480,13 +489,12 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             DTYPE_t squared_dist_i_j
             ITYPE_t n_X = X_end - X_start
             ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms(
+            DTYPE_t * dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
                 X_start, X_end, Y_start, Y_end, thread_num
             )
             DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num]
 
-
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
         for i in range(n_X):
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
index 44f48f5bf1558..be44f3a98a263 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -2,25 +2,20 @@ cimport numpy as cnp
 
 from cython cimport final
 
-from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t, SPARSE_INDEX_TYPE_t
 
 cnp.import_array()
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
-    const DTYPE_t[:, ::1] X,
-    ITYPE_t num_threads,
-)
-
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
-    const cnp.float32_t[:, ::1] X,
-    ITYPE_t num_threads,
-)
-
-{{for name_suffix in ['64', '32']}}
+{{for name_suffix, INPUT_DTYPE_t in [('64', 'DTYPE_t'), ('32', 'cnp.float32_t')]}}
 
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
+    X,
+    ITYPE_t num_threads,
+)
+
 cdef class BaseDistancesReduction{{name_suffix}}:
     """
     Base float{{name_suffix}} implementation template of the pairwise-distances
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index c1bade148c988..dc365a4cc0641 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -1,3 +1,18 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
+    #
+    # We also use the float64 dtype and C-type names as defined in
+    # `sklearn.utils._typedefs` to maintain consistency.
+    #
+    ('64', 'DTYPE_t', 'DTYPE'),
+    ('32', 'cnp.float32_t', 'np.float32')
+]
+
+}}
 cimport numpy as cnp
 
 from cython cimport final
@@ -11,17 +26,18 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 import numpy as np
 
+from scipy.sparse import issparse, csr_matrix
 from numbers import Integral
 from sklearn import get_config
 from sklearn.utils import check_scalar
 from ...utils._openmp_helpers import _openmp_effective_n_threads
-from ...utils._typedefs import DTYPE
+from ...utils._typedefs import ITYPE, DTYPE, SPARSE_INDEX_TYPE
 
 cnp.import_array()
 
 #####################
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
+cdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense(
     const DTYPE_t[:, ::1] X,
     ITYPE_t num_threads,
 ):
@@ -46,7 +62,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
     return squared_row_norms
 
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
+cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
     const cnp.float32_t[:, ::1] X,
     ITYPE_t num_threads,
 ):
@@ -86,10 +102,19 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
 
     return squared_row_norms
 
-{{for name_suffix in ['64', '32']}}
+
+{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
 from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
+    X,
+    ITYPE_t num_threads,
+):
+    return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads)
+
+
 cdef class BaseDistancesReduction{{name_suffix}}:
     """
     Base float{{name_suffix}} implementation template of the pairwise-distances
@@ -359,7 +384,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_X_init_chunk(
+        self.middle_term_computer._parallel_on_X_init_chunk(
             thread_num, X_start, X_end,
         )
 
@@ -382,7 +407,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
@@ -425,7 +450,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_Y_parallel_init(
+        self.middle_term_computer._parallel_on_Y_parallel_init(
             thread_num, X_start, X_end,
         )
 
@@ -448,7 +473,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
         In this method, EuclideanDistance specialisations of subclass of
         BaseDistancesReduction _must_ call:
 
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index cd693f352dbc5..921074c3ae968 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -8,10 +8,7 @@
 
 from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING
 
-from ._base import (
-    _sqeuclidean_row_norms64,
-    _sqeuclidean_row_norms32,
-)
+from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64
 from ._argkmin import (
     ArgKmin64,
     ArgKmin32,
@@ -29,7 +26,7 @@ def sqeuclidean_row_norms(X, num_threads):
 
     Parameters
     ----------
-    X : ndarray of shape (n_samples, n_features)
+    X : ndarray or CSR matrix of shape (n_samples, n_features)
         Input data. Must be c-contiguous.
 
     num_threads : int
@@ -41,9 +38,9 @@ def sqeuclidean_row_norms(X, num_threads):
         Arrays containing the squared euclidean norm of each row of X.
     """
     if X.dtype == np.float64:
-        return _sqeuclidean_row_norms64(X, num_threads)
+        return np.asarray(_sqeuclidean_row_norms64(X, num_threads))
     if X.dtype == np.float32:
-        return _sqeuclidean_row_norms32(X, num_threads)
+        return np.asarray(_sqeuclidean_row_norms32(X, num_threads))
 
     raise ValueError(
         "Only float64 or float32 datasets are supported at this time, "
@@ -134,7 +131,7 @@ def is_valid_sparse_matrix(X):
         # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
         # using sparse-dense routines for matrix-vector multiplications.
         fused_sparse_dense_euclidean_case_guard = not (
-            (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y))
+            (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y))
             and isinstance(metric, str)
             and "euclidean" in metric
         )
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
similarity index 62%
rename from sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
rename to sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
index 2f4f040b91ad8..c7ce631591ea7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
@@ -15,16 +15,16 @@ implementation_specific_values = [
 }}
 cimport numpy as cnp
 
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t
 from libcpp.vector cimport vector
 
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+
+
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
-cdef class GEMMTermComputer{{name_suffix}}:
-    cdef:
-        const {{INPUT_DTYPE_t}}[:, ::1] X
-        const {{INPUT_DTYPE_t}}[:, ::1] Y
 
+cdef class MiddleTermComputer{{name_suffix}}:
+    cdef:
         ITYPE_t effective_n_threads
         ITYPE_t chunks_n_threads
         ITYPE_t dist_middle_terms_chunks_size
@@ -34,12 +34,6 @@ cdef class GEMMTermComputer{{name_suffix}}:
         # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
         vector[vector[DTYPE_t]] dist_middle_terms_chunks
 
-{{if upcast_to_float64}}
-        # Buffers for upcasting chunks of X and Y from float32 to float64
-        vector[vector[DTYPE_t]] X_c_upcast
-        vector[vector[DTYPE_t]] Y_c_upcast
-{{endif}}
-
     cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -85,4 +79,58 @@ cdef class GEMMTermComputer{{name_suffix}}:
         ITYPE_t thread_num,
     ) nogil
 
+
+cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:, ::1] X
+        const {{INPUT_DTYPE_t}}[:, ::1] Y
+
+    {{if upcast_to_float64}}
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
+        vector[vector[DTYPE_t]] X_c_upcast
+        vector[vector[DTYPE_t]] Y_c_upcast
+    {{endif}}
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
similarity index 58%
rename from sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
rename to sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index e69d1c3df9f7d..d29905ac05c9a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -17,8 +17,6 @@ cimport numpy as cnp
 
 from libcpp.vector cimport vector
 
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t
-
 from ...utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -27,35 +25,101 @@ from ...utils._cython_blas cimport (
   Trans,
   _gemm,
 )
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+
+import numpy as np
+from scipy.sparse import issparse, csr_matrix
+from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
+
+# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used
+# Introduction in Cython:
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
+cdef extern from "<algorithm>" namespace "std" nogil:
+    void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
+
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
-cdef class GEMMTermComputer{{name_suffix}}:
-    """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM.
 
-    `EuclideanDistance` subclasses internally compute the squared Euclidean distances
-    between chunks of vectors X_c and Y_c using the following decomposition:
+cdef class MiddleTermComputer{{name_suffix}}:
+    """Abstract base class for the `EuclideanDistance` specialisation
+    components wrapping the computation of the distance matrix chunks.
 
+    `EuclideanDistance` subclasses relies on the squared Euclidean
+    distances between chunks of vectors X_c and Y_c using the
+    following decomposition for the (i,j) pair :
 
-                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+         ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
 
 
     This helper class is in charge of wrapping the common logic to compute
-    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
-    arithmetic intensity.
+    the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
     """
 
-    def __init__(self,
-        const {{INPUT_DTYPE_t}}[:, ::1] X,
-        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        effective_n_threads,
+        chunks_n_threads,
+        dist_middle_terms_chunks_size,
+        n_features,
+        chunk_size,
+    ) -> MiddleTermComputer{{name_suffix}}:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Returns
+        -------
+        middle_term_computer: MiddleTermComputer{{name_suffix}}
+            The suited MiddleTermComputer{{name_suffix}} implementation.
+        """
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseMiddleTermComputer{{name_suffix}}(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+            )
+
+        raise NotImplementedError("X and Y must be both dense")
+
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
+        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
+        return X_data, X_indices, X_indptr
+
+    def __init__(
+        self,
         ITYPE_t effective_n_threads,
         ITYPE_t chunks_n_threads,
         ITYPE_t dist_middle_terms_chunks_size,
         ITYPE_t n_features,
         ITYPE_t chunk_size,
     ):
-        self.X = X
-        self.Y = Y
         self.effective_n_threads = effective_n_threads
         self.chunks_n_threads = chunks_n_threads
         self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
@@ -64,6 +128,92 @@ cdef class GEMMTermComputer{{name_suffix}}:
 
         self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
 
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return NULL
+
+
+cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Computes the middle term of the Euclidean distance between two chunked dense matrices
+    X_c and Y_c.
+
+            dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    This class use the BLAS gemm routine to perform the dot product of each chunks
+    of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
+    """
+
+    def __init__(
+        self,
+        const {{INPUT_DTYPE_t}}[:, ::1] X,
+        const {{INPUT_DTYPE_t}}[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X = X
+        self.Y = Y
+
 {{if upcast_to_float64}}
         # We populate the buffer for upcasting chunks of X and Y from float32 to float64.
         self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
@@ -97,10 +247,6 @@ cdef class GEMMTermComputer{{name_suffix}}:
         return
 {{endif}}
 
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
     cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
@@ -120,12 +266,6 @@ cdef class GEMMTermComputer{{name_suffix}}:
         return
 {{endif}}
 
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
     cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
@@ -211,4 +351,5 @@ cdef class GEMMTermComputer{{name_suffix}}:
 
         return dist_middle_terms
 
+
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
index 80fd2775b5acd..b6e4508468d2b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp
@@ -29,7 +29,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
 {{for name_suffix in ['64', '32']}}
 
 from ._base cimport BaseDistancesReduction{{name_suffix}}
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
     """float{{name_suffix}} implementation of the RadiusNeighbors."""
@@ -81,7 +81,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
 cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
     """EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}."""
     cdef:
-        GEMMTermComputer{{name_suffix}} gemm_term_computer
+        MiddleTermComputer{{name_suffix}} middle_term_computer
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index 80747fd1c1a12..c3b5b19993eb1 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -51,9 +51,10 @@ from ._base cimport (
 from ._datasets_pair cimport (
     DatasetsPair{{name_suffix}},
     DenseDenseDatasetsPair{{name_suffix}},
+    SparseSparseDatasetsPair{{name_suffix}},
 )
 
-from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}}
+from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
 
 cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
@@ -87,13 +88,18 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
         """
         if (
             metric in ("euclidean", "sqeuclidean")
-            and not issparse(X)
-            and not issparse(Y)
+            and not (issparse(X) or issparse(Y))
         ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in GEMMTermComputer docstring.
+            # Specialized implementation of RadiusNeighbors for the Euclidean
+            # distance.
+            # This implementation processes the computation of the chunks of the
+            # distance using a decomposition of the chunks of the distance matrix
+            # for the Squared Euclidean distance.
+            # This specialisation has an improved arithmetic intensity for both
+            # the dense and sparse settings, allowing in most case speed-ups of
+            # several orders of magnitude compared to the generic RadiusNeighbors
+            # implementation.
+            # For more information see MiddleTermComputer.
             use_squared_distances = metric == "sqeuclidean"
             pda = EuclideanRadiusNeighbors{{name_suffix}}(
                 X=X, Y=Y, radius=radius,
@@ -363,9 +369,9 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.gemm_term_computer = GEMMTermComputer{{name_suffix}}(
-            datasets_pair.X,
-            datasets_pair.Y,
+        self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
+            X,
+            Y,
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
@@ -381,12 +387,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
                 dtype=np.float64
             )
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{name_suffix}}(X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
@@ -401,7 +407,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
         ITYPE_t thread_num,
     ) nogil:
         RadiusNeighbors{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
-        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
+        self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
 
     @final
     cdef void _parallel_on_X_init_chunk(
@@ -411,7 +417,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
         ITYPE_t X_end,
     ) nogil:
         RadiusNeighbors{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
     @final
     cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
@@ -428,7 +434,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
@@ -437,7 +443,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
         self,
     ) nogil:
         RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self)
-        self.gemm_term_computer._parallel_on_Y_init()
+        self.middle_term_computer._parallel_on_Y_init()
 
     @final
     cdef void _parallel_on_Y_parallel_init(
@@ -447,7 +453,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
         ITYPE_t X_end,
     ) nogil:
         RadiusNeighbors{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+        self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
     @final
     cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
@@ -464,7 +470,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             Y_start, Y_end,
             thread_num,
         )
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
             X_start, X_end, Y_start, Y_end, thread_num
         )
 
@@ -487,7 +493,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             DTYPE_t squared_dist_i_j
             ITYPE_t n_X = X_end - X_start
             ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_dist_middle_terms(
+            DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
                 X_start, X_end, Y_start, Y_end, thread_num
             )
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/setup.py b/sklearn/metrics/_pairwise_distances_reduction/setup.py
index e1fbbceea7eb8..4c8632ebb9088 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/setup.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/setup.py
@@ -15,8 +15,8 @@ def configuration(parent_package="", top_path=None):
     templates = [
         "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp",
-        "sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp",
+        "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp",
+        "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp",
         "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp",
@@ -29,7 +29,7 @@ def configuration(parent_package="", top_path=None):
 
     cython_sources = [
         "_datasets_pair.pyx",
-        "_gemm_term_computer.pyx",
+        "_middle_term_computer.pyx",
         "_base.pyx",
         "_argkmin.pyx",
         "_radius_neighbors.pyx",

From 85790c4e68a285a5da522988cbabc201ecbdb0b2 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 2 Nov 2022 10:59:47 +0100
Subject: [PATCH 2/4] some revamp

---
 .../_argkmin.pyx.tp                           | 21 +++-------------
 .../_base.pxd.tp                              |  2 +-
 .../_base.pyx.tp                              |  5 ++--
 .../_dispatcher.py                            |  7 ++++--
 .../_middle_term_computer.pyx.tp              | 24 +++----------------
 .../_radius_neighbors.pyx.tp                  | 17 ++++---------
 6 files changed, 18 insertions(+), 58 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 550dcc03094f7..30dde842c4c86 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -28,11 +28,7 @@ from ._base cimport (
     _sqeuclidean_row_norms{{name_suffix}},
 )
 
-from ._datasets_pair cimport (
-    DatasetsPair{{name_suffix}},
-    DenseDenseDatasetsPair{{name_suffix}},
-    SparseSparseDatasetsPair{{name_suffix}},
-)
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
@@ -352,12 +348,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             strategy=strategy,
             k=k,
         )
-        # X and Y are checked by the DatasetsPair{{name_suffix}} implemented
-        # as a DenseDenseDatasetsPair{{name_suffix}}
         cdef:
-            DenseDenseDatasetsPair{{name_suffix}} datasets_pair = (
-            <DenseDenseDatasetsPair{{name_suffix}}> self.datasets_pair
-        )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
         self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
@@ -366,7 +357,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
+            n_features=X.shape[1],
             chunk_size=self.chunk_size,
         )
 
@@ -404,7 +395,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
         self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
 
-
     @final
     cdef void _parallel_on_X_init_chunk(
         self,
@@ -415,7 +405,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
         self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
-
     @final
     cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
         self,
@@ -435,7 +424,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             X_start, X_end, Y_start, Y_end, thread_num,
         )
 
-
     @final
     cdef void _parallel_on_Y_init(
         self,
@@ -443,7 +431,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ArgKmin{{name_suffix}}._parallel_on_Y_init(self)
         self.middle_term_computer._parallel_on_Y_init()
 
-
     @final
     cdef void _parallel_on_Y_parallel_init(
         self,
@@ -454,7 +441,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
         ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
         self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
-
     @final
     cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
         self,
@@ -474,7 +460,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             X_start, X_end, Y_start, Y_end, thread_num
         )
 
-
     @final
     cdef void _compute_and_reduce_distances_on_chunks(
         self,
@@ -486,7 +471,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
     ) nogil:
         cdef:
             ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
             ITYPE_t n_X = X_end - X_start
             ITYPE_t n_Y = Y_end - Y_start
             DTYPE_t * dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
@@ -495,6 +479,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
             DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num]
 
+
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
         for i in range(n_X):
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
index be44f3a98a263..090000fe26663 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -2,7 +2,7 @@ cimport numpy as cnp
 
 from cython cimport final
 
-from ...utils._typedefs cimport ITYPE_t, DTYPE_t, SPARSE_INDEX_TYPE_t
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 cnp.import_array()
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index dc365a4cc0641..53abf20f925e3 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -3,7 +3,7 @@
 implementation_specific_values = [
     # Values are the following ones:
     #
-    #       name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
+    #       name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
     #
     # We also use the float64 dtype and C-type names as defined in
     # `sklearn.utils._typedefs` to maintain consistency.
@@ -26,12 +26,11 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 import numpy as np
 
-from scipy.sparse import issparse, csr_matrix
 from numbers import Integral
 from sklearn import get_config
 from sklearn.utils import check_scalar
 from ...utils._openmp_helpers import _openmp_effective_n_threads
-from ...utils._typedefs import ITYPE, DTYPE, SPARSE_INDEX_TYPE
+from ...utils._typedefs import DTYPE
 
 cnp.import_array()
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
index 921074c3ae968..5bde2b063e89f 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -8,7 +8,10 @@
 
 from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING
 
-from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64
+from ._base import (
+    _sqeuclidean_row_norms64,
+    _sqeuclidean_row_norms32,
+)
 from ._argkmin import (
     ArgKmin64,
     ArgKmin32,
@@ -131,7 +134,7 @@ def is_valid_sparse_matrix(X):
         # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
         # using sparse-dense routines for matrix-vector multiplications.
         fused_sparse_dense_euclidean_case_guard = not (
-            (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y))
+            (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y))
             and isinstance(metric, str)
             and "euclidean" in metric
         )
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index d29905ac05c9a..162fccfcc7d7f 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -25,17 +25,9 @@ from ...utils._cython_blas cimport (
   Trans,
   _gemm,
 )
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t
 
-import numpy as np
-from scipy.sparse import issparse, csr_matrix
-from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
-
-# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used
-# Introduction in Cython:
-# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
-cdef extern from "<algorithm>" namespace "std" nogil:
-    void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
+from scipy.sparse import issparse
 
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
@@ -103,15 +95,6 @@ cdef class MiddleTermComputer{{name_suffix}}:
 
         raise NotImplementedError("X and Y must be both dense")
 
-
-    @classmethod
-    def unpack_csr_matrix(cls, X: csr_matrix):
-        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        X_data = np.asarray(X.data, dtype=DTYPE)
-        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
-        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
-        return X_data, X_indices, X_indptr
-
     def __init__(
         self,
         ITYPE_t effective_n_threads,
@@ -188,7 +171,7 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
     """Computes the middle term of the Euclidean distance between two chunked dense matrices
     X_c and Y_c.
 
-            dist_middle_terms = - 2 X_c_i.Y_c_j^T
+                        dist_middle_terms = - 2 X_c_i.Y_c_j^T
 
     This class use the BLAS gemm routine to perform the dot product of each chunks
     of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
@@ -351,5 +334,4 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_
 
         return dist_middle_terms
 
-
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index c3b5b19993eb1..f1923c5a9c16c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -48,11 +48,7 @@ from ._base cimport (
     _sqeuclidean_row_norms{{name_suffix}}
 )
 
-from ._datasets_pair cimport (
-    DatasetsPair{{name_suffix}},
-    DenseDenseDatasetsPair{{name_suffix}},
-    SparseSparseDatasetsPair{{name_suffix}},
-)
+from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
 
@@ -88,7 +84,8 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
         """
         if (
             metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) or issparse(Y))
+            and not issparse(X)
+            and not issparse(Y)
         ):
             # Specialized implementation of RadiusNeighbors for the Euclidean
             # distance.
@@ -277,7 +274,6 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
             )
             last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
 
-
     cdef void _parallel_on_Y_finalize(
         self,
     ) nogil:
@@ -361,12 +357,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             strategy=strategy,
             sort_results=sort_results,
         )
-        # X and Y are checked by the DatasetsPair{{name_suffix}} implemented
-        # as a DenseDenseDatasetsPair{{name_suffix}}
         cdef:
-            DenseDenseDatasetsPair{{name_suffix}} datasets_pair = (
-                <DenseDenseDatasetsPair{{name_suffix}}> self.datasets_pair
-            )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
         self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
@@ -375,7 +366,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
+            n_features=X.shape[1],
             chunk_size=self.chunk_size,
         )
 

From 33ef832508e835b02b8f0d65daa49befc2540250 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 2 Nov 2022 13:28:12 +0100
Subject: [PATCH 3/4] Apply suggestions from code review

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp  | 2 +-
 sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp  | 2 +-
 .../_middle_term_computer.pxd.tp                            | 2 +-
 .../_middle_term_computer.pyx.tp                            | 6 ++----
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
index 090000fe26663..35c8184d25a6c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp
@@ -12,7 +12,7 @@ from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 
 cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
-    X,
+    const {{INPUT_DTYPE_t}}[:, ::1] X,
     ITYPE_t num_threads,
 )
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
index 53abf20f925e3..c46d21aa25ed3 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -108,7 +108,7 @@ from ._datasets_pair cimport DatasetsPair{{name_suffix}}
 
 
 cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
-    X,
+    const {{INPUT_DTYPE_t}}[:, ::1] X,
     ITYPE_t num_threads,
 ):
     return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads)
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
index c7ce631591ea7..c49787188a05c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
@@ -17,7 +17,7 @@ cimport numpy as cnp
 
 from libcpp.vector cimport vector
 
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t
 
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index 162fccfcc7d7f..cf4ed6d6d7c14 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -64,15 +64,13 @@ cdef class MiddleTermComputer{{name_suffix}}:
 
         Parameters
         ----------
-        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+        X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
             Input data.
             If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
 
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+        Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
             Input data.
             If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
 
         Returns
         -------

From 6e03bf5091752476a9cbe29523d758de2d661c56 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 3 Nov 2022 14:31:50 +0100
Subject: [PATCH 4/4] Apply suggestions from code review

---
 .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp   | 5 ++---
 .../_middle_term_computer.pyx.tp                            | 6 ++++--
 .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp  | 5 ++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 30dde842c4c86..fbd2c5852b62c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -66,9 +66,8 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
             and not (issparse(X) or issparse(Y))
         ):
             # Specialized implementation of ArgKmin for the Euclidean distance.
-            # This implementation processes the computation of the chunks of the
-            # distance using a decomposition of the chunks of the distance matrix
-            # for the Squared Euclidean distance.
+            # This implementation computes the distances by chunk using
+            # a decomposition of the Squared Euclidean distance.
             # This specialisation has an improved arithmetic intensity for both
             # the dense and sparse settings, allowing in most case speed-ups of
             # several orders of magnitude compared to the generic ArgKmin
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
index cf4ed6d6d7c14..692652b3e8d5a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -34,8 +34,10 @@ from scipy.sparse import issparse
 
 
 cdef class MiddleTermComputer{{name_suffix}}:
-    """Abstract base class for the `EuclideanDistance` specialisation
-    components wrapping the computation of the distance matrix chunks.
+    """Helper class to compute a Euclidean distance matrix in chunks. 
+
+    This is an abstract base class that is further specialized depending
+    on the type of data (dense or sparse).
 
     `EuclideanDistance` subclasses relies on the squared Euclidean
     distances between chunks of vectors X_c and Y_c using the
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index f1923c5a9c16c..26d25e1040003 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -89,9 +89,8 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
         ):
             # Specialized implementation of RadiusNeighbors for the Euclidean
             # distance.
-            # This implementation processes the computation of the chunks of the
-            # distance using a decomposition of the chunks of the distance matrix
-            # for the Squared Euclidean distance.
+            # This implementation computes the distances by chunk using
+            # a decomposition of the Squared Euclidean distance.
             # This specialisation has an improved arithmetic intensity for both
             # the dense and sparse settings, allowing in most case speed-ups of
             # several orders of magnitude compared to the generic RadiusNeighbors