scikit-learn
diff --git a/‎sklearn/metrics/_pairwise_distances_reduction.pyx
Lines changed: 217 additions & 7 deletions b/‎sklearn/metrics/_pairwise_distances_reduction.pyx
Lines changed: 217 additions & 7 deletions
diff --git a/‎sklearn/metrics/tests/test_pairwise_distances_reduction.py
Lines changed: 42 additions & 0 deletions b/‎sklearn/metrics/tests/test_pairwise_distances_reduction.py
Lines changed: 42 additions & 0 deletions
@@ -605,13 +605,26 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         # For future work, this might can be an entrypoint to specialise operations
         # for various back-end and/or hardware and/or datatypes, and/or fused
         # {sparse, dense}-datasetspair etc.
-
-        pda = PairwiseDistancesArgKmin(
-            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
-            k=k,
-            chunk_size=chunk_size,
-            strategy=strategy,
-        )
+        if (
+            metric in ("euclidean", "sqeuclidean")
+                and not issparse(X)
+                and not issparse(Y)
+        ):
+            use_squared_distances = metric == "sqeuclidean"
+            pda = FastEuclideanPairwiseDistancesArgKmin(
+                X=X, Y=Y, k=k,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                metric_kwargs=metric_kwargs,
+            )
+        else: # Fall back on the default
+            pda = PairwiseDistancesArgKmin(
+                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+                k=k,
+                chunk_size=chunk_size,
+                strategy=strategy,
+            )
 
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
@@ -819,3 +832,200 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
         return np.asarray(self.argkmin_indices)
+
+
+cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
+    """Fast specialized alternative for PairwiseDistancesArgKmin on EuclideanDistance.
+
+    The full pairwise squared distances matrix is computed as follows:
+
+                  ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||²
+
+    The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
+
+    Notes
+    -----
+    This implementation has a superior arithmetic intensity and hence
+    better running time when the alternative is IO bound, but it can suffer
+    from numerical instability caused by catastrophic cancellation potentially
+    introduced by the subtraction in the arithmetic expression above.
+
+    PairwiseDistancesArgKmin with EuclideanDistance must be used when higher
+    numerical precision is needed.
+    """
+
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
+
+        # Buffers for GEMM
+        DTYPE_t ** dist_middle_terms_chunks
+        bint use_squared_distances
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and
+                not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t k,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        metric_kwargs=None,
+    ):
+        if metric_kwargs is not None and len(metric_kwargs) > 0:
+            warnings.warn(
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't"
+                f"usable for this case ({self.__class__.__name__}) and will be ignored.",
+                UserWarning,
+                stacklevel=3,
+            )
+
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            k=k,
+            chunk_size=chunk_size,
+        )
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+        self.X, self.Y = datasets_pair.X, datasets_pair.Y
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared", None)
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_n_threads)
+
+        # Do not recompute norms if datasets are identical.
+        self.X_norm_squared = (
+            self.Y_norm_squared if X is Y else
+            _sqeuclidean_row_norms(self.X, self.effective_n_threads)
+        )
+        self.use_squared_distances = use_squared_distances
+
+        # Temporary datastructures used in threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.chunks_n_threads
+        )
+
+    def __dealloc__(self):
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            PairwiseDistancesArgKmin.compute_exact_distances(self)
+
+    @final
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesArgKmin._parallel_on_X_parallel_init(self, thread_num)
+
+        # Temporary buffer for the `-2 * X_c @ Y_c.T` term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        )
+
+    @final
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        PairwiseDistancesArgKmin._parallel_on_X_parallel_finalize(self, thread_num)
+        free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesArgKmin._parallel_on_Y_parallel_init(self)
+
+        for thread_num in range(self.chunks_n_threads):
+            # Temporary buffer for the `-2 * X_c @ Y_c.T` term
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+            )
+
+    @final
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesArgKmin._parallel_on_Y_finalize(self)
+
+        for thread_num in range(self.chunks_n_threads):
+            free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            DTYPE_t * C = dist_middle_terms
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                heap_push(
+                    heaps_r_distances + i * self.k,
+                    heaps_indices + i * self.k,
+                    self.k,
+                    # Using the squared euclidean distance as the rank-preserving distance:
+                    #
+                    #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    #
+                    (
+                        self.X_nor
10000
m_squared[i + X_start] +
+                        dist_middle_terms[i * Y_c.shape[0] + j] +
+                        self.Y_norm_squared[j + Y_start]
+                    ),
+                    j + Y_start,
+                )
@@ -7,6 +7,7 @@
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
+    FastEuclideanPairwiseDistancesArgKmin,
     _sqeuclidean_row_norms,
 )
 
@@ -139,6 +140,47 @@ def test_argkmin_factory_method_wrong_usages():
         )
 
 
+@fails_if_unstable_openblas
+@pytest.mark.filterwarnings("ignore:Constructing a DIA matrix")
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction, FastPairwiseDistancesReduction",
+    [
+        (PairwiseDistancesArgKmin, FastEuclideanPairwiseDistancesArgKmin),
+    ],
+)
+def test_pairwise_distances_reduction_factory_method(
+    PairwiseDistancesReduction, FastPairwiseDistancesReduction
+):
+    # Test all the combinations of DatasetsPair for creation
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    metric = "euclidean"
+
+    # Dummy value for k or radius
+    dummy_arg = 5
+
+    with pytest.raises(
+        ValueError, match="Only dense datasets are supported for X and Y."
+    ):
+        PairwiseDistancesReduction.compute(
+            csr_matrix(X),
+            csr_matrix(Y),
+            dummy_arg,
+            metric,
+        )
+
+    with pytest.raises(
+        ValueError, match="Only dense datasets are supported for X and Y."
+    ):
+        PairwiseDistancesReduction.compute(X, csr_matrix(Y), dummy_arg, metric=metric)
+
+    with pytest.raises(
+        ValueError, match="Only dense datasets are supported for X and Y."
+    ):
+        PairwiseDistancesReduction.compute(csr_matrix(X), Y, dummy_arg, metric=metric)
+
+
 @fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])