From 24fe8a64d3cfd322755645d84f7f9a61387423a8 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 30 Sep 2022 22:10:29 +0200 Subject: [PATCH 01/27] wip --- .../_argkmin.pxd.tp | 4 +- .../_argkmin.pyx.tp | 15 +++- .../_dispatcher.py | 2 +- .../_gemm_term_computer.pxd.tp | 30 +++++++- .../_gemm_term_computer.pyx.tp | 74 ++++++++++++++++++- .../_radius_neighborhood.pxd.tp | 4 +- .../_radius_neighborhood.pyx.tp | 16 +++- 7 files changed, 130 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index 37dd7f5836857..fcdaf6e63887a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -22,7 +22,7 @@ cnp.import_array() {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} from ._base cimport BaseDistanceReducer{{name_suffix}} -from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} +from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): """{{name_suffix}}bit implementation of BaseDistanceReducer{{name_suffix}} for the `ArgKmin` reduction.""" @@ -41,7 +41,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): """EuclideanDistance-specialized {{name_suffix}}bit implementation of ArgKmin{{name_suffix}}.""" cdef: - GEMMTermComputer{{name_suffix}} gemm_term_computer + MiddleTermComputer{{name_suffix}} gemm_term_computer const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 1c1459e27f210..a765bad4ec9bb 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -47,9 +47,13 @@ from ._base cimport ( from ._datasets_pair cimport ( DatasetsPair{{name_suffix}}, DenseDenseDatasetsPair{{name_suffix}}, + SparseSparseDatasetsPair{{name_suffix}}, ) -from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} +from ._gemm_term_computer cimport ( + DenseDenseMiddleTermComputer{{name_suffix}}, + SparseSparseMiddleTermComputer{{name_suffix}}, +) cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): @@ -82,8 +86,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): """ if ( metric in ("euclidean", "sqeuclidean") - and not issparse(X) - and not issparse(Y) + and not (issparse(X) ^ issparse(Y)) ): # Specialized implementation with improved arithmetic intensity # and vector instructions (SIMD) by processing several vectors @@ -372,7 +375,11 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.gemm_term_computer = GEMMTermComputer{{name_suffix}}( + if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}): + MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}} + else: + MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}} + self.gemm_term_computer = MiddleTermComputer{{name_suffix}}( datasets_pair.X, datasets_pair.Y, self.effective_n_threads, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 809d683b52ced..4b57e8327b2a4 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -131,7 +131,7 @@ def is_valid_sparse_matrix(X): # TODO: implement specialisation for (sq)euclidean on fused sparse-dense # using sparse-dense routines for matrix-vector multiplications. fused_sparse_dense_euclidean_case_guard = not ( - (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y)) + (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y)) and isinstance(metric, str) and "euclidean" in metric ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index 5978cfee9ebee..4956e0fb40fff 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -20,7 +20,7 @@ from libcpp.vector cimport vector {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -cdef class GEMMTermComputer{{name_suffix}}: +cdef class MiddleTermComputer{{name_suffix}}: cdef: const {{INPUT_DTYPE_t}}[:, ::1] X const {{INPUT_DTYPE_t}}[:, ::1] Y @@ -76,6 +76,34 @@ cdef class GEMMTermComputer{{name_suffix}}: ITYPE_t thread_num ) nogil + +cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef DTYPE_t * _compute_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef void _middle_term_sparse_sparse_{{name_suffix}}( + const {{INPUT_DTYPE_t}}[:] X_data, + const ITYPE_t[:] X_indices, + const ITYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const {{INPUT_DTYPE_t}}[:] Y_data, + const ITYPE_t[:] Y_indices, + const ITYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t[:, ::1] D +) nogil + + +cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index 35e57219a96a7..9e4dce4199459 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -31,7 +31,7 @@ from ...utils._cython_blas cimport ( {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -cdef class GEMMTermComputer{{name_suffix}}: +cdef class MiddleTermComputer{{name_suffix}}: """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM. `FastEuclidean*` classes internally compute the squared Euclidean distances between @@ -167,6 +167,9 @@ cdef class GEMMTermComputer{{name_suffix}}: return {{endif}} + +cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, @@ -214,4 +217,73 @@ cdef class GEMMTermComputer{{name_suffix}}: return dist_middle_terms + +cdef void _middle_term_sparse_sparse_{{name_suffix}}( + const {{INPUT_DTYPE_t}}[:] X_data, + const ITYPE_t[:] X_indices, + const ITYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const {{INPUT_DTYPE_t}}[:] Y_data, + const ITYPE_t[:] Y_indices, + const ITYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t[:, ::1] D, +) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr + + for i in range(n_X): + for _X_indptr in range(X_indptr[i], X_indptr[i+1]): + _X_indices = X_indices[_X_indptr] + for j in range(n_Y): + for _Y_indptr in range(Y_indptr[j], Y_indptr[j+1]): + _Y_indices = Y_indices[_Y_indptr] + if _X_indices == _Y_indices: + D[X_start + i, Y_start + j] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr] + + +cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + + cdef DTYPE_t * _compute_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] + const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + const {{INPUT_DTYPE_t}}[:] X_data = X_c.data + const ITYPE_t[:] X_indices = X_c.indices + const ITYPE_t[:] X_indptr = X_c.indptr + + const {{INPUT_DTYPE_t}}[:] Y_data = Y_c.data + const ITYPE_t[:] Y_indices = Y_c.indices + const ITYPE_t[:] Y_indptr = Y_c.indptr + + _middle_term_sparse_sparse_{{name_suffix}}( + X_data, + X_indices, + X_indptr, + X_start, + X_end, + Y_data, + Y_indices, + Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + + return dist_middle_terms + {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp index 9bbe186589f20..7fddb77365579 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp @@ -44,7 +44,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} from ._base cimport BaseDistanceReducer{{name_suffix}} -from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} +from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): """ @@ -99,7 +99,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}): """EuclideanDistance-specialized {{name_suffix}}bit implementation for RadiusNeighbors{{name_suffix}}.""" cdef: - GEMMTermComputer{{name_suffix}} gemm_term_computer + MiddleTermComputer{{name_suffix}} gemm_term_computer const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 8d50a6e04abf9..9a8fb9500477c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -66,9 +66,13 @@ from ._base cimport ( from ._datasets_pair cimport ( DatasetsPair{{name_suffix}}, DenseDenseDatasetsPair{{name_suffix}}, + SparseSparseDatasetsPair{{name_suffix}}, ) -from ._gemm_term_computer cimport GEMMTermComputer{{name_suffix}} +from ._gemm_term_computer cimport ( + DenseDenseMiddleTermComputer{{name_suffix}}, + SparseSparseMiddleTermComputer{{name_suffix}}, +) cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): @@ -105,8 +109,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): """ if ( metric in ("euclidean", "sqeuclidean") - and not issparse(X) - and not issparse(Y) + and not (issparse(X) ^ issparse(Y)) ): # Specialized implementation with improved arithmetic intensity # and vector instructions (SIMD) by processing several vectors @@ -381,7 +384,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.gemm_term_computer = GEMMTermComputer{{name_suffix}}( + if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}): + MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}} + else: + MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}} + + self.gemm_term_computer = MiddleTermComputer{{name_suffix}}( datasets_pair.X, datasets_pair.Y, self.effective_n_threads, From 0132e562fdbc8673296cd3c448d1799a7d9d4e8a Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 30 Sep 2022 22:21:31 +0200 Subject: [PATCH 02/27] remove potential bug --- .../_argkmin.pyx.tp | 29 +++++++++++------- .../_radius_neighborhood.pyx.tp | 30 +++++++++++-------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index a765bad4ec9bb..d003b9be08f75 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -376,18 +376,25 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}): - MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}} + self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}( + datasets_pair.X, + datasets_pair.Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, + ) else: - MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}} - self.gemm_term_computer = MiddleTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) + self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}( + datasets_pair.X, + datasets_pair.Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, + ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = check_array( diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 9a8fb9500477c..7b6195154def3 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -385,19 +385,25 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}): - MiddleTermComputer{{name_suffix}} = DenseDenseMiddleTermComputer{{name_suffix}} + self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}( + datasets_pair.X, + datasets_pair.Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, + ) else: - MiddleTermComputer{{name_suffix}} = SparseSparseMiddleTermComputer{{name_suffix}} - - self.gemm_term_computer = MiddleTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) + self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}( + datasets_pair.X, + datasets_pair.Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, + ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = check_array( From c0ac7db46f7cbd211008bf27cb893d0cf431cd27 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 11 Oct 2022 18:36:03 +0200 Subject: [PATCH 03/27] much fix such wow --- .../_argkmin.pxd.tp | 2 +- .../_argkmin.pyx.tp | 43 ++-- .../_gemm_term_computer.pxd.tp | 59 +++-- .../_gemm_term_computer.pyx.tp | 217 +++++++++++++----- .../_radius_neighborhood.pxd.tp | 2 +- .../_radius_neighborhood.pyx.tp | 43 ++-- 6 files changed, 227 insertions(+), 139 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index fcdaf6e63887a..4751c5fa04026 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -41,7 +41,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): """EuclideanDistance-specialized {{name_suffix}}bit implementation of ArgKmin{{name_suffix}}.""" cdef: - MiddleTermComputer{{name_suffix}} gemm_term_computer + MiddleTermComputer{{name_suffix}} middle_term_computer const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index d003b9be08f75..99e6c1ac2d77d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -375,26 +375,15 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}): - self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) - else: - self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) + self.middle_term_computer = MiddleTermComputer.get_for( + X, + Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, + ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = check_array( @@ -424,7 +413,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ITYPE_t thread_num, ) nogil: ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num) - self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) + self.middle_term_computer._parallel_on_X_parallel_init(thread_num) @final @@ -435,7 +424,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ITYPE_t X_end, ) nogil: ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) - self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) @final @@ -453,7 +442,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): Y_start, Y_end, thread_num, ) - self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num, ) @@ -464,7 +453,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ) nogil: cdef ITYPE_t thread_num ArgKmin{{name_suffix}}._parallel_on_Y_init(self) - self.gemm_term_computer._parallel_on_Y_init() + self.middle_term_computer._parallel_on_Y_init() @final @@ -475,7 +464,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ITYPE_t X_end, ) nogil: ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) - self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) @final @@ -493,7 +482,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): Y_start, Y_end, thread_num, ) - self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num ) @@ -512,7 +501,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): DTYPE_t squared_dist_i_j ITYPE_t n_X = X_end - X_start ITYPE_t n_Y = Y_end - Y_start - DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( + DTYPE_t * dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num ) DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index 4956e0fb40fff..d9c5f96cd13e6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -15,16 +15,29 @@ implementation_specific_values = [ }} cimport numpy as cnp -from ...utils._typedefs cimport DTYPE_t, ITYPE_t +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from libcpp.vector cimport vector {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef void _middle_term_sparse_sparse_{{name_suffix}}( + const {{INPUT_DTYPE_t}}[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const {{INPUT_DTYPE_t}}[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil + + cdef class MiddleTermComputer{{name_suffix}}: cdef: - const {{INPUT_DTYPE_t}}[:, ::1] X - const {{INPUT_DTYPE_t}}[:, ::1] Y - ITYPE_t effective_n_threads ITYPE_t chunks_n_threads ITYPE_t dist_middle_terms_chunks_size @@ -34,12 +47,6 @@ cdef class MiddleTermComputer{{name_suffix}}: # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM vector[vector[DTYPE_t]] dist_middle_terms_chunks -{{if upcast_to_float64}} - # Buffers for upcasting chunks of X and Y from 32bit to 64bit - vector[vector[DTYPE_t]] X_c_upcast - vector[vector[DTYPE_t]] Y_c_upcast -{{endif}} - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, ITYPE_t X_start, @@ -78,6 +85,16 @@ cdef class MiddleTermComputer{{name_suffix}}: cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] X + const {{INPUT_DTYPE_t}}[:, ::1] Y + + {{if upcast_to_float64}} + # Buffers for upcasting chunks of X and Y from 32bit to 64bit + vector[vector[DTYPE_t]] X_c_upcast + vector[vector[DTYPE_t]] Y_c_upcast + {{endif}} + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, @@ -88,22 +105,16 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ ) nogil -cdef void _middle_term_sparse_sparse_{{name_suffix}}( - const {{INPUT_DTYPE_t}}[:] X_data, - const ITYPE_t[:] X_indices, - const ITYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const {{INPUT_DTYPE_t}}[:] Y_data, - const ITYPE_t[:] Y_indices, - const ITYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t[:, ::1] D -) nogil +cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + const {{INPUT_DTYPE_t}}[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr -cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index 9e4dce4199459..a034401566ca5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -14,10 +14,13 @@ implementation_specific_values = [ }} cimport numpy as cnp +import numpy as np from libcpp.vector cimport vector -from ...utils._typedefs cimport DTYPE_t, ITYPE_t +from scipy.sparse import issparse, csr_matrix +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t +from ...utils._typedefs import SPARSE_INDEX_TYPE from ...utils._cython_blas cimport ( BLAS_Order, @@ -31,6 +34,37 @@ from ...utils._cython_blas cimport ( {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef void _middle_term_sparse_sparse_{{name_suffix}}( + const {{INPUT_DTYPE_t}}[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const {{INPUT_DTYPE_t}}[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil: + cdef: + ITYPE_t i, j, k + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr + + for i in range(n_X): + for _X_indptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + _X_indices = X_indices[_X_indptr] + for j in range(n_Y): + for _Y_indptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): + _Y_indices = Y_indices[_Y_indptr] + if _X_indices == _Y_indices: + k = (X_start + i) * n_X + Y_start + j + D[k] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr] + + cdef class MiddleTermComputer{{name_suffix}}: """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM. @@ -46,17 +80,79 @@ cdef class MiddleTermComputer{{name_suffix}}: arithmetic intensity. """ - def __init__(self, - const {{INPUT_DTYPE_t}}[:, ::1] X, - const {{INPUT_DTYPE_t}}[:, ::1] Y, + @classmethod + def get_for( + cls, + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) -> MiddleTermComputer{{name_suffix}}: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Returns + ------- + middle_term_computer: MiddleTermComputer{{name_suffix}} + The suited MiddleTermComputer{{name_suffix}} implementation. + """ + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return DenseDenseMiddleTermComputer{{name_suffix}}( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + if X_is_sparse and Y_is_sparse: + return SparseSparseMiddleTermComputer{{name_suffix}}( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + + raise NotImplementedError("X and Y must be both sparse or dense") + + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr + + def __init__( + self, ITYPE_t effective_n_threads, ITYPE_t chunks_n_threads, ITYPE_t dist_middle_terms_chunks_size, ITYPE_t n_features, ITYPE_t chunk_size, ): - self.X = X - self.Y = Y self.effective_n_threads = effective_n_threads self.chunks_n_threads = chunks_n_threads self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size @@ -170,6 +266,26 @@ cdef class MiddleTermComputer{{name_suffix}}: cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X = X + self.Y = Y + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, @@ -218,37 +334,28 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ return dist_middle_terms -cdef void _middle_term_sparse_sparse_{{name_suffix}}( - const {{INPUT_DTYPE_t}}[:] X_data, - const ITYPE_t[:] X_indices, - const ITYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const {{INPUT_DTYPE_t}}[:] Y_data, - const ITYPE_t[:] Y_indices, - const ITYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t[:, ::1] D, -) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr - - for i in range(n_X): - for _X_indptr in range(X_indptr[i], X_indptr[i+1]): - _X_indices = X_indices[_X_indptr] - for j in range(n_Y): - for _Y_indptr in range(Y_indptr[j], Y_indptr[j+1]): - _Y_indices = Y_indices[_Y_indptr] - if _X_indices == _Y_indices: - D[X_start + i, Y_start + j] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr] - - cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, @@ -258,32 +365,24 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam ITYPE_t thread_num, ) nogil: cdef: - const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] - const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] - DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() - - const {{INPUT_DTYPE_t}}[:] X_data = X_c.data - const ITYPE_t[:] X_indices = X_c.indices - const ITYPE_t[:] X_indptr = X_c.indptr - - const {{INPUT_DTYPE_t}}[:] Y_data = Y_c.data - const ITYPE_t[:] Y_indices = Y_c.indices - const ITYPE_t[:] Y_indptr = Y_c.indptr - - _middle_term_sparse_sparse_{{name_suffix}}( - X_data, - X_indices, - X_indptr, - X_start, - X_end, - Y_data, - Y_indices, - Y_indptr, - Y_start, - Y_end, - dist_middle_terms, + DTYPE_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() ) + _middle_term_sparse_sparse_{{name_suffix}}( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y_data, + self.Y_indices, + self.Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + return dist_middle_terms {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp index 7fddb77365579..f0dd22da6faed 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd.tp @@ -99,7 +99,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}): """EuclideanDistance-specialized {{name_suffix}}bit implementation for RadiusNeighbors{{name_suffix}}.""" cdef: - MiddleTermComputer{{name_suffix}} gemm_term_computer + MiddleTermComputer{{name_suffix}} middle_term_computer const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 7b6195154def3..688edf72fa696 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -384,26 +384,15 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - if isinstance(self.datasets_pair, DenseDenseDatasetsPair{{name_suffix}}): - self.gemm_term_computer = DenseDenseMiddleTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) - else: - self.gemm_term_computer = SparseSparseMiddleTermComputer{{name_suffix}}( - datasets_pair.X, - datasets_pair.Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) + self.middle_term_computer = MiddleTermComputer.get_for( + X, + Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, + ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = check_array( @@ -433,7 +422,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ITYPE_t thread_num, ) nogil: RadiusNeighbors{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num) - self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) + self.middle_term_computer._parallel_on_X_parallel_init(thread_num) @final cdef void _parallel_on_X_init_chunk( @@ -443,7 +432,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ITYPE_t X_end, ) nogil: RadiusNeighbors{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) - self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) @final cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( @@ -460,7 +449,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} Y_start, Y_end, thread_num, ) - self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num, ) @@ -470,7 +459,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ) nogil: cdef ITYPE_t thread_num RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self) - self.gemm_term_computer._parallel_on_Y_init() + self.middle_term_computer._parallel_on_Y_init() @final cdef void _parallel_on_Y_parallel_init( @@ -480,7 +469,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ITYPE_t X_end, ) nogil: RadiusNeighbors{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) - self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) @final cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( @@ -497,7 +486,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} Y_start, Y_end, thread_num, ) - self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num ) @@ -520,7 +509,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} DTYPE_t squared_dist_i_j ITYPE_t n_X = X_end - X_start ITYPE_t n_Y = Y_end - Y_start - DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( + DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num ) From 27b9427911e56592fdef13f23c6e9ad45b02c3b2 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 12 Oct 2022 22:24:52 +0200 Subject: [PATCH 04/27] compiling ! --- .../_argkmin.pyx.tp | 7 +- .../_gemm_term_computer.pxd.tp | 41 ++++++ .../_gemm_term_computer.pyx.tp | 124 ++++++++++++------ .../_radius_neighborhood.pyx.tp | 7 +- 4 files changed, 131 insertions(+), 48 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 99e6c1ac2d77d..baddc41cef7e9 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -50,10 +50,7 @@ from ._datasets_pair cimport ( SparseSparseDatasetsPair{{name_suffix}}, ) -from ._gemm_term_computer cimport ( - DenseDenseMiddleTermComputer{{name_suffix}}, - SparseSparseMiddleTermComputer{{name_suffix}}, -) +from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): @@ -375,7 +372,7 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.middle_term_computer = MiddleTermComputer.get_for( + self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for( X, Y, self.effective_n_threads, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index d9c5f96cd13e6..0e5d744720ffb 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -83,6 +83,15 @@ cdef class MiddleTermComputer{{name_suffix}}: ITYPE_t thread_num ) nogil + cdef DTYPE_t * _compute_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): cdef: @@ -95,6 +104,38 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ vector[vector[DTYPE_t]] Y_c_upcast {{endif}} + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index a034401566ca5..9dbeca75bcd87 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -20,7 +20,7 @@ from libcpp.vector cimport vector from scipy.sparse import issparse, csr_matrix from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t -from ...utils._typedefs import SPARSE_INDEX_TYPE +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE from ...utils._cython_blas cimport ( BLAS_Order, @@ -61,7 +61,7 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}( for _Y_indptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): _Y_indices = Y_indices[_Y_indptr] if _X_indices == _Y_indices: - k = (X_start + i) * n_X + Y_start + j + k = i * n_X + j D[k] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr] @@ -140,7 +140,7 @@ cdef class MiddleTermComputer{{name_suffix}}: @classmethod def unpack_csr_matrix(cls, X: csr_matrix): """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" - X_data = np.asarray(X.data, dtype=DTYPE) + X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) return X_data, X_indices, X_indptr @@ -161,6 +161,87 @@ cdef class MiddleTermComputer{{name_suffix}}: self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: + self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_init(self) nogil: + for thread_num in range(self.chunks_n_threads): + self.dist_middle_terms_chunks[thread_num].resize( + self.dist_middle_terms_chunks_size + ) + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + return + + cdef DTYPE_t * _compute_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = vector[DTYPE_t](1).data() + return dist_middle_terms + + + +cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X = X + self.Y = Y + {{if upcast_to_float64}} # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit. self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) @@ -194,10 +275,6 @@ cdef class MiddleTermComputer{{name_suffix}}: return {{endif}} - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: - self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) - cdef void _parallel_on_X_init_chunk( self, ITYPE_t thread_num, @@ -217,12 +294,6 @@ cdef class MiddleTermComputer{{name_suffix}}: return {{endif}} - cdef void _parallel_on_Y_init(self) nogil: - for thread_num in range(self.chunks_n_threads): - self.dist_middle_terms_chunks[thread_num].resize( - self.dist_middle_terms_chunks_size - ) - cdef void _parallel_on_Y_parallel_init( self, ITYPE_t thread_num, @@ -263,29 +334,6 @@ cdef class MiddleTermComputer{{name_suffix}}: return {{endif}} - -cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): - - def __init__( - self, - const {{INPUT_DTYPE_t}}[:, ::1] X, - const {{INPUT_DTYPE_t}}[:, ::1] Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - super().__init__( - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - self.X = X - self.Y = Y - cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, @@ -338,8 +386,8 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam def __init__( self, - const {{INPUT_DTYPE_t}}[:, ::1] X, - const {{INPUT_DTYPE_t}}[:, ::1] Y, + X, + Y, ITYPE_t effective_n_threads, ITYPE_t chunks_n_threads, ITYPE_t dist_middle_terms_chunks_size, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 688edf72fa696..9b6d22f8c25b2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -69,10 +69,7 @@ from ._datasets_pair cimport ( SparseSparseDatasetsPair{{name_suffix}}, ) -from ._gemm_term_computer cimport ( - DenseDenseMiddleTermComputer{{name_suffix}}, - SparseSparseMiddleTermComputer{{name_suffix}}, -) +from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): @@ -384,7 +381,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.middle_term_computer = MiddleTermComputer.get_for( + self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for( X, Y, self.effective_n_threads, From 2a1415455677494bb66ce757aa6dff739d2e1af9 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sun, 16 Oct 2022 16:18:20 +0200 Subject: [PATCH 05/27] add sparse method for sq_euclidean_norm --- .../_argkmin.pxd.tp | 3 + .../_argkmin.pyx.tp | 17 ++++- .../_base.pxd.tp | 29 +++++++- .../_base.pyx.tp | 74 ++++++++++++++++++- .../_dispatcher.py | 9 +-- .../_gemm_term_computer.pxd.tp | 1 + .../_gemm_term_computer.pyx.tp | 37 ++++++++-- .../_radius_neighborhood.pyx.tp | 6 +- 8 files changed, 151 insertions(+), 25 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index 4751c5fa04026..0e5fa6dcc13cd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -16,6 +16,9 @@ implementation_specific_values = [ cimport numpy as cnp from ...utils._typedefs cimport ITYPE_t, DTYPE_t +from libcpp.vector cimport vector +from ...utils._vector_sentinel cimport vector_to_nd_array +from ...utils._typedefs cimport DTYPECODE cnp.import_array() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index baddc41cef7e9..c1822eac3d40f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -33,6 +33,9 @@ from scipy.sparse import issparse from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits from ...utils._typedefs import ITYPE, DTYPE +from libcpp.vector cimport vector +from ...utils._vector_sentinel cimport vector_to_nd_array +from ...utils._typedefs cimport DTYPECODE cnp.import_array() @@ -41,7 +44,7 @@ cnp.import_array() from ._base cimport ( BaseDistanceReducer{{name_suffix}}, - _sqeuclidean_row_norms{{name_suffix}}, + SqeuclideanRowNorm{{name_suffix}}, ) from ._datasets_pair cimport ( @@ -390,14 +393,22 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): dtype=np.float64 ) else: - self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = SqeuclideanRowNorm{{name_suffix}}.get_for( + Y, self.effective_n_threads + ).compute() # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads) + SqeuclideanRowNorm{{name_suffix}}.get_for( + X, self.effective_n_threads + ).compute() ) self.use_squared_distances = use_squared_distances + print("DEBUG X_norm_squared") + print(np.asarray(self.X_norm_squared)) + print("DEBUG Y_norm_squared") + print(np.asarray(self.Y_norm_squared)) @final cdef void compute_exact_distances(self) nogil: diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 77a689a9e4e94..9c09fdd71c38c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -17,7 +17,7 @@ cimport numpy as cnp from cython cimport final -from ...utils._typedefs cimport ITYPE_t, DTYPE_t +from ...utils._typedefs cimport ITYPE_t, DTYPE_t, SPARSE_INDEX_TYPE_t cnp.import_array() @@ -25,11 +25,36 @@ cnp.import_array() from ._datasets_pair cimport DatasetsPair{{name_suffix}} -cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( +cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_dense( const {{INPUT_DTYPE_t}}[:, ::1] X, ITYPE_t num_threads, ) +cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( + const {{INPUT_DTYPE_t}}[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t num_threads, +) + +cdef class SqeuclideanRowNorm{{name_suffix}}: + cdef: + ITYPE_t num_threads + + cpdef DTYPE_t[::1] compute(self) + + +cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] X + + +cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + cdef class BaseDistanceReducer{{name_suffix}}: """ Base {{name_suffix}}bit implementation template of the pairwise-distances reduction diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index d0c06de0f8761..3a573126d892a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -27,17 +27,18 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np +from scipy.sparse import issparse, csr_matrix from numbers import Integral from sklearn import get_config from sklearn.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils._typedefs import ITYPE, DTYPE +from ...utils._typedefs import ITYPE, DTYPE, SPARSE_INDEX_TYPE cnp.import_array() ##################### -cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( +cpdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense( const DTYPE_t[:, ::1] X, ITYPE_t num_threads, ): @@ -62,7 +63,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( return squared_row_norms -cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( +cpdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( const cnp.float32_t[:, ::1] X, ITYPE_t num_threads, ): @@ -103,6 +104,73 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( + const {{INPUT_DTYPE_t}}[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t num_threads, +): + cdef: + ITYPE_t n = X_indptr.shape[0] - 1 + ITYPE_t idx = 0 + ITYPE_t thread_num + ITYPE_t X_i_ptr + DTYPE_t[::1] squared_row_norms = np.zeros(n, dtype=DTYPE) + + for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): + for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]): + squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr] + + return squared_row_norms + + +cdef class SqeuclideanRowNorm{{name_suffix}}: + + @classmethod + def get_for(cls, X, num_threads): + if issparse(X): + return SparseSqeuclideanRowNorm{{name_suffix}}(X, num_threads) + else: + return DenseSqeuclideanRowNorm{{name_suffix}}(X, num_threads) + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr + + def __init__(self, num_threads): + self.num_threads = num_threads + + cpdef DTYPE_t[::1] compute(self): + return None + + +cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): + + def __init__(self, X, num_threads): + super().__init__(num_threads) + self.X = X + + cpdef DTYPE_t[::1] compute(self): + return _sqeuclidean_row_norms{{name_suffix}}_dense(self.X, self.num_threads) + + +cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): + + def __init__(self, X, num_threads): + super().__init__(num_threads) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + + cpdef DTYPE_t[::1] compute(self): + return _sqeuclidean_row_norms{{name_suffix}}_sparse( + self.X_data, + self.X_indptr, + self.num_threads, + ) + + from ._datasets_pair cimport DatasetsPair{{name_suffix}} cdef class BaseDistanceReducer{{name_suffix}}: diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 4b57e8327b2a4..7f46df232f720 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -8,10 +8,7 @@ from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING -from ._base import ( - _sqeuclidean_row_norms64, - _sqeuclidean_row_norms32, -) +from ._base import SqeuclideanRowNorm32, SqeuclideanRowNorm64 from ._argkmin import ( ArgKmin64, ArgKmin32, @@ -41,9 +38,9 @@ def sqeuclidean_row_norms(X, num_threads): Arrays containing the squared euclidean norm of each row of X. """ if X.dtype == np.float64: - return _sqeuclidean_row_norms64(X, num_threads) + return SqeuclideanRowNorm64.get_for(X, num_threads).compute() if X.dtype == np.float32: - return _sqeuclidean_row_norms32(X, num_threads) + return SqeuclideanRowNorm32.get_for(X, num_threads).compute() raise ValueError( "Only float64 or float32 datasets are supported at this time, " diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index 0e5d744720ffb..dbae5ef42119c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -17,6 +17,7 @@ cimport numpy as cnp from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from libcpp.vector cimport vector +from ...utils._vector_sentinel cimport vector_to_nd_array {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index 9dbeca75bcd87..bee05aa2b528b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -21,6 +21,7 @@ from libcpp.vector cimport vector from scipy.sparse import issparse, csr_matrix from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE +from ...utils._vector_sentinel cimport vector_to_nd_array from ...utils._cython_blas cimport ( BLAS_Order, @@ -52,17 +53,17 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}( ITYPE_t i, j, k ITYPE_t n_X = X_end - X_start ITYPE_t n_Y = Y_end - Y_start - ITYPE_t _X_indices, _X_indptr, _Y_indices, _Y_indptr + ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr for i in range(n_X): - for _X_indptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): - _X_indices = X_indices[_X_indptr] + for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + X_i_col_idx = X_indices[X_i_ptr] for j in range(n_Y): - for _Y_indptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): - _Y_indices = Y_indices[_Y_indptr] - if _X_indices == _Y_indices: - k = i * n_X + j - D[k] += -2 * X_data[_X_indptr] * Y_data[_Y_indptr] + k = i * n_Y + j + for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): + Y_j_col_idx = Y_indices[Y_j_ptr] + if X_i_col_idx == Y_j_col_idx: + D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] cdef class MiddleTermComputer{{name_suffix}}: @@ -416,6 +417,17 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam DTYPE_t *dist_middle_terms = ( self.dist_middle_terms_chunks[thread_num].data() ) + vector[DTYPE_t] vec_to_inspect + + with gil: + print("DEBUG Before _middle_term_sparse_sparse_{{name_suffix}}") + print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=") + vec_to_inspect = self.dist_middle_terms_chunks[thread_num] + vec_as_np_array = vector_to_nd_array( + # This is the address of the vector + &vec_to_inspect, + ) + print(vec_as_np_array) _middle_term_sparse_sparse_{{name_suffix}}( self.X_data, @@ -430,6 +442,15 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam Y_end, dist_middle_terms, ) + with gil: + print("DEBUG After _middle_term_sparse_sparse_{{name_suffix}}") + print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=") + vec_to_inspect = self.dist_middle_terms_chunks[thread_num] + vec_as_np_array = vector_to_nd_array( + # This is the address of the vector + &vec_to_inspect + ) + print(vec_as_np_array) return dist_middle_terms diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 9b6d22f8c25b2..885d9191ad2b0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -60,7 +60,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( from ._base cimport ( BaseDistanceReducer{{name_suffix}}, - _sqeuclidean_row_norms{{name_suffix}} + _sqeuclidean_row_norms{{name_suffix}}_dense ) from ._datasets_pair cimport ( @@ -399,12 +399,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} dtype=np.float64 ) else: - self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.Y, self.effective_n_threads) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads) + _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.X, self.effective_n_threads) ) self.use_squared_distances = use_squared_distances From 4060271fd2276c1ea787974c6002cf17cb4efef8 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sun, 16 Oct 2022 16:24:43 +0200 Subject: [PATCH 06/27] remove print --- .../_argkmin.pxd.tp | 3 --- .../_argkmin.pyx.tp | 8 -------- .../_gemm_term_computer.pxd.tp | 1 - .../_gemm_term_computer.pyx.tp | 20 ------------------- 4 files changed, 32 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index 0e5fa6dcc13cd..4751c5fa04026 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -16,9 +16,6 @@ implementation_specific_values = [ cimport numpy as cnp from ...utils._typedefs cimport ITYPE_t, DTYPE_t -from libcpp.vector cimport vector -from ...utils._vector_sentinel cimport vector_to_nd_array -from ...utils._typedefs cimport DTYPECODE cnp.import_array() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index c1822eac3d40f..2237c00f05f05 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -33,10 +33,6 @@ from scipy.sparse import issparse from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits from ...utils._typedefs import ITYPE, DTYPE -from libcpp.vector cimport vector -from ...utils._vector_sentinel cimport vector_to_nd_array -from ...utils._typedefs cimport DTYPECODE - cnp.import_array() @@ -405,10 +401,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): ).compute() ) self.use_squared_distances = use_squared_distances - print("DEBUG X_norm_squared") - print(np.asarray(self.X_norm_squared)) - print("DEBUG Y_norm_squared") - print(np.asarray(self.Y_norm_squared)) @final cdef void compute_exact_distances(self) nogil: diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index dbae5ef42119c..0e5d744720ffb 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -17,7 +17,6 @@ cimport numpy as cnp from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from libcpp.vector cimport vector -from ...utils._vector_sentinel cimport vector_to_nd_array {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index bee05aa2b528b..52d7136423cc2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -21,7 +21,6 @@ from libcpp.vector cimport vector from scipy.sparse import issparse, csr_matrix from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE -from ...utils._vector_sentinel cimport vector_to_nd_array from ...utils._cython_blas cimport ( BLAS_Order, @@ -419,16 +418,6 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam ) vector[DTYPE_t] vec_to_inspect - with gil: - print("DEBUG Before _middle_term_sparse_sparse_{{name_suffix}}") - print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=") - vec_to_inspect = self.dist_middle_terms_chunks[thread_num] - vec_as_np_array = vector_to_nd_array( - # This is the address of the vector - &vec_to_inspect, - ) - print(vec_as_np_array) - _middle_term_sparse_sparse_{{name_suffix}}( self.X_data, self.X_indices, @@ -442,15 +431,6 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam Y_end, dist_middle_terms, ) - with gil: - print("DEBUG After _middle_term_sparse_sparse_{{name_suffix}}") - print(f"DEBUG self.dist_middle_terms_chunks[{thread_num}]=") - vec_to_inspect = self.dist_middle_terms_chunks[thread_num] - vec_as_np_array = vector_to_nd_array( - # This is the address of the vector - &vec_to_inspect - ) - print(vec_as_np_array) return dist_middle_terms From c8e35fd47fddfcadb1e3db4a2c8cc0766eb4dacd Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 18 Oct 2022 10:45:00 +0200 Subject: [PATCH 07/27] underkill the overkill --- .../_argkmin.pyx.tp | 10 ++-- .../_base.pxd.tp | 22 ++------ .../_base.pyx.tp | 52 +++---------------- .../_dispatcher.py | 6 +-- .../_radius_neighborhood.pyx.tp | 6 +-- 5 files changed, 23 insertions(+), 73 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 2237c00f05f05..83ba2bf31e7d0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -40,7 +40,7 @@ cnp.import_array() from ._base cimport ( BaseDistanceReducer{{name_suffix}}, - SqeuclideanRowNorm{{name_suffix}}, + _sqeuclidean_row_norms{{name_suffix}} ) from ._datasets_pair cimport ( @@ -389,16 +389,16 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): dtype=np.float64 ) else: - self.Y_norm_squared = SqeuclideanRowNorm{{name_suffix}}.get_for( + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( Y, self.effective_n_threads - ).compute() + ) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - SqeuclideanRowNorm{{name_suffix}}.get_for( + _sqeuclidean_row_norms{{name_suffix}}( X, self.effective_n_threads - ).compute() + ) ) self.use_squared_distances = use_squared_distances diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 9c09fdd71c38c..3058d3946e56c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -36,24 +36,10 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( ITYPE_t num_threads, ) -cdef class SqeuclideanRowNorm{{name_suffix}}: - cdef: - ITYPE_t num_threads - - cpdef DTYPE_t[::1] compute(self) - - -cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): - cdef: - const {{INPUT_DTYPE_t}}[:, ::1] X - - -cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): - cdef: - const {{INPUT_DTYPE_t}}[:] X_data - const SPARSE_INDEX_TYPE_t[:] X_indices - const SPARSE_INDEX_TYPE_t[:] X_indptr - +cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( + X, + ITYPE_t num_threads, +) cdef class BaseDistanceReducer{{name_suffix}}: """ diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 3a573126d892a..c9fe70f6dbf30 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -123,52 +123,16 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( return squared_row_norms -cdef class SqeuclideanRowNorm{{name_suffix}}: - - @classmethod - def get_for(cls, X, num_threads): - if issparse(X): - return SparseSqeuclideanRowNorm{{name_suffix}}(X, num_threads) - else: - return DenseSqeuclideanRowNorm{{name_suffix}}(X, num_threads) - - @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" +cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( + X, + ITYPE_t num_threads, +): + if issparse(X): X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) - X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - return X_data, X_indices, X_indptr - - def __init__(self, num_threads): - self.num_threads = num_threads - - cpdef DTYPE_t[::1] compute(self): - return None - - -cdef class DenseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): - - def __init__(self, X, num_threads): - super().__init__(num_threads) - self.X = X - - cpdef DTYPE_t[::1] compute(self): - return _sqeuclidean_row_norms{{name_suffix}}_dense(self.X, self.num_threads) - - -cdef class SparseSqeuclideanRowNorm{{name_suffix}}(SqeuclideanRowNorm{{name_suffix}}): - - def __init__(self, X, num_threads): - super().__init__(num_threads) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - - cpdef DTYPE_t[::1] compute(self): - return _sqeuclidean_row_norms{{name_suffix}}_sparse( - self.X_data, - self.X_indptr, - self.num_threads, - ) + return _sqeuclidean_row_norms{{name_suffix}}_sparse(X_data, X_indptr, num_threads) + else: + return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads) from ._datasets_pair cimport DatasetsPair{{name_suffix}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 7f46df232f720..43ae1330a1cac 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -8,7 +8,7 @@ from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING -from ._base import SqeuclideanRowNorm32, SqeuclideanRowNorm64 +from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64 from ._argkmin import ( ArgKmin64, ArgKmin32, @@ -38,9 +38,9 @@ def sqeuclidean_row_norms(X, num_threads): Arrays containing the squared euclidean norm of each row of X. """ if X.dtype == np.float64: - return SqeuclideanRowNorm64.get_for(X, num_threads).compute() + return _sqeuclidean_row_norms64(X, num_threads) if X.dtype == np.float32: - return SqeuclideanRowNorm32.get_for(X, num_threads).compute() + return _sqeuclidean_row_norms32(X, num_threads) raise ValueError( "Only float64 or float32 datasets are supported at this time, " diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 885d9191ad2b0..eca246cfd029a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -60,7 +60,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( from ._base cimport ( BaseDistanceReducer{{name_suffix}}, - _sqeuclidean_row_norms{{name_suffix}}_dense + _sqeuclidean_row_norms{{name_suffix}}, ) from ._datasets_pair cimport ( @@ -399,12 +399,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} dtype=np.float64 ) else: - self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms{{name_suffix}}_dense(datasets_pair.X, self.effective_n_threads) + _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads) ) self.use_squared_distances = use_squared_distances From c9aed9db3f6a3846bc27483702fde23d11cbe48a Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 18 Oct 2022 17:51:11 +0200 Subject: [PATCH 08/27] baptism by fire --- .../_argkmin.pyx.tp | 1 - .../_gemm_term_computer.pxd.tp | 18 ++++++++ .../_gemm_term_computer.pyx.tp | 41 +++++++++++++++++++ .../_radius_neighborhood.pyx.tp | 4 +- 4 files changed, 61 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 83ba2bf31e7d0..b55175c288797 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -507,7 +507,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num] ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num] - # Pushing the distance and their associated indices on heaps # which keep tracks of the argkmin. for i in range(n_X): diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp index 0e5d744720ffb..c28a81100aa3e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp @@ -156,6 +156,24 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam const SPARSE_INDEX_TYPE_t[:] Y_indices const SPARSE_INDEX_TYPE_t[:] Y_indptr + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index 52d7136423cc2..d2ab669543ea4 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -32,6 +32,13 @@ from ...utils._cython_blas cimport ( _gemm, ) +# TODO: change for `libcpp.algorithm.move` once Cython 3 is used +# Introduction in Cython: +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa +cdef extern from "" namespace "std" nogil: + void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa + + {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -404,6 +411,40 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t val = 0 + # reset the vector + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + val, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t val = 0 + # reset the vector + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + val, + ) + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index eca246cfd029a..5bf367d00c90e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -399,12 +399,12 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} dtype=np.float64 ) else: - self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(Y, self.effective_n_threads) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms{{name_suffix}}(datasets_pair.X, self.effective_n_threads) + _sqeuclidean_row_norms{{name_suffix}}(X, self.effective_n_threads) ) self.use_squared_distances = use_squared_distances From c5bd316311e6833de0bdb86867e8bbc04191cb9d Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 19 Oct 2022 10:42:07 +0200 Subject: [PATCH 09/27] clean it up, yo --- .../_pairwise_distances_reduction/_argkmin.pyx.tp | 1 + .../_pairwise_distances_reduction/_base.pxd.tp | 11 ----------- .../_pairwise_distances_reduction/_base.pyx.tp | 6 +++--- .../_gemm_term_computer.pyx.tp | 12 ++++-------- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 1ae6b97fbc3fb..38725c9a8b831 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -18,6 +18,7 @@ from ...utils import check_array, check_scalar, _in_unstable_openblas_configurat from ...utils.fixes import threadpool_limits from ...utils._typedefs import ITYPE, DTYPE + cnp.import_array() {{for name_suffix in ['64', '32']}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 0584e3570e104..be44f3a98a263 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -11,17 +11,6 @@ cnp.import_array() from ._datasets_pair cimport DatasetsPair{{name_suffix}} -cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_dense( - const {{INPUT_DTYPE_t}}[:, ::1] X, - ITYPE_t num_threads, -) - -cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( - const {{INPUT_DTYPE_t}}[:] X_data, - const SPARSE_INDEX_TYPE_t[:] X_indptr, - ITYPE_t num_threads, -) - cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( X, ITYPE_t num_threads, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 2149203615080..54f8cca66a746 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -38,7 +38,7 @@ cnp.import_array() ##################### -cpdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense( +cdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense( const DTYPE_t[:, ::1] X, ITYPE_t num_threads, ): @@ -63,7 +63,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms64_dense( return squared_row_norms -cpdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( +cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( const cnp.float32_t[:, ::1] X, ITYPE_t num_threads, ): @@ -108,7 +108,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( from ._datasets_pair cimport DatasetsPair{{name_suffix}} -cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( +cdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( const {{INPUT_DTYPE_t}}[:] X_data, const SPARSE_INDEX_TYPE_t[:] X_indptr, ITYPE_t num_threads, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index dc40bfc023522..3ace80ef230ee 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -413,13 +413,11 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam ITYPE_t Y_end, ITYPE_t thread_num, ) nogil: - cdef: - DTYPE_t val = 0 - # reset the vector + # Flush the thread dist_middle_terms_chunks to 0.0 fill( self.dist_middle_terms_chunks[thread_num].begin(), self.dist_middle_terms_chunks[thread_num].end(), - val, + 0.0, ) cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( @@ -430,13 +428,11 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam ITYPE_t Y_end, ITYPE_t thread_num, ) nogil: - cdef: - DTYPE_t val = 0 - # reset the vector + # Flush the thread dist_middle_terms_chunks to 0.0 fill( self.dist_middle_terms_chunks[thread_num].begin(), self.dist_middle_terms_chunks[thread_num].end(), - val, + 0.0, ) cdef DTYPE_t * _compute_dist_middle_terms( From 50bbf47d7cebaa08621073b426861bc1ba7e74d6 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 19 Oct 2022 10:51:33 +0200 Subject: [PATCH 10/27] worldwide renaming --- .gitignore | 4 +- setup.cfg | 4 +- setup.py | 2 +- .../_argkmin.pxd.tp | 2 +- .../_argkmin.pyx.tp | 2 +- .../_base.pyx.tp | 8 +- .../_gemm_term_computer.pxd | 323 ++++++++ .../_gemm_term_computer.pyx | 784 ++++++++++++++++++ ...er.pxd.tp => _middle_term_computer.pxd.tp} | 0 ...er.pyx.tp => _middle_term_computer.pyx.tp} | 0 .../_radius_neighbors.pxd.tp | 2 +- .../_radius_neighbors.pyx.tp | 2 +- .../_pairwise_distances_reduction/setup.py | 6 +- 13 files changed, 1123 insertions(+), 16 deletions(-) create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx rename sklearn/metrics/_pairwise_distances_reduction/{_gemm_term_computer.pxd.tp => _middle_term_computer.pxd.tp} (100%) rename sklearn/metrics/_pairwise_distances_reduction/{_gemm_term_computer.pyx.tp => _middle_term_computer.pyx.tp} (100%) diff --git a/.gitignore b/.gitignore index 68299a202b7c7..47ec8fa2faf79 100644 --- a/.gitignore +++ b/.gitignore @@ -93,7 +93,7 @@ sklearn/metrics/_pairwise_distances_reduction/_base.pxd sklearn/metrics/_pairwise_distances_reduction/_base.pyx sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx -sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd -sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx +sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd +sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx diff --git a/setup.cfg b/setup.cfg index 3aa89052a4b8e..a7e632e9887eb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,8 +77,8 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_base.pyx sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx - sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd - sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx + sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd + sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx diff --git a/setup.py b/setup.py index cc5b6f11749cb..467edff1ba360 100755 --- a/setup.py +++ b/setup.py @@ -89,7 +89,7 @@ "sklearn.manifold._barnes_hut_tsne", "sklearn.metrics.cluster._expected_mutual_info_fast", "sklearn.metrics._pairwise_distances_reduction._datasets_pair", - "sklearn.metrics._pairwise_distances_reduction._gemm_term_computer", + "sklearn.metrics._pairwise_distances_reduction._middle_term_computer", "sklearn.metrics._pairwise_distances_reduction._base", "sklearn.metrics._pairwise_distances_reduction._argkmin", "sklearn.metrics._pairwise_distances_reduction._radius_neighbors", diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index 2337516f22726..2bbe9e53518b3 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -6,7 +6,7 @@ cnp.import_array() {{for name_suffix in ['64', '32']}} from ._base cimport BaseDistancesReduction{{name_suffix}} -from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): """float{{name_suffix}} implementation of the ArgKmin.""" diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 38725c9a8b831..6c948d0abc018 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -34,7 +34,7 @@ from ._datasets_pair cimport ( SparseSparseDatasetsPair{{name_suffix}}, ) -from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 54f8cca66a746..b20381677fc8d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -408,7 +408,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: In this method, EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ call: - self.gemm_term_computer._parallel_on_X_init_chunk( + self.middle_term_computer._parallel_on_X_init_chunk( thread_num, X_start, X_end, ) @@ -431,7 +431,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: In this method, EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ call: - self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num, ) @@ -474,7 +474,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: In this method, EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ call: - self.gemm_term_computer._parallel_on_Y_parallel_init( + self.middle_term_computer._parallel_on_Y_parallel_init( thread_num, X_start, X_end, ) @@ -497,7 +497,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: In this method, EuclideanDistance specialisations of subclass of BaseDistancesReduction _must_ call: - self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num, ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd new file mode 100644 index 0000000000000..4f0eab969de3e --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd @@ -0,0 +1,323 @@ +cimport numpy as cnp + +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t +from libcpp.vector cimport vector + + +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const DTYPE_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil + + +cdef class MiddleTermComputer64: + cdef: + ITYPE_t effective_n_threads + ITYPE_t chunks_n_threads + ITYPE_t dist_middle_terms_chunks_size + ITYPE_t n_features + ITYPE_t chunk_size + + # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM + vector[vector[DTYPE_t]] dist_middle_terms_chunks + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_init(self) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64): + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64): + cdef: + const DTYPE_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const DTYPE_t[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef void _middle_term_sparse_sparse_32( + const cnp.float32_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const cnp.float32_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil + + +cdef class MiddleTermComputer32: + cdef: + ITYPE_t effective_n_threads + ITYPE_t chunks_n_threads + ITYPE_t dist_middle_terms_chunks_size + ITYPE_t n_features + ITYPE_t chunk_size + + # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM + vector[vector[DTYPE_t]] dist_middle_terms_chunks + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_init(self) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32): + cdef: + const cnp.float32_t[:, ::1] X + const cnp.float32_t[:, ::1] Y + + # Buffers for upcasting chunks of X and Y from 32bit to 64bit + vector[vector[DTYPE_t]] X_c_upcast + vector[vector[DTYPE_t]] Y_c_upcast + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32): + cdef: + const cnp.float32_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const cnp.float32_t[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx new file mode 100644 index 0000000000000..75e44f6c6b097 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx @@ -0,0 +1,784 @@ +cimport numpy as cnp + +from libcpp.vector cimport vector + +from ...utils._cython_blas cimport ( + BLAS_Order, + BLAS_Trans, + NoTrans, + RowMajor, + Trans, + _gemm, +) +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t + +import numpy as np +from scipy.sparse import issparse, csr_matrix +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE + +# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used +# Introduction in Cython: +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa +cdef extern from "" namespace "std" nogil: + void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa + + + +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const DTYPE_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil: + cdef: + ITYPE_t i, j, k + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr + + for i in range(n_X): + for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + X_i_col_idx = X_indices[X_i_ptr] + for j in range(n_Y): + k = i * n_Y + j + for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): + Y_j_col_idx = Y_indices[Y_j_ptr] + if X_i_col_idx == Y_j_col_idx: + D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] + + +cdef class MiddleTermComputer64: + """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM. + + `EuclideanDistance` subclasses internally compute the squared Euclidean distances + between chunks of vectors X_c and Y_c using the following decomposition: + + + ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + + + This helper class is in charge of wrapping the common logic to compute + the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high + arithmetic intensity. + """ + + @classmethod + def get_for( + cls, + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) -> MiddleTermComputer64: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Returns + ------- + middle_term_computer: MiddleTermComputer64 + The suited MiddleTermComputer64 implementation. + """ + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return DenseDenseMiddleTermComputer64( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + if X_is_sparse and Y_is_sparse: + return SparseSparseMiddleTermComputer64( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + + raise NotImplementedError("X and Y must be both sparse or dense") + + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr + + def __init__( + self, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + self.effective_n_threads = effective_n_threads + self.chunks_n_threads = chunks_n_threads + self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size + self.n_features = n_features + self.chunk_size = chunk_size + + self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: + self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_init(self) nogil: + for thread_num in range(self.chunks_n_threads): + self.dist_middle_terms_chunks[thread_num].resize( + self.dist_middle_terms_chunks_size + ) + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + return + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return NULL + + +cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64): + + def __init__( + self, + const DTYPE_t[:, ::1] X, + const DTYPE_t[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X = X + self.Y = Y + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + return + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] + const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] + DTYPE_t alpha = - 2. + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + DTYPE_t * A = &X_c[0, 0] + DTYPE_t * B = &Y_c[0, 0] + ITYPE_t lda = X_c.shape[1] + ITYPE_t ldb = X_c.shape[1] + DTYPE_t beta = 0. + ITYPE_t ldc = Y_c.shape[0] + + # dist_middle_terms = `-2 * X_c @ Y_c.T` + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + + return dist_middle_terms + + +cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64): + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + _middle_term_sparse_sparse_64( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y_data, + self.Y_indices, + self.Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + + return dist_middle_terms + + +cdef void _middle_term_sparse_sparse_32( + const cnp.float32_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const cnp.float32_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil: + cdef: + ITYPE_t i, j, k + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr + + for i in range(n_X): + for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + X_i_col_idx = X_indices[X_i_ptr] + for j in range(n_Y): + k = i * n_Y + j + for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): + Y_j_col_idx = Y_indices[Y_j_ptr] + if X_i_col_idx == Y_j_col_idx: + D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] + + +cdef class MiddleTermComputer32: + """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM. + + `EuclideanDistance` subclasses internally compute the squared Euclidean distances + between chunks of vectors X_c and Y_c using the following decomposition: + + + ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + + + This helper class is in charge of wrapping the common logic to compute + the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high + arithmetic intensity. + """ + + @classmethod + def get_for( + cls, + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) -> MiddleTermComputer32: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Returns + ------- + middle_term_computer: MiddleTermComputer32 + The suited MiddleTermComputer32 implementation. + """ + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return DenseDenseMiddleTermComputer32( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + if X_is_sparse and Y_is_sparse: + return SparseSparseMiddleTermComputer32( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + + raise NotImplementedError("X and Y must be both sparse or dense") + + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + X_data = np.asarray(X.data, dtype=np.float32) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr + + def __init__( + self, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + self.effective_n_threads = effective_n_threads + self.chunks_n_threads = chunks_n_threads + self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size + self.n_features = n_features + self.chunk_size = chunk_size + + self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: + self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_init(self) nogil: + for thread_num in range(self.chunks_n_threads): + self.dist_middle_terms_chunks[thread_num].resize( + self.dist_middle_terms_chunks_size + ) + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + return + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return NULL + + +cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32): + + def __init__( + self, + const cnp.float32_t[:, ::1] X, + const cnp.float32_t[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X = X + self.Y = Y + # We populate the buffer for upcasting chunks of X and Y from float32 to float64. + self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) + self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) + + upcast_buffer_n_elements = self.chunk_size * n_features + + for thread_num in range(self.effective_n_threads): + self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements) + self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + const cnp.float32_t[:, ::1] X_c = self.X[X_start:X_end, :] + const cnp.float32_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] + DTYPE_t alpha = - 2. + DTYPE_t * A = self.X_c_upcast[thread_num].data() + DTYPE_t * B = self.Y_c_upcast[thread_num].data() + ITYPE_t lda = X_c.shape[1] + ITYPE_t ldb = X_c.shape[1] + DTYPE_t beta = 0. + ITYPE_t ldc = Y_c.shape[0] + + # dist_middle_terms = `-2 * X_c @ Y_c.T` + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + + return dist_middle_terms + + +cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32): + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + _middle_term_sparse_sparse_32( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y_data, + self.Y_indices, + self.Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + + return dist_middle_terms diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp similarity index 100% rename from sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp rename to sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp similarity index 100% rename from sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp rename to sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp index 0b0829f82d975..b6e4508468d2b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp @@ -29,7 +29,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( {{for name_suffix in ['64', '32']}} from ._base cimport BaseDistancesReduction{{name_suffix}} -from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): """float{{name_suffix}} implementation of the RadiusNeighbors.""" diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 92decf15f1155..94272f9aa8f3c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -54,7 +54,7 @@ from ._datasets_pair cimport ( SparseSparseDatasetsPair{{name_suffix}}, ) -from ._gemm_term_computer cimport MiddleTermComputer{{name_suffix}} +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): diff --git a/sklearn/metrics/_pairwise_distances_reduction/setup.py b/sklearn/metrics/_pairwise_distances_reduction/setup.py index e1fbbceea7eb8..4c8632ebb9088 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/setup.py +++ b/sklearn/metrics/_pairwise_distances_reduction/setup.py @@ -15,8 +15,8 @@ def configuration(parent_package="", top_path=None): templates = [ "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp", "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp", - "sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp", - "sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd.tp", + "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp", + "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp", "sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp", "sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp", "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp", @@ -29,7 +29,7 @@ def configuration(parent_package="", top_path=None): cython_sources = [ "_datasets_pair.pyx", - "_gemm_term_computer.pyx", + "_middle_term_computer.pyx", "_base.pyx", "_argkmin.pyx", "_radius_neighbors.pyx", From e9cc5ad04ead355fc9be23cd1662cb372dcc1e59 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 19 Oct 2022 11:16:57 +0200 Subject: [PATCH 11/27] add sparse test for test_sqeuclidean_row_norms --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index f929a55105509..2a3ef30a5f975 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1148,10 +1148,15 @@ def test_sqeuclidean_row_norms( spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread + X_csr = csr_matrix(X) + sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 sq_row_norm = np.asarray(sqeuclidean_row_norms(X, num_threads=num_threads)) + sq_row_norm_csr = np.asarray(sqeuclidean_row_norms(X_csr, num_threads=num_threads)) + assert_allclose(sq_row_norm_reference, sq_row_norm) + assert_allclose(sq_row_norm_reference, sq_row_norm_csr) with pytest.raises(ValueError): X = np.asfortranarray(X) From 2ee517d7c73f524a8cbf0a237957e1d26b48ebcd Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sun, 23 Oct 2022 19:21:26 +0200 Subject: [PATCH 12/27] Remove previous Cython templates and sources --- .../_gemm_term_computer.pxd | 323 ------ .../_gemm_term_computer.pyx | 784 -------------- .../_radius_neighborhood.pxd | 152 --- .../_radius_neighborhood.pyx | 979 ------------------ 4 files changed, 2238 deletions(-) delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd deleted file mode 100644 index 4f0eab969de3e..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd +++ /dev/null @@ -1,323 +0,0 @@ -cimport numpy as cnp - -from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t -from libcpp.vector cimport vector - - -cdef void _middle_term_sparse_sparse_64( - const DTYPE_t[:] X_data, - const SPARSE_INDEX_TYPE_t[:] X_indices, - const SPARSE_INDEX_TYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const DTYPE_t[:] Y_data, - const SPARSE_INDEX_TYPE_t[:] Y_indices, - const SPARSE_INDEX_TYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t * D, -) nogil - - -cdef class MiddleTermComputer64: - cdef: - ITYPE_t effective_n_threads - ITYPE_t chunks_n_threads - ITYPE_t dist_middle_terms_chunks_size - ITYPE_t n_features - ITYPE_t chunk_size - - # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM - vector[vector[DTYPE_t]] dist_middle_terms_chunks - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_init(self) nogil - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - -cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64): - cdef: - const DTYPE_t[:, ::1] X - const DTYPE_t[:, ::1] Y - - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - -cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64): - cdef: - const DTYPE_t[:] X_data - const SPARSE_INDEX_TYPE_t[:] X_indices - const SPARSE_INDEX_TYPE_t[:] X_indptr - - const DTYPE_t[:] Y_data - const SPARSE_INDEX_TYPE_t[:] Y_indices - const SPARSE_INDEX_TYPE_t[:] Y_indptr - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - -cdef void _middle_term_sparse_sparse_32( - const cnp.float32_t[:] X_data, - const SPARSE_INDEX_TYPE_t[:] X_indices, - const SPARSE_INDEX_TYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const cnp.float32_t[:] Y_data, - const SPARSE_INDEX_TYPE_t[:] Y_indices, - const SPARSE_INDEX_TYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t * D, -) nogil - - -cdef class MiddleTermComputer32: - cdef: - ITYPE_t effective_n_threads - ITYPE_t chunks_n_threads - ITYPE_t dist_middle_terms_chunks_size - ITYPE_t n_features - ITYPE_t chunk_size - - # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM - vector[vector[DTYPE_t]] dist_middle_terms_chunks - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_init(self) nogil - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - -cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32): - cdef: - const cnp.float32_t[:, ::1] X - const cnp.float32_t[:, ::1] Y - - # Buffers for upcasting chunks of X and Y from 32bit to 64bit - vector[vector[DTYPE_t]] X_c_upcast - vector[vector[DTYPE_t]] Y_c_upcast - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - -cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32): - cdef: - const cnp.float32_t[:] X_data - const SPARSE_INDEX_TYPE_t[:] X_indices - const SPARSE_INDEX_TYPE_t[:] X_indptr - - const cnp.float32_t[:] Y_data - const SPARSE_INDEX_TYPE_t[:] Y_indices - const SPARSE_INDEX_TYPE_t[:] Y_indptr - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx deleted file mode 100644 index 75e44f6c6b097..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx +++ /dev/null @@ -1,784 +0,0 @@ -cimport numpy as cnp - -from libcpp.vector cimport vector - -from ...utils._cython_blas cimport ( - BLAS_Order, - BLAS_Trans, - NoTrans, - RowMajor, - Trans, - _gemm, -) -from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t - -import numpy as np -from scipy.sparse import issparse, csr_matrix -from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE - -# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used -# Introduction in Cython: -# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa -cdef extern from "" namespace "std" nogil: - void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa - - - -cdef void _middle_term_sparse_sparse_64( - const DTYPE_t[:] X_data, - const SPARSE_INDEX_TYPE_t[:] X_indices, - const SPARSE_INDEX_TYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const DTYPE_t[:] Y_data, - const SPARSE_INDEX_TYPE_t[:] Y_indices, - const SPARSE_INDEX_TYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t * D, -) nogil: - cdef: - ITYPE_t i, j, k - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr - - for i in range(n_X): - for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): - X_i_col_idx = X_indices[X_i_ptr] - for j in range(n_Y): - k = i * n_Y + j - for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): - Y_j_col_idx = Y_indices[Y_j_ptr] - if X_i_col_idx == Y_j_col_idx: - D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] - - -cdef class MiddleTermComputer64: - """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM. - - `EuclideanDistance` subclasses internally compute the squared Euclidean distances - between chunks of vectors X_c and Y_c using the following decomposition: - - - ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - - - This helper class is in charge of wrapping the common logic to compute - the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high - arithmetic intensity. - """ - - @classmethod - def get_for( - cls, - X, - Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) -> MiddleTermComputer64: - """Return the DatasetsPair implementation for the given arguments. - - Parameters - ---------- - X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - If provided as a sparse matrix, it must be in CSR format. - - Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - If provided as a sparse matrix, it must be in CSR format. - - Returns - ------- - middle_term_computer: MiddleTermComputer64 - The suited MiddleTermComputer64 implementation. - """ - X_is_sparse = issparse(X) - Y_is_sparse = issparse(Y) - - if not X_is_sparse and not Y_is_sparse: - return DenseDenseMiddleTermComputer64( - X, - Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - if X_is_sparse and Y_is_sparse: - return SparseSparseMiddleTermComputer64( - X, - Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - - raise NotImplementedError("X and Y must be both sparse or dense") - - - @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" - X_data = np.asarray(X.data, dtype=DTYPE) - X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - return X_data, X_indices, X_indptr - - def __init__( - self, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - self.effective_n_threads = effective_n_threads - self.chunks_n_threads = chunks_n_threads - self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size - self.n_features = n_features - self.chunk_size = chunk_size - - self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: - self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_init(self) nogil: - for thread_num in range(self.chunks_n_threads): - self.dist_middle_terms_chunks[thread_num].resize( - self.dist_middle_terms_chunks_size - ) - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil: - return - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return NULL - - -cdef class DenseDenseMiddleTermComputer64(MiddleTermComputer64): - - def __init__( - self, - const DTYPE_t[:, ::1] X, - const DTYPE_t[:, ::1] Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - super().__init__( - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - self.X = X - self.Y = Y - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil: - return - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] - const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] - DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() - - # Careful: LDA, LDB and LDC are given for F-ordered arrays - # in BLAS documentations, for instance: - # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa - # - # Here, we use their counterpart values to work with C-ordered arrays. - BLAS_Order order = RowMajor - BLAS_Trans ta = NoTrans - BLAS_Trans tb = Trans - ITYPE_t m = X_c.shape[0] - ITYPE_t n = Y_c.shape[0] - ITYPE_t K = X_c.shape[1] - DTYPE_t alpha = - 2. - # Casting for A and B to remove the const is needed because APIs exposed via - # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. - # See: https://github.com/scipy/scipy/issues/14262 - DTYPE_t * A = &X_c[0, 0] - DTYPE_t * B = &Y_c[0, 0] - ITYPE_t lda = X_c.shape[1] - ITYPE_t ldb = X_c.shape[1] - DTYPE_t beta = 0. - ITYPE_t ldc = Y_c.shape[0] - - # dist_middle_terms = `-2 * X_c @ Y_c.T` - _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) - - return dist_middle_terms - - -cdef class SparseSparseMiddleTermComputer64(MiddleTermComputer64): - - def __init__( - self, - X, - Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - super().__init__( - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - DTYPE_t *dist_middle_terms = ( - self.dist_middle_terms_chunks[thread_num].data() - ) - - _middle_term_sparse_sparse_64( - self.X_data, - self.X_indices, - self.X_indptr, - X_start, - X_end, - self.Y_data, - self.Y_indices, - self.Y_indptr, - Y_start, - Y_end, - dist_middle_terms, - ) - - return dist_middle_terms - - -cdef void _middle_term_sparse_sparse_32( - const cnp.float32_t[:] X_data, - const SPARSE_INDEX_TYPE_t[:] X_indices, - const SPARSE_INDEX_TYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const cnp.float32_t[:] Y_data, - const SPARSE_INDEX_TYPE_t[:] Y_indices, - const SPARSE_INDEX_TYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t * D, -) nogil: - cdef: - ITYPE_t i, j, k - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr - - for i in range(n_X): - for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): - X_i_col_idx = X_indices[X_i_ptr] - for j in range(n_Y): - k = i * n_Y + j - for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): - Y_j_col_idx = Y_indices[Y_j_ptr] - if X_i_col_idx == Y_j_col_idx: - D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] - - -cdef class MiddleTermComputer32: - """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM. - - `EuclideanDistance` subclasses internally compute the squared Euclidean distances - between chunks of vectors X_c and Y_c using the following decomposition: - - - ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - - - This helper class is in charge of wrapping the common logic to compute - the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high - arithmetic intensity. - """ - - @classmethod - def get_for( - cls, - X, - Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) -> MiddleTermComputer32: - """Return the DatasetsPair implementation for the given arguments. - - Parameters - ---------- - X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - If provided as a sparse matrix, it must be in CSR format. - - Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - If provided as a sparse matrix, it must be in CSR format. - - Returns - ------- - middle_term_computer: MiddleTermComputer32 - The suited MiddleTermComputer32 implementation. - """ - X_is_sparse = issparse(X) - Y_is_sparse = issparse(Y) - - if not X_is_sparse and not Y_is_sparse: - return DenseDenseMiddleTermComputer32( - X, - Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - if X_is_sparse and Y_is_sparse: - return SparseSparseMiddleTermComputer32( - X, - Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - - raise NotImplementedError("X and Y must be both sparse or dense") - - - @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" - X_data = np.asarray(X.data, dtype=np.float32) - X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - return X_data, X_indices, X_indptr - - def __init__( - self, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - self.effective_n_threads = effective_n_threads - self.chunks_n_threads = chunks_n_threads - self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size - self.n_features = n_features - self.chunk_size = chunk_size - - self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: - self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_init(self) nogil: - for thread_num in range(self.chunks_n_threads): - self.dist_middle_terms_chunks[thread_num].resize( - self.dist_middle_terms_chunks_size - ) - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil: - return - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return NULL - - -cdef class DenseDenseMiddleTermComputer32(MiddleTermComputer32): - - def __init__( - self, - const cnp.float32_t[:, ::1] X, - const cnp.float32_t[:, ::1] Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - super().__init__( - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - self.X = X - self.Y = Y - # We populate the buffer for upcasting chunks of X and Y from float32 to float64. - self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) - self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) - - upcast_buffer_n_elements = self.chunk_size * n_features - - for thread_num in range(self.effective_n_threads): - self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements) - self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = Y_end - Y_start - - # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = X_end - X_start - - # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = X_end - X_start - - # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = Y_end - Y_start - - # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - const cnp.float32_t[:, ::1] X_c = self.X[X_start:X_end, :] - const cnp.float32_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] - DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() - - # Careful: LDA, LDB and LDC are given for F-ordered arrays - # in BLAS documentations, for instance: - # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa - # - # Here, we use their counterpart values to work with C-ordered arrays. - BLAS_Order order = RowMajor - BLAS_Trans ta = NoTrans - BLAS_Trans tb = Trans - ITYPE_t m = X_c.shape[0] - ITYPE_t n = Y_c.shape[0] - ITYPE_t K = X_c.shape[1] - DTYPE_t alpha = - 2. - DTYPE_t * A = self.X_c_upcast[thread_num].data() - DTYPE_t * B = self.Y_c_upcast[thread_num].data() - ITYPE_t lda = X_c.shape[1] - ITYPE_t ldb = X_c.shape[1] - DTYPE_t beta = 0. - ITYPE_t ldc = Y_c.shape[0] - - # dist_middle_terms = `-2 * X_c @ Y_c.T` - _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) - - return dist_middle_terms - - -cdef class SparseSparseMiddleTermComputer32(MiddleTermComputer32): - - def __init__( - self, - X, - Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - super().__init__( - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - ) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - DTYPE_t *dist_middle_terms = ( - self.dist_middle_terms_chunks[thread_num].data() - ) - - _middle_term_sparse_sparse_32( - self.X_data, - self.X_indices, - self.X_indptr, - X_start, - X_end, - self.Y_data, - self.Y_indices, - self.Y_indptr, - Y_start, - Y_end, - dist_middle_terms, - ) - - return dist_middle_terms diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd deleted file mode 100644 index b4467b9e14569..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd +++ /dev/null @@ -1,152 +0,0 @@ -cimport numpy as cnp - -from libcpp.memory cimport shared_ptr -from libcpp.vector cimport vector -from cython cimport final - -from ...utils._typedefs cimport ITYPE_t, DTYPE_t - -cnp.import_array() - -###################### -## std::vector to np.ndarray coercion -# As type covariance is not supported for C++ containers via Cython, -# we need to redefine fused types. -ctypedef fused vector_DITYPE_t: - vector[ITYPE_t] - vector[DTYPE_t] - - -ctypedef fused vector_vector_DITYPE_t: - vector[vector[ITYPE_t]] - vector[vector[DTYPE_t]] - -cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( - shared_ptr[vector_vector_DITYPE_t] vecs -) - -##################### - -from ._base cimport BaseDistanceReducer64 -from ._gemm_term_computer cimport MiddleTermComputer64 - -cdef class RadiusNeighbors64(BaseDistanceReducer64): - """ - 64bit implementation of BaseDistanceReducer64 for the - `RadiusNeighbors` reduction. - """ - - cdef: - DTYPE_t radius - - # DistanceMetric64 compute rank-preserving surrogate distance via rdist - # which are proxies necessitating less computations. - # We get the equivalent for the radius to be able to compare it against - # vectors' rank-preserving surrogate distances. - DTYPE_t r_radius - - # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays. - # - # For this implementation, we want resizable buffers which we will wrap - # into numpy arrays at the end. std::vector comes as a handy container - # for interacting efficiently with resizable buffers. - # - # Though it is possible to access their buffer address with - # std::vector::data, they can't be stolen: buffers lifetime - # is tied to their std::vector and are deallocated when - # std::vectors are. - # - # To solve this, we dynamically allocate std::vectors and then - # encapsulate them in a StdVectorSentinel responsible for - # freeing them when the associated np.ndarray is freed. - # - # Shared pointers (defined via shared_ptr) are use for safer memory management. - # Unique pointers (defined via unique_ptr) can't be used as datastructures - # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk. - shared_ptr[vector[vector[ITYPE_t]]] neigh_indices - shared_ptr[vector[vector[DTYPE_t]]] neigh_distances - - # Used as array of pointers to private datastructures used in threads. - vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks - vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks - - bint sort_results - - @final - cdef void _merge_vectors( - self, - ITYPE_t idx, - ITYPE_t num_threads, - ) nogil - - -cdef class EuclideanRadiusNeighbors64(RadiusNeighbors64): - """EuclideanDistance-specialized 64bit implementation for RadiusNeighbors64.""" - cdef: - MiddleTermComputer64 middle_term_computer - const DTYPE_t[::1] X_norm_squared - const DTYPE_t[::1] Y_norm_squared - - bint use_squared_distances - -from ._base cimport BaseDistanceReducer32 -from ._gemm_term_computer cimport MiddleTermComputer32 - -cdef class RadiusNeighbors32(BaseDistanceReducer32): - """ - 32bit implementation of BaseDistanceReducer32 for the - `RadiusNeighbors` reduction. - """ - - cdef: - DTYPE_t radius - - # DistanceMetric32 compute rank-preserving surrogate distance via rdist - # which are proxies necessitating less computations. - # We get the equivalent for the radius to be able to compare it against - # vectors' rank-preserving surrogate distances. - DTYPE_t r_radius - - # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays. - # - # For this implementation, we want resizable buffers which we will wrap - # into numpy arrays at the end. std::vector comes as a handy container - # for interacting efficiently with resizable buffers. - # - # Though it is possible to access their buffer address with - # std::vector::data, they can't be stolen: buffers lifetime - # is tied to their std::vector and are deallocated when - # std::vectors are. - # - # To solve this, we dynamically allocate std::vectors and then - # encapsulate them in a StdVectorSentinel responsible for - # freeing them when the associated np.ndarray is freed. - # - # Shared pointers (defined via shared_ptr) are use for safer memory management. - # Unique pointers (defined via unique_ptr) can't be used as datastructures - # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk. - shared_ptr[vector[vector[ITYPE_t]]] neigh_indices - shared_ptr[vector[vector[DTYPE_t]]] neigh_distances - - # Used as array of pointers to private datastructures used in threads. - vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks - vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks - - bint sort_results - - @final - cdef void _merge_vectors( - self, - ITYPE_t idx, - ITYPE_t num_threads, - ) nogil - - -cdef class EuclideanRadiusNeighbors32(RadiusNeighbors32): - """EuclideanDistance-specialized 32bit implementation for RadiusNeighbors32.""" - cdef: - MiddleTermComputer32 middle_term_computer - const DTYPE_t[::1] X_norm_squared - const DTYPE_t[::1] Y_norm_squared - - bint use_squared_distances diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx deleted file mode 100644 index a133d23e0979e..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx +++ /dev/null @@ -1,979 +0,0 @@ -cimport numpy as cnp -import numpy as np -import warnings - -from libcpp.memory cimport shared_ptr, make_shared -from libcpp.vector cimport vector -from cython cimport final -from cython.operator cimport dereference as deref -from cython.parallel cimport parallel, prange - -from ...utils._sorting cimport simultaneous_sort -from ...utils._typedefs cimport ITYPE_t, DTYPE_t -from ...utils._vector_sentinel cimport vector_to_nd_array - -from numbers import Real -from scipy.sparse import issparse -from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration -from ...utils.fixes import threadpool_limits - -cnp.import_array() - -# TODO: change for `libcpp.algorithm.move` once Cython 3 is used -# Introduction in Cython: -# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa -cdef extern from "" namespace "std" nogil: - OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa - -###################### - -cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( - shared_ptr[vector_vector_DITYPE_t] vecs -): - """Coerce a std::vector of std::vector to a ndarray of ndarray.""" - cdef: - ITYPE_t n = deref(vecs).size() - cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray) - - for i in range(n): - nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i])) - - return nd_arrays_of_nd_arrays - -##################### - -from ._base cimport ( - BaseDistanceReducer64, - _sqeuclidean_row_norms64, -) - -from ._datasets_pair cimport ( - DatasetsPair64, - DenseDenseDatasetsPair64, - SparseSparseDatasetsPair64, -) - -from ._gemm_term_computer cimport MiddleTermComputer64 - - -cdef class RadiusNeighbors64(BaseDistanceReducer64): - """ - 64bit implementation of the pairwise-distance reduction BaseDistanceReducer64 for the - `RadiusNeighbors` reduction. - """ - - @classmethod - def compute( - cls, - X, - Y, - DTYPE_t radius, - str metric="euclidean", - chunk_size=None, - dict metric_kwargs=None, - str strategy=None, - bint return_distance=False, - bint sort_results=False, - ): - """Compute the radius-neighbors reduction. - - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`RadiusNeighbors64`. - - This allows decoupling the API entirely from the implementation details - whilst maintaining RAII: all temporarily allocated datastructures necessary - for the concrete implementation are therefore freed when this classmethod - returns. - - No instance should directly be created outside of this class method. - """ - if ( - metric in ("euclidean", "sqeuclidean") - and not (issparse(X) ^ issparse(Y)) - ): - # Specialized implementation with improved arithmetic intensity - # and vector instructions (SIMD) by processing several vectors - # at time to leverage a call to the BLAS GEMM routine as explained - # in more details in GEMMTermComputer docstring. - use_squared_distances = metric == "sqeuclidean" - pda = EuclideanRadiusNeighbors64( - X=X, Y=Y, radius=radius, - use_squared_distances=use_squared_distances, - chunk_size=chunk_size, - strategy=strategy, - sort_results=sort_results, - metric_kwargs=metric_kwargs, - ) - else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. - pda = RadiusNeighbors64( - datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs), - radius=radius, - chunk_size=chunk_size, - strategy=strategy, - sort_results=sort_results, - ) - - # Limit the number of threads in second level of nested parallelism for BLAS - # to avoid threads over-subscription (in GEMM for instance). - with threadpool_limits(limits=1, user_api="blas"): - if pda.execute_in_parallel_on_Y: - pda._parallel_on_Y() - else: - pda._parallel_on_X() - - return pda._finalize_results(return_distance) - - - def __init__( - self, - DatasetsPair64 datasets_pair, - DTYPE_t radius, - chunk_size=None, - strategy=None, - sort_results=False, - ): - super().__init__( - datasets_pair=datasets_pair, - chunk_size=chunk_size, - strategy=strategy, - ) - - self.radius = check_scalar(radius, "radius", Real, min_val=0) - self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius) - self.sort_results = sort_results - - # Allocating pointers to datastructures but not the datastructures themselves. - # There are as many pointers as effective threads. - # - # For the sake of explicitness: - # - when parallelizing on X, the pointers of those heaps are referencing - # self.neigh_distances and self.neigh_indices - # - when parallelizing on Y, the pointers of those heaps are referencing - # std::vectors of std::vectors which are thread-wise-allocated and whose - # content will be merged into self.neigh_distances and self.neigh_indices. - self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]]( - self.chunks_n_threads - ) - self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]]( - self.chunks_n_threads - ) - - # Temporary datastructures which will be coerced to numpy arrays on before - # RadiusNeighbors.compute "return" and will be then freed. - self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) - self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) - - cdef void _compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - ITYPE_t i, j - DTYPE_t r_dist_i_j - - for i in range(X_start, X_end): - for j in range(Y_start, Y_end): - r_dist_i_j = self.datasets_pair.surrogate_dist(i, j) - if r_dist_i_j <= self.r_radius: - deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j) - deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) - - def _finalize_results(self, bint return_distance=False): - if return_distance: - # We need to recompute distances because we relied on - # surrogate distances for the reduction. - self.compute_exact_distances() - return ( - coerce_vectors_to_nd_arrays(self.neigh_distances), - coerce_vectors_to_nd_arrays(self.neigh_indices), - ) - - return coerce_vectors_to_nd_arrays(self.neigh_indices) - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - - # As this strategy is embarrassingly parallel, we can set the - # thread vectors' pointers to the main vectors'. - self.neigh_distances_chunks[thread_num] = self.neigh_distances - self.neigh_indices_chunks[thread_num] = self.neigh_indices - - @final - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, jdx - - # Sorting neighbors for each query vector of X - if self.sort_results: - for idx in range(X_start, X_end): - simultaneous_sort( - deref(self.neigh_distances)[idx].data(), - deref(self.neigh_indices)[idx].data(), - deref(self.neigh_indices)[idx].size() - ) - - cdef void _parallel_on_Y_init( - self, - ) nogil: - cdef: - ITYPE_t thread_num - # As chunks of X are shared across threads, so must datastructures to avoid race - # conditions: each thread has its own vectors of n_samples_X vectors which are - # then merged back in the main n_samples_X vectors. - for thread_num in range(self.chunks_n_threads): - self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) - self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) - - @final - cdef void _merge_vectors( - self, - ITYPE_t idx, - ITYPE_t num_threads, - ) nogil: - cdef: - ITYPE_t thread_num - ITYPE_t idx_n_elements = 0 - ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size() - - # Resizing buffers only once for the given number of elements. - for thread_num in range(num_threads): - idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size() - - deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements) - deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements) - - # Moving the elements by range using the range first element - # as the reference for the insertion. - for thread_num in range(num_threads): - move( - deref(self.neigh_distances_chunks[thread_num])[idx].begin(), - deref(self.neigh_distances_chunks[thread_num])[idx].end(), - deref(self.neigh_distances)[idx].begin() + last_element_idx - ) - move( - deref(self.neigh_indices_chunks[thread_num])[idx].begin(), - deref(self.neigh_indices_chunks[thread_num])[idx].end(), - deref(self.neigh_indices)[idx].begin() + last_element_idx - ) - last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size() - - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current - - with nogil, parallel(num_threads=self.effective_n_threads): - # Merge vectors used in threads into the main ones. - # This is done in parallel sample-wise (no need for locks). - for idx in prange(self.n_samples_X, schedule='static'): - self._merge_vectors(idx, self.chunks_n_threads) - - # The content of the vector have been std::moved. - # Hence they can't be used anymore and can be deleted. - # Their deletion is carried out automatically as the - # implementation relies on shared pointers. - - # Sort in parallel in ascending order w.r.t the distances if requested. - if self.sort_results: - for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - deref(self.neigh_distances)[idx].data(), - deref(self.neigh_indices)[idx].data(), - deref(self.neigh_indices)[idx].size() - ) - - return - - cdef void compute_exact_distances(self) nogil: - """Convert rank-preserving distances to pairwise distances in parallel.""" - cdef: - ITYPE_t i, j - - for i in prange(self.n_samples_X, nogil=True, schedule='static', - num_threads=self.effective_n_threads): - for j in range(deref(self.neigh_indices)[i].size()): - deref(self.neigh_distances)[i][j] = ( - self.datasets_pair.distance_metric._rdist_to_dist( - # Guard against eventual -0., causing nan production. - max(deref(self.neigh_distances)[i][j], 0.) - ) - ) - - -cdef class EuclideanRadiusNeighbors64(RadiusNeighbors64): - """EuclideanDistance-specialized implementation for RadiusNeighbors64.""" - - @classmethod - def is_usable_for(cls, X, Y, metric) -> bool: - return (RadiusNeighbors64.is_usable_for(X, Y, metric) - and not _in_unstable_openblas_configuration()) - - def __init__( - self, - X, - Y, - DTYPE_t radius, - bint use_squared_distances=False, - chunk_size=None, - strategy=None, - sort_results=False, - metric_kwargs=None, - ): - if ( - metric_kwargs is not None and - len(metric_kwargs) > 0 and - "Y_norm_squared" not in metric_kwargs - ): - warnings.warn( - f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " - f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.", - UserWarning, - stacklevel=3, - ) - - super().__init__( - # The datasets pair here is used for exact distances computations - datasets_pair=DatasetsPair64.get_for(X, Y, metric="euclidean"), - radius=radius, - chunk_size=chunk_size, - strategy=strategy, - sort_results=sort_results, - ) - # X and Y are checked by the DatasetsPair64 implemented - # as a DenseDenseDatasetsPair64 - cdef: - DenseDenseDatasetsPair64 datasets_pair = ( - self.datasets_pair - ) - ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - - self.middle_term_computer = MiddleTermComputer64.get_for( - X, - Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) - - if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: - self.Y_norm_squared = check_array( - metric_kwargs.pop("Y_norm_squared"), - ensure_2d=False, - input_name="Y_norm_squared", - dtype=np.float64 - ) - else: - self.Y_norm_squared = _sqeuclidean_row_norms64(Y, self.effective_n_threads) - - # Do not recompute norms if datasets are identical. - self.X_norm_squared = ( - self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms64(X, self.effective_n_threads) - ) - self.use_squared_distances = use_squared_distances - - if use_squared_distances: - # In this specialisation and this setup, the value passed to the radius is - # already considered to be the adapted radius, so we overwrite it. - self.r_radius = radius - - @final - cdef void _parallel_on_X_parallel_init( - self, - ITYPE_t thread_num, - ) nogil: - RadiusNeighbors64._parallel_on_X_parallel_init(self, thread_num) - self.middle_term_computer._parallel_on_X_parallel_init(thread_num) - - @final - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - RadiusNeighbors64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) - self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) - - @final - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - RadiusNeighbors64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - X_start, X_end, - Y_start, Y_end, - thread_num, - ) - self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - X_start, X_end, Y_start, Y_end, thread_num, - ) - - @final - cdef void _parallel_on_Y_init( - self, - ) nogil: - cdef ITYPE_t thread_num - RadiusNeighbors64._parallel_on_Y_init(self) - self.middle_term_computer._parallel_on_Y_init() - - @final - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - RadiusNeighbors64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) - self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) - - @final - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - RadiusNeighbors64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - X_start, X_end, - Y_start, Y_end, - thread_num, - ) - self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - X_start, X_end, Y_start, Y_end, thread_num - ) - - @final - cdef void compute_exact_distances(self) nogil: - if not self.use_squared_distances: - RadiusNeighbors64.compute_exact_distances(self) - - @final - cdef void _compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks( - X_start, X_end, Y_start, Y_end, thread_num - ) - - # Pushing the distance and their associated indices in vectors. - for i in range(n_X): - for j in range(n_Y): - # Using the squared euclidean distance as the rank-preserving distance: - # - # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - # - squared_dist_i_j = ( - self.X_norm_squared[i + X_start] - + dist_middle_terms[i * n_Y + j] - + self.Y_norm_squared[j + Y_start] - ) - if squared_dist_i_j <= self.r_radius: - deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j) - deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) - -from ._base cimport ( - BaseDistanceReducer32, - _sqeuclidean_row_norms32, -) - -from ._datasets_pair cimport ( - DatasetsPair32, - DenseDenseDatasetsPair32, - SparseSparseDatasetsPair32, -) - -from ._gemm_term_computer cimport MiddleTermComputer32 - - -cdef class RadiusNeighbors32(BaseDistanceReducer32): - """ - 32bit implementation of the pairwise-distance reduction BaseDistanceReducer32 for the - `RadiusNeighbors` reduction. - """ - - @classmethod - def compute( - cls, - X, - Y, - DTYPE_t radius, - str metric="euclidean", - chunk_size=None, - dict metric_kwargs=None, - str strategy=None, - bint return_distance=False, - bint sort_results=False, - ): - """Compute the radius-neighbors reduction. - - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`RadiusNeighbors32`. - - This allows decoupling the API entirely from the implementation details - whilst maintaining RAII: all temporarily allocated datastructures necessary - for the concrete implementation are therefore freed when this classmethod - returns. - - No instance should directly be created outside of this class method. - """ - if ( - metric in ("euclidean", "sqeuclidean") - and not (issparse(X) ^ issparse(Y)) - ): - # Specialized implementation with improved arithmetic intensity - # and vector instructions (SIMD) by processing several vectors - # at time to leverage a call to the BLAS GEMM routine as explained - # in more details in GEMMTermComputer docstring. - use_squared_distances = metric == "sqeuclidean" - pda = EuclideanRadiusNeighbors32( - X=X, Y=Y, radius=radius, - use_squared_distances=use_squared_distances, - chunk_size=chunk_size, - strategy=strategy, - sort_results=sort_results, - metric_kwargs=metric_kwargs, - ) - else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. - pda = RadiusNeighbors32( - datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs), - radius=radius, - chunk_size=chunk_size, - strategy=strategy, - sort_results=sort_results, - ) - - # Limit the number of threads in second level of nested parallelism for BLAS - # to avoid threads over-subscription (in GEMM for instance). - with threadpool_limits(limits=1, user_api="blas"): - if pda.execute_in_parallel_on_Y: - pda._parallel_on_Y() - else: - pda._parallel_on_X() - - return pda._finalize_results(return_distance) - - - def __init__( - self, - DatasetsPair32 datasets_pair, - DTYPE_t radius, - chunk_size=None, - strategy=None, - sort_results=False, - ): - super().__init__( - datasets_pair=datasets_pair, - chunk_size=chunk_size, - strategy=strategy, - ) - - self.radius = check_scalar(radius, "radius", Real, min_val=0) - self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius) - self.sort_results = sort_results - - # Allocating pointers to datastructures but not the datastructures themselves. - # There are as many pointers as effective threads. - # - # For the sake of explicitness: - # - when parallelizing on X, the pointers of those heaps are referencing - # self.neigh_distances and self.neigh_indices - # - when parallelizing on Y, the pointers of those heaps are referencing - # std::vectors of std::vectors which are thread-wise-allocated and whose - # content will be merged into self.neigh_distances and self.neigh_indices. - self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]]( - self.chunks_n_threads - ) - self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]]( - self.chunks_n_threads - ) - - # Temporary datastructures which will be coerced to numpy arrays on before - # RadiusNeighbors.compute "return" and will be then freed. - self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) - self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) - - cdef void _compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - ITYPE_t i, j - DTYPE_t r_dist_i_j - - for i in range(X_start, X_end): - for j in range(Y_start, Y_end): - r_dist_i_j = self.datasets_pair.surrogate_dist(i, j) - if r_dist_i_j <= self.r_radius: - deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j) - deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) - - def _finalize_results(self, bint return_distance=False): - if return_distance: - # We need to recompute distances because we relied on - # surrogate distances for the reduction. - self.compute_exact_distances() - return ( - coerce_vectors_to_nd_arrays(self.neigh_distances), - coerce_vectors_to_nd_arrays(self.neigh_indices), - ) - - return coerce_vectors_to_nd_arrays(self.neigh_indices) - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - - # As this strategy is embarrassingly parallel, we can set the - # thread vectors' pointers to the main vectors'. - self.neigh_distances_chunks[thread_num] = self.neigh_distances - self.neigh_indices_chunks[thread_num] = self.neigh_indices - - @final - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, jdx - - # Sorting neighbors for each query vector of X - if self.sort_results: - for idx in range(X_start, X_end): - simultaneous_sort( - deref(self.neigh_distances)[idx].data(), - deref(self.neigh_indices)[idx].data(), - deref(self.neigh_indices)[idx].size() - ) - - cdef void _parallel_on_Y_init( - self, - ) nogil: - cdef: - ITYPE_t thread_num - # As chunks of X are shared across threads, so must datastructures to avoid race - # conditions: each thread has its own vectors of n_samples_X vectors which are - # then merged back in the main n_samples_X vectors. - for thread_num in range(self.chunks_n_threads): - self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) - self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) - - @final - cdef void _merge_vectors( - self, - ITYPE_t idx, - ITYPE_t num_threads, - ) nogil: - cdef: - ITYPE_t thread_num - ITYPE_t idx_n_elements = 0 - ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size() - - # Resizing buffers only once for the given number of elements. - for thread_num in range(num_threads): - idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size() - - deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements) - deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements) - - # Moving the elements by range using the range first element - # as the reference for the insertion. - for thread_num in range(num_threads): - move( - deref(self.neigh_distances_chunks[thread_num])[idx].begin(), - deref(self.neigh_distances_chunks[thread_num])[idx].end(), - deref(self.neigh_distances)[idx].begin() + last_element_idx - ) - move( - deref(self.neigh_indices_chunks[thread_num])[idx].begin(), - deref(self.neigh_indices_chunks[thread_num])[idx].end(), - deref(self.neigh_indices)[idx].begin() + last_element_idx - ) - last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size() - - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current - - with nogil, parallel(num_threads=self.effective_n_threads): - # Merge vectors used in threads into the main ones. - # This is done in parallel sample-wise (no need for locks). - for idx in prange(self.n_samples_X, schedule='static'): - self._merge_vectors(idx, self.chunks_n_threads) - - # The content of the vector have been std::moved. - # Hence they can't be used anymore and can be deleted. - # Their deletion is carried out automatically as the - # implementation relies on shared pointers. - - # Sort in parallel in ascending order w.r.t the distances if requested. - if self.sort_results: - for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - deref(self.neigh_distances)[idx].data(), - deref(self.neigh_indices)[idx].data(), - deref(self.neigh_indices)[idx].size() - ) - - return - - cdef void compute_exact_distances(self) nogil: - """Convert rank-preserving distances to pairwise distances in parallel.""" - cdef: - ITYPE_t i, j - - for i in prange(self.n_samples_X, nogil=True, schedule='static', - num_threads=self.effective_n_threads): - for j in range(deref(self.neigh_indices)[i].size()): - deref(self.neigh_distances)[i][j] = ( - self.datasets_pair.distance_metric._rdist_to_dist( - # Guard against eventual -0., causing nan production. - max(deref(self.neigh_distances)[i][j], 0.) - ) - ) - - -cdef class EuclideanRadiusNeighbors32(RadiusNeighbors32): - """EuclideanDistance-specialized implementation for RadiusNeighbors32.""" - - @classmethod - def is_usable_for(cls, X, Y, metric) -> bool: - return (RadiusNeighbors32.is_usable_for(X, Y, metric) - and not _in_unstable_openblas_configuration()) - - def __init__( - self, - X, - Y, - DTYPE_t radius, - bint use_squared_distances=False, - chunk_size=None, - strategy=None, - sort_results=False, - metric_kwargs=None, - ): - if ( - metric_kwargs is not None and - len(metric_kwargs) > 0 and - "Y_norm_squared" not in metric_kwargs - ): - warnings.warn( - f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " - f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.", - UserWarning, - stacklevel=3, - ) - - super().__init__( - # The datasets pair here is used for exact distances computations - datasets_pair=DatasetsPair32.get_for(X, Y, metric="euclidean"), - radius=radius, - chunk_size=chunk_size, - strategy=strategy, - sort_results=sort_results, - ) - # X and Y are checked by the DatasetsPair32 implemented - # as a DenseDenseDatasetsPair32 - cdef: - DenseDenseDatasetsPair32 datasets_pair = ( - self.datasets_pair - ) - ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - - self.middle_term_computer = MiddleTermComputer32.get_for( - X, - Y, - self.effective_n_threads, - self.chunks_n_threads, - dist_middle_terms_chunks_size, - n_features=datasets_pair.X.shape[1], - chunk_size=self.chunk_size, - ) - - if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: - self.Y_norm_squared = check_array( - metric_kwargs.pop("Y_norm_squared"), - ensure_2d=False, - input_name="Y_norm_squared", - dtype=np.float64 - ) - else: - self.Y_norm_squared = _sqeuclidean_row_norms32(Y, self.effective_n_threads) - - # Do not recompute norms if datasets are identical. - self.X_norm_squared = ( - self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms32(X, self.effective_n_threads) - ) - self.use_squared_distances = use_squared_distances - - if use_squared_distances: - # In this specialisation and this setup, the value passed to the radius is - # already considered to be the adapted radius, so we overwrite it. - self.r_radius = radius - - @final - cdef void _parallel_on_X_parallel_init( - self, - ITYPE_t thread_num, - ) nogil: - RadiusNeighbors32._parallel_on_X_parallel_init(self, thread_num) - self.middle_term_computer._parallel_on_X_parallel_init(thread_num) - - @final - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - RadiusNeighbors32._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) - self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) - - @final - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - RadiusNeighbors32._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - X_start, X_end, - Y_start, Y_end, - thread_num, - ) - self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - X_start, X_end, Y_start, Y_end, thread_num, - ) - - @final - cdef void _parallel_on_Y_init( - self, - ) nogil: - cdef ITYPE_t thread_num - RadiusNeighbors32._parallel_on_Y_init(self) - self.middle_term_computer._parallel_on_Y_init() - - @final - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - RadiusNeighbors32._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) - self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) - - @final - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - RadiusNeighbors32._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - X_start, X_end, - Y_start, Y_end, - thread_num, - ) - self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - X_start, X_end, Y_start, Y_end, thread_num - ) - - @final - cdef void compute_exact_distances(self) nogil: - if not self.use_squared_distances: - RadiusNeighbors32.compute_exact_distances(self) - - @final - cdef void _compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - DTYPE_t *dist_middle_terms = self.middle_term_computer._compute_distances_on_chunks( - X_start, X_end, Y_start, Y_end, thread_num - ) - - # Pushing the distance and their associated indices in vectors. - for i in range(n_X): - for j in range(n_Y): - # Using the squared euclidean distance as the rank-preserving distance: - # - # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - # - squared_dist_i_j = ( - self.X_norm_squared[i + X_start] - + dist_middle_terms[i * n_Y + j] - + self.Y_norm_squared[j + Y_start] - ) - if squared_dist_i_j <= self.r_radius: - deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j) - deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) From 9df551d445e0cf3675d0dfb128f80dd10beefbe6 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sun, 23 Oct 2022 19:21:54 +0200 Subject: [PATCH 13/27] Apply suggestions --- sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp | 6 +++--- .../metrics/_pairwise_distances_reduction/_dispatcher.py | 4 ++-- .../_middle_term_computer.pxd.tp | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index b20381677fc8d..aeeb5ee126ddd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -8,8 +8,8 @@ implementation_specific_values = [ # We also use the float64 dtype and C-type names as defined in # `sklearn.utils._typedefs` to maintain consistency. # - ('64', False, 'DTYPE_t', 'DTYPE'), - ('32', True, 'cnp.float32_t', 'np.float32') + ('64', 'DTYPE_t', 'DTYPE'), + ('32', 'cnp.float32_t', 'np.float32') ] }} @@ -103,7 +103,7 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( return squared_row_norms -{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 57006420e6d49..5bc9bfcc35dbb 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -38,9 +38,9 @@ def sqeuclidean_row_norms(X, num_threads): Arrays containing the squared euclidean norm of each row of X. """ if X.dtype == np.float64: - return _sqeuclidean_row_norms64(X, num_threads) + return np.asarray(_sqeuclidean_row_norms64(X, num_threads)) if X.dtype == np.float32: - return _sqeuclidean_row_norms32(X, num_threads) + return np.asarray(_sqeuclidean_row_norms32(X, num_threads)) raise ValueError( "Only float64 or float32 datasets are supported at this time, " diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp index 238ef829dc36e..6fd9a81ff6369 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp @@ -15,9 +15,10 @@ implementation_specific_values = [ }} cimport numpy as cnp -from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from libcpp.vector cimport vector +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t + {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} From 76574eb85462bef3a10f497d0498d1e088a4411a Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 24 Oct 2022 09:56:47 +0200 Subject: [PATCH 14/27] remove np.asarray from test --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 2a3ef30a5f975..b78e50f014ae1 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1151,9 +1151,9 @@ def test_sqeuclidean_row_norms( X_csr = csr_matrix(X) sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 - sq_row_norm = np.asarray(sqeuclidean_row_norms(X, num_threads=num_threads)) + sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads) - sq_row_norm_csr = np.asarray(sqeuclidean_row_norms(X_csr, num_threads=num_threads)) + sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads) assert_allclose(sq_row_norm_reference, sq_row_norm) assert_allclose(sq_row_norm_reference, sq_row_norm_csr) From 07622be4a149b25bb92694f161f55353658a8726 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 27 Oct 2022 16:11:27 +0200 Subject: [PATCH 15/27] update some doc --- doc/whats_new/v1.2.rst | 3 +- .../_argkmin.pyx.tp | 6 ++-- .../_dispatcher.py | 2 +- .../_middle_term_computer.pyx.tp | 29 +++++++++++++++---- .../_radius_neighbors.pyx.tp | 6 ++-- .../test_pairwise_distances_reduction.py | 3 ++ 6 files changed, 37 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 6430a0996f501..2d9cde0dceb9d 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -94,7 +94,8 @@ Changes impacting all modules - :func:`sklearn.manifold.trustworthiness` :pr:`23604` and :pr:`23585` by :user:`Julien Jerphanion `, - :user:`Olivier Grisel `, and `Thomas Fan`_. + :user:`Olivier Grisel `, and `Thomas Fan`_, + :pr:`24556` by :user:`Vincent Maladière `. - |Fix| Systematically check the sha256 digest of dataset tarballs used in code examples in the documentation. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 6c948d0abc018..1aeb97e27f15b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -71,8 +71,10 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): ): # Specialized implementation with improved arithmetic intensity # and vector instructions (SIMD) by processing several vectors - # at time to leverage a call to the BLAS GEMM routine as explained - # in more details in the docstring. + # at time to leverage a call to a matrix multiplication routine. + # Decomposing the euclidean distance into as a sum of squared + # matrices and a middle term cross product is explained in more + # details in the docstring of MiddleTermComputer. use_squared_distances = metric == "sqeuclidean" pda = EuclideanArgKmin{{name_suffix}}( X=X, Y=Y, k=k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 5bc9bfcc35dbb..921074c3ae968 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -26,7 +26,7 @@ def sqeuclidean_row_norms(X, num_threads): Parameters ---------- - X : ndarray of shape (n_samples, n_features) + X : ndarray or CSR matrix of shape (n_samples, n_features) Input data. Must be c-contiguous. num_threads : int diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 3ace80ef230ee..442b0742ce278 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -72,18 +72,19 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}( cdef class MiddleTermComputer{{name_suffix}}: - """Component for `EuclideanDistance` specialisation wrapping the logic for the call to GEMM. + """Abstract base class for the `EuclideanDistance` specialisation + components wrapping the computation of the distance matrix chunks. - `EuclideanDistance` subclasses internally compute the squared Euclidean distances - between chunks of vectors X_c and Y_c using the following decomposition: + `EuclideanDistance` subclasses relies on the squared Euclidean + distances between chunks of vectors X_c and Y_c using the + following decomposition for the (i,j) pair : - ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² This helper class is in charge of wrapping the common logic to compute - the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high - arithmetic intensity. + the middle term, i.e. `- 2 X_c_i.Y_c_j^T`. """ @classmethod @@ -224,6 +225,13 @@ cdef class MiddleTermComputer{{name_suffix}}: cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + """Computes the middle term of the Euclidean distance between two chunked dense matrices + X_c and Y_c. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + This class use the BLAS gemm routine to efficiently perform the dot product. + """ def __init__( self, @@ -384,6 +392,15 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + """Computes the middle term of the Euclidean distance between two chunked CSR matrices X_c and Y_c + and returns the result as a contiguous array. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_{{name_suffix}}. + This routine iterates over the data, indices and indptr arrays of the sparse matrices without + densifying them. + """ def __init__( self, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 94272f9aa8f3c..0b0c8ebf728f2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -92,8 +92,10 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) ): # Specialized implementation with improved arithmetic intensity # and vector instructions (SIMD) by processing several vectors - # at time to leverage a call to the BLAS GEMM routine as explained - # in more details in GEMMTermComputer docstring. + # at time to leverage a call to a matrix multiplication routine. + # Decomposing the euclidean distance into as a sum of squared + # matrices and a middle term cross product is explained in more + # details in the docstring of MiddleTermComputer. use_squared_distances = metric == "sqeuclidean" pda = EuclideanRadiusNeighbors{{name_suffix}}( X=X, Y=Y, radius=radius, diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index b78e50f014ae1..37ca3f4bfd981 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -564,6 +564,9 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) + assert BaseDistancesReductionDispatcher.is_usable_for( + X_csr, Y_csr, metric="euclidean" + ) # CSR matrices without non-zeros elements aren't currently supported # TODO: support CSR matrices without non-zeros elements From fb9a3c9748bf5b80bf465a67e201200c67206d1a Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 28 Oct 2022 12:10:12 +0200 Subject: [PATCH 16/27] doc improvement --- .../_argkmin.pyx.tp | 18 ++++++++++-------- .../_middle_term_computer.pyx.tp | 3 ++- .../_radius_neighbors.pyx.tp | 16 ++++++++++------ 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 1aeb97e27f15b..67f3f13deb4fc 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -69,12 +69,14 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): metric in ("euclidean", "sqeuclidean") and not (issparse(X) ^ issparse(Y)) ): - # Specialized implementation with improved arithmetic intensity - # and vector instructions (SIMD) by processing several vectors - # at time to leverage a call to a matrix multiplication routine. - # Decomposing the euclidean distance into as a sum of squared - # matrices and a middle term cross product is explained in more - # details in the docstring of MiddleTermComputer. + # This implementation processes the computation of the chunks of the + # distance using a decomposition of the chunks of the distance matrix + # for the Squared Euclidean distance. + # This specialisation has an improved arithmetic intensity for both + # the dense and sparse settings, allowing in most case speed-ups of + # several orders of magnitude compared to the generic ArgKmin + # implementation. + # For more information see MiddleTermComputer. use_squared_distances = metric == "sqeuclidean" pda = EuclideanArgKmin{{name_suffix}}( X=X, Y=Y, k=k, @@ -84,8 +86,8 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): metric_kwargs=metric_kwargs, ) else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. pda = ArgKmin{{name_suffix}}( datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), k=k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 442b0742ce278..099910a2f8de7 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -230,7 +230,8 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ dist_middle_terms = - 2 X_c_i.Y_c_j^T - This class use the BLAS gemm routine to efficiently perform the dot product. + This class use the BLAS gemm routine to perform the dot product of each chunks + of the distance matrix with improved arithmetic intensity and vector instruction (SIMD). """ def __init__( diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 0b0c8ebf728f2..ab7fd69a5c501 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -90,12 +90,16 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) metric in ("euclidean", "sqeuclidean") and not (issparse(X) ^ issparse(Y)) ): - # Specialized implementation with improved arithmetic intensity - # and vector instructions (SIMD) by processing several vectors - # at time to leverage a call to a matrix multiplication routine. - # Decomposing the euclidean distance into as a sum of squared - # matrices and a middle term cross product is explained in more - # details in the docstring of MiddleTermComputer. + # Specialized implementation of RadiusNeighbors for the Euclidean + # distance. + # This implementation processes the computation of the chunks of the + # distance using a decomposition of the chunks of the distance matrix + # for the Squared Euclidean distance. + # This specialisation has an improved arithmetic intensity for both + # the dense and sparse settings, allowing in most case speed-ups of + # several orders of magnitude compared to the generic RadiusNeighbors + # implementation. + # For more information see MiddleTermComputer. use_squared_distances = metric == "sqeuclidean" pda = EuclideanRadiusNeighbors{{name_suffix}}( X=X, Y=Y, radius=radius, From 88e074d33345033c65702f5301b8197b84fc0dff Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 28 Oct 2022 15:07:08 +0200 Subject: [PATCH 17/27] Update sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp Co-authored-by: Julien Jerphanion --- sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 67f3f13deb4fc..ef2c8e27816c6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -69,6 +69,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): metric in ("euclidean", "sqeuclidean") and not (issparse(X) ^ issparse(Y)) ): + # Specialized implementation of ArgKmin for the Euclidean distance. # This implementation processes the computation of the chunks of the # distance using a decomposition of the chunks of the distance matrix # for the Squared Euclidean distance. From d6bff5563a083431415409b53b7faa7e6114be90 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 28 Oct 2022 12:14:55 +0200 Subject: [PATCH 18/27] extend test_pairwise_distances_argkmin, fail with dtype=float32 and translation=1e7 --- .../test_pairwise_distances_reduction.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 37ca3f4bfd981..c08fb5b898751 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -977,6 +977,9 @@ def test_pairwise_distances_argkmin( X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + X_csr = csr_matrix(X) + Y_csr = csr_matrix(Y) + # Haversine distance only accepts 2D data if metric == "haversine": X = np.ascontiguousarray(X[:, :2]) @@ -999,24 +1002,25 @@ def test_pairwise_distances_argkmin( row_idx, argkmin_indices_ref[row_idx] ] - argkmin_distances, argkmin_indices = ArgKmin.compute( - X, - Y, - k, - metric=metric, - metric_kwargs=metric_kwargs, - return_distance=True, - # So as to have more than a chunk, forcing parallelism. - chunk_size=n_samples // 4, - strategy=strategy, - ) + for _X, _Y in [(X, Y), (X_csr, Y_csr)]: + argkmin_distances, argkmin_indices = ArgKmin.compute( + _X, + _Y, + k, + metric=metric, + metric_kwargs=metric_kwargs, + return_distance=True, + # So as to have more than a chunk, forcing parallelism. + chunk_size=n_samples // 4, + strategy=strategy, + ) - ASSERT_RESULT[(ArgKmin, dtype)]( - argkmin_distances, - argkmin_distances_ref, - argkmin_indices, - argkmin_indices_ref, - ) + ASSERT_RESULT[(ArgKmin, dtype)]( + argkmin_distances, + argkmin_distances_ref, + argkmin_indices, + argkmin_indices_ref, + ) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed From 9f96865fda54412ee0b7b5a679026095e4400143 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 28 Oct 2022 15:08:21 +0200 Subject: [PATCH 19/27] branch SparseSparseMiddleTermComputer32 to _middle_term_sparse_sparse_64 --- .../_middle_term_computer.pxd.tp | 16 +++++++------- .../_middle_term_computer.pyx.tp | 21 +++++++++++-------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp index 6fd9a81ff6369..e6ef5de2727b5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp @@ -19,16 +19,14 @@ from libcpp.vector cimport vector from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t -{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} - -cdef void _middle_term_sparse_sparse_{{name_suffix}}( - const {{INPUT_DTYPE_t}}[:] X_data, +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, const SPARSE_INDEX_TYPE_t[:] X_indices, const SPARSE_INDEX_TYPE_t[:] X_indptr, ITYPE_t X_start, ITYPE_t X_end, - const {{INPUT_DTYPE_t}}[:] Y_data, + const DTYPE_t[:] Y_data, const SPARSE_INDEX_TYPE_t[:] Y_indices, const SPARSE_INDEX_TYPE_t[:] Y_indptr, ITYPE_t Y_start, @@ -37,6 +35,9 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}( ) nogil +{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + + cdef class MiddleTermComputer{{name_suffix}}: cdef: ITYPE_t effective_n_threads @@ -149,11 +150,11 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): cdef: - const {{INPUT_DTYPE_t}}[:] X_data + const DTYPE_t[:] X_data const SPARSE_INDEX_TYPE_t[:] X_indices const SPARSE_INDEX_TYPE_t[:] X_indptr - const {{INPUT_DTYPE_t}}[:] Y_data + const DTYPE_t[:] Y_data const SPARSE_INDEX_TYPE_t[:] Y_indices const SPARSE_INDEX_TYPE_t[:] Y_indptr @@ -184,4 +185,5 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam ITYPE_t thread_num, ) nogil + {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 099910a2f8de7..49117c5401cb4 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -38,16 +38,13 @@ cdef extern from "" namespace "std" nogil: void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa -{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} - - -cdef void _middle_term_sparse_sparse_{{name_suffix}}( - const {{INPUT_DTYPE_t}}[:] X_data, +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, const SPARSE_INDEX_TYPE_t[:] X_indices, const SPARSE_INDEX_TYPE_t[:] X_indptr, ITYPE_t X_start, ITYPE_t X_end, - const {{INPUT_DTYPE_t}}[:] Y_data, + const DTYPE_t[:] Y_data, const SPARSE_INDEX_TYPE_t[:] Y_indices, const SPARSE_INDEX_TYPE_t[:] Y_indptr, ITYPE_t Y_start, @@ -71,6 +68,9 @@ cdef void _middle_term_sparse_sparse_{{name_suffix}}( D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] +{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + + cdef class MiddleTermComputer{{name_suffix}}: """Abstract base class for the `EuclideanDistance` specialisation components wrapping the computation of the distance matrix chunks. @@ -147,7 +147,9 @@ cdef class MiddleTermComputer{{name_suffix}}: @classmethod def unpack_csr_matrix(cls, X: csr_matrix): """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" - X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) + print("DEBUG before casting", X.data.dtype) + X_data = np.asarray(X.data, dtype=DTYPE) + print("DEBUG after casting", X_data.dtype) X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) return X_data, X_indices, X_indptr @@ -398,7 +400,7 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam dist_middle_terms = - 2 X_c_i.Y_c_j^T - The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_{{name_suffix}}. + The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64. This routine iterates over the data, indices and indptr arrays of the sparse matrices without densifying them. """ @@ -466,7 +468,7 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam self.dist_middle_terms_chunks[thread_num].data() ) - _middle_term_sparse_sparse_{{name_suffix}}( + _middle_term_sparse_sparse_64( self.X_data, self.X_indices, self.X_indptr, @@ -482,4 +484,5 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam return dist_middle_terms + {{endfor}} From 2df86bc3f28c0a5fa49b3b69e982a9415df155d1 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 28 Oct 2022 19:50:41 +0200 Subject: [PATCH 20/27] branch sparse_sqeuclidean_row_norm32 to sparse_sqeuclidean_row_norm64 to avoid overflow --- .../_pairwise_distances_reduction/_base.pyx.tp | 18 +++++++++--------- .../_middle_term_computer.pyx.tp | 2 -- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 886245598c3f9..1a5edc417834e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -103,13 +103,8 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( return squared_row_norms -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} - -from ._datasets_pair cimport DatasetsPair{{name_suffix}} - - -cdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( - const {{INPUT_DTYPE_t}}[:] X_data, +cdef DTYPE_t[::1] _sqeuclidean_row_norms64_sparse( + const DTYPE_t[:] X_data, const SPARSE_INDEX_TYPE_t[:] X_indptr, ITYPE_t num_threads, ): @@ -127,14 +122,19 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}_sparse( return squared_row_norms +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + + cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( X, ITYPE_t num_threads, ): if issparse(X): - X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) + X_data = np.asarray(X.data, dtype=DTYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - return _sqeuclidean_row_norms{{name_suffix}}_sparse(X_data, X_indptr, num_threads) + return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads) else: return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 49117c5401cb4..ecb259b6aaa38 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -147,9 +147,7 @@ cdef class MiddleTermComputer{{name_suffix}}: @classmethod def unpack_csr_matrix(cls, X: csr_matrix): """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" - print("DEBUG before casting", X.data.dtype) X_data = np.asarray(X.data, dtype=DTYPE) - print("DEBUG after casting", X_data.dtype) X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) return X_data, X_indices, X_indptr From c0675acc9fca4934161e58179c661bcb76dfbf56 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 4 Nov 2022 12:08:24 +0100 Subject: [PATCH 21/27] remove unused variables --- sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 2fb82029632d9..e4aef3c982589 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -26,12 +26,12 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np -from scipy.sparse import issparse, csr_matrix +from scipy.sparse import issparse from numbers import Integral from sklearn import get_config from sklearn.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils._typedefs import ITYPE, DTYPE, SPARSE_INDEX_TYPE +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE cnp.import_array() @@ -111,7 +111,6 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms64_sparse( cdef: ITYPE_t n = X_indptr.shape[0] - 1 ITYPE_t idx = 0 - ITYPE_t thread_num ITYPE_t X_i_ptr DTYPE_t[::1] squared_row_norms = np.zeros(n, dtype=DTYPE) @@ -156,7 +155,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: strategy=None, ): cdef: - ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks + ITYPE_t X_n_full_chunks, Y_n_full_chunks if chunk_size is None: chunk_size = get_config().get("pairwise_dist_chunk_size", 256) From 366548e68c024bfdfcfdca3fc33dbf58553cabce Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 4 Nov 2022 15:34:04 +0100 Subject: [PATCH 22/27] some suggestions from Maurice --- .../_middle_term_computer.pyx.tp | 23 ++++++-- .../_pairwise_distances_reduction/setup.py | 55 ------------------- 2 files changed, 18 insertions(+), 60 deletions(-) delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/setup.py diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 7c9112f247d68..083c1366bf554 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -27,17 +27,25 @@ from ...utils._cython_blas cimport ( ) from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t -import numpy as np -from scipy.sparse import issparse, csr_matrix -from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE - # TODO: change for `libcpp.algorithm.fill` once Cython 3 is used # Introduction in Cython: +# # https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa cdef extern from "" namespace "std" nogil: void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa +import numpy as np +from scipy.sparse import issparse, csr_matrix +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE +# TODO: If possible optimize this routine to efficiently treat cases where +# `n_samples_X << n_samples_Y` met in practise when X_test consists of a +# few samples, and thus when there's a single chunk of X whose number of +# samples is less that the default chunk size. + +# TODO: compare this routine with the similar ones in SciPy, especially +# `csr_matmat` which might implement a better algorithm. +# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L603-L669 # noqa cdef void _middle_term_sparse_sparse_64( const DTYPE_t[:] X_data, const SPARSE_INDEX_TYPE_t[:] X_indices, @@ -51,6 +59,9 @@ cdef void _middle_term_sparse_sparse_64( ITYPE_t Y_end, DTYPE_t * D, ) nogil: + # This routine assumes that D points to the first element of a + # zeroed buffer of length at least equal to n_X × n_Y, conceptually + # representing a 2-d C-ordered array. cdef: ITYPE_t i, j, k ITYPE_t n_X = X_end - X_start @@ -141,7 +152,9 @@ cdef class MiddleTermComputer{{name_suffix}}: chunk_size, ) - raise NotImplementedError("X and Y must be both sparse or dense") + raise NotImplementedError( + "Both X and Y must be CSR sparse matrices xor numpy arrays." + ) @classmethod diff --git a/sklearn/metrics/_pairwise_distances_reduction/setup.py b/sklearn/metrics/_pairwise_distances_reduction/setup.py deleted file mode 100644 index 4c8632ebb9088..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/setup.py +++ /dev/null @@ -1,55 +0,0 @@ -import os - -import numpy as np -from numpy.distutils.misc_util import Configuration - -from sklearn._build_utils import gen_from_templates - - -def configuration(parent_package="", top_path=None): - config = Configuration("_pairwise_distances_reduction", parent_package, top_path) - libraries = [] - if os.name == "posix": - libraries.append("m") - - templates = [ - "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp", - "sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp", - "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp", - "sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp", - "sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp", - "sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp", - "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp", - "sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp", - "sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp", - "sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp", - ] - - gen_from_templates(templates) - - cython_sources = [ - "_datasets_pair.pyx", - "_middle_term_computer.pyx", - "_base.pyx", - "_argkmin.pyx", - "_radius_neighbors.pyx", - ] - - for source_file in cython_sources: - private_extension_name = source_file.replace(".pyx", "") - config.add_extension( - name=private_extension_name, - sources=[source_file], - include_dirs=[np.get_include()], - language="c++", - libraries=libraries, - extra_compile_args=["-std=c++11"], - ) - - return config - - -if __name__ == "__main__": - from numpy.distutils.core import setup - - setup(**configuration().todict()) From 8ee5a294023e95e435e1b737c12e21cdc59abf74 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 7 Nov 2022 19:41:18 +0100 Subject: [PATCH 23/27] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 5 ++--- sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp | 3 +-- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 7 ++++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 078c4d4fd8d5e..a69d7643738ad 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -66,9 +66,8 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): and not (issparse(X) ^ issparse(Y)) ): # Specialized implementation of ArgKmin for the Euclidean distance. - # This implementation processes the computation of the chunks of the - # distance using a decomposition of the chunks of the distance matrix - # for the Squared Euclidean distance. + # This implementation computes the distances by chunk using + # a decomposition of the Squared Euclidean distance. # This specialisation has an improved arithmetic intensity for both # the dense and sparse settings, allowing in most case speed-ups of # several orders of magnitude compared to the generic ArgKmin diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index e4aef3c982589..932b8f9006d4e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -110,8 +110,7 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms64_sparse( ): cdef: ITYPE_t n = X_indptr.shape[0] - 1 - ITYPE_t idx = 0 - ITYPE_t X_i_ptr + SPARSE_INDEX_TYPE_t X_i_ptr, idx = 0 DTYPE_t[::1] squared_row_norms = np.zeros(n, dtype=DTYPE) for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index c08fb5b898751..04542f8b18ff7 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -564,9 +564,10 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) - assert BaseDistancesReductionDispatcher.is_usable_for( - X_csr, Y_csr, metric="euclidean" - ) + # TODO: add this assertion once #24542 and #24715 are merged. + # assert BaseDistancesReductionDispatcher.is_usable_for( + # X_csr, Y_csr, metric="euclidean" + # ) # CSR matrices without non-zeros elements aren't currently supported # TODO: support CSR matrices without non-zeros elements From 8d814fa3c2da326dfb22cc494e4e9711ab623188 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 8 Nov 2022 21:16:11 +0100 Subject: [PATCH 24/27] Update sklearn/metrics/tests/test_pairwise_distances_reduction.py Co-authored-by: Julien Jerphanion --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 04542f8b18ff7..299a3d57ca1f5 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -564,7 +564,7 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) - # TODO: add this assertion once #24542 and #24715 are merged. + # TODO: add this assertion once #24542 and #24715 are merged. # assert BaseDistancesReductionDispatcher.is_usable_for( # X_csr, Y_csr, metric="euclidean" # ) From 6fa803fcd6aeaddc0cfc74a84419d0ff33ced69e Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 16 Nov 2022 15:53:29 +0100 Subject: [PATCH 25/27] Update sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp Co-authored-by: Julien Jerphanion --- sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 932b8f9006d4e..1b2a8a31fb679 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -130,6 +130,8 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( ITYPE_t num_threads, ): if issparse(X): + # TODO: remove this instruction which is a cast in the float32 case + # by moving squared row norms computations in MiddleTermComputer. X_data = np.asarray(X.data, dtype=DTYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads) From c08a5155ac389a657abb9765a3c17999f85c5d03 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 18 Nov 2022 15:14:38 +0100 Subject: [PATCH 26/27] apply suggestions :) --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 5 +++-- .../metrics/_pairwise_distances_reduction/_dispatcher.py | 4 +++- .../_middle_term_computer.pyx.tp | 7 ++++--- .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp | 4 ++-- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 7 +++---- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index a69d7643738ad..ad4e1c32acc70 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -63,9 +63,10 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): """ if ( metric in ("euclidean", "sqeuclidean") - and not (issparse(X) ^ issparse(Y)) + and not (issparse(X) ^ issparse(Y)) # "^" is the XOR operator ): - # Specialized implementation of ArgKmin for the Euclidean distance. + # Specialized implementation of ArgKmin for the Euclidean distance + # for the dense-dense and sparse-sparse cases. # This implementation computes the distances by chunk using # a decomposition of the Squared Euclidean distance. # This specialisation has an improved arithmetic intensity for both diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 921074c3ae968..eb2979864f02d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -130,8 +130,10 @@ def is_valid_sparse_matrix(X): # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa # TODO: implement specialisation for (sq)euclidean on fused sparse-dense # using sparse-dense routines for matrix-vector multiplications. + # Currently, only dense-dense and sparse-sparse are optimized for + # the Euclidean case. fused_sparse_dense_euclidean_case_guard = not ( - (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y)) + (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y)) # "^" is XOR and isinstance(metric, str) and "euclidean" in metric ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 083c1366bf554..3363eb9524263 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -153,7 +153,7 @@ cdef class MiddleTermComputer{{name_suffix}}: ) raise NotImplementedError( - "Both X and Y must be CSR sparse matrices xor numpy arrays." + "X and Y must be both CSR sparse matrices or both numpy arrays." ) @@ -406,8 +406,9 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): - """Computes the middle term of the Euclidean distance between two chunked CSR matrices X_c and Y_c - and returns the result as a contiguous array. + """Middle term of the Euclidean distance between two chunked CSR matrices. + + The result is return as a contiguous array. dist_middle_terms = - 2 X_c_i.Y_c_j^T diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 36516af162b60..a498c6cd9856e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -84,10 +84,10 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) """ if ( metric in ("euclidean", "sqeuclidean") - and not (issparse(X) ^ issparse(Y)) + and not (issparse(X) ^ issparse(Y)) # "^" is XOR ): # Specialized implementation of RadiusNeighbors for the Euclidean - # distance. + # distance for the dense-dense and sparse-sparse cases. # This implementation computes the distances by chunk using # a decomposition of the Squared Euclidean distance. # This specialisation has an improved arithmetic intensity for both diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 299a3d57ca1f5..c08fb5b898751 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -564,10 +564,9 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) - # TODO: add this assertion once #24542 and #24715 are merged. - # assert BaseDistancesReductionDispatcher.is_usable_for( - # X_csr, Y_csr, metric="euclidean" - # ) + assert BaseDistancesReductionDispatcher.is_usable_for( + X_csr, Y_csr, metric="euclidean" + ) # CSR matrices without non-zeros elements aren't currently supported # TODO: support CSR matrices without non-zeros elements From 001fa5b108c9e440de26f25f131a4585ffebfd3b Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 18 Nov 2022 18:36:02 +0100 Subject: [PATCH 27/27] Fu... sion! --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index c08fb5b898751..c334087c65448 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -561,7 +561,7 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y, metric="euclidean" ) - assert not BaseDistancesReductionDispatcher.is_usable_for( + assert BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) assert BaseDistancesReductionDispatcher.is_usable_for(