From 4ffe1c56f131bcb3d456a0275efb35294e3e47f3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 9 Aug 2022 13:31:37 -0400 Subject: [PATCH 1/6] MNT Use float64_t and intp_t directly in Cython --- sklearn/feature_extraction/_hashing_fast.pyx | 7 +- .../_argkmin.pxd | 16 +- .../_argkmin.pyx | 148 +++++++++--------- .../_pairwise_distances_reduction/_base.pxd | 73 +++++---- .../_pairwise_distances_reduction/_base.pyx | 84 +++++----- .../_datasets_pair.pxd | 16 +- .../_datasets_pair.pyx | 17 +- .../_radius_neighborhood.pxd | 36 ++--- .../_radius_neighborhood.pyx | 125 ++++++++------- sklearn/utils/_vector_sentinel.pxd | 9 +- sklearn/utils/_vector_sentinel.pyx | 31 ++-- 11 files changed, 274 insertions(+), 288 deletions(-) diff --git a/sklearn/feature_extraction/_hashing_fast.pyx b/sklearn/feature_extraction/_hashing_fast.pyx index 48dbd928a03d3..34be67e166dea 100644 --- a/sklearn/feature_extraction/_hashing_fast.pyx +++ b/sklearn/feature_extraction/_hashing_fast.pyx @@ -9,7 +9,6 @@ from libcpp.vector cimport vector cimport numpy as cnp import numpy as np -from ..utils._typedefs cimport INT32TYPE_t, INT64TYPE_t from ..utils.murmurhash cimport murmurhash3_bytes_s32 from ..utils._vector_sentinel cimport vector_to_nd_array @@ -27,11 +26,11 @@ def transform(raw_X, Py_ssize_t n_features, dtype, For constructing a scipy.sparse.csr_matrix. """ - cdef INT32TYPE_t h + cdef cnp.int32_t h cdef double value - cdef vector[INT32TYPE_t] indices - cdef vector[INT64TYPE_t] indptr + cdef vector[cnp.int32_t] indices + cdef vector[cnp.int64_t] indptr indptr.push_back(0) # Since Python array does not understand Numpy dtypes, we grow the indices diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd index 34d3339e1c9e0..1758eb4690dac 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd @@ -5,29 +5,27 @@ from ._base cimport ( ) from ._gemm_term_computer cimport GEMMTermComputer64 -from ...utils._typedefs cimport ITYPE_t, DTYPE_t - cnp.import_array() cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): """64bit implementation of PairwiseDistancesArgKmin.""" cdef: - ITYPE_t k + cnp.intp_t k - ITYPE_t[:, ::1] argkmin_indices - DTYPE_t[:, ::1] argkmin_distances + cnp.intp_t[:, ::1] argkmin_indices + cnp.float64_t[:, ::1] argkmin_distances # Used as array of pointers to private datastructures used in threads. - DTYPE_t ** heaps_r_distances_chunks - ITYPE_t ** heaps_indices_chunks + cnp.float64_t ** heaps_r_distances_chunks + cnp.intp_t ** heaps_indices_chunks cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesArgKmin.""" cdef: GEMMTermComputer64 gemm_term_computer - const DTYPE_t[::1] X_norm_squared - const DTYPE_t[::1] Y_norm_squared + const cnp.float64_t[::1] X_norm_squared + const cnp.float64_t[::1] Y_norm_squared bint use_squared_distances diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx index 2f378543e1f97..bbb7300518295 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx @@ -19,7 +19,6 @@ from ._gemm_term_computer cimport GEMMTermComputer64 from ...utils._heap cimport heap_push from ...utils._sorting cimport simultaneous_sort -from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np import warnings @@ -28,7 +27,6 @@ from numbers import Integral from scipy.sparse import issparse from sklearn.utils import check_scalar, _in_unstable_openblas_configuration from sklearn.utils.fixes import threadpool_limits -from ...utils._typedefs import ITYPE, DTYPE cnp.import_array() @@ -41,7 +39,7 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): cls, X, Y, - ITYPE_t k, + cnp.intp_t k, str metric="euclidean", chunk_size=None, dict metric_kwargs=None, @@ -103,7 +101,7 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): DatasetsPair datasets_pair, chunk_size=None, strategy=None, - ITYPE_t k=1, + cnp.intp_t k=1, ): super().__init__( datasets_pair=datasets_pair, @@ -121,16 +119,16 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): # - when parallelizing on Y, the pointers of those heaps are referencing # small heaps which are thread-wise-allocated and whose content will be # merged with the main heaps'. - self.heaps_r_distances_chunks = malloc( - sizeof(DTYPE_t *) * self.chunks_n_threads + self.heaps_r_distances_chunks = malloc( + sizeof(cnp.float64_t *) * self.chunks_n_threads ) - self.heaps_indices_chunks = malloc( - sizeof(ITYPE_t *) * self.chunks_n_threads + self.heaps_indices_chunks = malloc( + sizeof(cnp.intp_t *) * self.chunks_n_threads ) # Main heaps which will be returned as results by `PairwiseDistancesArgKmin64.compute`. - self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE) - self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE) + self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=np.intp) + self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=np.float64) def __dealloc__(self): if self.heaps_indices_chunks is not NULL: @@ -141,18 +139,18 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): cdef void _compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: cdef: - ITYPE_t i, j - ITYPE_t n_samples_X = X_end - X_start - ITYPE_t n_samples_Y = Y_end - Y_start - DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num] - ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num] + cnp.intp_t i, j + cnp.intp_t n_samples_X = X_end - X_start + cnp.intp_t n_samples_Y = Y_end - Y_start + cnp.float64_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num] + cnp.intp_t *heaps_indices = self.heaps_indices_chunks[thread_num] # Pushing the distances and their associated indices on a heap # which by construction will keep track of the argkmin. @@ -168,9 +166,9 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): cdef void _parallel_on_X_init_chunk( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: # As this strategy is embarrassingly parallel, we can set each # thread's heaps pointer to the proper position on the main heaps. @@ -180,12 +178,12 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): @final cdef void _parallel_on_X_prange_iter_finalize( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: cdef: - ITYPE_t idx, jdx + cnp.intp_t idx, jdx # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. @@ -201,8 +199,8 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): ) nogil: cdef: # Maximum number of scalar elements (the last chunks can be smaller) - ITYPE_t heaps_size = self.X_n_samples_chunk * self.k - ITYPE_t thread_num + cnp.intp_t heaps_size = self.X_n_samples_chunk * self.k + cnp.intp_t thread_num # The allocation is done in parallel for data locality purposes: this way # the heaps used in each threads are allocated in pages which are closer @@ -214,18 +212,18 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): # As chunks of X are shared across threads, so must their # heaps. To solve this, each thread has its own heaps # which are then synchronised back in the main ones. - self.heaps_r_distances_chunks[thread_num] = malloc( - heaps_size * sizeof(DTYPE_t) + self.heaps_r_distances_chunks[thread_num] = malloc( + heaps_size * sizeof(cnp.float64_t) ) - self.heaps_indices_chunks[thread_num] = malloc( - heaps_size * sizeof(ITYPE_t) + self.heaps_indices_chunks[thread_num] = malloc( + heaps_size * sizeof(cnp.intp_t) ) cdef void _parallel_on_Y_parallel_init( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: # Initialising heaps (memset can't be used here) for idx in range(self.X_n_samples_chunk * self.k): @@ -235,11 +233,11 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): @final cdef void _parallel_on_Y_synchronize( self, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: cdef: - ITYPE_t idx, jdx, thread_num + cnp.intp_t idx, jdx, thread_num with nogil, parallel(num_threads=self.effective_n_threads): # Synchronising the thread heaps with the main heaps. # This is done in parallel sample-wise (no need for locks). @@ -263,7 +261,7 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): self, ) nogil: cdef: - ITYPE_t idx, thread_num + cnp.intp_t idx, thread_num with nogil, parallel(num_threads=self.chunks_n_threads): # Deallocating temporary datastructures @@ -283,9 +281,9 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): cdef void compute_exact_distances(self) nogil: cdef: - ITYPE_t i, j - ITYPE_t[:, ::1] Y_indices = self.argkmin_indices - DTYPE_t[:, ::1] distances = self.argkmin_distances + cnp.intp_t i, j + cnp.intp_t[:, ::1] Y_indices = self.argkmin_indices + cnp.float64_t[:, ::1] distances = self.argkmin_distances for i in prange(self.n_samples_X, schedule='static', nogil=True, num_threads=self.effective_n_threads): for j in range(self.k): @@ -320,7 +318,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): self, X, Y, - ITYPE_t k, + cnp.intp_t k, bint use_squared_distances=False, chunk_size=None, strategy=None, @@ -350,7 +348,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): DenseDenseDatasetsPair datasets_pair = ( self.datasets_pair ) - ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk + cnp.intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk self.gemm_term_computer = GEMMTermComputer64( datasets_pair.X, @@ -382,7 +380,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): @final cdef void _parallel_on_X_parallel_init( self, - ITYPE_t thread_num, + cnp.intp_t thread_num, ) nogil: PairwiseDistancesArgKmin64._parallel_on_X_parallel_init(self, thread_num) self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) @@ -391,9 +389,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): @final cdef void _parallel_on_X_init_chunk( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: PairwiseDistancesArgKmin64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) @@ -402,11 +400,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): @final cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: PairwiseDistancesArgKmin64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, @@ -423,7 +421,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num + cdef cnp.intp_t thread_num PairwiseDistancesArgKmin64._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() @@ -431,9 +429,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): @final cdef void _parallel_on_Y_parallel_init( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: PairwiseDistancesArgKmin64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) @@ -442,11 +440,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): @final cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: PairwiseDistancesArgKmin64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( self, @@ -462,22 +460,22 @@ cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): @final cdef void _compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( + cnp.intp_t i, j + cnp.float64_t squared_dist_i_j + cnp.intp_t n_X = X_end - X_start + cnp.intp_t n_Y = Y_end - Y_start + cnp.float64_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num ) - DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num] - ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num] + cnp.float64_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num] + cnp.intp_t * heaps_indices = self.heaps_indices_chunks[thread_num] # Pushing the distance and their associated indices on heaps diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd index 9f6ad45cb839a..297423b47b4fa 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd @@ -3,14 +3,13 @@ cimport numpy as cnp from cython cimport final from ._datasets_pair cimport DatasetsPair -from ...utils._typedefs cimport ITYPE_t, DTYPE_t cnp.import_array() -cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( - const DTYPE_t[:, ::1] X, - ITYPE_t num_threads, +cpdef cnp.float64_t[::1] _sqeuclidean_row_norms64( + const cnp.float64_t[:, ::1] X, + cnp.intp_t num_threads, ) cdef class PairwiseDistancesReduction64: @@ -32,13 +31,13 @@ cdef class PairwiseDistancesReduction64: # # chunks_n_threads <= effective_n_threads # - ITYPE_t effective_n_threads - ITYPE_t chunks_n_threads + cnp.intp_t effective_n_threads + cnp.intp_t chunks_n_threads - ITYPE_t n_samples_chunk, chunk_size + cnp.intp_t n_samples_chunk, chunk_size - ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk - ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk + cnp.intp_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk + cnp.intp_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk bint execute_in_parallel_on_Y @@ -52,11 +51,11 @@ cdef class PairwiseDistancesReduction64: cdef void _compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil @@ -66,35 +65,35 @@ cdef class PairwiseDistancesReduction64: cdef void _parallel_on_X_parallel_init( self, - ITYPE_t thread_num, + cnp.intp_t thread_num, ) nogil cdef void _parallel_on_X_init_chunk( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil cdef void _parallel_on_X_prange_iter_finalize( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil cdef void _parallel_on_X_parallel_finalize( self, - ITYPE_t thread_num + cnp.intp_t thread_num ) nogil cdef void _parallel_on_Y_init( @@ -103,24 +102,24 @@ cdef class PairwiseDistancesReduction64: cdef void _parallel_on_Y_parallel_init( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil cdef void _parallel_on_Y_synchronize( self, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil cdef void _parallel_on_Y_finalize( diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx index 07506e3616a74..cb07cba7f615b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx @@ -8,20 +8,18 @@ from cython.parallel cimport parallel, prange from ._datasets_pair cimport DatasetsPair from ...utils._cython_blas cimport _dot from ...utils._openmp_helpers cimport _openmp_thread_num -from ...utils._typedefs cimport ITYPE_t, DTYPE_t from numbers import Integral from sklearn.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils._typedefs import ITYPE, DTYPE cnp.import_array() ##################### -cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( - const DTYPE_t[:, ::1] X, - ITYPE_t num_threads, +cpdef cnp.float64_t[::1] _sqeuclidean_row_norms64( + const cnp.float64_t[:, ::1] X, + cnp.intp_t num_threads, ): """Compute the squared euclidean norm of the rows of X in parallel. @@ -32,11 +30,11 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' # const qualifier. # See: https://github.com/scipy/scipy/issues/14262 - DTYPE_t * X_ptr = &X[0, 0] - ITYPE_t idx = 0 - ITYPE_t n = X.shape[0] - ITYPE_t d = X.shape[1] - DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) + cnp.float64_t * X_ptr = &X[0, 0] + cnp.intp_t idx = 0 + cnp.intp_t n = X.shape[0] + cnp.intp_t d = X.shape[1] + cnp.float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64) for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) @@ -54,7 +52,7 @@ cdef class PairwiseDistancesReduction64: strategy=None, ): cdef: - ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks + cnp.intp_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks if chunk_size is None: chunk_size = get_config().get("pairwise_dist_chunk_size", 256) @@ -128,8 +126,8 @@ cdef class PairwiseDistancesReduction64: interact with those datastructures at various stages. """ cdef: - ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx - ITYPE_t thread_num + cnp.intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + cnp.intp_t thread_num with nogil, parallel(num_threads=self.chunks_n_threads): thread_num = _openmp_thread_num() @@ -197,8 +195,8 @@ cdef class PairwiseDistancesReduction64: interact with those datastructures at various stages. """ cdef: - ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx - ITYPE_t thread_num + cnp.intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + cnp.intp_t thread_num # Allocating datastructures shared by all threads self._parallel_on_Y_init() @@ -255,11 +253,11 @@ cdef class PairwiseDistancesReduction64: cdef void _compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: """Compute the pairwise distances on two chunks of X and Y and reduce them. @@ -284,27 +282,27 @@ cdef class PairwiseDistancesReduction64: cdef void _parallel_on_X_parallel_init( self, - ITYPE_t thread_num, + cnp.intp_t thread_num, ) nogil: """Allocate datastructures used in a thread given its number.""" return cdef void _parallel_on_X_init_chunk( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: """Initialise datastructures used in a thread given its number.""" return cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks. @@ -314,16 +312,16 @@ cdef class PairwiseDistancesReduction64: cdef void _parallel_on_X_prange_iter_finalize( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: """Interact with datastructures after a reduction on chunks.""" return cdef void _parallel_on_X_parallel_finalize( self, - ITYPE_t thread_num + cnp.intp_t thread_num ) nogil: """Interact with datastructures after executing all the reductions.""" return @@ -336,20 +334,20 @@ cdef class PairwiseDistancesReduction64: cdef void _parallel_on_Y_parallel_init( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: """Initialise datastructures used in a thread given its number.""" return cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks. @@ -359,8 +357,8 @@ cdef class PairwiseDistancesReduction64: cdef void _parallel_on_Y_synchronize( self, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: """Update thread datastructures before leaving a parallel region.""" return diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd index de6458f8c6f26..e8838c546205a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd @@ -1,21 +1,21 @@ -from ...utils._typedefs cimport DTYPE_t, ITYPE_t +cimport numpy as cnp from ...metrics._dist_metrics cimport DistanceMetric cdef class DatasetsPair: cdef DistanceMetric distance_metric - cdef ITYPE_t n_samples_X(self) nogil + cdef cnp.intp_t n_samples_X(self) nogil - cdef ITYPE_t n_samples_Y(self) nogil + cdef cnp.intp_t n_samples_Y(self) nogil - cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil - cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil cdef class DenseDenseDatasetsPair(DatasetsPair): cdef: - const DTYPE_t[:, ::1] X - const DTYPE_t[:, ::1] Y - ITYPE_t d + const cnp.float64_t[:, ::1] X + const cnp.float64_t[:, ::1] Y + cnp.intp_t d diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx index abef1bed098ed..3ea6c0d5ac17f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx @@ -4,7 +4,6 @@ cimport numpy as cnp from cython cimport final from scipy.sparse import issparse -from ...utils._typedefs cimport DTYPE_t, ITYPE_t from ...metrics._dist_metrics cimport DistanceMetric cnp.import_array() @@ -100,24 +99,24 @@ cdef class DatasetsPair: def __init__(self, DistanceMetric distance_metric): self.distance_metric = distance_metric - cdef ITYPE_t n_samples_X(self) nogil: + cdef cnp.intp_t n_samples_X(self) nogil: """Number of samples in X.""" # This is a abstract method. # This _must_ always be overwritten in subclasses. # TODO: add "with gil: raise" here when supporting Cython 3.0 return -999 - cdef ITYPE_t n_samples_Y(self) nogil: + cdef cnp.intp_t n_samples_Y(self) nogil: """Number of samples in Y.""" # This is a abstract method. # This _must_ always be overwritten in subclasses. # TODO: add "with gil: raise" here when supporting Cython 3.0 return -999 - cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.dist(i, j) - cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: # This is a abstract method. # This _must_ always be overwritten in subclasses. # TODO: add "with gil: raise" here when supporting Cython 3.0 @@ -148,17 +147,17 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): self.d = X.shape[1] @final - cdef ITYPE_t n_samples_X(self) nogil: + cdef cnp.intp_t n_samples_X(self) nogil: return self.X.shape[0] @final - cdef ITYPE_t n_samples_Y(self) nogil: + cdef cnp.intp_t n_samples_Y(self) nogil: return self.Y.shape[0] @final - cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.d) @final - cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.d) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd index 737e6888a8a55..6d043db0fa096 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd @@ -9,25 +9,23 @@ from ._base cimport ( ) from ._gemm_term_computer cimport GEMMTermComputer64 -from ...utils._typedefs cimport ITYPE_t, DTYPE_t - cnp.import_array() ###################### ## std::vector to np.ndarray coercion # As type covariance is not supported for C++ containers via Cython, # we need to redefine fused types. -ctypedef fused vector_DITYPE_t: - vector[ITYPE_t] - vector[DTYPE_t] +ctypedef fused vector_INTP_FLOAT64_t: + vector[cnp.intp_t] + vector[cnp.float64_t] -ctypedef fused vector_vector_DITYPE_t: - vector[vector[ITYPE_t]] - vector[vector[DTYPE_t]] +ctypedef fused vector_vector_INTP_FLOAT64_t: + vector[vector[cnp.intp_t]] + vector[vector[cnp.float64_t]] cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( - shared_ptr[vector_vector_DITYPE_t] vecs + shared_ptr[vector_vector_INTP_FLOAT64_t] vecs ) ##################### @@ -36,13 +34,13 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): """64bit implementation of PairwiseDistancesRadiusNeighborhood .""" cdef: - DTYPE_t radius + cnp.float64_t radius # DistanceMetric compute rank-preserving surrogate distance via rdist # which are proxies necessitating less computations. # We get the equivalent for the radius to be able to compare it against # vectors' rank-preserving surrogate distances. - DTYPE_t r_radius + cnp.float64_t r_radius # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays. # @@ -62,20 +60,20 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): # Shared pointers (defined via shared_ptr) are use for safer memory management. # Unique pointers (defined via unique_ptr) can't be used as datastructures # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk. - shared_ptr[vector[vector[ITYPE_t]]] neigh_indices - shared_ptr[vector[vector[DTYPE_t]]] neigh_distances + shared_ptr[vector[vector[cnp.intp_t]]] neigh_indices + shared_ptr[vector[vector[cnp.float64_t]]] neigh_distances # Used as array of pointers to private datastructures used in threads. - vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks - vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks + vector[shared_ptr[vector[vector[cnp.intp_t]]]] neigh_indices_chunks + vector[shared_ptr[vector[vector[cnp.float64_t]]]] neigh_distances_chunks bint sort_results @final cdef void _merge_vectors( self, - ITYPE_t idx, - ITYPE_t num_threads, + cnp.intp_t idx, + cnp.intp_t num_threads, ) nogil @@ -83,7 +81,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesRadiusNeighborhood.""" cdef: GEMMTermComputer64 gemm_term_computer - const DTYPE_t[::1] X_norm_squared - const DTYPE_t[::1] Y_norm_squared + const cnp.float64_t[::1] X_norm_squared + const cnp.float64_t[::1] Y_norm_squared bint use_squared_distances diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx index db2c22e89d06d..84408ad69217f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx @@ -21,7 +21,6 @@ from ._datasets_pair cimport ( from ._gemm_term_computer cimport GEMMTermComputer64 from ...utils._sorting cimport simultaneous_sort -from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._vector_sentinel cimport vector_to_nd_array from numbers import Real @@ -40,11 +39,11 @@ cdef extern from "" namespace "std" nogil: ###################### cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( - shared_ptr[vector_vector_DITYPE_t] vecs + shared_ptr[vector_vector_INTP_FLOAT64_t] vecs ): """Coerce a std::vector of std::vector to a ndarray of ndarray.""" cdef: - ITYPE_t n = deref(vecs).size() + cnp.intp_t n = deref(vecs).size() cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray) for i in range(n): @@ -63,7 +62,7 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): cls, X, Y, - DTYPE_t radius, + cnp.float64_t radius, str metric="euclidean", chunk_size=None, dict metric_kwargs=None, @@ -128,7 +127,7 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): def __init__( self, DatasetsPair datasets_pair, - DTYPE_t radius, + cnp.float64_t radius, chunk_size=None, strategy=None, sort_results=False, @@ -153,29 +152,29 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): # - when parallelizing on Y, the pointers of those heaps are referencing # std::vectors of std::vectors which are thread-wise-allocated and whose # content will be merged into self.neigh_distances and self.neigh_indices. - self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]]( + self.neigh_distances_chunks = vector[shared_ptr[vector[vector[cnp.float64_t]]]]( self.chunks_n_threads ) - self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]]( + self.neigh_indices_chunks = vector[shared_ptr[vector[vector[cnp.intp_t]]]]( self.chunks_n_threads ) # Temporary datastructures which will be coerced to numpy arrays on before # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed. - self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) - self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) + self.neigh_distances = make_shared[vector[vector[cnp.float64_t]]](self.n_samples_X) + self.neigh_indices = make_shared[vector[vector[cnp.intp_t]]](self.n_samples_X) cdef void _compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: cdef: - ITYPE_t i, j - DTYPE_t r_dist_i_j + cnp.intp_t i, j + cnp.float64_t r_dist_i_j for i in range(X_start, X_end): for j in range(Y_start, Y_end): @@ -198,9 +197,9 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): cdef void _parallel_on_X_init_chunk( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: # As this strategy is embarrassingly parallel, we can set the @@ -211,12 +210,12 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): @final cdef void _parallel_on_X_prange_iter_finalize( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: cdef: - ITYPE_t idx, jdx + cnp.intp_t idx, jdx # Sorting neighbors for each query vector of X if self.sort_results: @@ -231,24 +230,24 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): self, ) nogil: cdef: - ITYPE_t thread_num + cnp.intp_t thread_num # As chunks of X are shared across threads, so must datastructures to avoid race # conditions: each thread has its own vectors of n_samples_X vectors which are # then merged back in the main n_samples_X vectors. for thread_num in range(self.chunks_n_threads): - self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) - self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) + self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[cnp.float64_t]]](self.n_samples_X) + self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[cnp.intp_t]]](self.n_samples_X) @final cdef void _merge_vectors( self, - ITYPE_t idx, - ITYPE_t num_threads, + cnp.intp_t idx, + cnp.intp_t num_threads, ) nogil: cdef: - ITYPE_t thread_num - ITYPE_t idx_n_elements = 0 - ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size() + cnp.intp_t thread_num + cnp.intp_t idx_n_elements = 0 + cnp.intp_t last_element_idx = deref(self.neigh_indices)[idx].size() # Resizing buffers only once for the given number of elements. for thread_num in range(num_threads): @@ -277,7 +276,7 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): self, ) nogil: cdef: - ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current + cnp.intp_t idx, jdx, thread_num, idx_n_element, idx_current with nogil, parallel(num_threads=self.effective_n_threads): # Merge vectors used in threads into the main ones. @@ -304,7 +303,7 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): cdef void compute_exact_distances(self) nogil: """Convert rank-preserving distances to pairwise distances in parallel.""" cdef: - ITYPE_t i, j + cnp.intp_t i, j for i in prange(self.n_samples_X, nogil=True, schedule='static', num_threads=self.effective_n_threads): @@ -329,7 +328,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR self, X, Y, - DTYPE_t radius, + cnp.float64_t radius, bint use_squared_distances=False, chunk_size=None, strategy=None, @@ -360,7 +359,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair cdef: DenseDenseDatasetsPair datasets_pair = self.datasets_pair - ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk + cnp.intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk self.gemm_term_computer = GEMMTermComputer64( datasets_pair.X, @@ -392,7 +391,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR @final cdef void _parallel_on_X_parallel_init( self, - ITYPE_t thread_num, + cnp.intp_t thread_num, ) nogil: PairwiseDistancesRadiusNeighborhood64._parallel_on_X_parallel_init(self, thread_num) self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) @@ -400,9 +399,9 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR @final cdef void _parallel_on_X_init_chunk( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: PairwiseDistancesRadiusNeighborhood64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) @@ -410,11 +409,11 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR @final cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: PairwiseDistancesRadiusNeighborhood64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, @@ -430,16 +429,16 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num + cdef cnp.intp_t thread_num PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() @final cdef void _parallel_on_Y_parallel_init( self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, + cnp.intp_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, ) nogil: PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) @@ -447,11 +446,11 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR @final cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( self, @@ -471,18 +470,18 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR @final cdef void _compute_and_reduce_distances_on_chunks( self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, + cnp.intp_t X_start, + cnp.intp_t X_end, + cnp.intp_t Y_start, + cnp.intp_t Y_end, + cnp.intp_t thread_num, ) nogil: cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( + cnp.intp_t i, j + cnp.float64_t squared_dist_i_j + cnp.intp_t n_X = X_end - X_start + cnp.intp_t n_Y = Y_end - Y_start + cnp.float64_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks( X_start, X_end, Y_start, Y_end, thread_num ) diff --git a/sklearn/utils/_vector_sentinel.pxd b/sklearn/utils/_vector_sentinel.pxd index b3d9a3ff32613..ff8d46ae428c3 100644 --- a/sklearn/utils/_vector_sentinel.pxd +++ b/sklearn/utils/_vector_sentinel.pxd @@ -1,12 +1,11 @@ cimport numpy as cnp from libcpp.vector cimport vector -from ..utils._typedefs cimport ITYPE_t, DTYPE_t, INT32TYPE_t, INT64TYPE_t ctypedef fused vector_typed: - vector[DTYPE_t] - vector[ITYPE_t] - vector[INT32TYPE_t] - vector[INT64TYPE_t] + vector[cnp.float64_t] + vector[cnp.intp_t] + vector[cnp.int32_t] + vector[cnp.int64_t] cdef cnp.ndarray vector_to_nd_array(vector_typed * vect_ptr) diff --git a/sklearn/utils/_vector_sentinel.pyx b/sklearn/utils/_vector_sentinel.pyx index 45c48de9dac68..95c95f192d7eb 100644 --- a/sklearn/utils/_vector_sentinel.pyx +++ b/sklearn/utils/_vector_sentinel.pyx @@ -2,17 +2,16 @@ from cython.operator cimport dereference as deref from cpython.ref cimport Py_INCREF cimport numpy as cnp -from ._typedefs cimport DTYPECODE, ITYPECODE, INT32TYPECODE, INT64TYPECODE cnp.import_array() cdef StdVectorSentinel _create_sentinel(vector_typed * vect_ptr): - if vector_typed is vector[DTYPE_t]: + if vector_typed is vector[cnp.float64_t]: return StdVectorSentinelFloat64.create_for(vect_ptr) - elif vector_typed is vector[INT32TYPE_t]: + elif vector_typed is vector[cnp.int32_t]: return StdVectorSentinelInt32.create_for(vect_ptr) - elif vector_typed is vector[INT64TYPE_t]: + elif vector_typed is vector[cnp.int64_t]: return StdVectorSentinelInt64.create_for(vect_ptr) else: return StdVectorSentinelIntP.create_for(vect_ptr) @@ -33,10 +32,10 @@ cdef class StdVectorSentinel: cdef class StdVectorSentinelFloat64(StdVectorSentinel): - cdef vector[DTYPE_t] vec + cdef vector[cnp.float64_t] vec @staticmethod - cdef StdVectorSentinel create_for(vector[DTYPE_t] * vect_ptr): + cdef StdVectorSentinel create_for(vector[cnp.float64_t] * vect_ptr): # This initializes the object directly without calling __init__ # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa cdef StdVectorSentinelFloat64 sentinel = StdVectorSentinelFloat64.__new__(StdVectorSentinelFloat64) @@ -47,14 +46,14 @@ cdef class StdVectorSentinelFloat64(StdVectorSentinel): return self.vec.data() cdef int get_typenum(self): - return DTYPECODE + return cnp.NPY_FLOAT64 cdef class StdVectorSentinelIntP(StdVectorSentinel): - cdef vector[ITYPE_t] vec + cdef vector[cnp.intp_t] vec @staticmethod - cdef StdVectorSentinel create_for(vector[ITYPE_t] * vect_ptr): + cdef StdVectorSentinel create_for(vector[cnp.intp_t] * vect_ptr): # This initializes the object directly without calling __init__ # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa cdef StdVectorSentinelIntP sentinel = StdVectorSentinelIntP.__new__(StdVectorSentinelIntP) @@ -65,14 +64,14 @@ cdef class StdVectorSentinelIntP(StdVectorSentinel): return self.vec.data() cdef int get_typenum(self): - return ITYPECODE + return cnp.NPY_INTP cdef class StdVectorSentinelInt32(StdVectorSentinel): - cdef vector[INT32TYPE_t] vec + cdef vector[cnp.int32_t] vec @staticmethod - cdef StdVectorSentinel create_for(vector[INT32TYPE_t] * vect_ptr): + cdef StdVectorSentinel create_for(vector[cnp.int32_t] * vect_ptr): # This initializes the object directly without calling __init__ # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa cdef StdVectorSentinelInt32 sentinel = StdVectorSentinelInt32.__new__(StdVectorSentinelInt32) @@ -83,14 +82,14 @@ cdef class StdVectorSentinelInt32(StdVectorSentinel): return self.vec.data() cdef int get_typenum(self): - return INT32TYPECODE + return cnp.NPY_INT32 cdef class StdVectorSentinelInt64(StdVectorSentinel): - cdef vector[INT64TYPE_t] vec + cdef vector[cnp.int64_t] vec @staticmethod - cdef StdVectorSentinel create_for(vector[INT64TYPE_t] * vect_ptr): + cdef StdVectorSentinel create_for(vector[cnp.int64_t] * vect_ptr): # This initializes the object directly without calling __init__ # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa cdef StdVectorSentinelInt64 sentinel = StdVectorSentinelInt64.__new__(StdVectorSentinelInt64) @@ -101,7 +100,7 @@ cdef class StdVectorSentinelInt64(StdVectorSentinel): return self.vec.data() cdef int get_typenum(self): - return INT64TYPECODE + return cnp.NPY_INT64 cdef cnp.ndarray vector_to_nd_array(vector_typed * vect_ptr): From f9bb33e1d639bf77226b8c08cbf76164a13f824a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 10 Aug 2022 08:20:07 -0400 Subject: [PATCH 2/6] MNT Adds more metadata into pyproject.toml --- pyproject.toml | 74 ++++++++++++++++++- setup.py | 45 ----------- sklearn/_min_dependencies.py | 2 +- ...ies_readme.py => test_min_dependencies.py} | 26 ++++++- 4 files changed, 98 insertions(+), 49 deletions(-) rename sklearn/tests/{test_min_dependencies_readme.py => test_min_dependencies.py} (60%) diff --git a/pyproject.toml b/pyproject.toml index 9b38a78966358..79327a211aa09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "setuptools<60.0", "wheel", - "Cython>=0.28.5", + "Cython>=0.29.24", # use oldest-supported-numpy which provides the oldest numpy version with # wheels on PyPI @@ -13,6 +13,78 @@ requires = [ "scipy>=1.3.2", ] +build-backend = "setuptools.build_meta" + +[project] +name = "scikit-learn" +description = "A set of python modules for machine learning and data mining" +readme = "README.rst" +license = {text = "BSD 3-Clause License"} +classifiers = [ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: C", + "Programming Language :: Python", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Development Status :: 5 - Production/Stable", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +maintainers = [ + {name = "Andreas Mueller", email = "amueller@ais.uni-bonn.de"}, +] +requires-python = ">=3.8" +dependencies = [ + "numpy>=1.19.2; platform_python_implementation=='PyPy'", + "numpy>=1.17.3; platform_python_implementation!='PyPy'", + "scipy>=1.3.2", + "threadpoolctl>=2.0.0", + "joblib>=1.0.0", +] + +dynamic = ["version"] + +[project.urls] +"Bug Tracker" = "https://github.com/scikit-learn/scikit-learn/issues" +Documentation = "https://scikit-learn.org/stable/documentation.html" +"Source Code" = "https://github.com/scikit-learn/scikit-learn" + +[project.optional-dependencies] +tests = [ + "matplotlib>=3.1.2", + "scikit-image>=0.16.2", + "pandas>=1.0.5", + "pytest>=5.0.1", + "pytest-cov>=2.9.0", + "flake8>=3.8.2", + "black>=22.3.0", + "mypy>=0.961", + "pyamg>=4.0.0", + "numpydoc>=1.2.0", +] +docs = [ + "matplotlib>=3.1.2", + "scikit-image>=0.16.2", + "pandas>=1.0.5", + "seaborn>=0.9.0", + "memory_profiler>=0.57.0", + "sphinx>=4.0.1", + "sphinx-gallery>=0.7.0", + "numpydoc>=1.2.0", + "Pillow>=7.1.2", + "sphinx-prompt>=1.3.0", + "sphinxext-opengraph>=0.4.2", +] [tool.black] line-length = 88 diff --git a/setup.py b/setup.py index 2ecc5ba0bcc2e..5a29e4a434bad 100755 --- a/setup.py +++ b/setup.py @@ -32,20 +32,8 @@ builtins.__SKLEARN_SETUP__ = True -DISTNAME = "scikit-learn" -DESCRIPTION = "A set of python modules for machine learning and data mining" -with open("README.rst") as f: - LONG_DESCRIPTION = f.read() -MAINTAINER = "Andreas Mueller" -MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de" URL = "http://scikit-learn.org" DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" -LICENSE = "new BSD" -PROJECT_URLS = { - "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", - "Documentation": "https://scikit-learn.org/stable/documentation.html", - "Source Code": "https://github.com/scikit-learn/scikit-learn", -} # We can actually import a restricted version of sklearn that # does not need the compiled code @@ -76,10 +64,6 @@ extra_setuptools_args = dict( zip_safe=False, # the package can run out of an .egg file include_package_data=True, - extras_require={ - key: min_deps.tag_to_packages[key] - for key in ["examples", "docs", "tests", "benchmark"] - }, ) else: extra_setuptools_args = dict() @@ -230,7 +214,6 @@ def check_package_status(package, min_version): def setup_package(): - # TODO: Require Python 3.8 for PyPy when PyPy3.8 is ready # https://github.com/conda-forge/conda-forge-pinning-feedstock/issues/2089 if platform.python_implementation() == "PyPy": @@ -241,39 +224,11 @@ def setup_package(): required_python_version = (3, 8) metadata = dict( - name=DISTNAME, - maintainer=MAINTAINER, - maintainer_email=MAINTAINER_EMAIL, - description=DESCRIPTION, - license=LICENSE, url=URL, download_url=DOWNLOAD_URL, - project_urls=PROJECT_URLS, version=VERSION, - long_description=LONG_DESCRIPTION, - classifiers=[ - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Programming Language :: C", - "Programming Language :: Python", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Development Status :: 5 - Production/Stable", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX", - "Operating System :: Unix", - "Operating System :: MacOS", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", - ], cmdclass=cmdclass, python_requires=python_requires, - install_requires=min_deps.tag_to_packages["install"], package_data={"": ["*.pxd"]}, **extra_setuptools_args, ) diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index 3e28d6bc7dc98..b962e5638643d 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -29,7 +29,7 @@ "scipy": (SCIPY_MIN_VERSION, "build, install"), "joblib": (JOBLIB_MIN_VERSION, "install"), "threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"), - "cython": (CYTHON_MIN_VERSION, "build"), + "Cython": (CYTHON_MIN_VERSION, "build"), "matplotlib": ("3.1.2", "benchmark, docs, examples, tests"), "scikit-image": ("0.16.2", "docs, examples, tests"), "pandas": ("1.0.5", "benchmark, docs, examples, tests"), diff --git a/sklearn/tests/test_min_dependencies_readme.py b/sklearn/tests/test_min_dependencies.py similarity index 60% rename from sklearn/tests/test_min_dependencies_readme.py rename to sklearn/tests/test_min_dependencies.py index 8b2b548c5bf42..5a03b392e86b9 100644 --- a/sklearn/tests/test_min_dependencies_readme.py +++ b/sklearn/tests/test_min_dependencies.py @@ -1,4 +1,4 @@ -"""Tests for the minimum dependencies in the README.rst file.""" +"""Tests for the minimum dependencies in the README.rst file and pyproject.toml.""" import os @@ -8,7 +8,7 @@ import pytest import sklearn -from sklearn._min_dependencies import dependent_packages +from sklearn._min_dependencies import dependent_packages, tag_to_packages from sklearn.utils.fixes import parse_version @@ -50,3 +50,25 @@ def test_min_dependencies_readme(): min_version = parse_version(dependent_packages[package][0]) assert version == min_version, f"{package} has a mismatched version" + + +def test_min_dependencies_pyproject_toml(): + """Verify that pyproject.toml is consistent with _min_dependencies.""" + root_path = Path(sklearn.__path__[0]).parents[0] + pyproject_toml = root_path / "pyproject.toml" + + if not pyproject_toml.exists(): + # Skip the test if the pyproject.toml file is not available. + # For instance, when installing scikit-learn from wheels + pytest.skip("The pyproject.toml does not exist") + + toml_content = pyproject_toml.read_text() + + for tag, constraints in tag_to_packages.items(): + if tag == "maintenance": + # maintenance does not need to be in pyproject.toml + continue + for constraint in constraints: + assert ( + constraint in toml_content + ), f"{constraint} should be in pyproject.toml" From aaf7231b479fd46b1c9625f309a583a072fcdc47 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 10 Aug 2022 09:34:01 -0400 Subject: [PATCH 3/6] Revert "MNT Adds more metadata into pyproject.toml" This reverts commit f9bb33e1d639bf77226b8c08cbf76164a13f824a. --- pyproject.toml | 74 +------------------ setup.py | 45 +++++++++++ sklearn/_min_dependencies.py | 2 +- ...ies.py => test_min_dependencies_readme.py} | 26 +------ 4 files changed, 49 insertions(+), 98 deletions(-) rename sklearn/tests/{test_min_dependencies.py => test_min_dependencies_readme.py} (60%) diff --git a/pyproject.toml b/pyproject.toml index 79327a211aa09..9b38a78966358 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "setuptools<60.0", "wheel", - "Cython>=0.29.24", + "Cython>=0.28.5", # use oldest-supported-numpy which provides the oldest numpy version with # wheels on PyPI @@ -13,78 +13,6 @@ requires = [ "scipy>=1.3.2", ] -build-backend = "setuptools.build_meta" - -[project] -name = "scikit-learn" -description = "A set of python modules for machine learning and data mining" -readme = "README.rst" -license = {text = "BSD 3-Clause License"} -classifiers = [ - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Programming Language :: C", - "Programming Language :: Python", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Development Status :: 5 - Production/Stable", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX", - "Operating System :: Unix", - "Operating System :: MacOS", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] -maintainers = [ - {name = "Andreas Mueller", email = "amueller@ais.uni-bonn.de"}, -] -requires-python = ">=3.8" -dependencies = [ - "numpy>=1.19.2; platform_python_implementation=='PyPy'", - "numpy>=1.17.3; platform_python_implementation!='PyPy'", - "scipy>=1.3.2", - "threadpoolctl>=2.0.0", - "joblib>=1.0.0", -] - -dynamic = ["version"] - -[project.urls] -"Bug Tracker" = "https://github.com/scikit-learn/scikit-learn/issues" -Documentation = "https://scikit-learn.org/stable/documentation.html" -"Source Code" = "https://github.com/scikit-learn/scikit-learn" - -[project.optional-dependencies] -tests = [ - "matplotlib>=3.1.2", - "scikit-image>=0.16.2", - "pandas>=1.0.5", - "pytest>=5.0.1", - "pytest-cov>=2.9.0", - "flake8>=3.8.2", - "black>=22.3.0", - "mypy>=0.961", - "pyamg>=4.0.0", - "numpydoc>=1.2.0", -] -docs = [ - "matplotlib>=3.1.2", - "scikit-image>=0.16.2", - "pandas>=1.0.5", - "seaborn>=0.9.0", - "memory_profiler>=0.57.0", - "sphinx>=4.0.1", - "sphinx-gallery>=0.7.0", - "numpydoc>=1.2.0", - "Pillow>=7.1.2", - "sphinx-prompt>=1.3.0", - "sphinxext-opengraph>=0.4.2", -] [tool.black] line-length = 88 diff --git a/setup.py b/setup.py index 5a29e4a434bad..2ecc5ba0bcc2e 100755 --- a/setup.py +++ b/setup.py @@ -32,8 +32,20 @@ builtins.__SKLEARN_SETUP__ = True +DISTNAME = "scikit-learn" +DESCRIPTION = "A set of python modules for machine learning and data mining" +with open("README.rst") as f: + LONG_DESCRIPTION = f.read() +MAINTAINER = "Andreas Mueller" +MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de" URL = "http://scikit-learn.org" DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" +LICENSE = "new BSD" +PROJECT_URLS = { + "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", + "Documentation": "https://scikit-learn.org/stable/documentation.html", + "Source Code": "https://github.com/scikit-learn/scikit-learn", +} # We can actually import a restricted version of sklearn that # does not need the compiled code @@ -64,6 +76,10 @@ extra_setuptools_args = dict( zip_safe=False, # the package can run out of an .egg file include_package_data=True, + extras_require={ + key: min_deps.tag_to_packages[key] + for key in ["examples", "docs", "tests", "benchmark"] + }, ) else: extra_setuptools_args = dict() @@ -214,6 +230,7 @@ def check_package_status(package, min_version): def setup_package(): + # TODO: Require Python 3.8 for PyPy when PyPy3.8 is ready # https://github.com/conda-forge/conda-forge-pinning-feedstock/issues/2089 if platform.python_implementation() == "PyPy": @@ -224,11 +241,39 @@ def setup_package(): required_python_version = (3, 8) metadata = dict( + name=DISTNAME, + maintainer=MAINTAINER, + maintainer_email=MAINTAINER_EMAIL, + description=DESCRIPTION, + license=LICENSE, url=URL, download_url=DOWNLOAD_URL, + project_urls=PROJECT_URLS, version=VERSION, + long_description=LONG_DESCRIPTION, + classifiers=[ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: C", + "Programming Language :: Python", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Development Status :: 5 - Production/Stable", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + ], cmdclass=cmdclass, python_requires=python_requires, + install_requires=min_deps.tag_to_packages["install"], package_data={"": ["*.pxd"]}, **extra_setuptools_args, ) diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index b962e5638643d..3e28d6bc7dc98 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -29,7 +29,7 @@ "scipy": (SCIPY_MIN_VERSION, "build, install"), "joblib": (JOBLIB_MIN_VERSION, "install"), "threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"), - "Cython": (CYTHON_MIN_VERSION, "build"), + "cython": (CYTHON_MIN_VERSION, "build"), "matplotlib": ("3.1.2", "benchmark, docs, examples, tests"), "scikit-image": ("0.16.2", "docs, examples, tests"), "pandas": ("1.0.5", "benchmark, docs, examples, tests"), diff --git a/sklearn/tests/test_min_dependencies.py b/sklearn/tests/test_min_dependencies_readme.py similarity index 60% rename from sklearn/tests/test_min_dependencies.py rename to sklearn/tests/test_min_dependencies_readme.py index 5a03b392e86b9..8b2b548c5bf42 100644 --- a/sklearn/tests/test_min_dependencies.py +++ b/sklearn/tests/test_min_dependencies_readme.py @@ -1,4 +1,4 @@ -"""Tests for the minimum dependencies in the README.rst file and pyproject.toml.""" +"""Tests for the minimum dependencies in the README.rst file.""" import os @@ -8,7 +8,7 @@ import pytest import sklearn -from sklearn._min_dependencies import dependent_packages, tag_to_packages +from sklearn._min_dependencies import dependent_packages from sklearn.utils.fixes import parse_version @@ -50,25 +50,3 @@ def test_min_dependencies_readme(): min_version = parse_version(dependent_packages[package][0]) assert version == min_version, f"{package} has a mismatched version" - - -def test_min_dependencies_pyproject_toml(): - """Verify that pyproject.toml is consistent with _min_dependencies.""" - root_path = Path(sklearn.__path__[0]).parents[0] - pyproject_toml = root_path / "pyproject.toml" - - if not pyproject_toml.exists(): - # Skip the test if the pyproject.toml file is not available. - # For instance, when installing scikit-learn from wheels - pytest.skip("The pyproject.toml does not exist") - - toml_content = pyproject_toml.read_text() - - for tag, constraints in tag_to_packages.items(): - if tag == "maintenance": - # maintenance does not need to be in pyproject.toml - continue - for constraint in constraints: - assert ( - constraint in toml_content - ), f"{constraint} should be in pyproject.toml" From 2b5edc89b141e3c2bdd992a735b4e80ba8e83299 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 10 Aug 2022 10:31:19 -0400 Subject: [PATCH 4/6] FIX Fixes merge errors --- .../_argkmin.pxd.tp | 15 ++++++------- .../_base.pxd.tp | 2 +- .../_base.pyx.tp | 22 +++++++++---------- .../_datasets_pair.pxd.tp | 13 +++++------ .../_datasets_pair.pyx.tp | 2 +- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp index 4a7d4db953391..780a934c061d6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -15,7 +15,6 @@ implementation_specific_values = [ }} cimport numpy as cnp -from ...utils._typedefs cimport ITYPE_t, DTYPE_t cnp.import_array() @@ -28,22 +27,22 @@ cdef class PairwiseDistancesArgKmin{{name_suffix}}(PairwiseDistancesReduction{{n """{{name_suffix}}bit implementation of PairwiseDistancesArgKmin.""" cdef: - ITYPE_t k + cnp.intp_t k - ITYPE_t[:, ::1] argkmin_indices - DTYPE_t[:, ::1] argkmin_distances + cnp.intp_t[:, ::1] argkmin_indices + cnp.float64_t[:, ::1] argkmin_distances # Used as array of pointers to private datastructures used in threads. - DTYPE_t ** heaps_r_distances_chunks - ITYPE_t ** heaps_indices_chunks + cnp.float64_t ** heaps_r_distances_chunks + cnp.intp_t ** heaps_indices_chunks cdef class FastEuclideanPairwiseDistancesArgKmin{{name_suffix}}(PairwiseDistancesArgKmin{{name_suffix}}): """EuclideanDistance-specialized {{name_suffix}}bit implementation for PairwiseDistancesArgKmin.""" cdef: GEMMTermComputer{{name_suffix}} gemm_term_computer - const DTYPE_t[::1] X_norm_squared - const DTYPE_t[::1] Y_norm_squared + const cnp.float64_t[::1] X_norm_squared + const cnp.float64_t[::1] Y_norm_squared bint use_squared_distances diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 5e5ff737caa78..cc26694621c0b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -8,7 +8,7 @@ implementation_specific_values = [ # We also use the float64 dtype and C-type names as defined in # `sklearn.utils._typedefs` to maintain consistency. # - ('64', 'DTYPE_t', 'DTYPE'), + ('64', 'cnp.float64_t', 'np.float64'), ('32', 'cnp.float32_t', 'np.float32') ] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 342ad900d89c1..2b91c387b702e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -8,7 +8,7 @@ implementation_specific_values = [ # We also use the float64 dtype and C-type names as defined in # `sklearn.utils._typedefs` to maintain consistency. # - ('64', 'DTYPE_t', 'DTYPE'), + ('64', 'cnp.float64_t', 'np.float64'), ('32', 'cnp.float32_t', 'np.float32') ] @@ -60,9 +60,9 @@ cpdef cnp.float64_t[::1] _sqeuclidean_row_norms64( return squared_row_norms -cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( +cpdef cnp.float64_t[::1] _sqeuclidean_row_norms32( const cnp.float32_t[:, ::1] X, - ITYPE_t num_threads, + cnp.intp_t num_threads, ): """Compute the squared euclidean norm of the rows of X in parallel. @@ -74,15 +74,15 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( # const qualifier. # See: https://github.com/scipy/scipy/issues/14262 cnp.float32_t * X_ptr = &X[0, 0] - ITYPE_t i = 0, j = 0 - ITYPE_t thread_num - ITYPE_t n = X.shape[0] - ITYPE_t d = X.shape[1] - DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) + cnp.intp_t i = 0, j = 0 + cnp.intp_t thread_num + cnp.intp_t n = X.shape[0] + cnp.intp_t d = X.shape[1] + cnp.float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64) # To upcast the i-th row of X from 32bit to 64bit - vector[vector[DTYPE_t]] X_i_upcast = vector[vector[DTYPE_t]]( - num_threads, vector[DTYPE_t](d) + vector[vector[cnp.float64_t]] X_i_upcast = vector[vector[cnp.float64_t]]( + num_threads, vector[cnp.float64_t](d) ) with nogil, parallel(num_threads=num_threads): @@ -90,7 +90,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( for i in prange(n, schedule='static'): # Upcasting the i-th row of X from 32bit to 64bit for j in range(d): - X_i_upcast[thread_num][j] = deref(X_ptr + i * d + j) + X_i_upcast[thread_num][j] = deref(X_ptr + i * d + j) squared_row_norms[i] = _dot( d, X_i_upcast[thread_num].data(), 1, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index d10f8f493e5f0..702ef33bd5a68 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -8,14 +8,13 @@ implementation_specific_values = [ # We also use the float64 dtype and C-type names as defined in # `sklearn.utils._typedefs` to maintain consistency. # - ('64', 'DistanceMetric', 'DTYPE_t', 'DTYPE'), + ('64', 'DistanceMetric', 'cnp.float64_t', 'np.float64'), ('32', 'DistanceMetric32', 'cnp.float32_t', 'np.float32') ] }} cimport numpy as cnp -from ...utils._typedefs cimport DTYPE_t, ITYPE_t from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32 {{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -24,18 +23,18 @@ from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32 cdef class DatasetsPair{{name_suffix}}: cdef {{DistanceMetric}} distance_metric - cdef ITYPE_t n_samples_X(self) nogil + cdef cnp.intp_t n_samples_X(self) nogil - cdef ITYPE_t n_samples_Y(self) nogil + cdef cnp.intp_t n_samples_Y(self) nogil - cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil - cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:, ::1] X const {{INPUT_DTYPE_t}}[:, ::1] Y - ITYPE_t d + cnp.intp_t d {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 18b2f48be9e26..8d60ee763ef5b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -8,7 +8,7 @@ implementation_specific_values = [ # We also use the float64 dtype and C-type names as defined in # `sklearn.utils._typedefs` to maintain consistency. # - ('64', 'DistanceMetric', 'DTYPE_t', 'DTYPE'), + ('64', 'DistanceMetric', 'cnp.float64_t', 'np.float64'), ('32', 'DistanceMetric32', 'cnp.float32_t', 'np.float32') ] From c357102841fdf4af9e4edce48dc267564bb73fe6 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 31 Aug 2022 13:40:34 -0400 Subject: [PATCH 5/6] FIX Corrects wrong dtype --- sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index f285dc1df5bc4..d13896d0fc2fa 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -147,7 +147,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): # Main heaps which will be returned as results by `ArgKmin{{name_suffix}}.compute`. self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=np.intp) - self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=np.intp) + self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=np.float64) def __dealloc__(self): if self.heaps_indices_chunks is not NULL: From c39616629c65c8d60d59b8498a72a4ae22099bea Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 21 Sep 2022 17:49:19 -0700 Subject: [PATCH 6/6] FIX Fixes merge errors --- .../_datasets_pair.pxd.tp | 2 +- .../_datasets_pair.pyx.tp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index 53df33caa04f2..18e1a784b978d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -23,7 +23,7 @@ from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32 cdef class DatasetsPair{{name_suffix}}: cdef: {{DistanceMetric}} distance_metric - np.intp_t n_features + cnp.intp_t n_features cdef cnp.intp_t n_samples_X(self) nogil diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 2acac87ef97e7..ff7e23610da56 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -193,11 +193,11 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.Y.shape[0] @final - cdef DTYPE_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.n_features) @final - cdef DTYPE_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.n_features) @@ -233,7 +233,7 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.Y_indptr.shape[0] - 1 @final - cdef DTYPE_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], x1_indices=self.X_indices, @@ -247,7 +247,7 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): ) @final - cdef DTYPE_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.dist_csr( x1_data=&self.X_data[0], x1_indices=self.X_indices, @@ -326,7 +326,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.n_Y @final - cdef DTYPE_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], x1_indices=self.X_indices, @@ -342,7 +342,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): ) @final - cdef DTYPE_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: return self.distance_metric.dist_csr( x1_data=&self.X_data[0], @@ -392,12 +392,12 @@ cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.datasets_pair.n_samples_X() @final - cdef DTYPE_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t surrogate_dist(self, cnp.intp_t i, cnp.intp_t j) nogil: # Swapping arguments on the same interface return self.datasets_pair.surrogate_dist(j, i) @final - cdef DTYPE_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: + cdef cnp.float64_t dist(self, cnp.intp_t i, cnp.intp_t j) nogil: # Swapping arguments on the same interface return self.datasets_pair.dist(j, i)