From e8d4c6f7460027ae8c71ab41616270a6a44e0f90 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 2 Dec 2022 11:25:25 +0100 Subject: [PATCH 01/18] MAINT Move sorting utilities to `sklearn.utils._sorting` --- sklearn/tree/_splitter.pyx | 121 +++------------------------------- sklearn/utils/_sorting.pxd | 25 ++++++- sklearn/utils/_sorting.pyx | 129 +++++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 114 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 9f4301cd99dc0..b2ebde6b3d07e 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -21,6 +21,14 @@ import numpy as np from scipy.sparse import csc_matrix +from ..utils._sorting cimport ( + sort, + swap, + median3, + introsort, + heapsort +) + from ._utils cimport log from ._utils cimport rand_int from ._utils cimport rand_uniform @@ -413,119 +421,6 @@ cdef inline int node_split_best( return 0 -# Sort n-element arrays pointed to by feature_values and samples, simultaneously, -# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) nogil: - if n == 0: - return - cdef int maxd = 2 * log(n) - introsort(feature_values, samples, n, maxd) - - -cdef inline void swap(DTYPE_t* feature_values, SIZE_t* samples, - SIZE_t i, SIZE_t j) nogil: - # Helper for sort - feature_values[i], feature_values[j] = feature_values[j], feature_values[i] - samples[i], samples[j] = samples[j], samples[i] - - -cdef inline DTYPE_t median3(DTYPE_t* feature_values, SIZE_t n) nogil: - # Median of three pivot selection, after Bentley and McIlroy (1993). - # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. - cdef DTYPE_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1] - if a < b: - if b < c: - return b - elif a < c: - return c - else: - return a - elif b < c: - if a < c: - return a - else: - return c - else: - return b - - -# Introsort with median of 3 pivot selection and 3-way partition function -# (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(DTYPE_t* feature_values, SIZE_t *samples, - SIZE_t n, int maxd) nogil: - cdef DTYPE_t pivot - cdef SIZE_t i, l, r - - while n > 1: - if maxd <= 0: # max depth limit exceeded ("gone quadratic") - heapsort(feature_values, samples, n) - return - maxd -= 1 - - pivot = median3(feature_values, n) - - # Three-way partition. - i = l = 0 - r = n - while i < r: - if feature_values[i] < pivot: - swap(feature_values, samples, i, l) - i += 1 - l += 1 - elif feature_values[i] > pivot: - r -= 1 - swap(feature_values, samples, i, r) - else: - i += 1 - - introsort(feature_values, samples, l, maxd) - feature_values += r - samples += r - n -= r - - -cdef inline void sift_down(DTYPE_t* feature_values, SIZE_t* samples, - SIZE_t start, SIZE_t end) nogil: - # Restore heap order in feature_values[start:end] by moving the max element to start. - cdef SIZE_t child, maxind, root - - root = start - while True: - child = root * 2 + 1 - - # find max of root, left child, right child - maxind = root - if child < end and feature_values[maxind] < feature_values[child]: - maxind = child - if child + 1 < end and feature_values[maxind] < feature_values[child + 1]: - maxind = child + 1 - - if maxind == root: - break - else: - swap(feature_values, samples, root, maxind) - root = maxind - - -cdef void heapsort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) nogil: - cdef SIZE_t start, end - - # heapify - start = (n - 2) / 2 - end = n - while True: - sift_down(feature_values, samples, start, end) - if start == 0: - break - start -= 1 - - # sort by shrinking the heap, putting the max element immediately after it - end = n - 1 - while end > 0: - swap(feature_values, samples, 0, end) - sift_down(feature_values, samples, 0, end) - end = end - 1 - cdef inline int node_split_random( Splitter splitter, Partitioner partitioner, diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index 412d67c479fac..41195fd676856 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -1,4 +1,5 @@ -from ._typedefs cimport DTYPE_t, ITYPE_t +cimport numpy as cnp +from ._typedefs cimport ITYPE_t from cython cimport floating @@ -7,3 +8,25 @@ cdef int simultaneous_sort( ITYPE_t *idx, ITYPE_t size, ) nogil + +cdef void sort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil + +cdef void swap( + cnp.npy_float32* Xf, + cnp.npy_intp* samples, + cnp.npy_intp i, + cnp.npy_intp j, +) nogil + +cdef cnp.npy_float32 median3(cnp.npy_float32* Xf, cnp.npy_intp n) nogil + +cdef void introsort(cnp.npy_float32* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil + +cdef void sift_down( + cnp.npy_float32* Xf, + cnp.npy_intp* samples, + cnp.npy_intp start, + cnp.npy_intp end, +) nogil + +cdef void heapsort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index 367448b5cb91b..de9131672ea7f 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -1,4 +1,12 @@ +cimport numpy as cnp + from cython cimport floating +from libc.math cimport log as ln + +# TODO: Factor code also present in `tree._utils` or use `libc.math.log2` directly +cdef inline double log(double x) nogil: + return ln(x) / ln(2.0) + cdef inline void dual_swap( floating* darr, @@ -91,3 +99,124 @@ cdef int simultaneous_sort( indices + pivot_idx + 1, size - pivot_idx - 1) return 0 + + +# Sort n-element arrays pointed to by Xf and samples, simultaneously, +# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). +cdef inline void sort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: + if n == 0: + return + cdef int maxd = 2 * log(n) + introsort(Xf, samples, n, maxd) + + +cdef inline void swap( + cnp.npy_float32* Xf, + cnp.npy_intp* samples, + cnp.npy_intp i, + cnp.npy_intp j, +) nogil: + # Helper for sort + Xf[i], Xf[j] = Xf[j], Xf[i] + samples[i], samples[j] = samples[j], samples[i] + + +cdef inline cnp.npy_float32 median3(cnp.npy_float32* Xf, cnp.npy_intp n) nogil: + # Median of three pivot selection, after Bentley and McIlroy (1993). + # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. + cdef cnp.npy_float32 a = Xf[0], b = Xf[n / 2], c = Xf[n - 1] + if a < b: + if b < c: + return b + elif a < c: + return c + else: + return a + elif b < c: + if a < c: + return a + else: + return c + else: + return b + + +# Introsort with median of 3 pivot selection and 3-way partition function +# (robust to repeated elements, e.g. lots of zero features). +cdef void introsort(cnp.npy_float32* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil: + cdef cnp.npy_float32 pivot + cdef cnp.npy_intp i, l, r + + while n > 1: + if maxd <= 0: # max depth limit exceeded ("gone quadratic") + heapsort(Xf, samples, n) + return + maxd -= 1 + + pivot = median3(Xf, n) + + # Three-way partition. + i = l = 0 + r = n + while i < r: + if Xf[i] < pivot: + swap(Xf, samples, i, l) + i += 1 + l += 1 + elif Xf[i] > pivot: + r -= 1 + swap(Xf, samples, i, r) + else: + i += 1 + + introsort(Xf, samples, l, maxd) + Xf += r + samples += r + n -= r + + +cdef inline void sift_down( + cnp.npy_float32* Xf, + cnp.npy_intp* samples, + cnp.npy_intp start, + cnp.npy_intp end, +) nogil: + # Restore heap order in Xf[start:end] by moving the max element to start. + cdef cnp.npy_intp child, maxind, root + + root = start + while True: + child = root * 2 + 1 + + # find max of root, left child, right child + maxind = root + if child < end and Xf[maxind] < Xf[child]: + maxind = child + if child + 1 < end and Xf[maxind] < Xf[child + 1]: + maxind = child + 1 + + if maxind == root: + break + else: + swap(Xf, samples, root, maxind) + root = maxind + + +cdef void heapsort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: + cdef cnp.npy_intp start, end + + # heapify + start = (n - 2) / 2 + end = n + while True: + sift_down(Xf, samples, start, end) + if start == 0: + break + start -= 1 + + # sort by shrinking the heap, putting the max element immediately after it + end = n - 1 + while end > 0: + swap(Xf, samples, 0, end) + sift_down(Xf, samples, 0, end) + end = end - 1 From 42367fc2e8eef7249860d2914b1a7b0ce2d5bfa9 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 2 Dec 2022 11:57:53 +0100 Subject: [PATCH 02/18] Use floating for sorting routines --- sklearn/utils/_sorting.pxd | 12 ++++++------ sklearn/utils/_sorting.pyx | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index 41195fd676856..da9d95a49eabe 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -9,24 +9,24 @@ cdef int simultaneous_sort( ITYPE_t size, ) nogil -cdef void sort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil +cdef void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil cdef void swap( - cnp.npy_float32* Xf, + floating* Xf, cnp.npy_intp* samples, cnp.npy_intp i, cnp.npy_intp j, ) nogil -cdef cnp.npy_float32 median3(cnp.npy_float32* Xf, cnp.npy_intp n) nogil +cdef floating median3(floating* Xf, cnp.npy_intp n) nogil -cdef void introsort(cnp.npy_float32* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil +cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil cdef void sift_down( - cnp.npy_float32* Xf, + floating* Xf, cnp.npy_intp* samples, cnp.npy_intp start, cnp.npy_intp end, ) nogil -cdef void heapsort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil +cdef void heapsort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index de9131672ea7f..bc2a723c4a49b 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -103,7 +103,7 @@ cdef int simultaneous_sort( # Sort n-element arrays pointed to by Xf and samples, simultaneously, # by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: +cdef inline void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: if n == 0: return cdef int maxd = 2 * log(n) @@ -111,7 +111,7 @@ cdef inline void sort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n cdef inline void swap( - cnp.npy_float32* Xf, + floating* Xf, cnp.npy_intp* samples, cnp.npy_intp i, cnp.npy_intp j, @@ -121,10 +121,10 @@ cdef inline void swap( samples[i], samples[j] = samples[j], samples[i] -cdef inline cnp.npy_float32 median3(cnp.npy_float32* Xf, cnp.npy_intp n) nogil: +cdef inline floating median3(floating* Xf, cnp.npy_intp n) nogil: # Median of three pivot selection, after Bentley and McIlroy (1993). # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. - cdef cnp.npy_float32 a = Xf[0], b = Xf[n / 2], c = Xf[n - 1] + cdef floating a = Xf[0], b = Xf[n / 2], c = Xf[n - 1] if a < b: if b < c: return b @@ -143,8 +143,8 @@ cdef inline cnp.npy_float32 median3(cnp.npy_float32* Xf, cnp.npy_intp n) nogil: # Introsort with median of 3 pivot selection and 3-way partition function # (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(cnp.npy_float32* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil: - cdef cnp.npy_float32 pivot +cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil: + cdef floating pivot cdef cnp.npy_intp i, l, r while n > 1: @@ -176,7 +176,7 @@ cdef void introsort(cnp.npy_float32* Xf, cnp.npy_intp *samples, cnp.npy_intp n, cdef inline void sift_down( - cnp.npy_float32* Xf, + floating* Xf, cnp.npy_intp* samples, cnp.npy_intp start, cnp.npy_intp end, @@ -202,7 +202,7 @@ cdef inline void sift_down( root = maxind -cdef void heapsort(cnp.npy_float32* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: +cdef void heapsort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: cdef cnp.npy_intp start, end # heapify From f5b593c1a2f87e1d1054383acab44b81cf23a5f2 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 2 Dec 2022 14:08:43 +0100 Subject: [PATCH 03/18] Replace `simultaneous_sort` with `sort` in concrete `PairwiseDistancesReductions` --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 6 +++--- .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index b8afe5c3cd5f8..7d41dd84e0fb9 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -6,7 +6,7 @@ from cython cimport final from cython.parallel cimport parallel, prange from ...utils._heap cimport heap_push -from ...utils._sorting cimport simultaneous_sort +from ...utils._sorting cimport sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np @@ -194,7 +194,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. for idx in range(X_end - X_start): - simultaneous_sort( + sort( self.heaps_r_distances_chunks[thread_num] + idx * self.k, self.heaps_indices_chunks[thread_num] + idx * self.k, self.k @@ -278,7 +278,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main in ascending order w.r.t the distances. # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( + sort( &self.argkmin_distances[idx, 0], &self.argkmin_indices[idx, 0], self.k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index b3f20cac3ea08..788d52e7cff5e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -8,7 +8,7 @@ from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from ...utils._sorting cimport simultaneous_sort +from ...utils._sorting cimport sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._vector_sentinel cimport vector_to_nd_array @@ -221,7 +221,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): - simultaneous_sort( + sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() @@ -292,7 +292,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sort in parallel in ascending order w.r.t the distances if requested. if self.sort_results: for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( + sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() From a8fcdb935661b82884f59374eda261790e9679af Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 2 Dec 2022 14:12:08 +0100 Subject: [PATCH 04/18] Only expose `sort` and `simultaneous_sort` --- sklearn/tree/_splitter.pyx | 8 +------- sklearn/utils/_sorting.pxd | 20 -------------------- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index b2ebde6b3d07e..9023657c29022 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -21,13 +21,7 @@ import numpy as np from scipy.sparse import csc_matrix -from ..utils._sorting cimport ( - sort, - swap, - median3, - introsort, - heapsort -) +from ..utils._sorting cimport sort from ._utils cimport log from ._utils cimport rand_int diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index da9d95a49eabe..2847f5a1c1325 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -10,23 +10,3 @@ cdef int simultaneous_sort( ) nogil cdef void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil - -cdef void swap( - floating* Xf, - cnp.npy_intp* samples, - cnp.npy_intp i, - cnp.npy_intp j, -) nogil - -cdef floating median3(floating* Xf, cnp.npy_intp n) nogil - -cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil - -cdef void sift_down( - floating* Xf, - cnp.npy_intp* samples, - cnp.npy_intp start, - cnp.npy_intp end, -) nogil - -cdef void heapsort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil From de756be8bb7d3c0e37776c7ac18f706ab2d597f9 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 8 Dec 2022 15:58:31 +0100 Subject: [PATCH 05/18] Use cnp.intp_t directly and remove duplicated logic --- sklearn/utils/_sorting.pxd | 7 ++-- sklearn/utils/_sorting.pyx | 84 +++++++++++++++----------------------- 2 files changed, 37 insertions(+), 54 deletions(-) diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index 2847f5a1c1325..6070a543e0e08 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -1,12 +1,11 @@ cimport numpy as cnp -from ._typedefs cimport ITYPE_t from cython cimport floating cdef int simultaneous_sort( floating *dist, - ITYPE_t *idx, - ITYPE_t size, + cnp.intp_t *idx, + cnp.intp_t size, ) nogil -cdef void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil +cdef void sort(floating* Xf, cnp.intp_t* samples, cnp.intp_t n) nogil diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index bc2a723c4a49b..b1ce9e5b1c5a3 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -8,26 +8,21 @@ cdef inline double log(double x) nogil: return ln(x) / ln(2.0) -cdef inline void dual_swap( - floating* darr, - ITYPE_t *iarr, - ITYPE_t a, - ITYPE_t b, +cdef inline void simultaneous_swap( + floating* Xf, + cnp.intp_t* samples, + cnp.intp_t i, + cnp.intp_t j, ) nogil: - """Swap the values at index a and b of both darr and iarr""" - cdef floating dtmp = darr[a] - darr[a] = darr[b] - darr[b] = dtmp - - cdef ITYPE_t itmp = iarr[a] - iarr[a] = iarr[b] - iarr[b] = itmp + # Helper for sort + Xf[i], Xf[j] = Xf[j], Xf[i] + samples[i], samples[j] = samples[j], samples[i] cdef int simultaneous_sort( floating* values, - ITYPE_t* indices, - ITYPE_t size, + cnp.intp_t* indices, + cnp.intp_t size, ) nogil: """ Perform a recursive quicksort on the values array as to sort them ascendingly. @@ -50,7 +45,7 @@ cdef int simultaneous_sort( # an Array of Structures (AoS) instead of the Structure of Arrays (SoA) # currently used. cdef: - ITYPE_t pivot_idx, i, store_idx + cnp.intp_t pivot_idx, i, store_idx floating pivot_val # in the small-array case, do things efficiently @@ -58,14 +53,14 @@ cdef int simultaneous_sort( pass elif size == 2: if values[0] > values[1]: - dual_swap(values, indices, 0, 1) + simultaneous_swap(values, indices, 0, 1) elif size == 3: if values[0] > values[1]: - dual_swap(values, indices, 0, 1) + simultaneous_swap(values, indices, 0, 1) if values[1] > values[2]: - dual_swap(values, indices, 1, 2) + simultaneous_swap(values, indices, 1, 2) if values[0] > values[1]: - dual_swap(values, indices, 0, 1) + simultaneous_swap(values, indices, 0, 1) else: # Determine the pivot using the median-of-three rule. # The smallest of the three is moved to the beginning of the array, @@ -73,11 +68,11 @@ cdef int simultaneous_sort( # is moved to the pivot index. pivot_idx = size // 2 if values[0] > values[size - 1]: - dual_swap(values, indices, 0, size - 1) + simultaneous_swap(values, indices, 0, size - 1) if values[size - 1] > values[pivot_idx]: - dual_swap(values, indices, size - 1, pivot_idx) + simultaneous_swap(values, indices, size - 1, pivot_idx) if values[0] > values[size - 1]: - dual_swap(values, indices, 0, size - 1) + simultaneous_swap(values, indices, 0, size - 1) pivot_val = values[size - 1] # Partition indices about pivot. At the end of this operation, @@ -86,9 +81,9 @@ cdef int simultaneous_sort( store_idx = 0 for i in range(size - 1): if values[i] < pivot_val: - dual_swap(values, indices, i, store_idx) + simultaneous_swap(values, indices, i, store_idx) store_idx += 1 - dual_swap(values, indices, store_idx, size - 1) + simultaneous_swap(values, indices, store_idx, size - 1) pivot_idx = store_idx # Recursively sort each side of the pivot @@ -103,25 +98,14 @@ cdef int simultaneous_sort( # Sort n-element arrays pointed to by Xf and samples, simultaneously, # by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: +cdef inline void sort(floating* Xf, cnp.intp_t* samples, cnp.intp_t n) nogil: if n == 0: return cdef int maxd = 2 * log(n) introsort(Xf, samples, n, maxd) -cdef inline void swap( - floating* Xf, - cnp.npy_intp* samples, - cnp.npy_intp i, - cnp.npy_intp j, -) nogil: - # Helper for sort - Xf[i], Xf[j] = Xf[j], Xf[i] - samples[i], samples[j] = samples[j], samples[i] - - -cdef inline floating median3(floating* Xf, cnp.npy_intp n) nogil: +cdef inline floating median3(floating* Xf, cnp.intp_t n) nogil: # Median of three pivot selection, after Bentley and McIlroy (1993). # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. cdef floating a = Xf[0], b = Xf[n / 2], c = Xf[n - 1] @@ -143,9 +127,9 @@ cdef inline floating median3(floating* Xf, cnp.npy_intp n) nogil: # Introsort with median of 3 pivot selection and 3-way partition function # (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil: +cdef void introsort(floating* Xf, cnp.intp_t *samples, cnp.intp_t n, int maxd) nogil: cdef floating pivot - cdef cnp.npy_intp i, l, r + cdef cnp.intp_t i, l, r while n > 1: if maxd <= 0: # max depth limit exceeded ("gone quadratic") @@ -160,12 +144,12 @@ cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int max r = n while i < r: if Xf[i] < pivot: - swap(Xf, samples, i, l) + simultaneous_swap(Xf, samples, i, l) i += 1 l += 1 elif Xf[i] > pivot: r -= 1 - swap(Xf, samples, i, r) + simultaneous_swap(Xf, samples, i, r) else: i += 1 @@ -177,12 +161,12 @@ cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int max cdef inline void sift_down( floating* Xf, - cnp.npy_intp* samples, - cnp.npy_intp start, - cnp.npy_intp end, + cnp.intp_t* samples, + cnp.intp_t start, + cnp.intp_t end, ) nogil: # Restore heap order in Xf[start:end] by moving the max element to start. - cdef cnp.npy_intp child, maxind, root + cdef cnp.intp_t child, maxind, root root = start while True: @@ -198,12 +182,12 @@ cdef inline void sift_down( if maxind == root: break else: - swap(Xf, samples, root, maxind) + simultaneous_swap(Xf, samples, root, maxind) root = maxind -cdef void heapsort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: - cdef cnp.npy_intp start, end +cdef void heapsort(floating* Xf, cnp.intp_t* samples, cnp.intp_t n) nogil: + cdef cnp.intp_t start, end # heapify start = (n - 2) / 2 @@ -217,6 +201,6 @@ cdef void heapsort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil: # sort by shrinking the heap, putting the max element immediately after it end = n - 1 while end > 0: - swap(Xf, samples, 0, end) + simultaneous_swap(Xf, samples, 0, end) sift_down(Xf, samples, 0, end) end = end - 1 From 831ba2ae01f175ba2a1205e5c7408108a963a170 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 27 Jan 2023 11:22:11 +0100 Subject: [PATCH 06/18] Propagate right sorting interfaces --- .../_argkmin.pyx.tp | 6 +- .../_radius_neighbors.pyx.tp | 6 +- sklearn/neighbors/_binary_tree.pxi | 6 +- sklearn/tree/_splitter.pyx | 2 +- sklearn/utils/_sorting.pxd | 12 +- sklearn/utils/_sorting.pyx | 220 ++++++++++-------- 6 files changed, 136 insertions(+), 116 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 7d41dd84e0fb9..d85534e5513d8 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -6,7 +6,7 @@ from cython cimport final from cython.parallel cimport parallel, prange from ...utils._heap cimport heap_push -from ...utils._sorting cimport sort +from ...utils._sorting cimport simultaneous_quick_sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np @@ -194,7 +194,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. for idx in range(X_end - X_start): - sort( + simultaneous_quick_sort( self.heaps_r_distances_chunks[thread_num] + idx * self.k, self.heaps_indices_chunks[thread_num] + idx * self.k, self.k @@ -278,7 +278,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main in ascending order w.r.t the distances. # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): - sort( + simultaneous_quick_sort( &self.argkmin_distances[idx, 0], &self.argkmin_indices[idx, 0], self.k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 788d52e7cff5e..3b39626ff3bf4 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -8,7 +8,7 @@ from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from ...utils._sorting cimport sort +from ...utils._sorting cimport simultaneous_quick_sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._vector_sentinel cimport vector_to_nd_array @@ -221,7 +221,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): - sort( + simultaneous_quick_sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() @@ -292,7 +292,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sort in parallel in ascending order w.r.t the distances if requested. if self.sort_results: for idx in prange(self.n_samples_X, schedule='static'): - sort( + simultaneous_quick_sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 00b5b3c2758d3..13e492f89bfd0 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -164,7 +164,7 @@ from ..utils import check_array from ..utils._typedefs cimport DTYPE_t, ITYPE_t from ..utils._typedefs import DTYPE, ITYPE from ..utils._heap cimport heap_push -from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort +from ..utils._sorting cimport simultaneous_quick_sort as _simultaneous_sort # TODO: use cnp.PyArray_ENABLEFLAGS when Cython>=3.0 is used. cdef extern from "numpy/arrayobject.h": @@ -561,8 +561,8 @@ cdef class NeighborsHeap: cdef ITYPE_t row for row in range(self.distances.shape[0]): _simultaneous_sort( - dist=&self.distances[row, 0], - idx=&self.indices[row, 0], + values=&self.distances[row, 0], + indices=&self.indices[row, 0], size=self.distances.shape[1], ) return 0 diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 9023657c29022..9cf65852b2e62 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -21,7 +21,7 @@ import numpy as np from scipy.sparse import csc_matrix -from ..utils._sorting cimport sort +from ..utils._sorting cimport simultaneous_introsort as sort from ._utils cimport log from ._utils cimport rand_int diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index 6070a543e0e08..96ec1d8375865 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -2,10 +2,14 @@ cimport numpy as cnp from cython cimport floating -cdef int simultaneous_sort( - floating *dist, - cnp.intp_t *idx, +cdef int simultaneous_quick_sort( + floating* values, + cnp.intp_t* indices, cnp.intp_t size, ) nogil -cdef void sort(floating* Xf, cnp.intp_t* samples, cnp.intp_t n) nogil +cdef void simultaneous_introsort( + floating* values, + cnp.intp_t* indices, + cnp.intp_t size, +) nogil diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index b1ce9e5b1c5a3..30963e37dfb3a 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -3,23 +3,93 @@ cimport numpy as cnp from cython cimport floating from libc.math cimport log as ln +# TODO: In order to support discrete distance metrics, we need to have a +# simultaneous sort which breaks ties on indices when distances are identical. +# The best might be using a std::stable_sort and a Comparator which might need +# an Array of Structures (AoS) instead of the Structure of Arrays (SoA) +# currently used. + +# Utilities functions + # TODO: Factor code also present in `tree._utils` or use `libc.math.log2` directly cdef inline double log(double x) nogil: return ln(x) / ln(2.0) - -cdef inline void simultaneous_swap( - floating* Xf, - cnp.intp_t* samples, +cdef inline void _simultaneous_swap( + floating* values, + cnp.intp_t* indices, cnp.intp_t i, cnp.intp_t j, ) nogil: # Helper for sort - Xf[i], Xf[j] = Xf[j], Xf[i] - samples[i], samples[j] = samples[j], samples[i] + values[i], values[j] = values[j], values[i] + indices[i], indices[j] = indices[j], indices[i] + +cdef inline floating _median3( + floating* values, + cnp.intp_t size, +) nogil: + # Median of three pivot selection, after Bentley and McIlroy (1993). + # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. + cdef floating a = values[0], b = values[size / 2], c = values[size - 1] + if a < b: + if b < c: + return b + elif a < c: + return c + else: + return a + elif b < c: + if a < c: + return a + else: + return c + else: + return b + +cdef inline void _sift_down( + floating* values, + cnp.intp_t* indices, + cnp.intp_t start, + cnp.intp_t end, +) nogil: + # Restore heap order in values[start:end] by moving the max element to start. + cdef cnp.intp_t child, maxind, root + + root = start + while True: + child = root * 2 + 1 + + # find max of root, left child, right child + maxind = root + if child < end and values[maxind] < values[child]: + maxind = child + if child + 1 < end and values[maxind] < values[child + 1]: + maxind = child + 1 + + if maxind == root: + break + else: + _simultaneous_swap(values, indices, root, maxind) + root = maxind -cdef int simultaneous_sort( +# Sorting functions + +cdef inline void simultaneous_introsort( + floating* values, + cnp.intp_t* indices, + cnp.intp_t size, +) nogil: + # Sort a Structure of Arrays pointed consisting of arrays of values and indices, + # simultaneously, based on the values. Algorithm: Introsort (Musser, SP&E, 1997). + if size == 0: + return + cdef int maxd = 2 * log(size) + _simultaneous_introsort(values, indices, size, maxd) + + +cdef int simultaneous_quick_sort( floating* values, cnp.intp_t* indices, cnp.intp_t size, @@ -39,11 +109,6 @@ cdef int simultaneous_sort( Arrays are manipulated via a pointer to there first element and their size as to ease the processing of dynamically allocated buffers. """ - # TODO: In order to support discrete distance metrics, we need to have a - # simultaneous sort which breaks ties on indices when distances are identical. - # The best might be using a std::stable_sort and a Comparator which might need - # an Array of Structures (AoS) instead of the Structure of Arrays (SoA) - # currently used. cdef: cnp.intp_t pivot_idx, i, store_idx floating pivot_val @@ -53,14 +118,14 @@ cdef int simultaneous_sort( pass elif size == 2: if values[0] > values[1]: - simultaneous_swap(values, indices, 0, 1) + _simultaneous_swap(values, indices, 0, 1) elif size == 3: if values[0] > values[1]: - simultaneous_swap(values, indices, 0, 1) + _simultaneous_swap(values, indices, 0, 1) if values[1] > values[2]: - simultaneous_swap(values, indices, 1, 2) + _simultaneous_swap(values, indices, 1, 2) if values[0] > values[1]: - simultaneous_swap(values, indices, 0, 1) + _simultaneous_swap(values, indices, 0, 1) else: # Determine the pivot using the median-of-three rule. # The smallest of the three is moved to the beginning of the array, @@ -68,11 +133,11 @@ cdef int simultaneous_sort( # is moved to the pivot index. pivot_idx = size // 2 if values[0] > values[size - 1]: - simultaneous_swap(values, indices, 0, size - 1) + _simultaneous_swap(values, indices, 0, size - 1) if values[size - 1] > values[pivot_idx]: - simultaneous_swap(values, indices, size - 1, pivot_idx) + _simultaneous_swap(values, indices, size - 1, pivot_idx) if values[0] > values[size - 1]: - simultaneous_swap(values, indices, 0, size - 1) + _simultaneous_swap(values, indices, 0, size - 1) pivot_val = values[size - 1] # Partition indices about pivot. At the end of this operation, @@ -81,126 +146,77 @@ cdef int simultaneous_sort( store_idx = 0 for i in range(size - 1): if values[i] < pivot_val: - simultaneous_swap(values, indices, i, store_idx) + _simultaneous_swap(values, indices, i, store_idx) store_idx += 1 - simultaneous_swap(values, indices, store_idx, size - 1) + _simultaneous_swap(values, indices, store_idx, size - 1) pivot_idx = store_idx # Recursively sort each side of the pivot if pivot_idx > 1: - simultaneous_sort(values, indices, pivot_idx) + simultaneous_quick_sort(values, indices, pivot_idx) if pivot_idx + 2 < size: - simultaneous_sort(values + pivot_idx + 1, - indices + pivot_idx + 1, - size - pivot_idx - 1) + simultaneous_quick_sort(values + pivot_idx + 1, + indices + pivot_idx + 1, + size - pivot_idx - 1) return 0 - -# Sort n-element arrays pointed to by Xf and samples, simultaneously, -# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(floating* Xf, cnp.intp_t* samples, cnp.intp_t n) nogil: - if n == 0: - return - cdef int maxd = 2 * log(n) - introsort(Xf, samples, n, maxd) - - -cdef inline floating median3(floating* Xf, cnp.intp_t n) nogil: - # Median of three pivot selection, after Bentley and McIlroy (1993). - # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. - cdef floating a = Xf[0], b = Xf[n / 2], c = Xf[n - 1] - if a < b: - if b < c: - return b - elif a < c: - return c - else: - return a - elif b < c: - if a < c: - return a - else: - return c - else: - return b - - # Introsort with median of 3 pivot selection and 3-way partition function # (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(floating* Xf, cnp.intp_t *samples, cnp.intp_t n, int maxd) nogil: +cdef void _simultaneous_introsort( + floating* values, + cnp.intp_t* indices, + cnp.intp_t size, + int maxd, +) nogil: cdef floating pivot cdef cnp.intp_t i, l, r - while n > 1: + while size > 1: if maxd <= 0: # max depth limit exceeded ("gone quadratic") - heapsort(Xf, samples, n) + simultaneous_heapsort(values, indices, size) return maxd -= 1 - pivot = median3(Xf, n) + pivot = _median3(values, size) # Three-way partition. i = l = 0 - r = n + r = size while i < r: - if Xf[i] < pivot: - simultaneous_swap(Xf, samples, i, l) + if values[i] < pivot: + _simultaneous_swap(values, indices, i, l) i += 1 l += 1 - elif Xf[i] > pivot: + elif values[i] > pivot: r -= 1 - simultaneous_swap(Xf, samples, i, r) + _simultaneous_swap(values, indices, i, r) else: i += 1 - introsort(Xf, samples, l, maxd) - Xf += r - samples += r - n -= r + _simultaneous_introsort(values, indices, l, maxd) + values += r + indices += r + size -= r - -cdef inline void sift_down( - floating* Xf, - cnp.intp_t* samples, - cnp.intp_t start, - cnp.intp_t end, +cdef void simultaneous_heapsort( + floating* values, + cnp.intp_t* indices, + cnp.intp_t size, ) nogil: - # Restore heap order in Xf[start:end] by moving the max element to start. - cdef cnp.intp_t child, maxind, root - - root = start - while True: - child = root * 2 + 1 - - # find max of root, left child, right child - maxind = root - if child < end and Xf[maxind] < Xf[child]: - maxind = child - if child + 1 < end and Xf[maxind] < Xf[child + 1]: - maxind = child + 1 - - if maxind == root: - break - else: - simultaneous_swap(Xf, samples, root, maxind) - root = maxind - - -cdef void heapsort(floating* Xf, cnp.intp_t* samples, cnp.intp_t n) nogil: cdef cnp.intp_t start, end # heapify - start = (n - 2) / 2 - end = n + start = (size - 2) / 2 + end = size while True: - sift_down(Xf, samples, start, end) + _sift_down(values, indices, start, end) if start == 0: break start -= 1 # sort by shrinking the heap, putting the max element immediately after it - end = n - 1 + end = size - 1 while end > 0: - simultaneous_swap(Xf, samples, 0, end) - sift_down(Xf, samples, 0, end) + _simultaneous_swap(values, indices, 0, end) + _sift_down(values, indices, 0, end) end = end - 1 From 4d81dd1e52ec04fc5401cb765023b261aae60db9 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 14 Feb 2023 17:14:00 +0100 Subject: [PATCH 07/18] MAINT Make signature of sorting utilities uniform --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 6 +++--- .../_radius_neighbors.pyx.tp | 6 +++--- sklearn/utils/_sorting.pxd | 8 +++++++- sklearn/utils/_sorting.pyx | 4 ++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index d85534e5513d8..8c970ce4437ba 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -6,7 +6,7 @@ from cython cimport final from cython.parallel cimport parallel, prange from ...utils._heap cimport heap_push -from ...utils._sorting cimport simultaneous_quick_sort +from ...utils._sorting cimport simultaneous_quick_sort as _simultaneous_sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np @@ -194,7 +194,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. for idx in range(X_end - X_start): - simultaneous_quick_sort( + _simultaneous_sort( self.heaps_r_distances_chunks[thread_num] + idx * self.k, self.heaps_indices_chunks[thread_num] + idx * self.k, self.k @@ -278,7 +278,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main in ascending order w.r.t the distances. # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_quick_sort( + _simultaneous_sort( &self.argkmin_distances[idx, 0], &self.argkmin_indices[idx, 0], self.k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 3b39626ff3bf4..b2cc2e246e70d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -8,7 +8,7 @@ from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from ...utils._sorting cimport simultaneous_quick_sort +from ...utils._sorting cimport simultaneous_quick_sort as _simultaneous_sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._vector_sentinel cimport vector_to_nd_array @@ -221,7 +221,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): - simultaneous_quick_sort( + _simultaneous_sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() @@ -292,7 +292,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sort in parallel in ascending order w.r.t the distances if requested. if self.sort_results: for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_quick_sort( + _simultaneous_sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index 96ec1d8375865..114591baa2b03 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -2,7 +2,7 @@ cimport numpy as cnp from cython cimport floating -cdef int simultaneous_quick_sort( +cdef void simultaneous_quick_sort( floating* values, cnp.intp_t* indices, cnp.intp_t size, @@ -13,3 +13,9 @@ cdef void simultaneous_introsort( cnp.intp_t* indices, cnp.intp_t size, ) nogil + +cdef void simultaneous_heapsort( + floating* values, + cnp.intp_t* indices, + cnp.intp_t size, +) nogil diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index 30963e37dfb3a..65e024d66e3fa 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -89,7 +89,7 @@ cdef inline void simultaneous_introsort( _simultaneous_introsort(values, indices, size, maxd) -cdef int simultaneous_quick_sort( +cdef void simultaneous_quick_sort( floating* values, cnp.intp_t* indices, cnp.intp_t size, @@ -158,7 +158,7 @@ cdef int simultaneous_quick_sort( simultaneous_quick_sort(values + pivot_idx + 1, indices + pivot_idx + 1, size - pivot_idx - 1) - return 0 + # Introsort with median of 3 pivot selection and 3-way partition function # (robust to repeated elements, e.g. lots of zero features). From 8176a54d037b53e9ab851ab38ca4e977f3d00450 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 14 Feb 2023 17:55:21 +0100 Subject: [PATCH 08/18] TST Add sanity check test --- .../neighbors/tests/test_neighbors_tree.py | 26 ----------------- sklearn/utils/_sorting.pyx | 21 ++++++++++++++ sklearn/utils/tests/test_sorting.py | 28 +++++++++++++++++++ 3 files changed, 49 insertions(+), 26 deletions(-) create mode 100644 sklearn/utils/tests/test_sorting.py diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index 85d578c271faa..b7f63b694d252 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -13,13 +13,11 @@ DTYPE, ITYPE, NeighborsHeap as NeighborsHeapBT, - simultaneous_sort as simultaneous_sort_bt, nodeheap_sort as nodeheap_sort_bt, ) from sklearn.neighbors._kd_tree import ( KDTree, NeighborsHeap as NeighborsHeapKDT, - simultaneous_sort as simultaneous_sort_kdt, nodeheap_sort as nodeheap_sort_kdt, ) @@ -190,30 +188,6 @@ def test_node_heap(nodeheap_sort, n_nodes=50): assert_array_almost_equal(vals[i1], vals2) -@pytest.mark.parametrize( - "simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt] -) -def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201): - rng = check_random_state(0) - dist = rng.random_sample((n_rows, n_pts)).astype(DTYPE, copy=False) - ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(ITYPE, copy=False) - - dist2 = dist.copy() - ind2 = ind.copy() - - # simultaneous sort rows using function - simultaneous_sort(dist, ind) - - # simultaneous sort rows using numpy - i = np.argsort(dist2, axis=1) - row_ind = np.arange(n_rows)[:, None] - dist2 = dist2[row_ind, i] - ind2 = ind2[row_ind, i] - - assert_array_almost_equal(dist, dist2) - assert_array_almost_equal(ind, ind2) - - @pytest.mark.parametrize("Cls", [KDTree, BallTree]) def test_gaussian_kde(Cls, n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index 65e024d66e3fa..b3835e06f359c 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -15,6 +15,27 @@ from libc.math cimport log as ln cdef inline double log(double x) nogil: return ln(x) / ln(2.0) +def _simultaneous_sort( + floating[:] values, + cnp.intp_t[:] indices, + kind=None, +): + """Interface for testing purposes.""" + cdef cnp.intp_t size = indices.shape[0] + + if kind is None: + kind = "introsort" + + if kind == "introsort": + return simultaneous_introsort(&values[0], &indices[0], size) + + if kind == "quicksort": + return simultaneous_quick_sort(&values[0], &indices[0], size) + + if kind == "heapsort": + return simultaneous_heapsort(&values[0], &indices[0], size) + + cdef inline void _simultaneous_swap( floating* values, cnp.intp_t* indices, diff --git a/sklearn/utils/tests/test_sorting.py b/sklearn/utils/tests/test_sorting.py new file mode 100644 index 0000000000000..7d05ce92e323e --- /dev/null +++ b/sklearn/utils/tests/test_sorting.py @@ -0,0 +1,28 @@ +import pytest +import numpy as np +from numpy.testing import assert_array_almost_equal + +from sklearn.utils._sorting import _simultaneous_sort + +from sklearn.utils import check_random_state + + +@pytest.mark.parametrize("kind", ["introsort", "heapsort", "quick_sort"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_simultaneous_sort(kind, dtype, n_pts=201): + # Sort sanity check + rng = check_random_state(0) + values = rng.random_sample(n_pts).astype(dtype, copy=False) + indices = np.arange(n_pts).astype(np.int64, copy=False) + + values_2 = values.copy() + indices_2 = indices.copy() + + _simultaneous_sort(values, indices, kind=kind) + + sorted_indices = np.argsort(values_2) + values_2 = values_2[sorted_indices] + indices_2 = indices_2[sorted_indices] + + assert_array_almost_equal(values, values_2) + assert_array_almost_equal(indices, indices_2) From a73cb9b41f94ab506f9641ee015bd56e98a620d7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Mar 2023 10:29:22 +0100 Subject: [PATCH 09/18] Correct _simultaneous_sort --- sklearn/utils/_sorting.pyx | 3 ++- sklearn/utils/tests/test_sorting.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index f1e011445a302..35a3d4884b790 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -29,12 +29,13 @@ def _simultaneous_sort( if kind == "introsort": return simultaneous_introsort(&values[0], &indices[0], size) - if kind == "quicksort": + if kind == "quick_sort": return simultaneous_quick_sort(&values[0], &indices[0], size) if kind == "heapsort": return simultaneous_heapsort(&values[0], &indices[0], size) + raise ValueError(f"Currently kind='{kind}', but kind must be in ('introsort', 'quick_sort', 'heapsort').") cdef inline void _simultaneous_swap( floating* values, diff --git a/sklearn/utils/tests/test_sorting.py b/sklearn/utils/tests/test_sorting.py index 7d05ce92e323e..cff77b24ad4f2 100644 --- a/sklearn/utils/tests/test_sorting.py +++ b/sklearn/utils/tests/test_sorting.py @@ -7,6 +7,15 @@ from sklearn.utils import check_random_state +def test_simultaneous_sort_wrong_usage(): + rng = check_random_state(0) + values = rng.random_sample(10).astype(np.float64, copy=False) + indices = np.arange(10).astype(np.int64, copy=False) + + with pytest.raises(ValueError, match="Currently kind='not_existent'"): + _simultaneous_sort(values, indices, kind="not_existent") + + @pytest.mark.parametrize("kind", ["introsort", "heapsort", "quick_sort"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_simultaneous_sort(kind, dtype, n_pts=201): From 16d579f4997fbc7d3a69d0bf71253b6bf3db7472 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Mar 2023 10:32:19 +0100 Subject: [PATCH 10/18] Rename aliases --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 6 +++--- .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index f1b2d1f9f9a3b..ff10e889c1888 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -6,7 +6,7 @@ from cython cimport final from cython.parallel cimport parallel, prange from ...utils._heap cimport heap_push -from ...utils._sorting cimport simultaneous_quick_sort as _simultaneous_sort +from ...utils._sorting cimport simultaneous_quick_sort as simultaneous_sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np @@ -194,7 +194,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. for idx in range(X_end - X_start): - _simultaneous_sort( + simultaneous_sort( self.heaps_r_distances_chunks[thread_num] + idx * self.k, self.heaps_indices_chunks[thread_num] + idx * self.k, self.k @@ -278,7 +278,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main in ascending order w.r.t the distances. # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): - _simultaneous_sort( + simultaneous_sort( &self.argkmin_distances[idx, 0], &self.argkmin_indices[idx, 0], self.k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 564dd4ddbe2ce..52c35903e42fa 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -8,7 +8,7 @@ from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from ...utils._sorting cimport simultaneous_quick_sort as _simultaneous_sort +from ...utils._sorting cimport simultaneous_quick_sort as simultaneous_sort from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._vector_sentinel cimport vector_to_nd_array @@ -221,7 +221,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): - _simultaneous_sort( + simultaneous_sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() @@ -292,7 +292,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sort in parallel in ascending order w.r.t the distances if requested. if self.sort_results: for idx in prange(self.n_samples_X, schedule='static'): - _simultaneous_sort( + simultaneous_sort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() From f6c5f4b2baf56114ea60e882dc1241ff6b056be8 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Mar 2023 10:39:07 +0100 Subject: [PATCH 11/18] Complete TODO comment --- sklearn/utils/_sorting.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index 35a3d4884b790..df6233215dc0c 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -7,7 +7,8 @@ from libc.math cimport log as ln # simultaneous sort which breaks ties on indices when distances are identical. # The best might be using a std::stable_sort and a Comparator which might need # an Array of Structures (AoS) instead of the Structure of Arrays (SoA) -# currently used. +# currently used. Alternatively, we can find a stable algorithm for SoA and +# adapt it so that it is simultaneous. # Utilities functions From 83a82aa5074d016f26a82db49db0236a1f0920a0 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Mar 2023 13:43:22 +0100 Subject: [PATCH 12/18] TST Use np.intp over np.in64 for portability --- sklearn/utils/tests/test_sorting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_sorting.py b/sklearn/utils/tests/test_sorting.py index cff77b24ad4f2..c468e8e8c9167 100644 --- a/sklearn/utils/tests/test_sorting.py +++ b/sklearn/utils/tests/test_sorting.py @@ -10,7 +10,7 @@ def test_simultaneous_sort_wrong_usage(): rng = check_random_state(0) values = rng.random_sample(10).astype(np.float64, copy=False) - indices = np.arange(10).astype(np.int64, copy=False) + indices = np.arange(10).astype(np.intp, copy=False) with pytest.raises(ValueError, match="Currently kind='not_existent'"): _simultaneous_sort(values, indices, kind="not_existent") @@ -22,7 +22,7 @@ def test_simultaneous_sort(kind, dtype, n_pts=201): # Sort sanity check rng = check_random_state(0) values = rng.random_sample(n_pts).astype(dtype, copy=False) - indices = np.arange(n_pts).astype(np.int64, copy=False) + indices = np.arange(n_pts).astype(np.intp, copy=False) values_2 = values.copy() indices_2 = indices.copy() From f347471a844c7bdf49c36f71c0f6b463794bcb2a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 21 Mar 2023 19:48:51 +0100 Subject: [PATCH 13/18] Work-around Cython type inference limitations --- sklearn/tree/_splitter.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index b7459e6663a07..598d560632297 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -21,7 +21,9 @@ import numpy as np from scipy.sparse import csc_matrix +# TODO: when Cython>=3.0 is used, remove the casts in call to sort. from ..utils._sorting cimport simultaneous_introsort as sort +from ..utils._typedefs cimport intp_t from ._utils cimport log from ._utils cimport rand_int @@ -631,7 +633,7 @@ cdef class DensePartitioner: # effectively. for i in range(self.start, self.end): feature_values[i] = X[samples[i], current_feature] - sort(&feature_values[self.start], &samples[self.start], self.end - self.start) + sort(&feature_values[self.start], &samples[self.start], self.end - self.start) cdef inline void find_min_max( self, @@ -790,9 +792,9 @@ cdef class SparsePartitioner: self.extract_nnz(current_feature) # Sort the positive and negative parts of `feature_values` - sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start) + sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start) if self.start_positive < self.end: - sort(&feature_values[self.start_positive], &samples[self.start_positive], + sort(&feature_values[self.start_positive], &samples[self.start_positive], self.end - self.start_positive) # Update index_to_samples to take into account the sort From a1d7194cbd87db68d63b7d73bd799592b8bf3b8d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 22 Mar 2023 10:23:57 +0100 Subject: [PATCH 14/18] DOC Document _simultaneous_sort --- sklearn/utils/_sorting.pyx | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index ff73e0e5fc5c3..edfa181c11ef9 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -17,11 +17,31 @@ cdef inline double log(double x) nogil: return ln(x) / ln(2.0) def _simultaneous_sort( - floating[:] values, - intp_t[:] indices, + floating[::1] values, + intp_t[::1] indices, kind=None, ): - """Interface for testing purposes.""" + """Interface to simultaneous sorting algorithms. + + `values` and `indices` are sorted simultaneously based on increasing + order of elements in `values`. + + This interface exposes Cython implementations but is only meant to be + used for testing purposes. + + Parameters + ---------- + values : ndarray + 1-D Array of floating values to sort. + + indices : ndarray + Associated 1-D array of values' indices to sort. + + kind : str, default=None + Kind of the sorting algorithm to use. + Valid values for `kind` are in {'introsort', 'quick_sort', 'heapsort'}. + If None, 'introsort' is used. + """ cdef intp_t size = indices.shape[0] if kind is None: From d3f33c6bd6bb914fadd82dfe520a8db20fb14129 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 24 Mar 2023 09:40:26 +0100 Subject: [PATCH 15/18] DOC Rename quick_sort to quicksort --- .../_pairwise_distances_reduction/_argkmin.pyx.tp | 2 +- .../_radius_neighbors.pyx.tp | 2 +- sklearn/neighbors/_binary_tree.pxi | 2 +- sklearn/utils/_sorting.pxd | 2 +- sklearn/utils/_sorting.pyx | 14 +++++++------- sklearn/utils/tests/test_sorting.py | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index f311784c58963..3014d5e8665ba 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -4,7 +4,7 @@ from cython cimport final from cython.parallel cimport parallel, prange from ...utils._heap cimport heap_push -from ...utils._sorting cimport simultaneous_quick_sort as simultaneous_sort +from ...utils._sorting cimport simultaneous_quicksort as simultaneous_sort from ...utils._typedefs cimport intp_t, float64_t import numpy as np diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 2b26d817f3810..3e0744c795e6c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -8,7 +8,7 @@ from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from ...utils._sorting cimport simultaneous_quick_sort as simultaneous_sort +from ...utils._sorting cimport simultaneous_quicksort as simultaneous_sort from ...utils._typedefs cimport intp_t, float64_t from ...utils._vector_sentinel cimport vector_to_nd_array diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 14106ee2afc1e..196e7a9ff47c7 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -163,7 +163,7 @@ from ._partition_nodes cimport partition_node_indices from ..utils import check_array from ..utils._typedefs cimport float64_t, intp_t from ..utils._heap cimport heap_push -from ..utils._sorting cimport simultaneous_quick_sort as _simultaneous_sort +from ..utils._sorting cimport simultaneous_quicksort as _simultaneous_sort cnp.import_array() diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index 44f91bc5a06ae..92dacd5e0d82c 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -2,7 +2,7 @@ from cython cimport floating from ._typedefs cimport intp_t -cdef void simultaneous_quick_sort( +cdef void simultaneous_quicksort( floating* values, intp_t* indices, intp_t size, diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index edfa181c11ef9..d21c64b460fd0 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -39,7 +39,7 @@ def _simultaneous_sort( kind : str, default=None Kind of the sorting algorithm to use. - Valid values for `kind` are in {'introsort', 'quick_sort', 'heapsort'}. + Valid values for `kind` are in {'introsort', 'quicksort', 'heapsort'}. If None, 'introsort' is used. """ cdef intp_t size = indices.shape[0] @@ -50,13 +50,13 @@ def _simultaneous_sort( if kind == "introsort": return simultaneous_introsort(&values[0], &indices[0], size) - if kind == "quick_sort": - return simultaneous_quick_sort(&values[0], &indices[0], size) + if kind == "quicksort": + return simultaneous_quicksort(&values[0], &indices[0], size) if kind == "heapsort": return simultaneous_heapsort(&values[0], &indices[0], size) - raise ValueError(f"Currently kind='{kind}', but kind must be in ('introsort', 'quick_sort', 'heapsort').") + raise ValueError(f"Currently kind='{kind}', but kind must be in ('introsort', 'quicksort', 'heapsort').") cdef inline void _simultaneous_swap( floating* values, @@ -132,7 +132,7 @@ cdef inline void simultaneous_introsort( _simultaneous_introsort(values, indices, size, maxd) -cdef void simultaneous_quick_sort( +cdef void simultaneous_quicksort( floating* values, intp_t* indices, intp_t size, @@ -196,9 +196,9 @@ cdef void simultaneous_quick_sort( # Recursively sort each side of the pivot if pivot_idx > 1: - simultaneous_quick_sort(values, indices, pivot_idx) + simultaneous_quicksort(values, indices, pivot_idx) if pivot_idx + 2 < size: - simultaneous_quick_sort(values + pivot_idx + 1, + simultaneous_quicksort(values + pivot_idx + 1, indices + pivot_idx + 1, size - pivot_idx - 1) diff --git a/sklearn/utils/tests/test_sorting.py b/sklearn/utils/tests/test_sorting.py index c468e8e8c9167..61ba9f82f569b 100644 --- a/sklearn/utils/tests/test_sorting.py +++ b/sklearn/utils/tests/test_sorting.py @@ -16,7 +16,7 @@ def test_simultaneous_sort_wrong_usage(): _simultaneous_sort(values, indices, kind="not_existent") -@pytest.mark.parametrize("kind", ["introsort", "heapsort", "quick_sort"]) +@pytest.mark.parametrize("kind", ["introsort", "heapsort", "quicksort"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_simultaneous_sort(kind, dtype, n_pts=201): # Sort sanity check From b25345a520924c3df9c0dec61ee588bb5524d2fe Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 24 Mar 2023 09:44:22 +0100 Subject: [PATCH 16/18] TST Update tests Co-authored-by: Olivier Grisel --- sklearn/utils/tests/test_sorting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_sorting.py b/sklearn/utils/tests/test_sorting.py index 61ba9f82f569b..e5488080bed9c 100644 --- a/sklearn/utils/tests/test_sorting.py +++ b/sklearn/utils/tests/test_sorting.py @@ -12,15 +12,15 @@ def test_simultaneous_sort_wrong_usage(): values = rng.random_sample(10).astype(np.float64, copy=False) indices = np.arange(10).astype(np.intp, copy=False) - with pytest.raises(ValueError, match="Currently kind='not_existent'"): - _simultaneous_sort(values, indices, kind="not_existent") + with pytest.raises(ValueError, match="Currently kind='nonexistent'"): + _simultaneous_sort(values, indices, kind="nonexistent") @pytest.mark.parametrize("kind", ["introsort", "heapsort", "quicksort"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_simultaneous_sort(kind, dtype, n_pts=201): +def test_simultaneous_sort(kind, dtype, global_random_seed, n_pts=201): # Sort sanity check - rng = check_random_state(0) + rng = check_random_state(global_random_seed) values = rng.random_sample(n_pts).astype(dtype, copy=False) indices = np.arange(n_pts).astype(np.intp, copy=False) From 3f65032e6a3eee7a365e513af8860aa4c1556146 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 24 Mar 2023 17:40:28 +0100 Subject: [PATCH 17/18] Use libc.math.log2 directly --- sklearn/utils/_sorting.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index d21c64b460fd0..0e04b2efa9ea5 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -1,5 +1,5 @@ from cython cimport floating -from libc.math cimport log as ln +from libc.math cimport log2 from ._typedefs cimport intp_t @@ -12,9 +12,6 @@ from ._typedefs cimport intp_t # Utilities functions -# TODO: Factor code also present in `tree._utils` or use `libc.math.log2` directly -cdef inline double log(double x) nogil: - return ln(x) / ln(2.0) def _simultaneous_sort( floating[::1] values, @@ -128,7 +125,7 @@ cdef inline void simultaneous_introsort( # simultaneously, based on the values. Algorithm: Introsort (Musser, SP&E, 1997). if size == 0: return - cdef int maxd = 2 * log(size) + cdef int maxd = 2 * log2(size) _simultaneous_introsort(values, indices, size, maxd) From f7bd05d270b8fbf650a13becfaab014dab727193 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 24 Mar 2023 17:44:41 +0100 Subject: [PATCH 18/18] Do not use aliases Co-authored-by: jeremiedbb --- .../_argkmin.pyx.tp | 6 +++--- .../_radius_neighbors.pyx.tp | 6 +++--- sklearn/neighbors/_binary_tree.pxi | 21 ++++++++++++------- sklearn/tree/_splitter.pyx | 2 +- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 3014d5e8665ba..f2d122c6f1826 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -4,7 +4,7 @@ from cython cimport final from cython.parallel cimport parallel, prange from ...utils._heap cimport heap_push -from ...utils._sorting cimport simultaneous_quicksort as simultaneous_sort +from ...utils._sorting cimport simultaneous_quicksort from ...utils._typedefs cimport intp_t, float64_t import numpy as np @@ -184,7 +184,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. for idx in range(X_end - X_start): - simultaneous_sort( + simultaneous_quicksort( self.heaps_r_distances_chunks[thread_num] + idx * self.k, self.heaps_indices_chunks[thread_num] + idx * self.k, self.k @@ -268,7 +268,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main in ascending order w.r.t the distances. # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( + simultaneous_quicksort( &self.argkmin_distances[idx, 0], &self.argkmin_indices[idx, 0], self.k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 3e0744c795e6c..2d0f8819c4998 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -8,7 +8,7 @@ from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from ...utils._sorting cimport simultaneous_quicksort as simultaneous_sort +from ...utils._sorting cimport simultaneous_quicksort from ...utils._typedefs cimport intp_t, float64_t from ...utils._vector_sentinel cimport vector_to_nd_array @@ -218,7 +218,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): - simultaneous_sort( + simultaneous_quicksort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() @@ -289,7 +289,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sort in parallel in ascending order w.r.t the distances if requested. if self.sort_results: for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( + simultaneous_quicksort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 196e7a9ff47c7..7078ea629a4d1 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -163,7 +163,7 @@ from ._partition_nodes cimport partition_node_indices from ..utils import check_array from ..utils._typedefs cimport float64_t, intp_t from ..utils._heap cimport heap_push -from ..utils._sorting cimport simultaneous_quicksort as _simultaneous_sort +from ..utils._sorting cimport simultaneous_quicksort cnp.import_array() @@ -561,7 +561,7 @@ cdef class NeighborsHeap: """simultaneously sort the distances and indices""" cdef intp_t row for row in range(self.distances.shape[0]): - _simultaneous_sort( + simultaneous_quicksort( values=&self.distances[row, 0], indices=&self.indices[row, 0], size=self.distances.shape[1], @@ -1305,8 +1305,11 @@ cdef class BinaryTree: continue if sort_results: - _simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0], - counts[i]) + simultaneous_quicksort( + &dist_arr_i[0], + &idx_arr_i[0], + counts[i], + ) # equivalent to: indices[i] = np_idx_arr[:counts[i]].copy() indices[i] = malloc(counts[i] * sizeof(intp_t)) @@ -2388,15 +2391,17 @@ def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays This python wrapper exists primarily to enable unit testing - of the _simultaneous_sort C routine. + of the simultaneous_quicksort C routine. """ assert distances.shape[0] == indices.shape[0] assert distances.shape[1] == indices.shape[1] cdef intp_t row for row in range(distances.shape[0]): - _simultaneous_sort(&distances[row, 0], - &indices[row, 0], - distances.shape[1]) + simultaneous_quicksort( + &distances[row, 0], + &indices[row, 0], + distances.shape[1], + ) def nodeheap_sort(float64_t[::1] vals): diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 598d560632297..9c1d8d369e5ce 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -21,8 +21,8 @@ import numpy as np from scipy.sparse import csc_matrix -# TODO: when Cython>=3.0 is used, remove the casts in call to sort. from ..utils._sorting cimport simultaneous_introsort as sort +# TODO: when Cython>=3.0 is used, remove the casts in call to sort. from ..utils._typedefs cimport intp_t from ._utils cimport log