diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 6542bc680c58c..b3f02cd74d7b5 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -142,7 +142,6 @@ # BinaryTree tree2, ITYPE_t i_node2): # """Compute the maximum distance between two nodes""" -cimport cython cimport numpy as np from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma from libc.math cimport fmin, fmax @@ -151,21 +150,21 @@ from libc.string cimport memcpy import numpy as np import warnings -from ..utils import check_array - -from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t -from sklearn.utils._typedefs import DTYPE, ITYPE from ..metrics._dist_metrics cimport ( DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist, - euclidean_rdist_to_dist, ) from ._partition_nodes cimport partition_node_indices +from ..utils import check_array +from ..utils._typedefs cimport DTYPE_t, ITYPE_t +from ..utils._typedefs import DTYPE, ITYPE +from ..utils._heap cimport simultaneous_sort as _simultaneous_sort, heap_push + cdef extern from "numpy/arrayobject.h": void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) @@ -231,7 +230,7 @@ leaf_size : positive int, default=40 the case that ``n_samples < leaf_size``. metric : str or DistanceMetric object - the distance metric to use for the tree. Default='minkowski' + The distance metric to use for the tree. Default='minkowski' with p=2 (that is, a euclidean metric). See the documentation of the DistanceMetric class for a list of available metrics. {binary_tree}.valid_metrics gives a list of the metrics which @@ -494,27 +493,6 @@ def kernel_norm(h, d, kernel, return_log=False): return np.exp(result) -###################################################################### -# Tree Utility Routines -cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2): - """swap the values at index i1 and i2 of arr""" - cdef DITYPE_t tmp = arr[i1] - arr[i1] = arr[i2] - arr[i2] = tmp - - -cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr, - ITYPE_t i1, ITYPE_t i2) nogil: - """swap the values at inex i1 and i2 of both darr and iarr""" - cdef DTYPE_t dtmp = darr[i1] - darr[i1] = darr[i2] - darr[i2] = dtmp - - cdef ITYPE_t itmp = iarr[i1] - iarr[i1] = iarr[i2] - iarr[i2] = itmp - - cdef class NeighborsHeap: """A max-heap structure to keep track of distances/indices of neighbors @@ -569,52 +547,11 @@ cdef class NeighborsHeap: cdef int _push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val) nogil except -1: """push (val, i_val) into the given row""" - cdef ITYPE_t i, ic1, ic2, i_swap - cdef ITYPE_t size = self.distances.shape[1] - cdef DTYPE_t* dist_arr = &self.distances[row, 0] - cdef ITYPE_t* ind_arr = &self.indices[row, 0] - - # check if val should be in heap - if val >= dist_arr[0]: - return 0 - - # insert val at position zero - dist_arr[0] = val - ind_arr[0] = i_val - - # descend the heap, swapping values until the max heap criterion is met - i = 0 - while True: - ic1 = 2 * i + 1 - ic2 = ic1 + 1 - - if ic1 >= size: - break - elif ic2 >= size: - if dist_arr[ic1] > val: - i_swap = ic1 - else: - break - elif dist_arr[ic1] >= dist_arr[ic2]: - if val < dist_arr[ic1]: - i_swap = ic1 - else: - break - else: - if val < dist_arr[ic2]: - i_swap = ic2 - else: - break - - dist_arr[i] = dist_arr[i_swap] - ind_arr[i] = ind_arr[i_swap] - - i = i_swap - - dist_arr[i] = val - ind_arr[i] = i_val - - return 0 + cdef: + ITYPE_t size = self.distances.shape[1] + DTYPE_t* dist_arr = &self.distances[row, 0] + ITYPE_t* ind_arr = &self.indices[row, 0] + return heap_push(dist_arr, ind_arr, size, val, i_val) cdef int _sort(self) except -1: """simultaneously sort the distances and indices""" @@ -627,68 +564,6 @@ cdef class NeighborsHeap: distances.shape[1]) return 0 - -cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx, - ITYPE_t size) nogil except -1: - """ - Perform a recursive quicksort on the dist array, simultaneously - performing the same swaps on the idx array. The equivalent in - numpy (though quite a bit slower) is - - def simultaneous_sort(dist, idx): - i = np.argsort(dist) - return dist[i], idx[i] - """ - cdef ITYPE_t pivot_idx, i, store_idx - cdef DTYPE_t pivot_val - - # in the small-array case, do things efficiently - if size <= 1: - pass - elif size == 2: - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - elif size == 3: - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - if dist[1] > dist[2]: - dual_swap(dist, idx, 1, 2) - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - else: - # Determine the pivot using the median-of-three rule. - # The smallest of the three is moved to the beginning of the array, - # the middle (the pivot value) is moved to the end, and the largest - # is moved to the pivot index. - pivot_idx = size / 2 - if dist[0] > dist[size - 1]: - dual_swap(dist, idx, 0, size - 1) - if dist[size - 1] > dist[pivot_idx]: - dual_swap(dist, idx, size - 1, pivot_idx) - if dist[0] > dist[size - 1]: - dual_swap(dist, idx, 0, size - 1) - pivot_val = dist[size - 1] - - # partition indices about pivot. At the end of this operation, - # pivot_idx will contain the pivot value, everything to the left - # will be smaller, and everything to the right will be larger. - store_idx = 0 - for i in range(size - 1): - if dist[i] < pivot_val: - dual_swap(dist, idx, i, store_idx) - store_idx += 1 - dual_swap(dist, idx, store_idx, size - 1) - pivot_idx = store_idx - - # recursively sort each side of the pivot - if pivot_idx > 1: - _simultaneous_sort(dist, idx, pivot_idx) - if pivot_idx + 2 < size: - _simultaneous_sort(dist + pivot_idx + 1, - idx + pivot_idx + 1, - size - pivot_idx - 1) - return 0 - #------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 41ccff25a260e..d5046afd2da2a 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -85,3 +85,20 @@ def test_array_object_type(): X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): BallTree(X) + + +def test_bad_pyfunc_metric(): + def wrong_returned_value(x, y): + return "1" + + def one_arg_func(x): + return 1.0 # pragma: no cover + + X = np.ones((5, 2)) + msg = "Custom distance function must accept two vectors and return a float." + with pytest.raises(TypeError, match=msg): + BallTree(X, metric=wrong_returned_value) + + msg = "takes 1 positional argument but 2 were given" + with pytest.raises(TypeError, match=msg): + BallTree(X, metric=one_arg_func) diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd new file mode 100644 index 0000000000000..c760d900a1c3b --- /dev/null +++ b/sklearn/utils/_heap.pxd @@ -0,0 +1,19 @@ +# Heap routines, used in various Cython implementations. + +from cython cimport floating + +from ._typedefs cimport ITYPE_t + +cdef int simultaneous_sort( + floating* dist, + ITYPE_t* idx, + ITYPE_t size +) nogil + +cdef int heap_push( + floating* values, + ITYPE_t* indices, + ITYPE_t size, + floating val, + ITYPE_t val_idx, +) nogil diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx new file mode 100644 index 0000000000000..03ab33d6e065b --- /dev/null +++ b/sklearn/utils/_heap.pyx @@ -0,0 +1,172 @@ +from cython cimport floating, integral, numeric + +from ._typedefs cimport ITYPE_t + +cdef inline void dual_swap(floating* darr, ITYPE_t* iarr, + ITYPE_t a, ITYPE_t b) nogil: + """Swap the values at index a and b of both darr and iarr""" + cdef floating dtmp = darr[a] + darr[a] = darr[b] + darr[b] = dtmp + + cdef ITYPE_t itmp = iarr[a] + iarr[a] = iarr[b] + iarr[b] = itmp + +cdef int simultaneous_sort( + floating* values, + ITYPE_t* indices, + ITYPE_t size +) nogil: + """ + Perform a recursive quicksort on the values array as to sort them ascendingly. + This simultaneously performs the swaps on both the values and the indices arrays. + + The numpy equivalent is: + + def simultaneous_sort(dist, idx): + i = np.argsort(dist) + return dist[i], idx[i] + + Notes + ----- + Arrays are manipulated via a pointer to there first element and their size + as to ease the processing of dynamically allocated buffers. + """ + # TODO: In order to support discrete distance metrics, we need to have a + # simultaneous sort which breaks ties on indices when distances are identical. + # The best might be using a std::stable_sort and a Comparator which might need + # an Array of Structures (AoS) instead of the Structure of Arrays (SoA) + # currently used. + cdef: + ITYPE_t pivot_idx, i, store_idx + floating pivot_val + + # in the small-array case, do things efficiently + if size <= 1: + pass + elif size == 2: + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + elif size == 3: + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + if values[1] > values[2]: + dual_swap(values, indices, 1, 2) + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + else: + # Determine the pivot using the median-of-three rule. + # The smallest of the three is moved to the beginning of the array, + # the middle (the pivot value) is moved to the end, and the largest + # is moved to the pivot index. + pivot_idx = size // 2 + if values[0] > values[size - 1]: + dual_swap(values, indices, 0, size - 1) + if values[size - 1] > values[pivot_idx]: + dual_swap(values, indices, size - 1, pivot_idx) + if values[0] > values[size - 1]: + dual_swap(values, indices, 0, size - 1) + pivot_val = values[size - 1] + + # Partition indices about pivot. At the end of this operation, + # pivot_idx will contain the pivot value, everything to the left + # will be smaller, and everything to the right will be larger. + store_idx = 0 + for i in range(size - 1): + if values[i] < pivot_val: + dual_swap(values, indices, i, store_idx) + store_idx += 1 + dual_swap(values, indices, store_idx, size - 1) + pivot_idx = store_idx + + # Recursively sort each side of the pivot + if pivot_idx > 1: + simultaneous_sort(values, indices, pivot_idx) + if pivot_idx + 2 < size: + simultaneous_sort(values + pivot_idx + 1, + indices + pivot_idx + 1, + size - pivot_idx - 1) + return 0 + + +cdef inline int heap_push( + floating* values, + ITYPE_t* indices, + ITYPE_t size, + floating val, + ITYPE_t val_idx, +) nogil: + """Push a tuple (val, val_idx) onto a fixed-size max-heap. + + The max-heap is represented as a Structure of Arrays where: + - values is the array containing the data to construct the heap with + - indices is the array containing the indices (meta-data) of each value + + Notes + ----- + Arrays are manipulated via a pointer to there first element and their size + as to ease the processing of dynamically allocated buffers. + + For instance, in pseudo-code: + + values = [1.2, 0.4, 0.1], + indices = [42, 1, 5], + heap_push( + values=values, + indices=indices, + size=3, + val=0.2, + val_idx=4, + ) + + will modify values and indices inplace, giving at the end of the call: + + values == [0.4, 0.2, 0.1] + indices == [1, 4, 5] + + """ + cdef: + ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx + + # Check if val should be in heap + if val >= values[0]: + return 0 + + # Insert val at position zero + values[0] = val + indices[0] = val_idx + + # Descend the heap, swapping values until the max heap criterion is met + current_idx = 0 + while True: + left_child_idx = 2 * current_idx + 1 + right_child_idx = left_child_idx + 1 + + if left_child_idx >= size: + break + elif right_child_idx >= size: + if values[left_child_idx] > val: + swap_idx = left_child_idx + else: + break + elif values[left_child_idx] >= values[right_child_idx]: + if val < values[left_child_idx]: + swap_idx = left_child_idx + else: + break + else: + if val < values[right_child_idx]: + swap_idx = right_child_idx + else: + break + + values[current_idx] = values[swap_idx] + indices[current_idx] = indices[swap_idx] + + current_idx = swap_idx + + values[current_idx] = val + indices[current_idx] = val_idx + + return 0 diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index ed78ecc5db76f..e3ceab6c52bbf 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -95,6 +95,12 @@ def configuration(parent_package="", top_path=None): libraries=libraries, ) + config.add_extension( + "_heap", + sources=["_heap.pyx"], + libraries=libraries, + ) + config.add_subpackage("tests") return config