8000 MAINT refactor heap routines into utils by jjerphan · Pull Request #21987 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

MAINT refactor heap routines into utils #21987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Dec 21, 2021
147 changes: 11 additions & 136 deletions sklearn/neighbors/_binary_tree.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@
# BinaryTree tree2, ITYPE_t i_node2):
# """Compute the maximum distance between two nodes"""

cimport cython
cimport numpy as np
from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma
from libc.math cimport fmin, fmax
Expand All @@ -151,21 +150,21 @@ from libc.string cimport memcpy

import numpy as np
import warnings
from ..utils import check_array

from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
from sklearn.utils._typedefs import DTYPE, ITYPE

from ..metrics._dist_metrics cimport (
DistanceMetric,
euclidean_dist,
euclidean_rdist,
euclidean_dist_to_rdist,
euclidean_rdist_to_dist,
)

from ._partition_nodes cimport partition_node_indices

from ..utils import check_array
from ..utils._typedefs cimport DTYPE_t, ITYPE_t
from ..utils._typedefs import DTYPE, ITYPE
from ..utils._heap cimport simultaneous_sort as _simultaneous_sort, heap_push

cdef extern from "numpy/arrayobject.h":
void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)

Expand Down Expand Up @@ -231,7 +230,7 @@ leaf_size : positive int, default=40
the case that ``n_samples < leaf_size``.

metric : str or DistanceMetric object
the distance metric to use for the tree. Default='minkowski'
The distance metric to use for the tree. Default='minkowski'
with p=2 (that is, a euclidean metric). See the documentation
of the DistanceMetric class for a list of available metrics.
{binary_tree}.valid_metrics gives a list of the metrics which
Expand Down Expand Up @@ -494,27 +493,6 @@ def kernel_norm(h, d, kernel, return_log=False):
return np.exp(result)


######################################################################
# Tree Utility Routines
cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2):
"""swap the values at index i1 and i2 of arr"""
cdef DITYPE_t tmp = arr[i1]
arr[i1] = arr[i2]
arr[i2] = tmp


cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr,
ITYPE_t i1, ITYPE_t i2) nogil:
"""swap the values at inex i1 and i2 of both darr and iarr"""
cdef DTYPE_t dtmp = darr[i1]
darr[i1] = darr[i2]
darr[i2] = dtmp

cdef ITYPE_t itmp = iarr[i1]
iarr[i1] = iarr[i2]
iarr[i2] = itmp


cdef class NeighborsHeap:
"""A max-heap structure to keep track of distances/indices of neighbors

Expand Down Expand Up @@ -569,52 +547,11 @@ cdef class NeighborsHeap:
cdef int _push(self, ITYPE_t row, DTYPE_t val,
ITYPE_t i_val) nogil except -1:
"""push (val, i_val) into the given row"""
cdef ITYPE_t i, ic1, ic2, i_swap
cdef ITYPE_t size = self.distances.shape[1]
cdef DTYPE_t* dist_arr = &self.distances[row, 0]
cdef ITYPE_t* ind_arr = &self.indices[row, 0]

# check if val should be in heap
if val >= dist_arr[0]:
return 0

# insert val at position zero
dist_arr[0] = val
ind_arr[0] = i_val

# descend the heap, swapping values until the max heap criterion is met
i = 0
while True:
ic1 = 2 * i + 1
ic2 = ic1 + 1

if ic1 >= size:
break
elif ic2 >= size:
if dist_arr[ic1] > val:
i_swap = ic1
else:
break
elif dist_arr[ic1] >= dist_arr[ic2]:
if val < dist_arr[ic1]:
i_swap = ic1
else:
break
else:
if val < dist_arr[ic2]:
i_swap = ic2
else:
break

dist_arr[i] = dist_arr[i_swap]
ind_arr[i] = ind_arr[i_swap]

i = i_swap

dist_arr[i] = val
ind_arr[i] = i_val

return 0
cdef:
ITYPE_t size = self.distances.shape[1]
DTYPE_t* dist_arr = &self.distances[row, 0]
ITYPE_t* ind_arr = &self.indices[row, 0]
return heap_push(dist_arr, ind_arr, size, val, i_val)

cdef int _sort(self) except -1:
"""simultaneously sort the distances and indices"""
Expand All @@ -627,68 +564,6 @@ cdef class NeighborsHeap:
distances.shape[1])
return 0


cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx,
ITYPE_t size) nogil except -1:
"""
Perform a recursive quicksort on the dist array, simultaneously
performing the same swaps o 10000 n the idx array. The equivalent in
numpy (though quite a bit slower) is

def simultaneous_sort(dist, idx):
i = np.argsort(dist)
return dist[i], idx[i]
"""
cdef ITYPE_t pivot_idx, i, store_idx
cdef DTYPE_t pivot_val

# in the small-array case, do things efficiently
if size <= 1:
pass
elif size == 2:
if dist[0] > dist[1]:
dual_swap(dist, idx, 0, 1)
elif size == 3:
if dist[0] > dist[1]:
dual_swap(dist, idx, 0, 1)
if dist[1] > dist[2]:
dual_swap(dist, idx, 1, 2)
if dist[0] > dist[1]:
dual_swap(dist, idx, 0, 1)
else:
# Determine the pivot using the median-of-three rule.
# The smallest of the three is moved to the beginning of the array,
# the middle (the pivot value) is moved to the end, and the largest
# is moved to the pivot index.
pivot_idx = size / 2
if dist[0] > dist[size - 1]:
dual_swap(dist, idx, 0, size - 1)
if dist[size - 1] > dist[pivot_idx]:
dual_swap(dist, idx, size - 1, pivot_idx)
if dist[0] > dist[size - 1]:
dual_swap(dist, idx, 0, size - 1)
pivot_val = dist[size - 1]

# partition indices about pivot. At the end of this operation,
# pivot_idx will contain the pivot value, everything to the left
# will be smaller, and everything to the right will be larger.
store_idx = 0
for i in range(size - 1):
if dist[i] < pivot_val:
dual_swap(dist, idx, i, store_idx)
store_idx += 1
dual_swap(dist, idx, store_idx, size - 1)
pivot_idx = store_idx

# recursively sort each side of the pivot
if pivot_idx > 1:
_simultaneous_sort(dist, idx, pivot_idx)
if pivot_idx + 2 < size:
_simultaneous_sort(dist + pivot_idx + 1,
idx + pivot_idx + 1,
size - pivot_idx - 1)
return 0

#------------------------------------------------------------
# find_node_split_dim:
# this computes the equivalent of
Expand Down
17 changes: 17 additions & 0 deletions sklearn/neighbors/tests/test_ball_tree.py
67F4
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,20 @@ def test_array_object_type():
X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
with pytest.raises(ValueError, match="setting an array element with a sequence"):
BallTree(X)


def test_bad_pyfunc_metric():
def wrong_returned_value(x, y):
return "1"

def one_arg_func(x):
return 1.0 # pragma: no cover

X = np.ones((5, 2))
msg = "Custom distance function must accept two vectors and return a float."
with pytest.raises(TypeError, match=msg):
BallTree(X, metric=wrong_returned_value)

msg = "takes 1 positional argument but 2 were given"
with pytest.raises(TypeError, match=msg):
BallTree(X, metric=one_arg_func)
19 changes: 19 additions & 0 deletions sklearn/utils/_heap.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Heap routines, used in various Cython implementations.

from cython cimport floating

from ._typedefs cimport ITYPE_t

cdef int simultaneous_sort(
floating* dist,
ITYPE_t* idx,
ITYPE_t size
) nogil

cdef int heap_push(
floating* values,
ITYPE_t* indices,
ITYPE_t size,
floating val,
ITYPE_t val_idx,
) nogil
Loading
0