8000 Adding base criterion · scikit-learn/scikit-learn@63c62c6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 63c62c6

Browse files
committed
Adding base criterion
1 parent 98a3fdf commit 63c62c6

File tree

4 files changed

+1197
-49
lines changed

4 files changed

+1197
-49
lines changed
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
cimport numpy as cnp
2+
3+
from libcpp.memory cimport shared_ptr
4+
from libcpp.vector cimport vector
5+
from cython cimport final
6+
7+
from ...utils._typedefs cimport ITYPE_t, DTYPE_t
8+
9+
cnp.import_array()
10+
11+
######################
12+
## std::vector to np.ndarray coercion
13+
# As type covariance is not supported for C++ containers via Cython,
14+
# we need to redefine fused types.
15+
ctypedef fused vector_DITYPE_t:
16+
vector[ITYPE_t]
17+
vector[DTYPE_t]
18+
19+
20+
ctypedef fused vector_vector_DITYPE_t:
21+
vector[vector[ITYPE_t]]
22+
vector[vector[DTYPE_t]]
23+
24+
cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
25+
shared_ptr[vector_vector_DITYPE_t] vecs
26+
)
27+
28+
#####################
29+
30+
from ._base cimport BaseDistanceReducer64
31+
from ._gemm_term_computer cimport GEMMTermComputer64
32+
33+
cdef class RadiusNeighbors64(BaseDistanceReducer64):
34+
"""
35+
64bit implementation of BaseDistanceReducer64 for the
36+
`RadiusNeighbors` reduction.
37+
"""
38+
39+
cdef:
40+
DTYPE_t radius
41+
42+
# DistanceMetric64 compute rank-preserving surrogate distance via rdist
43+
# which are proxies necessitating less computations.
44+
# We get the equivalent for the radius to be able to compare it against
45+
# vectors' rank-preserving surrogate distances.
46+
DTYPE_t r_radius
47+
48+
# Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
49+
#
50+
# For this implementation, we want resizable buffers which we will wrap
51+
# into numpy arrays at the end. std::vector comes as a handy container
52+
# for interacting efficiently with resizable buffers.
53+
#
54+
# Though it is possible to access their buffer address with
55+
# std::vector::data, they can't be stolen: buffers lifetime
56+
# is tied to their std::vector and are deallocated when
57+
# std::vectors are.
58+
#
59+
# To solve this, we dynamically allocate std::vectors and then
60+
# encapsulate them in a StdVectorSentinel responsible for
61+
# freeing them when the associated np.ndarray is freed.
62+
#
63+
# Shared pointers (defined via shared_ptr) are use for safer memory management.
64+
# Unique pointers (defined via unique_ptr) can't be used as datastructures
65+
# are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
66+
shared_ptr[vector[vector[ITYPE_t]]] neigh_indices
67+
shared_ptr[vector[vector[DTYPE_t]]] neigh_distances
68+
69+
# Used as array of pointers to private datastructures used in threads.
70+
vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks
71+
vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks
72+
73+
bint sort_results
74+
75+
@final
76+
cdef void _merge_vectors(
77+
self,
78+
ITYPE_t idx,
79+
ITYPE_t num_threads,
80+
) nogil
81+
82+
83+
cdef class EuclideanRadiusNeighbors64(RadiusNeighbors64):
84+
"""EuclideanDistance-specialized 64bit implementation for RadiusNeighbors64."""
85+
cdef:
86+
GEMMTermComputer64 gem 628C m_term_computer
87+
const DTYPE_t[::1] X_norm_squared
88+
const DTYPE_t[::1] Y_norm_squared
89+
90+
bint use_squared_distances
91+
92+
from ._base cimport BaseDistanceReducer32
93+
from ._gemm_term_computer cimport GEMMTermComputer32
94+
95+
cdef class RadiusNeighbors32(BaseDistanceReducer32):
96+
"""
97+
32bit implementation of BaseDistanceReducer32 for the
98+
`RadiusNeighbors` reduction.
99+
"""
100+
101+
cdef:
102+
DTYPE_t radius
103+
104+
# DistanceMetric32 compute rank-preserving surrogate distance via rdist
105+
# which are proxies necessitating less computations.
106+
# We get the equivalent for the radius to be able to compare it against
107+
# vectors' rank-preserving surrogate distances.
108+
DTYPE_t r_radius
109+
110+
# Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
111+
#
112+
# For this implementation, we want resizable buffers which we will wrap
113+
# into numpy arrays at the end. std::vector comes as a handy container
114+
# for interacting efficiently with resizable buffers.
115+
#
116+
# Though it is possible to access their buffer address with
117+
# std::vector::data, they can't be stolen: buffers lifetime
118+
# is tied to their std::vector and are deallocated when
119+
# std::vectors are.
120+
#
121+
# To solve this, we dynamically allocate std::vectors and then
122+
# encapsulate them in a StdVectorSentinel responsible for
123+
# freeing them when the associated np.ndarray is freed.
124+
#
125+
# Shared pointers (defined via shared_ptr) are use for safer memory management.
126+
# Unique pointers (defined via unique_ptr) can't be used as datastructures
127+
# are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
128+
shared_ptr[vector[vector[ITYPE_t]]] neigh_indices
129+
shared_ptr[vector[vector[DTYPE_t]]] neigh_distances
130+
131+
# Used as array of pointers to private datastructures used in threads.
132+
vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks
133+
vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks
134+
135+
bint sort_results
136+
137+
@final
138+
cdef void _merge_vectors(
139+
self,
140+
ITYPE_t idx,
141+
ITYPE_t num_threads,
142+
) nogil
143+
144+
145+
cdef class EuclideanRadiusNeighbors32(RadiusNeighbors32):
146+
"""EuclideanDistance-specialized 32bit implementation for RadiusNeighbors32."""
147+
cdef:
148+
GEMMTermComputer32 gemm_term_computer
149+
const DTYPE_t[::1] X_norm_squared
150+
const DTYPE_t[::1] Y_norm_squared
151+
152+
bint use_squared_distances

0 commit comments

Comments
 (0)
0