10000 MAINT Replace cnp.ndarray with memory views in sklearn.tree._tree (wh… · scikit-learn/scikit-learn@584d413 · GitHub
[go: up one dir, main page]

Skip to content

Commit 584d413

Browse files
authored
MAINT Replace cnp.ndarray with memory views in sklearn.tree._tree (where possible) (#25540)
1 parent aae5c83 commit 584d413

File tree

3 files changed

+101
-80
lines changed

3 files changed

+101
-80
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
"sklearn.svm._libsvm_sparse",
112112
"sklearn.svm._newrand",
113113
"sklearn.tree._splitter",
114+
"sklearn.tree._tree",
114115
"sklearn.tree._utils",
115116
"sklearn.utils._cython_blas",
116117
"sklearn.utils._fast_dict",

sklearn/tree/_tree.pxd

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,17 @@ cdef class TreeBuilder:
9999
cdef SIZE_t max_depth # Maximal tree depth
100100
cdef double min_impurity_decrease # Impurity threshold for early stopping
101101

102-
cpdef build(self, Tree tree, object X, cnp.ndarray y,
103-
cnp.ndarray sample_weight=*)
104-
cdef _check_input(self, object X, cnp.ndarray y, cnp.ndarray sample_weight)
102+
cpdef build(
103+
self,
104+
Tree tree,
105+
object X,
106+
const DOUBLE_t[:, ::1] y,
107+
const DOUBLE_t[:] sample_weight=*,
108+
)
109+
110+
cdef _check_input(
111+
self,
112+
object X,
113+
const DOUBLE_t[:, ::1] y,
114+
const DOUBLE_t[:] sample_weight,
115+
)

sklearn/tree/_tree.pyx

Lines changed: 86 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,22 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
8686
cdef class TreeBuilder:
8787
"""Interface for different tree building strategies."""
8888

89-
cpdef build(self, Tree tree, object X, cnp.ndarray y,
90-
cnp.ndarray sample_weight=None):
89+
cpdef build(
90+
self,
91+
Tree tree,
92+
object X,
93+
const DOUBLE_t[:, ::1] y,
94+
const DOUBLE_t[:] sample_weight=None,
95+
):
9196
"""Build a decision tree from the training set (X, y)."""
9297
pass
9398

94-
cdef inline _check_input(self, object X, cnp.ndarray y,
95-
cnp.ndarray sample_weight):
99+
cdef inline _check_input(
100+
self,
101+
object X,
102+
const DOUBLE_t[:, ::1] y,
103+
const DOUBLE_t[:] sample_weight,
104+
):
96105
"""Check input dtype, layout and format"""
97106
if issparse(X):
98107
X = X.tocsc()
@@ -109,12 +118,15 @@ cdef class TreeBuilder:
109118
# since we have to copy we will make it fortran for efficiency
110119
X = np.asfortranarray(X, dtype=DTYPE)
111120

112-
if y.dtype != DOUBLE or not y.flags.contiguous:
121+
# TODO: This check for y seems to be redundant, as it is also
122+
# present in the BaseDecisionTree's fit method, and therefore
123+
# can be removed.
124+
if y.base.dtype != DOUBLE or not y.base.flags.contiguous:
113125
y = np.ascontiguousarray(y, dtype=DOUBLE)
114126

115127
if (sample_weight is not None and
116-
(sample_weight.dtype != DOUBLE or
117-
not sample_weight.flags.contiguous)):
128+
(sample_weight.base.dtype != DOUBLE or
129+
not sample_weight.base.flags.contiguous)):
118130
sample_weight = np.asarray(sample_weight, dtype=DOUBLE,
119131
order="C")
120132

@@ -144,8 +156,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
144156
self.max_depth = max_depth
145157
self.min_impurity_decrease = min_impurity_decrease
146158

147-
cpdef build(self, Tree tree, object X, cnp.ndarray y,
148-
cnp.ndarray sample_weight=None):
159+
cpdef build(
160+
self,
161+
Tree tree,
162+
object X,
163+
const DOUBLE_t[:, ::1] y,
164+
const DOUBLE_t[:] sample_weight=None,
165+
):
149166
"""Build a decision tree from the training set (X, y)."""
150167

151168
# check input
@@ -335,8 +352,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
335352
self.max_leaf_nodes = max_leaf_nodes
336353
self.min_impurity_decrease = min_impurity_decrease
337354

338-
cpdef build(self, Tree tree, object X, cnp.ndarray y,
339-
cnp.ndarray sample_weight=None):
355+
cpdef build(
356+
self,
357+
Tree tree,
358+
object X,
359+
const DOUBLE_t[:, ::1] y,
360+
const DOUBLE_t[:] sample_weight=None,
361+
):
340362
"""Build a decision tree from the training set (X, y)."""
341363

342364
# check input
@@ -608,6 +630,8 @@ cdef class Tree:
608630
def __get__(self):
609631
return self._get_value_ndarray()[:self.node_count]
610632

633+
# TODO: Convert n_classes to cython.integral memory view once
634+
# https://github.com/cython/cython/issues/5243 is fixed
611635
def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
612636
"""Constructor."""
613637
cdef SIZE_t dummy = 0
@@ -683,9 +707,12 @@ cdef class Tree:
683707
self.capacity = node_ndarray.shape[0]
684708
if self._resize_c(self.capacity) != 0:
685709
raise MemoryError("resizing tree to %d" % self.capacity)
686-
nodes = memcpy(self.nodes, (<cnp.ndarray> node_ndarray).data,
710+
711+
cdef Node[::1] node_memory_view = node_ndarray
712+
cdef DOUBLE_t[:, :, ::1] value_memory_view = value_ndarray
713+
nodes = memcpy(self.nodes, &node_memory_view[0],
687714
self.capacity * sizeof(Node))
688-
value = memcpy(self.value, (<cnp.ndarray> value_ndarray).data,
715+
value = memcpy(self.value, &value_memory_view[0, 0, 0],
689716
self.capacity * self.value_stride * sizeof(double))
690717

691718
cdef int _resize(self, SIZE_t capacity) nogil except -1:
@@ -804,8 +831,7 @@ cdef class Tree:
804831
cdef SIZE_t n_samples = X.shape[0]
805832

806833
# Initialize output
807-
cdef cnp.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype=np.intp)
808-
cdef SIZE_t* out_ptr = <SIZE_t*> out.data
834+
cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp)
809835

810836
# Initialize auxiliary data-structure
811837
cdef Node* node = NULL
@@ -822,9 +848,9 @@ cdef class Tree:
822848
else:
823849
node = &self.nodes[node.right_child]
824850

825-
out_ptr[i] = <SIZE_t>(node - self.nodes) # node offset
851+
out[i] = <SIZE_t>(node - self.nodes) # node offset
826852

827-
return out
853+
return np.asarray(out)
828854

829855
cdef inline cnp.ndarray _apply_sparse_csr(self, object X):
830856
"""Finds the terminal region (=leaf node) for each sample in sparse X.
@@ -838,21 +864,15 @@ cdef class Tree:
838864
raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
839865

840866
# Extract input
841-
cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data
842-
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices
843-
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr
844-
845-
cdef DTYPE_t* X_data = <DTYPE_t*>X_data_ndarray.data
846-
cdef INT32_t* X_indices = <INT32_t*>X_indices_ndarray.data
847-
cdef INT32_t* X_indptr = <INT32_t*>X_indptr_ndarray.data
867+
cdef const DTYPE_t[:] X_data = X.data
868+
cdef const INT32_t[:] X_indices = X.indices
869+
cdef const INT32_t[:] X_indptr = X.indptr
848870

849871
cdef SIZE_t n_samples = X.shape[0]
850872
cdef SIZE_t n_features = X.shape[1]
851873

852874
# Initialize output
853-
cdef cnp.ndarray[SIZE_t, ndim=1] out = np.zeros((n_samples,),
854-
dtype=np.intp)
855-
cdef SIZE_t* out_ptr = <SIZE_t*> out.data
875+
cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp)
856876

857877
# Initialize auxiliary data-structure
858878
cdef DTYPE_t feature_value = 0.
@@ -893,13 +913,13 @@ cdef class Tree:
893913
else:
894914
node = &self.nodes[node.right_child]
895915

896-
out_ptr[i] = <SIZE_t>(node - self.nodes) # node offset
916+
out[i] = <SIZE_t>(node - self.nodes) # node offset
897917

898918
# Free auxiliary arrays
899919
free(X_sample)
900920
free(feature_to_sample)
901921

902-
return out
922+
return np.asarray(out)
903923

904924
cpdef object decision_path(self, object X):
905925
"""Finds the decision path (=node) for each sample in X."""
@@ -924,13 +944,10 @@ cdef class Tree:
924944
cdef SIZE_t n_samples = X.shape[0]
925945

926946
# Initialize output
927-
cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp)
928-
cdef SIZE_t* indptr_ptr = <SIZE_t*> indptr.data
929-
930-
cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
931-
(1 + self.max_depth),
932-
dtype=np.intp)
933-
cdef SIZE_t* indices_ptr = <SIZE_t*> indices.data
947+
cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp)
948+
cdef SIZE_t[:] indices = np.zeros(
949+
n_samples * (1 + self.max_depth), dtype=np.intp
950+
)
934951

935952
# Initialize auxiliary data-structure
936953
cdef Node* node = NULL
@@ -939,26 +956,25 @@ cdef class Tree:
939956
with nogil:
940957
for i in range(n_samples):
941958
node = self.nodes
942-
indptr_ptr[i + 1] = indptr_ptr[i]
959+
indptr[i + 1] = indptr[i]
943960

944961
# Add all external nodes
945962
while node.left_child != _TREE_LEAF:
946963
# ... and node.right_child != _TREE_LEAF:
947-
indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
948-
indptr_ptr[i + 1] += 1
964+
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
965+
indptr[i + 1] += 1
949966

950967
if X_ndarray[i, node.feature] <= node.threshold:
951968
node = &self.nodes[node.left_child]
952969
else:
953970
node = &self.nodes[node.right_child]
954971

955972
# Add the leave node
956-
indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
957-
indptr_ptr[i + 1] += 1
973+
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
974+
indptr[i + 1] += 1
958975

959976
indices = indices[:indptr[n_samples]]
960-
cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices),
961-
dtype=np.intp)
977+
cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp)
962978
out = csr_matrix((data, indices, indptr),
963979
shape=(n_samples, self.node_count))
964980

@@ -976,25 +992,18 @@ cdef class Tree:
976992
raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
977993

978994
# Extract input
979-
cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data
980-
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices
981-
cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr
982-
983-
cdef DTYPE_t* X_data = <DTYPE_t*>X_data_ndarray.data
984-
cdef INT32_t* X_indices = <INT32_t*>X_indices_ndarray.data
985-
cdef INT32_t* X_indptr = <INT32_t*>X_indptr_ndarray.data
995+
cdef const DTYPE_t[:] X_data = X.data
996+
cdef const INT32_t[:] X_indices = X.indices
997+
cdef const INT32_t[:] X_indptr = X.indptr
986998

987999
cdef SIZE_t n_samples = X.shape[0]
9881000
cdef SIZE_t n_features = X.shape[1]
9891001

9901002
# Initialize output
991-
cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp)
992-
cdef SIZE_t* indptr_ptr = <SIZE_t*> indptr.data
993-
994-
cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
995-
(1 + self.max_depth),
996-
dtype=np.intp)
997-
cdef SIZE_t* indices_ptr = <SIZE_t*> indices.data
1003+
cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp)
1004+
cdef SIZE_t[:] indices = np.zeros(
1005+
n_samples * (1 + self.max_depth), dtype=np.intp
1006+
)
9981007

9991008
# Initialize auxiliary data-structure
10001009
cdef DTYPE_t feature_value = 0.
@@ -1016,7 +1025,7 @@ cdef class Tree:
10161025

10171026
for i in range(n_samples):
10181027
node = self.nodes
1019-
indptr_ptr[i + 1] = indptr_ptr[i]
1028+
indptr[i + 1] = indptr[i]
10201029

10211030
for k in range(X_indptr[i], X_indptr[i + 1]):
10221031
feature_to_sample[X_indices[k]] = i
@@ -1026,8 +1035,8 @@ cdef class Tree:
10261035
while node.left_child != _TREE_LEAF:
10271036
# ... and node.right_child != _TREE_LEAF:
10281037

1029-
indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
1030-
indptr_ptr[i + 1] += 1
1038+
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
1039+
indptr[i + 1] += 1
10311040

10321041
if feature_to_sample[node.feature] == i:
10331042
feature_value = X_sample[node.feature]
@@ -1041,16 +1050,15 @@ cdef class Tree:
10411050
node = &self.nodes[node.right_child]
10421051

10431052
# Add the leave node
1044-
indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
1045-
indptr_ptr[i + 1] += 1
1053+
indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
1054+
indptr[i + 1] += 1
10461055

10471056
# Free auxiliary arrays
10481057
free(X_sample)
10491058
free(feature_to_sample)
10501059

10511060
indices = indices[:indptr[n_samples]]
1052-
cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices),
1053-
dtype=np.intp)
1061+
cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp)
10541062
out = csr_matrix((data, indices, indptr),
10551063
shape=(n_samples, self.node_count))
10561064

@@ -1093,9 +1101,7 @@ cdef class Tree:
10931101

10941102
cdef double normalizer = 0.
10951103

1096-
cdef cnp.ndarray[cnp.float64_t, ndim=1] importances
1097-
importances = np.zeros((self.n_features,))
1098-
cdef DOUBLE_t* importance_data = <DOUBLE_t*>importances.data
1104+
cdef cnp.float64_t[:] importances = np.zeros(self.n_features)
10991105

11001106
with nogil:
11011107
while node != end_node:
@@ -1104,22 +1110,24 @@ cdef class Tree:
11041110
left = &nodes[node.left_child]
11051111
right = &nodes[node.right_child]
11061112

1107-
importance_data[node.feature] += (
1113+
importances[node.feature] += (
11081114
node.weighted_n_node_samples * node.impurity -
11091115
left.weighted_n_node_samples * left.impurity -
11101116
right.weighted_n_node_samples * right.impurity)
11111117
node += 1
11121118

1113-
importances /= nodes[0].weighted_n_node_samples
1119+
for i in range(self.n_features):
1120+
importances[i] /= nodes[0].weighted_n_node_samples
11141121

11151122
if normalize:
11161123
normalizer = np.sum(importances)
11171124

11181125
if normalizer > 0.0:
11191126
# Avoid dividing by zero (e.g., when root is pure)
1120-
importances /= normalizer
1127+
for i in range(self.n_features):
1128+
importances[i] /= normalizer
11211129

1122-
return importances
1130+
return np.asarray(importances)
11231131

11241132
cdef cnp.ndarray _get_value_ndarray(self):
11251133
"""Wraps value as a 3-d NumPy array.
@@ -1154,7 +1162,7 @@ cdef class Tree:
11541162
arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
11551163
<cnp.dtype> NODE_DTYPE, 1, shape,
11561164
strides, <void*> self.nodes,
1157-
cnp.NPY_DEFAULT, None)
1165+
cnp.NPY_ARRAY_DEFAULT, None)
11581166
Py_INCREF(self)
11591167
if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
11601168
raise ValueError("Can't initialize array.")
@@ -1686,18 +1694,19 @@ def ccp_pruning_path(Tree orig_tree):
16861694

16871695
cdef:
16881696
UINT32_t total_items = path_finder.count
1689-
cnp.ndarray ccp_alphas = np.empty(shape=total_items,
1690-
dtype=np.float64)
1691-
cnp.ndarray impurities = np.empty(shape=total_items,
1692-
dtype=np.float64)
1697+
DOUBLE_t[:] ccp_alphas = np.empty(shape=total_items, dtype=np.float64)
1698+
DOUBLE_t[:] impurities = np.empty(shape=total_items, dtype=np.float64)
16931699
UINT32_t count = 0
16941700

16951701
while count < total_items:
16961702
ccp_alphas[count] = path_finder.ccp_alphas[count]
16971703
impurities[count] = path_finder.impurities[count]
16981704
count += 1
16991705

1700-
return {'ccp_alphas': ccp_alphas, 'impurities': impurities}
1706+
return {
1707+
'ccp_alphas': np.asarray(ccp_alphas),
1708+
'impurities': np.asarray(impurities),
1709+
}
17011710

17021711

17031712
cdef struct BuildPrunedRecord:

0 commit comments

Comments
 (0)
0