@@ -86,13 +86,22 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
86
86
cdef class TreeBuilder:
87
87
""" Interface for different tree building strategies."""
88
88
89
- cpdef build(self , Tree tree, object X, cnp.ndarray y,
90
- cnp.ndarray sample_weight = None ):
89
+ cpdef build(
90
+ self ,
91
+ Tree tree,
92
+ object X,
93
+ const DOUBLE_t[:, ::1 ] y,
94
+ const DOUBLE_t[:] sample_weight = None ,
95
+ ):
91
96
""" Build a decision tree from the training set (X, y)."""
92
97
pass
93
98
94
- cdef inline _check_input(self , object X, cnp.ndarray y,
95
- cnp.ndarray sample_weight):
99
+ cdef inline _check_input(
100
+ self ,
101
+ object X,
102
+ const DOUBLE_t[:, ::1 ] y,
103
+ const DOUBLE_t[:] sample_weight,
104
+ ):
96
105
""" Check input dtype, layout and format"""
97
106
if issparse(X):
98
107
X = X.tocsc()
@@ -109,12 +118,15 @@ cdef class TreeBuilder:
109
118
# since we have to copy we will make it fortran for efficiency
110
119
X = np.asfortranarray(X, dtype = DTYPE)
111
120
112
- if y.dtype != DOUBLE or not y.flags.contiguous:
121
+ # TODO: This check for y seems to be redundant, as it is also
122
+ # present in the BaseDecisionTree's fit method, and therefore
123
+ # can be removed.
124
+ if y.base.dtype != DOUBLE or not y.base.flags.contiguous:
113
125
y = np.ascontiguousarray(y, dtype = DOUBLE)
114
126
115
127
if (sample_weight is not None and
116
- (sample_weight.dtype != DOUBLE or
117
- not sample_weight.flags.contiguous)):
128
+ (sample_weight.base. dtype != DOUBLE or
129
+ not sample_weight.base. flags.contiguous)):
118
130
sample_weight = np.asarray(sample_weight, dtype = DOUBLE,
119
131
order = " C" )
120
132
@@ -144,8 +156,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
144
156
self .max_depth = max_depth
145
157
self .min_impurity_decrease = min_impurity_decrease
146
158
147
- cpdef build(self , Tree tree, object X, cnp.ndarray y,
148
- cnp.ndarray sample_weight = None ):
159
+ cpdef build(
160
+ self ,
161
+ Tree tree,
162
+ object X,
163
+ const DOUBLE_t[:, ::1 ] y,
164
+ const DOUBLE_t[:] sample_weight = None ,
165
+ ):
149
166
""" Build a decision tree from the training set (X, y)."""
150
167
151
168
# check input
@@ -335,8 +352,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
335
352
self .max_leaf_nodes = max_leaf_nodes
336
353
self .min_impurity_decrease = min_impurity_decrease
337
354
338
- cpdef build(self , Tree tree, object X, cnp.ndarray y,
339
- cnp.ndarray sample_weight = None ):
355
+ cpdef build(
356
+ self ,
357
+ Tree tree,
358
+ object X,
359
+ const DOUBLE_t[:, ::1 ] y,
360
+ const DOUBLE_t[:] sample_weight = None ,
361
+ ):
340
362
""" Build a decision tree from the training set (X, y)."""
341
363
342
364
# check input
@@ -608,6 +630,8 @@ cdef class Tree:
608
630
def __get__ (self ):
609
631
return self ._get_value_ndarray()[:self .node_count]
610
632
633
+ # TODO: Convert n_classes to cython.integral memory view once
634
+ # https://github.com/cython/cython/issues/5243 is fixed
611
635
def __cinit__ (self , int n_features , cnp.ndarray n_classes , int n_outputs ):
612
636
""" Constructor."""
613
637
cdef SIZE_t dummy = 0
@@ -683,9 +707,12 @@ cdef class Tree:
683
707
self .capacity = node_ndarray.shape[0 ]
684
708
if self ._resize_c(self .capacity) != 0 :
685
709
raise MemoryError (" resizing tree to %d " % self .capacity)
686
- nodes = memcpy(self .nodes, (< cnp.ndarray> node_ndarray).data,
710
+
711
+ cdef Node[::1 ] node_memory_view = node_ndarray
712
+ cdef DOUBLE_t[:, :, ::1 ] value_memory_view = value_ndarray
713
+ nodes = memcpy(self .nodes, & node_memory_view[0 ],
687
714
self .capacity * sizeof(Node))
688
- value = memcpy(self .value, ( < cnp.ndarray > value_ndarray).data ,
715
+ value = memcpy(self .value, & value_memory_view[ 0 , 0 , 0 ] ,
689
716
self .capacity * self .value_stride * sizeof(double ))
690
717
691
718
cdef int _resize(self , SIZE_t capacity) nogil except - 1 :
@@ -804,8 +831,7 @@ cdef class Tree:
804
831
cdef SIZE_t n_samples = X.shape[0 ]
805
832
806
833
# Initialize output
807
- cdef cnp.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype = np.intp)
808
- cdef SIZE_t* out_ptr = < SIZE_t* > out.data
834
+ cdef SIZE_t[:] out = np.zeros(n_samples, dtype = np.intp)
809
835
810
836
# Initialize auxiliary data-structure
811
837
cdef Node* node = NULL
@@ -822,9 +848,9 @@ cdef class Tree:
822
848
else :
823
849
node = & self .nodes[node.right_child]
824
850
825
- out_ptr [i] = < SIZE_t> (node - self .nodes) # node offset
851
+ out [i] = < SIZE_t> (node - self .nodes) # node offset
826
852
827
- return out
853
+ return np.asarray( out)
828
854
829
855
cdef inline cnp.ndarray _apply_sparse_csr(self , object X):
830
856
""" Finds the terminal region (=leaf node) for each sample in sparse X.
@@ -838,21 +864,15 @@ cdef class Tree:
838
864
raise ValueError (" X.dtype should be np.float32, got %s " % X.dtype)
839
865
840
866
# Extract input
841
- cdef cnp.ndarray[ndim= 1 , dtype= DTYPE_t] X_data_ndarray = X.data
842
- cdef cnp.ndarray[ndim= 1 , dtype= INT32_t] X_indices_ndarray = X.indices
843
- cdef cnp.ndarray[ndim= 1 , dtype= INT32_t] X_indptr_ndarray = X.indptr
844
-
845
- cdef DTYPE_t* X_data = < DTYPE_t* > X_data_ndarray.data
846
- cdef INT32_t* X_indices = < INT32_t* > X_indices_ndarray.data
847
- cdef INT32_t* X_indptr = < INT32_t* > X_indptr_ndarray.data
867
+ cdef const DTYPE_t[:] X_data = X.data
868
+ cdef const INT32_t[:] X_indices = X.indices
869
+ cdef const INT32_t[:] X_indptr = X.indptr
848
870
849
871
cdef SIZE_t n_samples = X.shape[0 ]
850
872
cdef SIZE_t n_features = X.shape[1 ]
851
873
852
874
# Initialize output
853
- cdef cnp.ndarray[SIZE_t, ndim= 1 ] out = np.zeros((n_samples,),
854
- dtype = np.intp)
855
- cdef SIZE_t* out_ptr = < SIZE_t* > out.data
875
+ cdef SIZE_t[:] out = np.zeros(n_samples, dtype = np.intp)
856
876
857
877
# Initialize auxiliary data-structure
858
878
cdef DTYPE_t feature_value = 0.
@@ -893,13 +913,13 @@ cdef class Tree:
893
913
else :
894
914
node = & self .nodes[node.right_child]
895
915
896
- out_ptr [i] = < SIZE_t> (node - self .nodes) # node offset
916
+ out [i] = < SIZE_t> (node - self .nodes) # node offset
897
917
898
918
# Free auxiliary arrays
899
919
free(X_sample)
900
920
free(feature_to_sample)
901
921
902
- return out
922
+ return np.asarray( out)
903
923
904
924
cpdef object decision_path(self , object X):
905
925
""" Finds the decision path (=node) for each sample in X."""
@@ -924,13 +944,10 @@ cdef class Tree:
924
944
cdef SIZE_t n_samples = X.shape[0 ]
925
945
926
946
# Initialize output
927
- cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1 , dtype = np.intp)
928
- cdef SIZE_t* indptr_ptr = < SIZE_t* > indptr.data
929
-
930
- cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
931
- (1 + self .max_depth),
932
- dtype = np.intp)
933
- cdef SIZE_t* indices_ptr = < SIZE_t* > indices.data
947
+ cdef SIZE_t[:] indptr = np.zeros(n_samples + 1 , dtype = np.intp)
948
+ cdef SIZE_t[:] indices = np.zeros(
949
+ n_samples * (1 + self .max_depth), dtype = np.intp
950
+ )
934
951
935
952
# Initialize auxiliary data-structure
936
953
cdef Node* node = NULL
@@ -939,26 +956,25 @@ cdef class Tree:
939
956
with nogil:
940
957
for i in range (n_samples):
941
958
node = self .nodes
942
- indptr_ptr [i + 1 ] = indptr_ptr [i]
959
+ indptr [i + 1 ] = indptr [i]
943
960
944
961
# Add all external nodes
945
962
while node.left_child != _TREE_LEAF:
946
963
# ... and node.right_child != _TREE_LEAF:
947
- indices_ptr[indptr_ptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
948
- indptr_ptr [i + 1 ] += 1
964
+ indices[indptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
965
+ indptr [i + 1 ] += 1
949
966
950
967
if X_ndarray[i, node.feature] <= node.threshold:
951
968
node = & self .nodes[node.left_child]
952
969
else :
953
970
node = & self .nodes[node.right_child]
954
971
955
972
# Add the leave node
956
- indices_ptr[indptr_ptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
957
- indptr_ptr [i + 1 ] += 1
973
+ indices[indptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
974
+ indptr [i + 1 ] += 1
958
975
959
976
indices = indices[:indptr[n_samples]]
960
- cdef cnp.ndarray[SIZE_t] data = np.ones(shape = len (indices),
961
- dtype = np.intp)
977
+ cdef SIZE_t[:] data = np.ones(shape = len (indices), dtype = np.intp)
962
978
out = csr_matrix((data, indices, indptr),
963
979
shape = (n_samples, self .node_count))
964
980
@@ -976,25 +992,18 @@ cdef class Tree:
976
992
raise ValueError (" X.dtype should be np.float32, got %s " % X.dtype)
977
993
978
994
# Extract input
979
- cdef cnp.ndarray[ndim= 1 , dtype= DTYPE_t] X_data_ndarray = X.data
980
- cdef cnp.ndarray[ndim= 1 , dtype= INT32_t] X_indices_ndarray = X.indices
981
- cdef cnp.ndarray[ndim= 1 , dtype= INT32_t] X_indptr_ndarray = X.indptr
982
-
983
- cdef DTYPE_t* X_data = < DTYPE_t* > X_data_ndarray.data
984
- cdef INT32_t* X_indices = < INT32_t* > X_indices_ndarray.data
985
- cdef INT32_t* X_indptr = < INT32_t* > X_indptr_ndarray.data
995
+ cdef const DTYPE_t[:] X_data = X.data
996
+ cdef const INT32_t[:] X_indices = X.indices
997
+ cdef const INT32_t[:] X_indptr = X.indptr
986
998
987
999
cdef SIZE_t n_samples = X.shape[0 ]
988
1000
cdef SIZE_t n_features = X.shape[1 ]
989
1001
990
1002
# Initialize output
991
- cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1 , dtype = np.intp)
992
- cdef SIZE_t* indptr_ptr = < SIZE_t* > indptr.data
993
-
994
- cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
995
- (1 + self .max_depth),
996
- dtype = np.intp)
997
- cdef SIZE_t* indices_ptr = < SIZE_t* > indices.data
1003
+ cdef SIZE_t[:] indptr = np.zeros(n_samples + 1 , dtype = np.intp)
1004
+ cdef SIZE_t[:] indices = np.zeros(
1005
+ n_samples * (1 + self .max_depth), dtype = np.intp
1006
+ )
998
1007
999
1008
# Initialize auxiliary data-structure
1000
1009
cdef DTYPE_t feature_value = 0.
@@ -1016,7 +1025,7 @@ cdef class Tree:
1016
1025
1017
1026
for i in range (n_samples):
1018
1027
node = self .nodes
1019
- indptr_ptr [i + 1 ] = indptr_ptr [i]
1028
+ indptr [i + 1 ] = indptr [i]
1020
1029
1021
1030
for k in range (X_indptr[i], X_indptr[i + 1 ]):
1022
1031
feature_to_sample[X_indices[k]] = i
@@ -1026,8 +1035,8 @@ cdef class Tree:
1026
1035
while node.left_child != _TREE_LEAF:
1027
1036
# ... and node.right_child != _TREE_LEAF:
1028
1037
1029
- indices_ptr[indptr_ptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
1030
- indptr_ptr [i + 1 ] += 1
1038
+ indices[indptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
1039
+ indptr [i + 1 ] += 1
1031
1040
1032
1041
if feature_to_sample[node.feature] == i:
1033
1042
feature_value = X_sample[node.feature]
@@ -1041,16 +1050,15 @@ cdef class Tree:
1041
1050
node = & self .nodes[node.right_child]
1042
1051
1043
1052
# Add the leave node
1044
- indices_ptr[indptr_ptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
1045
- indptr_ptr [i + 1 ] += 1
1053
+ indices[indptr [i + 1 ]] = < SIZE_t> (node - self .nodes)
1054
+ indptr [i + 1 ] += 1
1046
1055
1047
1056
# Free auxiliary arrays
1048
1057
free(X_sample)
1049
1058
free(feature_to_sample)
1050
1059
1051
1060
indices = indices[:indptr[n_samples]]
1052
- cdef cnp.ndarray[SIZE_t] data = np.ones(shape = len (indices),
1053
- dtype = np.intp)
1061
+ cdef SIZE_t[:] data = np.ones(shape = len (indices), dtype = np.intp)
1054
1062
out = csr_matrix((data, indices, indptr),
1055
1063
shape = (n_samples, self .node_count))
1056
1064
@@ -1093,9 +1101,7 @@ cdef class Tree:
1093
1101
1094
1102
cdef double normalizer = 0.
1095
1103
1096
- cdef cnp.ndarray[cnp.float64_t, ndim= 1 ] importances
1097
- importances = np.zeros((self .n_features,))
1098
- cdef DOUBLE_t* importance_data = < DOUBLE_t* > importances.data
1104
+ cdef cnp.float64_t[:] importances = np.zeros(self .n_features)
1099
1105
1100
1106
with nogil:
1101
1107
while node != end_node:
@@ -1104,22 +1110,24 @@ cdef class Tree:
1104
1110
left = & nodes[node.left_child]
1105
1111
right = & nodes[node.right_child]
1106
1112
1107
- importance_data [node.feature] += (
1113
+ importances [node.feature] += (
1108
1114
node.weighted_n_node_samples * node.impurity -
1109
1115
left.weighted_n_node_samples * left.impurity -
1110
1116
right.weighted_n_node_samples * right.impurity)
1111
1117
node += 1
1112
1118
1113
- importances /= nodes[0 ].weighted_n_node_samples
1119
+ for i in range (self .n_features):
1120
+ importances[i] /= nodes[0 ].weighted_n_node_samples
1114
1121
1115
1122
if normalize:
1116
1123
normalizer = np.sum(importances)
1117
1124
1118
1125
if normalizer > 0.0 :
1119
1126
# Avoid dividing by zero (e.g., when root is pure)
1120
- importances /= normalizer
1127
+ for i in range (self .n_features):
1128
+ importances[i] /= normalizer
1121
1129
1122
- return importances
1130
+ return np.asarray( importances)
1123
1131
1124
1132
cdef cnp.ndarray _get_value_ndarray(self ):
1125
1133
""" Wraps value as a 3-d NumPy array.
@@ -1154,7 +1162,7 @@ cdef class Tree:
1154
1162
arr = PyArray_NewFromDescr(< PyTypeObject * > cnp.ndarray,
1155
1163
< cnp.dtype> NODE_DTYPE, 1 , shape,
1156
1164
strides, < void * > self .nodes,
1157
- cnp.NPY_DEFAULT , None )
1165
+ cnp.NPY_ARRAY_DEFAULT , None )
1158
1166
Py_INCREF(self )
1159
1167
if PyArray_SetBaseObject(arr, < PyObject* > self ) < 0 :
1160
1168
raise ValueError (" Can't initialize array." )
@@ -1686,18 +1694,19 @@ def ccp_pruning_path(Tree orig_tree):
1686
1694
1687
1695
cdef:
1688
1696
UINT32_t total_items = path_finder.count
1689
- cnp.ndarray ccp_alphas = np.empty(shape = total_items,
1690
- dtype = np.float64)
1691
- cnp.ndarray impurities = np.empty(shape = total_items,
1692
- dtype = np.float64)
1697
+ DOUBLE_t[:] ccp_alphas = np.empty(shape = total_items, dtype = np.float64)
1698
+ DOUBLE_t[:] impurities = np.empty(shape = total_items, dtype = np.float64)
1693
1699
UINT32_t count = 0
1694
1700
1695
1701
while count < total_items:
1696
1702
ccp_alphas[count] = path_finder.ccp_alphas[count]
1697
1703
impurities[count] = path_finder.impurities[count]
1698
1704
count += 1
1699
1705
1700
- return {' ccp_alphas' : ccp_alphas, ' impurities' : impurities}
1706
+ return {
1707
+ ' ccp_alphas' : np.asarray(ccp_alphas),
1708
+ ' impurities' : np.asarray(impurities),
1709
+ }
1701
1710
1702
1711
1703
1712
cdef struct BuildPrunedRecord:
0 commit comments