diff --git a/setup.py b/setup.py index f252e1ec03ad8..dc2edc6a8127b 100755 --- a/setup.py +++ b/setup.py @@ -111,6 +111,7 @@ "sklearn.svm._libsvm_sparse", "sklearn.svm._newrand", "sklearn.tree._splitter", + "sklearn.tree._tree", "sklearn.tree._utils", "sklearn.utils._cython_blas", "sklearn.utils._fast_dict", diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 11c848881f6d3..3e60e91d6940a 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -99,6 +99,17 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=*) - cdef _check_input(self, object X, cnp.ndarray y, cnp.ndarray sample_weight) + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=*, + ) + + cdef _check_input( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + ) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 730b2c1c3c9c6..72e98b33b847f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -86,13 +86,22 @@ NODE_DTYPE = np.asarray((&dummy)).dtype cdef class TreeBuilder: """Interface for different tree building strategies.""" - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=None): + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + ): """Build a decision tree from the training set (X, y).""" pass - cdef inline _check_input(self, object X, cnp.ndarray y, - cnp.ndarray sample_weight): + cdef inline _check_input( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + ): """Check input dtype, layout and format""" if issparse(X): X = X.tocsc() @@ -109,12 +118,15 @@ cdef class TreeBuilder: # since we have to copy we will make it fortran for efficiency X = np.asfortranarray(X, dtype=DTYPE) - if y.dtype != DOUBLE or not y.flags.contiguous: + # TODO: This check for y seems to be redundant, as it is also + # present in the BaseDecisionTree's fit method, and therefore + # can be removed. + if y.base.dtype != DOUBLE or not y.base.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if (sample_weight is not None and - (sample_weight.dtype != DOUBLE or - not sample_weight.flags.contiguous)): + (sample_weight.base.dtype != DOUBLE or + not sample_weight.base.flags.contiguous)): sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C") @@ -144,8 +156,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=None): + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + ): """Build a decision tree from the training set (X, y).""" # check input @@ -335,8 +352,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease - cpdef build(self, Tree tree, object X, cnp.ndarray y, - cnp.ndarray sample_weight=None): + cpdef build( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + ): """Build a decision tree from the training set (X, y).""" # check input @@ -608,6 +630,8 @@ cdef class Tree: def __get__(self): return self._get_value_ndarray()[:self.node_count] + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): """Constructor.""" cdef SIZE_t dummy = 0 @@ -683,9 +707,12 @@ cdef class Tree: self.capacity = node_ndarray.shape[0] if self._resize_c(self.capacity) != 0: raise MemoryError("resizing tree to %d" % self.capacity) - nodes = memcpy(self.nodes, ( node_ndarray).data, + + cdef Node[::1] node_memory_view = node_ndarray + cdef DOUBLE_t[:, :, ::1] value_memory_view = value_ndarray + nodes = memcpy(self.nodes, &node_memory_view[0], self.capacity * sizeof(Node)) - value = memcpy(self.value, ( value_ndarray).data, + value = memcpy(self.value, &value_memory_view[0, 0, 0], self.capacity * self.value_stride * sizeof(double)) cdef int _resize(self, SIZE_t capacity) nogil except -1: @@ -804,8 +831,7 @@ cdef class Tree: cdef SIZE_t n_samples = X.shape[0] # Initialize output - cdef cnp.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype=np.intp) - cdef SIZE_t* out_ptr = out.data + cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp) # Initialize auxiliary data-structure cdef Node* node = NULL @@ -822,9 +848,9 @@ cdef class Tree: else: node = &self.nodes[node.right_child] - out_ptr[i] = (node - self.nodes) # node offset + out[i] = (node - self.nodes) # node offset - return out + return np.asarray(out) cdef inline cnp.ndarray _apply_sparse_csr(self, object X): """Finds the terminal region (=leaf node) for each sample in sparse X. @@ -838,21 +864,15 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr - - cdef DTYPE_t* X_data = X_data_ndarray.data - cdef INT32_t* X_indices = X_indices_ndarray.data - cdef INT32_t* X_indptr = X_indptr_ndarray.data + cdef const DTYPE_t[:] X_data = X.data + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] # Initialize output - cdef cnp.ndarray[SIZE_t, ndim=1] out = np.zeros((n_samples,), - dtype=np.intp) - cdef SIZE_t* out_ptr = out.data + cdef SIZE_t[:] out = np.zeros(n_samples, dtype=np.intp) # Initialize auxiliary data-structure cdef DTYPE_t feature_value = 0. @@ -893,13 +913,13 @@ cdef class Tree: else: node = &self.nodes[node.right_child] - out_ptr[i] = (node - self.nodes) # node offset + out[i] = (node - self.nodes) # node offset # Free auxiliary arrays free(X_sample) free(feature_to_sample) - return out + return np.asarray(out) cpdef object decision_path(self, object X): """Finds the decision path (=node) for each sample in X.""" @@ -924,13 +944,10 @@ cdef class Tree: cdef SIZE_t n_samples = X.shape[0] # Initialize output - cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp) - cdef SIZE_t* indptr_ptr = indptr.data - - cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples * - (1 + self.max_depth), - dtype=np.intp) - cdef SIZE_t* indices_ptr = indices.data + cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp) + cdef SIZE_t[:] indices = np.zeros( + n_samples * (1 + self.max_depth), dtype=np.intp + ) # Initialize auxiliary data-structure cdef Node* node = NULL @@ -939,13 +956,13 @@ cdef class Tree: with nogil: for i in range(n_samples): node = self.nodes - indptr_ptr[i + 1] = indptr_ptr[i] + indptr[i + 1] = indptr[i] # Add all external nodes while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 if X_ndarray[i, node.feature] <= node.threshold: node = &self.nodes[node.left_child] @@ -953,12 +970,11 @@ cdef class Tree: node = &self.nodes[node.right_child] # Add the leave node - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 indices = indices[:indptr[n_samples]] - cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices), - dtype=np.intp) + cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp) out = csr_matrix((data, indices, indptr), shape=(n_samples, self.node_count)) @@ -976,25 +992,18 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices - cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr - - cdef DTYPE_t* X_data = X_data_ndarray.data - cdef INT32_t* X_indices = X_indices_ndarray.data - cdef INT32_t* X_indptr = X_indptr_ndarray.data + cdef const DTYPE_t[:] X_data = X.data + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] # Initialize output - cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp) - cdef SIZE_t* indptr_ptr = indptr.data - - cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples * - (1 + self.max_depth), - dtype=np.intp) - cdef SIZE_t* indices_ptr = indices.data + cdef SIZE_t[:] indptr = np.zeros(n_samples + 1, dtype=np.intp) + cdef SIZE_t[:] indices = np.zeros( + n_samples * (1 + self.max_depth), dtype=np.intp + ) # Initialize auxiliary data-structure cdef DTYPE_t feature_value = 0. @@ -1016,7 +1025,7 @@ cdef class Tree: for i in range(n_samples): node = self.nodes - indptr_ptr[i + 1] = indptr_ptr[i] + indptr[i + 1] = indptr[i] for k in range(X_indptr[i], X_indptr[i + 1]): feature_to_sample[X_indices[k]] = i @@ -1026,8 +1035,8 @@ cdef class Tree: while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] @@ -1041,16 +1050,15 @@ cdef class Tree: node = &self.nodes[node.right_child] # Add the leave node - indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) - indptr_ptr[i + 1] += 1 + indices[indptr[i + 1]] = (node - self.nodes) + indptr[i + 1] += 1 # Free auxiliary arrays free(X_sample) free(feature_to_sample) indices = indices[:indptr[n_samples]] - cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices), - dtype=np.intp) + cdef SIZE_t[:] data = np.ones(shape=len(indices), dtype=np.intp) out = csr_matrix((data, indices, indptr), shape=(n_samples, self.node_count)) @@ -1093,9 +1101,7 @@ cdef class Tree: cdef double normalizer = 0. - cdef cnp.ndarray[cnp.float64_t, ndim=1] importances - importances = np.zeros((self.n_features,)) - cdef DOUBLE_t* importance_data = importances.data + cdef cnp.float64_t[:] importances = np.zeros(self.n_features) with nogil: while node != end_node: @@ -1104,22 +1110,24 @@ cdef class Tree: left = &nodes[node.left_child] right = &nodes[node.right_child] - importance_data[node.feature] += ( + importances[node.feature] += ( node.weighted_n_node_samples * node.impurity - left.weighted_n_node_samples * left.impurity - right.weighted_n_node_samples * right.impurity) node += 1 - importances /= nodes[0].weighted_n_node_samples + for i in range(self.n_features): + importances[i] /= nodes[0].weighted_n_node_samples if normalize: normalizer = np.sum(importances) if normalizer > 0.0: # Avoid dividing by zero (e.g., when root is pure) - importances /= normalizer + for i in range(self.n_features): + importances[i] /= normalizer - return importances + return np.asarray(importances) cdef cnp.ndarray _get_value_ndarray(self): """Wraps value as a 3-d NumPy array. @@ -1154,7 +1162,7 @@ cdef class Tree: arr = PyArray_NewFromDescr( cnp.ndarray, NODE_DTYPE, 1, shape, strides, self.nodes, - cnp.NPY_DEFAULT, None) + cnp.NPY_ARRAY_DEFAULT, None) Py_INCREF(self) if PyArray_SetBaseObject(arr, self) < 0: raise ValueError("Can't initialize array.") @@ -1686,10 +1694,8 @@ def ccp_pruning_path(Tree orig_tree): cdef: UINT32_t total_items = path_finder.count - cnp.ndarray ccp_alphas = np.empty(shape=total_items, - dtype=np.float64) - cnp.ndarray impurities = np.empty(shape=total_items, - dtype=np.float64) + DOUBLE_t[:] ccp_alphas = np.empty(shape=total_items, dtype=np.float64) + DOUBLE_t[:] impurities = np.empty(shape=total_items, dtype=np.float64) UINT32_t count = 0 while count < total_items: @@ -1697,7 +1703,10 @@ def ccp_pruning_path(Tree orig_tree): impurities[count] = path_finder.impurities[count] count += 1 - return {'ccp_alphas': ccp_alphas, 'impurities': impurities} + return { + 'ccp_alphas': np.asarray(ccp_alphas), + 'impurities': np.asarray(impurities), + } cdef struct BuildPrunedRecord: