diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py index c336d1e5f8805..877fcdb09fe68 100644 --- a/asv_benchmarks/benchmarks/ensemble.py +++ b/asv_benchmarks/benchmarks/ensemble.py @@ -2,6 +2,7 @@ GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier, + RandomForestRegressor, ) from .common import Benchmark, Estimator, Predictor @@ -9,8 +10,50 @@ _20newsgroups_highdim_dataset, _20newsgroups_lowdim_dataset, _synth_classification_dataset, + _synth_regression_dataset, + _synth_regression_sparse_dataset, ) -from .utils import make_gen_classif_scorers +from .utils import make_gen_classif_scorers, make_gen_reg_scorers + + +class RandomForestRegressorBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for RandomForestRegressor. + """ + + param_names = ["representation", "n_jobs"] + params = (["dense", "sparse"], Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, n_jobs = params + + if representation == "sparse": + data = _synth_regression_sparse_dataset() + else: + data = _synth_regression_dataset() + + return data + + def make_estimator(self, params): + representation, n_jobs = params + + n_estimators = 500 if Benchmark.data_size == "large" else 100 + + estimator = RandomForestRegressor( + n_estimators=n_estimators, + min_samples_split=10, + max_features="log2", + n_jobs=n_jobs, + random_state=0, + ) + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark): diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index fe1b239cdeb32..0f16f10538a62 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -6,6 +6,7 @@ # Jacob Schreiber # Adam Li # Jong Shin +# Samuel Carliles # # License: BSD 3 clause @@ -14,9 +15,49 @@ from libcpp.vector cimport vector from ._criterion cimport BaseCriterion, Criterion from ._tree cimport ParentInfo + from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint32_t +# NICE IDEAS THAT DON'T APPEAR POSSIBLE +# - accessing elements of a memory view of cython extension types in a nogil block/function +# - storing cython extension types in cpp vectors +# +# despite the fact that we can access scalar extension type properties in such a context, +# as for instance node_split_best does with Criterion and Partition, +# and we can access the elements of a memory view of primitive types in such a context +# +# SO WHERE DOES THAT LEAVE US +# - we can transform these into cpp vectors of structs +# and with some minor casting irritations everything else works ok +ctypedef void* SplitConditionEnv +ctypedef bint (*SplitConditionFunction)( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionEnv split_condition_env +) noexcept nogil + +cdef struct SplitConditionClosure: + SplitConditionFunction f + SplitConditionEnv e + +cdef class SplitCondition: + cdef SplitConditionClosure c + +cdef class MinSamplesLeafCondition(SplitCondition): + pass + +cdef class MinWeightLeafCondition(SplitCondition): + pass + +cdef class MonotonicConstraintCondition(SplitCondition): + pass + + cdef struct SplitRecord: # Data to track sample split intp_t feature # Which feature to split on. @@ -30,6 +71,13 @@ cdef struct SplitRecord: unsigned char missing_go_to_left # Controls if missing values go to the left node. intp_t n_missing # Number of missing values for the feature being split on +ctypedef void* SplitRecordFactoryEnv +ctypedef SplitRecord* (*SplitRecordFactory)(SplitRecordFactoryEnv env) except NULL nogil + +cdef struct SplitRecordFactoryClosure: + SplitRecordFactory f + SplitRecordFactoryEnv e + cdef class BaseSplitter: """Abstract interface for splitter.""" @@ -59,6 +107,8 @@ cdef class BaseSplitter: cdef const float64_t[:] sample_weight + cdef SplitRecordFactoryClosure split_record_factory + # The samples vector `samples` is maintained by the Splitter object such # that the samples contained in a node are contiguous. With this setting, # `node_split` reorganizes the node samples `samples[start:end]` in two @@ -90,6 +140,7 @@ cdef class BaseSplitter: cdef void node_value(self, float64_t* dest) noexcept nogil cdef float64_t node_impurity(self) noexcept nogil cdef intp_t pointer_size(self) noexcept nogil + cdef SplitRecord* create_split_record(self) except NULL nogil cdef class Splitter(BaseSplitter): """Base class for supervised splitters.""" @@ -105,6 +156,13 @@ cdef class Splitter(BaseSplitter): cdef const int8_t[:] monotonic_cst cdef bint with_monotonic_cst + cdef SplitCondition min_samples_leaf_condition + cdef SplitCondition min_weight_leaf_condition + cdef SplitCondition monotonic_constraint_condition + + cdef vector[SplitConditionClosure] presplit_conditions + cdef vector[SplitConditionClosure] postsplit_conditions + cdef int init( self, object X, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d3c8fa1f98e83..66776e8bc5b38 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -19,7 +19,8 @@ from cython cimport final from libc.math cimport isnan -from libc.stdlib cimport qsort +from libc.stdint cimport uintptr_t +from libc.stdlib cimport qsort, free, malloc from libc.string cimport memcpy from ._criterion cimport Criterion @@ -42,6 +43,155 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 + +cdef bint min_sample_leaf_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionEnv split_condition_env +) noexcept nogil: + cdef intp_t min_samples_leaf = splitter.min_samples_leaf + cdef intp_t end_non_missing = splitter.end - n_missing + cdef intp_t n_left, n_right + + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + + # Reject if min_samples_leaf is not guaranteed + if n_left < min_samples_leaf or n_right < min_samples_leaf: + return False + + return True + +cdef class MinSamplesLeafCondition(SplitCondition): + def __cinit__(self): + self.c.f = min_sample_leaf_condition + self.c.e = NULL # min_samples is stored in splitter, which is already passed to f + +cdef bint min_weight_leaf_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionEnv split_condition_env +) noexcept nogil: + cdef float64_t min_weight_leaf = splitter.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((splitter.criterion.weighted_n_left < min_weight_leaf) or + (splitter.criterion.weighted_n_right < min_weight_leaf)): + return False + + return True + +cdef class MinWeightLeafCondition(SplitCondition): + def __cinit__(self): + self.c.f = min_weight_leaf_condition + self.c.e = NULL # min_weight_leaf is stored in splitter, which is already passed to f + +cdef bint monotonic_constraint_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionEnv split_condition_env +) noexcept nogil: + if ( + splitter.with_monotonic_cst and + splitter.monotonic_cst[current_split.feature] != 0 and + not splitter.criterion.check_monotonicity( + splitter.monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): + return False + + return True + +cdef class MonotonicConstraintCondition(SplitCondition): + def __cinit__(self): + self.c.f = monotonic_constraint_condition + self.c.e = NULL + +# cdef struct HasDataEnv: +# int min_samples + +# cdef bint has_data_condition( +# Splitter splitter, +# SplitRecord* current_split, +# intp_t n_missing, +# bint missing_go_to_left, +# float64_t lower_bound, +# float64_t upper_bound, +# SplitConditionEnv split_condition_env +# ) noexcept nogil: +# cdef HasDataEnv* e = split_condition_env +# return splitter.n_samples >= e.min_samples + +# cdef class HasDataCondition(SplitCondition): +# def __cinit__(self, int min_samples): +# self.c.f = has_data_condition +# self.c.e = malloc(sizeof(HasDataEnv)) +# (self.c.e).min_samples = min_samples + +# def __dealloc__(self): +# if self.c.e is not NULL: +# free(self.c.e) + +# super.__dealloc__(self) + +# cdef struct AlphaRegularityEnv: +# float64_t alpha + +# cdef bint alpha_regularity_condition( +# Splitter splitter, +# SplitRecord* current_split, +# intp_t n_missing, +# bint missing_go_to_left, +# float64_t lower_bound, +# float64_t upper_bound, +# SplitConditionEnv split_condition_env +# ) noexcept nogil: +# cdef AlphaRegularityEnv* e = split_condition_env + +# return True + +# cdef class AlphaRegularityCondition(SplitCondition): +# def __cinit__(self, float64_t alpha): +# self.c.f = alpha_regularity_condition +# self.c.e = malloc(sizeof(AlphaRegularityEnv)) +# (self.c.e).alpha = alpha + +# def __dealloc__(self): +# if self.c.e is not NULL: +# free(self.c.e) + +# super.__dealloc__(self) + + +# from ._tree cimport Tree +# cdef class FooTree(Tree): +# cdef Splitter splitter + +# def __init__(self): +# self.splitter = Splitter( +# presplit_conditions = [HasDataCondition(10)], +# postsplit_conditions = [AlphaRegularityCondition(0.1)], +# ) + + cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY @@ -52,6 +202,9 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil self.missing_go_to_left = False self.n_missing = 0 +cdef SplitRecord* _base_split_record_factory(SplitRecordFactoryEnv env) except NULL nogil: + return malloc(sizeof(SplitRecord)); + cdef class BaseSplitter: """This is an abstract interface for splitters. @@ -136,6 +289,9 @@ cdef class BaseSplitter: `SplitRecord`. """ return sizeof(SplitRecord) + + cdef SplitRecord* create_split_record(self) except NULL nogil: + return self.split_record_factory.f(self.split_record_factory.e) cdef class Splitter(BaseSplitter): """Abstract interface for supervised splitters.""" @@ -148,6 +304,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const int8_t[:] monotonic_cst, + SplitCondition[:] presplit_conditions = None, + SplitCondition[:] postsplit_conditions = None, *argv ): """ @@ -188,6 +346,42 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.min_samples_leaf_condition = MinSamplesLeafCondition() + self.min_weight_leaf_condition = MinWeightLeafCondition() + + self.presplit_conditions.resize( + (len(presplit_conditions) if presplit_conditions is not None else 0) + + (2 if self.with_monotonic_cst else 1) + ) + self.postsplit_conditions.resize( + (len(postsplit_conditions) if postsplit_conditions is not None else 0) + + (2 if self.with_monotonic_cst else 1) + ) + + cdef int offset = 0 + self.presplit_conditions[offset] = self.min_samples_leaf_condition.c + self.postsplit_conditions[offset] = self.min_weight_leaf_condition.c + offset += 1 + + if(self.with_monotonic_cst): + self.monotonic_constraint_condition = MonotonicConstraintCondition() + self.presplit_conditions[offset] = self.monotonic_constraint_condition.c + self.postsplit_conditions[offset] = self.monotonic_constraint_condition.c + offset += 1 + + cdef int i + if presplit_conditions is not None: + for i in range(len(presplit_conditions)): + self.presplit_conditions[i + offset] = presplit_conditions[i].c + + if postsplit_conditions is not None: + for i in range(len(postsplit_conditions)): + self.postsplit_conditions[i + offset] = postsplit_conditions[i].c + + self.split_record_factory.f = _base_split_record_factory + self.split_record_factory.e = NULL + + def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -485,6 +679,8 @@ cdef inline intp_t node_split_best( # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants + cdef bint conditions_hold = True + _init_split(&best_split, end) partitioner.init_node_split(start, end) @@ -579,46 +775,71 @@ cdef inline intp_t node_split_best( current_split.pos = p - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): - continue - - # Reject if min_samples_leaf is not guaranteed - if missing_go_to_left: - n_left = current_split.pos - splitter.start + n_missing - n_right = end_non_missing - current_split.pos - else: - n_left = current_split.pos - splitter.start - n_right = end_non_missing - current_split.pos + n_missing - if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: + # # Reject if monotonicity constraints are not satisfied + # if ( + # with_monotonic_cst and + # monotonic_cst[current_split.feature] != 0 and + # not criterion.check_monotonicity( + # monotonic_cst[current_split.feature], + # lower_bound, + # upper_bound, + # ) + # ): + # continue + + # # Reject if min_samples_leaf is not guaranteed + # if missing_go_to_left: + # n_left = current_split.pos - splitter.start + n_missing + # n_right = end_non_missing - current_split.pos + # else: + # n_left = current_split.pos - splitter.start + # n_right = end_non_missing - current_split.pos + n_missing + + conditions_hold = True + for condition in splitter.presplit_conditions: + if not condition.f( + splitter, ¤t_split, n_missing, missing_go_to_left, + lower_bound, upper_bound, condition.e + ): + conditions_hold = False + break + + if not conditions_hold: continue + # if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: + # continue + criterion.update(current_split.pos) - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): - continue - - # Reject if min_weight_leaf is not satisfied - if splitter.check_postsplit_conditions() == 1: + # # Reject if monotonicity constraints are not satisfied + # if ( + # with_monotonic_cst and + # monotonic_cst[current_split.feature] != 0 and + # not criterion.check_monotonicity( + # monotonic_cst[current_split.feature], + # lower_bound, + # upper_bound, + # ) + # ): + # continue + + conditions_hold = True + for condition in splitter.postsplit_conditions: + if not condition.f( + splitter, ¤t_split, n_missing, missing_go_to_left, + lower_bound, upper_bound, condition.e + ): + conditions_hold = False + break + + if not conditions_hold: continue - + + # # Reject if min_weight_leaf is not satisfied + # if splitter.check_postsplit_conditions() == 1: + # continue + current_proxy_improvement = criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 2267b4306e261..930a21ad05783 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -43,6 +43,67 @@ cdef struct ParentInfo: float64_t impurity # the impurity of the parent intp_t n_constant_features # the number of constant features found in parent +# A record on the stack for depth-first tree growing +cdef struct StackRecord: + intp_t start + intp_t end + intp_t depth + intp_t parent + bint is_left + float64_t impurity + intp_t n_constant_features + float64_t lower_bound + float64_t upper_bound + +cdef extern from "" namespace "std" nogil: + cdef cppclass stack[T]: + ctypedef T value_type + stack() except + + bint empty() + void pop() + void push(T&) except + # Raise c++ exception for bad_alloc -> MemoryError + T& top() + +cdef struct BuildEnv: + # Parameters + intp_t max_depth + intp_t min_samples_leaf + float64_t min_weight_leaf + intp_t min_samples_split + float64_t min_impurity_decrease + + unsigned char store_leaf_values + + # Initial capacity + intp_t init_capacity + bint first + + intp_t start + intp_t end + intp_t depth + intp_t parent + bint is_left + intp_t n_node_samples + float64_t weighted_n_node_samples + intp_t node_id + float64_t right_child_min, left_child_min, right_child_max, left_child_max + + SplitRecord* split + + float64_t middle_value + bint is_leaf + intp_t max_depth_seen + + intp_t rc + + stack[StackRecord] builder_stack + stack[StackRecord] update_stack + stack[StackRecord]* target_stack + StackRecord stack_record + + ParentInfo parent_record + + cdef class BaseTree: # Inner structures: values are stored separately from node structure, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 418eae57e4995..6e5ad54848b3c 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -48,14 +48,6 @@ cdef extern from "numpy/arrayobject.h": void* data, intp_t flags, object obj) intp_t PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) -cdef extern from "" namespace "std" nogil: - cdef cppclass stack[T]: - ctypedef T value_type - stack() except + - bint empty() - void pop() - void push(T&) except + # Raise c++ exception for bad_alloc -> MemoryError - T& top() # ============================================================================= # Types and constants @@ -161,19 +153,6 @@ cdef class TreeBuilder: # Depth first builder --------------------------------------------------------- -# A record on the stack for depth-first tree growing -cdef struct StackRecord: - intp_t start - intp_t end - intp_t depth - intp_t parent - bint is_left - float64_t impurity - intp_t n_constant_features - float64_t lower_bound - float64_t upper_bound - - cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" @@ -272,6 +251,141 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # convert dict to numpy array and store value self.initial_roots = np.array(list(false_roots.items())) + cdef void _build_body(self, Tree tree, Splitter splitter, BuildEnv* e, bint update) noexcept nogil: + while not e.target_stack.empty(): + e.stack_record = e.target_stack.top() + e.target_stack.pop() + + e.start = e.stack_record.start + e.end = e.stack_record.end + e.depth = e.stack_record.depth + e.parent = e.stack_record.parent + e.is_left = e.stack_record.is_left + e.parent_record.impurity = e.stack_record.impurity + e.parent_record.n_constant_features = e.stack_record.n_constant_features + e.parent_record.lower_bound = e.stack_record.lower_bound + e.parent_record.upper_bound = e.stack_record.upper_bound + + e.n_node_samples = e.end - e.start + splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) + + e.is_leaf = (e.depth >= e.max_depth or + e.n_node_samples < e.min_samples_split or + e.n_node_samples < 2 * e.min_samples_leaf or + e.weighted_n_node_samples < 2 * e.min_weight_leaf) + + if e.first: + e.parent_record.impurity = splitter.node_impurity() + e.first = 0 + + # impurity == 0 with tolerance due to rounding errors + e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON + + if not e.is_leaf: + splitter.node_split( + &e.parent_record, + e.split, + ) + + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + e.is_leaf = (e.is_leaf or e.split.pos >= e.end or + (e.split.improvement + EPSILON < + e.min_impurity_decrease)) + + if update == 1: + e.node_id = tree._update_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) + else: + e.node_id = tree._add_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) + + if e.node_id == INTPTR_MAX: + e.rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + e.node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value( + tree.value + e.node_id * tree.value_stride, + e.parent_record.lower_bound, + e.parent_record.upper_bound + ) + + if not e.is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[e.split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + e.left_child_min = e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.right_child_max = e.parent_record.upper_bound + elif splitter.monotonic_cst[e.split.feature] == 1: + # Split on a feature with monotonic increase constraint + e.left_child_min = e.parent_record.lower_bound + e.right_child_max = e.parent_record.upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.right_child_min = e.middle_value + e.left_child_max = e.middle_value + else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 + # Split on a feature with monotonic decrease constraint + e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.parent_record.upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.left_child_min = e.middle_value + e.right_child_max = e.middle_value + + # Push right child on stack + e.builder_stack.push({ + "start": e.split.pos, + "end": e.end, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 0, + "impurity": e.split.impurity_right, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.right_child_min, + "upper_bound": e.right_child_max, + }) + + # Push left child on stack + e.builder_stack.push({ + "start": e.start, + "end": e.split.pos, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 1, + "impurity": e.split.impurity_left, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.left_child_min, + "upper_bound": e.left_child_max, + }) + elif e.store_leaf_values and e.is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[e.node_id]) + + if e.depth > e.max_depth_seen: + e.max_depth_seen = e.depth + cpdef build( self, Tree tree, @@ -285,31 +399,31 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # check input X, y, sample_weight = self._check_input(X, y, sample_weight) - # Parameters cdef Splitter splitter = self.splitter - cdef intp_t max_depth = self.max_depth - cdef intp_t min_samples_leaf = self.min_samples_leaf - cdef float64_t min_weight_leaf = self.min_weight_leaf - cdef intp_t min_samples_split = self.min_samples_split - cdef float64_t min_impurity_decrease = self.min_impurity_decrease - - cdef unsigned char store_leaf_values = self.store_leaf_values cdef cnp.ndarray initial_roots = self.initial_roots + cdef BuildEnv e + e.max_depth = self.max_depth + e.min_samples_leaf = self.min_samples_leaf + e.min_weight_leaf = self.min_weight_leaf + e.min_samples_split = self.min_samples_split + e.min_impurity_decrease = self.min_impurity_decrease + + e.store_leaf_values = self.store_leaf_values + # Initial capacity - cdef intp_t init_capacity - cdef bint first = 0 + e.first = 0 if initial_roots is None: # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight, missing_values_in_feature_mask) if tree.max_depth <= 10: - init_capacity = (2 ** (tree.max_depth + 1)) - 1 + e.init_capacity = (2 ** (tree.max_depth + 1)) - 1 else: - init_capacity = 2047 + e.init_capacity = 2047 - tree._resize(init_capacity) - first = 1 + tree._resize(e.init_capacity) + e.first = 1 else: # convert numpy array back to dict false_roots = {} @@ -319,39 +433,24 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # reset the root array self.initial_roots = None - cdef intp_t start = 0 - cdef intp_t end = 0 - cdef intp_t depth - cdef intp_t parent - cdef bint is_left - cdef intp_t n_node_samples = splitter.n_samples - cdef float64_t weighted_n_node_samples - cdef intp_t node_id - cdef float64_t right_child_min, left_child_min, right_child_max, left_child_max + e.start = 0 + e.end = 0 + e.n_node_samples = splitter.n_samples + e.split = self.splitter.create_split_record() - cdef SplitRecord split - cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + e.max_depth_seen = -1 if e.first else tree.max_depth - cdef float64_t middle_value - cdef bint is_leaf - cdef intp_t max_depth_seen = -1 if first else tree.max_depth + e.rc = 0 - cdef intp_t rc = 0 + _init_parent_record(&e.parent_record) - cdef stack[StackRecord] builder_stack - cdef stack[StackRecord] update_stack - cdef StackRecord stack_record - - cdef ParentInfo parent_record - _init_parent_record(&parent_record) - - if not first: + if not e.first: # push reached leaf nodes onto stack for key, value in reversed(sorted(false_roots.items())): - end += value[0] - update_stack.push({ - "start": start, - "end": end, + e.end += value[0] + e.update_stack.push({ + "start": e.start, + "end": e.end, "depth": value[1], "parent": key[0], "is_left": key[1], @@ -360,12 +459,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "lower_bound": -INFINITY, "upper_bound": INFINITY, }) - start += value[0] + e.start += value[0] else: # push root node onto stack - builder_stack.push({ + e.builder_stack.push({ "start": 0, - "end": n_node_samples, + "end": e.n_node_samples, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0, @@ -376,275 +475,22 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): }) with nogil: - while not update_stack.empty(): - stack_record = update_stack.top() - update_stack.pop() - - start = stack_record.start - end = stack_record.end - depth = stack_record.depth - parent = stack_record.parent - is_left = stack_record.is_left - parent_record.impurity = stack_record.impurity - parent_record.n_constant_features = stack_record.n_constant_features - parent_record.lower_bound = stack_record.lower_bound - parent_record.upper_bound = stack_record.upper_bound - - n_node_samples = end - start - splitter.node_reset(start, end, &weighted_n_node_samples) - - is_leaf = (depth >= max_depth or - n_node_samples < min_samples_split or - n_node_samples < 2 * min_samples_leaf or - weighted_n_node_samples < 2 * min_weight_leaf) - - if first: - parent_record.impurity = splitter.node_impurity() - first = 0 - - # impurity == 0 with tolerance due to rounding errors - is_leaf = is_leaf or parent_record.impurity <= EPSILON - - if not is_leaf: - splitter.node_split( - &parent_record, - split_ptr, - ) + e.target_stack = &e.update_stack + self._build_body(tree, splitter, &e, 1) - # assign local copy of SplitRecord to assign - # pos, improvement, and impurity scores - split = deref(split_ptr) - - # If EPSILON=0 in the below comparison, float precision - # issues stop splitting, producing trees that are - # dissimilar to v0.18 - is_leaf = (is_leaf or split.pos >= end or - (split.improvement + EPSILON < - min_impurity_decrease)) - - node_id = tree._update_node(parent, is_left, is_leaf, split_ptr, - parent_record.impurity, - n_node_samples, weighted_n_node_samples, - split.missing_go_to_left) - - if node_id == INTPTR_MAX: - rc = -1 - break - - # Store value for all nodes, to facilitate tree/model - # inspection and interpretation - splitter.node_value(tree.value + node_id * tree.value_stride) - if splitter.with_monotonic_cst: - splitter.clip_node_value( - tree.value + node_id * tree.value_stride, - parent_record.lower_bound, - parent_record.upper_bound - ) + e.target_stack = &e.builder_stack + self._build_body(tree, splitter, &e, 0) - if not is_leaf: - if ( - not splitter.with_monotonic_cst or - splitter.monotonic_cst[split.feature] == 0 - ): - # Split on a feature with no monotonicity constraint + if e.rc >= 0: + e.rc = tree._resize_c(tree.node_count) - # Current bounds must always be propagated to both children. - # If a monotonic constraint is active, bounds are used in - # node value clipping. - left_child_min = right_child_min = parent_record.lower_bound - left_child_max = right_child_max = parent_record.upper_bound - elif splitter.monotonic_cst[split.feature] == 1: - # Split on a feature with monotonic increase constraint - left_child_min = parent_record.lower_bound - right_child_max = parent_record.upper_bound - - # Lower bound for right child and upper bound for left child - # are set to the same value. - middle_value = splitter.criterion.middle_value() - right_child_min = middle_value - left_child_max = middle_value - else: # i.e. splitter.monotonic_cst[split.feature] == -1 - # Split on a feature with monotonic decrease constraint - right_child_min = parent_record.lower_bound - left_child_max = parent_record.upper_bound - - # Lower bound for left child and upper bound for right child - # are set to the same value. - middle_value = splitter.criterion.middle_value() - left_child_min = middle_value - right_child_max = middle_value - - # Push right child on stack - builder_stack.push({ - "start": split.pos, - "end": end, - "depth": depth + 1, - "parent": node_id, - "is_left": 0, - "impurity": split.impurity_right, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": right_child_min, - "upper_bound": right_child_max, - }) - - # Push left child on stack - builder_stack.push({ - "start": start, - "end": split.pos, - "depth": depth + 1, - "parent": node_id, - "is_left": 1, - "impurity": split.impurity_left, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": left_child_min, - "upper_bound": left_child_max, - }) - elif store_leaf_values and is_leaf: - # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[node_id]) - - if depth > max_depth_seen: - max_depth_seen = depth - - while not builder_stack.empty(): - stack_record = builder_stack.top() - builder_stack.pop() - - start = stack_record.start - end = stack_record.end - depth = stack_record.depth - parent = stack_record.parent - is_left = stack_record.is_left - parent_record.impurity = stack_record.impurity - parent_record.n_constant_features = stack_record.n_constant_features - parent_record.lower_bound = stack_record.lower_bound - parent_record.upper_bound = stack_record.upper_bound - - n_node_samples = end - start - splitter.node_reset(start, end, &weighted_n_node_samples) - - is_leaf = (depth >= max_depth or - n_node_samples < min_samples_split or - n_node_samples < 2 * min_samples_leaf or - weighted_n_node_samples < 2 * min_weight_leaf) - - if first: - parent_record.impurity = splitter.node_impurity() - first=0 - - # impurity == 0 with tolerance due to rounding errors - is_leaf = is_leaf or parent_record.impurity <= EPSILON - - if not is_leaf: - splitter.node_split( - &parent_record, - split_ptr, - ) - - # assign local copy of SplitRecord to assign - # pos, improvement, and impurity scores - split = deref(split_ptr) - - # If EPSILON=0 in the below comparison, float precision - # issues stop splitting, producing trees that are - # dissimilar to v0.18 - is_leaf = (is_leaf or split.pos >= end or - (split.improvement + EPSILON < - min_impurity_decrease)) - - node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, - parent_record.impurity, n_node_samples, - weighted_n_node_samples, split.missing_go_to_left) - - if node_id == INTPTR_MAX: - rc = -1 - break - - # Store value for all nodes, to facilitate tree/model - # inspection and interpretation - splitter.node_value(tree.value + node_id * tree.value_stride) - if splitter.with_monotonic_cst: - splitter.clip_node_value( - tree.value + node_id * tree.value_stride, - parent_record.lower_bound, - parent_record.upper_bound - ) - - if not is_leaf: - if ( - not splitter.with_monotonic_cst or - splitter.monotonic_cst[split.feature] == 0 - ): - # Split on a feature with no monotonicity constraint - - # Current bounds must always be propagated to both children. - # If a monotonic constraint is active, bounds are used in - # node value clipping. - left_child_min = right_child_min = parent_record.lower_bound - left_child_max = right_child_max = parent_record.upper_bound - elif splitter.monotonic_cst[split.feature] == 1: - # Split on a feature with monotonic increase constraint - left_child_min = parent_record.lower_bound - right_child_max = parent_record.upper_bound - - # Lower bound for right child and upper bound for left child - # are set to the same value. - middle_value = splitter.criterion.middle_value() - right_child_min = middle_value - left_child_max = middle_value - else: # i.e. splitter.monotonic_cst[split.feature] == -1 - # Split on a feature with monotonic decrease constraint - right_child_min = parent_record.lower_bound - left_child_max = parent_record.upper_bound - - # Lower bound for left child and upper bound for right child - # are set to the same value. - middle_value = splitter.criterion.middle_value() - left_child_min = middle_value - right_child_max = middle_value - - # Push right child on stack - builder_stack.push({ - "start": split.pos, - "end": end, - "depth": depth + 1, - "parent": node_id, - "is_left": 0, - "impurity": split.impurity_right, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": right_child_min, - "upper_bound": right_child_max, - }) - - # Push left child on stack - builder_stack.push({ - "start": start, - "end": split.pos, - "depth": depth + 1, - "parent": node_id, - "is_left": 1, - "impurity": split.impurity_left, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": left_child_min, - "upper_bound": left_child_max, - }) - elif store_leaf_values and is_leaf: - # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[node_id]) - - if depth > max_depth_seen: - max_depth_seen = depth - - if rc >= 0: - rc = tree._resize_c(tree.node_count) - - if rc >= 0: - tree.max_depth = max_depth_seen + if e.rc >= 0: + tree.max_depth = e.max_depth_seen # free the memory created for the SplitRecord pointer - free(split_ptr) + free(e.split) - if rc == -1: + if e.rc == -1: raise MemoryError() # Best first builder ----------------------------------------------------------