From 03e1d77d22e398d3e8485ef9ef66fc1cc133cdd9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 22 Nov 2022 21:28:33 -0500 Subject: [PATCH 1/8] Initial commit --- sklearn/tree/_criterion.pxd | 4 ++-- sklearn/tree/_criterion.pyx | 36 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index ee09c307aac79..a6fde757c5412 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -24,7 +24,7 @@ cdef class Criterion: cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights - cdef SIZE_t* samples # Sample indices in X, y + cdef const SIZE_t[:] samples # Sample indices in X, y cdef SIZE_t start # samples[start:pos] are the samples in the left node cdef SIZE_t pos # samples[pos:end] are the samples in the right node cdef SIZE_t end @@ -46,7 +46,7 @@ cdef class Criterion: const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - SIZE_t* samples, + const SIZE_t[:] samples, SIZE_t start, SIZE_t end ) nogil except -1 diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 4527789cfcc0c..b1d2d4f04396a 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -46,7 +46,7 @@ cdef class Criterion: const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - SIZE_t* samples, + const SIZE_t[:] samples, SIZE_t start, SIZE_t end, ) nogil except -1: @@ -64,9 +64,9 @@ cdef class Criterion: The weight of each sample stored as a Cython memoryview. weighted_n_samples : double The total weight of the samples being considered - samples : array-like, dtype=SIZE_t + samples : ndarray, dtype=SIZE_t Indices of the samples in X and y, where samples[start:end] - correspond to the samples in this node + correspond to the samples in this node stored as a Cython memoryview. start : SIZE_t The first sample to be used on this node end : SIZE_t @@ -214,7 +214,6 @@ cdef class ClassificationCriterion(Criterion): n_classes : numpy.ndarray, dtype=SIZE_t The number of unique classes in each target """ - self.samples = NULL self.start = 0 self.pos = 0 self.end = 0 @@ -255,7 +254,7 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - SIZE_t* samples, + const SIZE_t[:] samples, SIZE_t start, SIZE_t end ) nogil except -1: @@ -275,8 +274,9 @@ cdef class ClassificationCriterion(Criterion): The weight of each sample stored as a Cython memoryview. weighted_n_samples : double The total weight of all samples - samples : array-like, dtype=SIZE_t - A mask on the samples, showing which ones we want to use + samples : ndarray, dtype=SIZE_t + A mask on the samples, showing which ones we want to use stored + as a Cython memoryview. start : SIZE_t The first sample to use in the mask end : SIZE_t @@ -368,7 +368,7 @@ cdef class ClassificationCriterion(Criterion): cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end - cdef SIZE_t* samples = self.samples + cdef const SIZE_t[:] samples = self.samples cdef const DOUBLE_t[:] sample_weight = self.sample_weight cdef SIZE_t i @@ -623,7 +623,6 @@ cdef class RegressionCriterion(Criterion): The total number of samples to fit on """ # Default values - self.samples = NULL self.start = 0 self.pos = 0 self.end = 0 @@ -649,7 +648,7 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - SIZE_t* samples, + const SIZE_t[:] samples, SIZE_t start, SIZE_t end, ) nogil except -1: @@ -720,7 +719,7 @@ cdef class RegressionCriterion(Criterion): cdef int update(self, SIZE_t new_pos) nogil except -1: """Updated statistics by moving samples[pos:new_pos] to the left.""" cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples + cdef const SIZE_t[:] samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end @@ -845,7 +844,7 @@ cdef class MSE(RegressionCriterion): impurity the right child (samples[pos:end]). """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples + cdef const SIZE_t[:] samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start @@ -904,7 +903,6 @@ cdef class MAE(RegressionCriterion): The total number of samples to fit on """ # Default values - self.samples = NULL self.start = 0 self.pos = 0 self.end = 0 @@ -930,7 +928,7 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - SIZE_t* samples, + const SIZE_t[:] samples, SIZE_t start, SIZE_t end, ) nogil except -1: @@ -1049,7 +1047,7 @@ cdef class MAE(RegressionCriterion): or 0 otherwise. """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples + cdef const SIZE_t[:] samples = self.samples cdef void** left_child = self.left_child.data cdef void** right_child = self.right_child.data @@ -1113,7 +1111,7 @@ cdef class MAE(RegressionCriterion): better. """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples + cdef const SIZE_t[:] samples = self.samples cdef SIZE_t i, p, k cdef DOUBLE_t w = 1.0 cdef DOUBLE_t impurity = 0.0 @@ -1137,7 +1135,7 @@ cdef class MAE(RegressionCriterion): impurity the right child (samples[pos:end]). """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples + cdef const SIZE_t[:] samples = self.samples cdef SIZE_t start = self.start cdef SIZE_t pos = self.pos @@ -1335,10 +1333,12 @@ cdef class Poisson(RegressionCriterion): """ cdef const DOUBLE_t[:, ::1] y = self.y cdef const DOUBLE_t[:] sample_weight = self.sample_weight + cdef const SIZE_t[:] samples = self.samples cdef DOUBLE_t y_mean = 0. cdef DOUBLE_t poisson_loss = 0. cdef DOUBLE_t w = 1.0 + cdef SIZE_t i, k, p cdef SIZE_t n_outputs = self.n_outputs for k in range(n_outputs): @@ -1353,7 +1353,7 @@ cdef class Poisson(RegressionCriterion): y_mean = y_sum[k] / weight_sum for p in range(start, end): - i = self.samples[p] + i = samples[p] if sample_weight is not None: w = sample_weight[i] From 4a2979b54151ebda51553f0a1d2df2c13756c96f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 22 Nov 2022 21:31:39 -0500 Subject: [PATCH 2/8] Fix pip install --- sklearn/tree/_splitter.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 47b97e30fb131..355a5da8427be 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -182,7 +182,7 @@ cdef class Splitter: self.criterion.init(self.y, self.sample_weight, self.weighted_n_samples, - &self.samples[0], + self.samples, start, end) From 1b4bd2e8e518ff934502a7fd51618a1d4ed06e80 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 23 Nov 2022 15:14:04 -0500 Subject: [PATCH 3/8] Address comments about naming --- sklearn/tree/_criterion.pxd | 4 +- sklearn/tree/_criterion.pyx | 153 ++++++++++++++++++------------------ 2 files changed, 80 insertions(+), 77 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index a6fde757c5412..3c6217a8c272c 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -24,7 +24,7 @@ cdef class Criterion: cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights - cdef const SIZE_t[:] samples # Sample indices in X, y + cdef const SIZE_t[:] sample_indices # Sample indices in X, y cdef SIZE_t start # samples[start:pos] are the samples in the left node cdef SIZE_t pos # samples[pos:end] are the samples in the right node cdef SIZE_t end @@ -46,7 +46,7 @@ cdef class Criterion: const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] samples, + const SIZE_t[:] sample_indices, SIZE_t start, SIZE_t end ) nogil except -1 diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index b1d2d4f04396a..b63cfd8ef7db8 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -46,7 +46,7 @@ cdef class Criterion: const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] samples, + const SIZE_t[:] sample_indices, SIZE_t start, SIZE_t end, ) nogil except -1: @@ -64,9 +64,9 @@ cdef class Criterion: The weight of each sample stored as a Cython memoryview. weighted_n_samples : double The total weight of the samples being considered - samples : ndarray, dtype=SIZE_t - Indices of the samples in X and y, where samples[start:end] - correspond to the samples in this node stored as a Cython memoryview. + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. start : SIZE_t The first sample to be used on this node end : SIZE_t @@ -90,16 +90,16 @@ cdef class Criterion: pass cdef int update(self, SIZE_t new_pos) nogil except -1: - """Updated statistics by moving samples[pos:new_pos] to the left child. + """Updated statistics by moving sample_indices[pos:new_pos] to the left child. - This updates the collected statistics by moving samples[pos:new_pos] + This updates the collected statistics by moving sample_indices[pos:new_pos] from the right child to the left child. It must be implemented by the subclass. Parameters ---------- new_pos : SIZE_t - New starting index position of the samples in the right child + New starting index position of the sample_indices in the right child """ pass @@ -107,7 +107,7 @@ cdef class Criterion: """Placeholder for calculating the impurity of the node. Placeholder for a method which will evaluate the impurity of - the current node, i.e. the impurity of samples[start:end]. This is the + the current node, i.e. the impurity of sample_indices[start:end]. This is the primary function of the criterion class. The smaller the impurity the better. """ @@ -118,8 +118,8 @@ cdef class Criterion: """Placeholder for calculating the impurity of children. Placeholder for a method which evaluates the impurity in - children nodes, i.e. the impurity of samples[start:pos] + the impurity - of samples[pos:end]. + children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity + of sample_indices[pos:end]. Parameters ---------- @@ -136,7 +136,7 @@ cdef class Criterion: """Placeholder for storing the node value. Placeholder for a method which will compute the node value - of samples[start:end] and save the value into dest. + of sample_indices[start:end] and save the value into dest. Parameters ---------- @@ -254,14 +254,14 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] samples, + const SIZE_t[:] sample_indices, SIZE_t start, SIZE_t end ) nogil except -1: """Initialize the criterion. - This initializes the criterion at node samples[start:end] and children - samples[start:start] and samples[start:end]. + This initializes the criterion at node sample_indices[start:end] and children + sample_indices[start:start] and sample_indices[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. @@ -274,9 +274,9 @@ cdef class ClassificationCriterion(Criterion): The weight of each sample stored as a Cython memoryview. weighted_n_samples : double The total weight of all samples - samples : ndarray, dtype=SIZE_t - A mask on the samples, showing which ones we want to use stored - as a Cython memoryview. + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. start : SIZE_t The first sample to use in the mask end : SIZE_t @@ -284,7 +284,7 @@ cdef class ClassificationCriterion(Criterion): """ self.y = y self.sample_weight = sample_weight - self.samples = samples + self.sample_indices = sample_indices self.start = start self.end = end self.n_node_samples = end - start @@ -301,7 +301,7 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double)) for p in range(start, end): - i = samples[p] + i = sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. @@ -354,7 +354,7 @@ cdef class ClassificationCriterion(Criterion): return 0 cdef int update(self, SIZE_t new_pos) nogil except -1: - """Updated statistics by moving samples[pos:new_pos] to the left child. + """Updated statistics by moving sample_indices[pos:new_pos] to the left child. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. @@ -362,13 +362,13 @@ cdef class ClassificationCriterion(Criterion): Parameters ---------- new_pos : SIZE_t - The new ending position for which to move samples from the right + The new ending position for which to move sample_indices from the right child to the left child. """ cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end - cdef const SIZE_t[:] samples = self.samples + cdef const SIZE_t[:] sample_indices = self.sample_indices cdef const DOUBLE_t[:] sample_weight = self.sample_weight cdef SIZE_t i @@ -386,7 +386,7 @@ cdef class ClassificationCriterion(Criterion): # of computations, i.e. from pos to new_pos or from end to new_po. if (new_pos - pos) <= (end - new_pos): for p in range(pos, new_pos): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -400,7 +400,7 @@ cdef class ClassificationCriterion(Criterion): self.reverse_reset() for p in range(end - 1, new_pos - 1, -1): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -427,7 +427,7 @@ cdef class ClassificationCriterion(Criterion): pass cdef void node_value(self, double* dest) nogil: - """Compute the node value of samples[start:end] and save it into dest. + """Compute the node value of sample_indices[start:end] and save it into dest. Parameters ---------- @@ -461,7 +461,7 @@ cdef class Entropy(ClassificationCriterion): """Evaluate the impurity of the current node. Evaluate the cross-entropy criterion as impurity of the current node, - i.e. the impurity of samples[start:end]. The smaller the impurity the + i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ cdef double entropy = 0.0 @@ -482,8 +482,8 @@ cdef class Entropy(ClassificationCriterion): double* impurity_right) nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (samples[start:pos]) and the - impurity the right child (samples[pos:end]). + i.e. the impurity of the left child (sample_indices[start:pos]) and the + impurity the right child (sample_indices[pos:end]). Parameters ---------- @@ -535,7 +535,7 @@ cdef class Gini(ClassificationCriterion): """Evaluate the impurity of the current node. Evaluate the Gini criterion as impurity of the current node, - i.e. the impurity of samples[start:end]. The smaller the impurity the + i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ cdef double gini = 0.0 @@ -560,8 +560,8 @@ cdef class Gini(ClassificationCriterion): double* impurity_right) nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (samples[start:pos]) and the - impurity the right child (samples[pos:end]) using the Gini index. + i.e. the impurity of the left child (sample_indices[start:pos]) and the + impurity the right child (sample_indices[pos:end]) using the Gini index. Parameters ---------- @@ -648,19 +648,19 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] samples, + const SIZE_t[:] sample_indices, SIZE_t start, SIZE_t end, ) nogil except -1: """Initialize the criterion. - This initializes the criterion at node samples[start:end] and children - samples[start:start] and samples[start:end]. + This initializes the criterion at node sample_indices[start:end] and children + sample_indices[start:start] and sample_indices[start:end]. """ # Initialize fields self.y = y self.sample_weight = sample_weight - self.samples = samples + self.sample_indices = sample_indices self.start = start self.end = end self.n_node_samples = end - start @@ -677,7 +677,7 @@ cdef class RegressionCriterion(Criterion): memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) for p in range(start, end): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -717,9 +717,9 @@ cdef class RegressionCriterion(Criterion): return 0 cdef int update(self, SIZE_t new_pos) nogil except -1: - """Updated statistics by moving samples[pos:new_pos] to the left.""" + """Updated statistics by moving sample_indices[pos:new_pos] to the left.""" cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef const SIZE_t[:] samples = self.samples + cdef const SIZE_t[:] sample_indices = self.sample_indices cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end @@ -737,7 +737,7 @@ cdef class RegressionCriterion(Criterion): # of computations, i.e. from pos to new_pos or from end to new_pos. if (new_pos - pos) <= (end - new_pos): for p in range(pos, new_pos): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -750,7 +750,7 @@ cdef class RegressionCriterion(Criterion): self.reverse_reset() for p in range(end - 1, new_pos - 1, -1): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -776,7 +776,7 @@ cdef class RegressionCriterion(Criterion): pass cdef void node_value(self, double* dest) nogil: - """Compute the node value of samples[start:end] into dest.""" + """Compute the node value of sample_indices[start:end] into dest.""" cdef SIZE_t k for k in range(self.n_outputs): @@ -793,7 +793,7 @@ cdef class MSE(RegressionCriterion): """Evaluate the impurity of the current node. Evaluate the MSE criterion as impurity of the current node, - i.e. the impurity of samples[start:end]. The smaller the impurity the + i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ cdef double impurity @@ -840,11 +840,11 @@ cdef class MSE(RegressionCriterion): double* impurity_right) nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (samples[start:pos]) and the - impurity the right child (samples[pos:end]). + i.e. the impurity of the left child (sample_indices[start:pos]) and the + impurity the right child (sample_indices[pos:end]). """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef const SIZE_t[:] samples = self.samples + cdef const SIZE_t[:] sample_indices = self.sample_indices cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start @@ -859,7 +859,7 @@ cdef class MSE(RegressionCriterion): cdef DOUBLE_t w = 1.0 for p in range(start, pos): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -928,14 +928,14 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] samples, + const SIZE_t[:] sample_indices, SIZE_t start, SIZE_t end, ) nogil except -1: """Initialize the criterion. - This initializes the criterion at node samples[start:end] and children - samples[start:start] and samples[start:end]. + This initializes the criterion at node sample_indices[start:end] and children + sample_indices[start:start] and sample_indices[start:end]. """ cdef SIZE_t i, p, k cdef DOUBLE_t w = 1.0 @@ -943,7 +943,7 @@ cdef class MAE(RegressionCriterion): # Initialize fields self.y = y self.sample_weight = sample_weight - self.samples = samples + self.sample_indices = sample_indices self.start = start self.end = end self.n_node_samples = end - start @@ -961,7 +961,7 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -1041,13 +1041,13 @@ cdef class MAE(RegressionCriterion): return 0 cdef int update(self, SIZE_t new_pos) nogil except -1: - """Updated statistics by moving samples[pos:new_pos] to the left. + """Updated statistics by moving sample_indices[pos:new_pos] to the left. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef const SIZE_t[:] samples = self.samples + cdef const SIZE_t[:] sample_indices = self.sample_indices cdef void** left_child = self.left_child.data cdef void** right_child = self.right_child.data @@ -1064,7 +1064,7 @@ cdef class MAE(RegressionCriterion): # computations, i.e. from pos to new_pos or from end to new_pos. if (new_pos - pos) <= (end - new_pos): for p in range(pos, new_pos): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -1080,7 +1080,7 @@ cdef class MAE(RegressionCriterion): self.reverse_reset() for p in range(end - 1, new_pos - 1, -1): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -1098,7 +1098,7 @@ cdef class MAE(RegressionCriterion): return 0 cdef void node_value(self, double* dest) nogil: - """Computes the node value of samples[start:end] into dest.""" + """Computes the node value of sample_indices[start:end] into dest.""" cdef SIZE_t k for k in range(self.n_outputs): dest[k] = self.node_medians[k] @@ -1107,18 +1107,18 @@ cdef class MAE(RegressionCriterion): """Evaluate the impurity of the current node. Evaluate the MAE criterion as impurity of the current node, - i.e. the impurity of samples[start:end]. The smaller the impurity the + i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef const SIZE_t[:] samples = self.samples + cdef const SIZE_t[:] sample_indices = self.sample_indices cdef SIZE_t i, p, k cdef DOUBLE_t w = 1.0 cdef DOUBLE_t impurity = 0.0 for k in range(self.n_outputs): for p in range(self.start, self.end): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -1131,11 +1131,11 @@ cdef class MAE(RegressionCriterion): double* p_impurity_right) nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (samples[start:pos]) and the - impurity the right child (samples[pos:end]). + i.e. the impurity of the left child (sample_indices[start:pos]) and the + impurity the right child (sample_indices[pos:end]). """ cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef const SIZE_t[:] samples = self.samples + cdef const SIZE_t[:] sample_indices = self.sample_indices cdef SIZE_t start = self.start cdef SIZE_t pos = self.pos @@ -1153,7 +1153,7 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): median = ( left_child[k]).get_median() for p in range(start, pos): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -1165,7 +1165,7 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): median = ( right_child[k]).get_median() for p in range(pos, end): - i = samples[p] + i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] @@ -1255,7 +1255,7 @@ cdef class Poisson(RegressionCriterion): """Evaluate the impurity of the current node. Evaluate the Poisson criterion as impurity of the current node, - i.e. the impurity of samples[start:end]. The smaller the impurity the + i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ return self.poisson_loss(self.start, self.end, self.sum_total, @@ -1311,8 +1311,8 @@ cdef class Poisson(RegressionCriterion): double* impurity_right) nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (samples[start:pos]) and the - impurity of the right child (samples[pos:end]) for Poisson. + i.e. the impurity of the left child (sample_indices[start:pos]) and the + impurity of the right child (sample_indices[pos:end]) for Poisson. """ cdef SIZE_t start = self.start cdef SIZE_t pos = self.pos @@ -1333,15 +1333,15 @@ cdef class Poisson(RegressionCriterion): """ cdef const DOUBLE_t[:, ::1] y = self.y cdef const DOUBLE_t[:] sample_weight = self.sample_weight - cdef const SIZE_t[:] samples = self.samples + cdef const SIZE_t[:] sample_indices = self.sample_indices cdef DOUBLE_t y_mean = 0. cdef DOUBLE_t poisson_loss = 0. cdef DOUBLE_t w = 1.0 - cdef SIZE_t i, k, p + cdef SIZE_t samples_idx, output_idx, index_pointer cdef SIZE_t n_outputs = self.n_outputs - for k in range(n_outputs): + for output_idx in range(n_outputs): if y_sum[k] <= EPSILON: # y_sum could be computed from the subtraction # sum_right = sum_total - sum_left leading to a potential @@ -1350,13 +1350,16 @@ cdef class Poisson(RegressionCriterion): # y_sum <= EPSILON. return INFINITY - y_mean = y_sum[k] / weight_sum + y_mean = y_sum[output_idx] / weight_sum - for p in range(start, end): - i = samples[p] + for index_pointer in range(start, end): + samples_idx = sample_indices[index_pointer] if sample_weight is not None: - w = sample_weight[i] + w = sample_weight[samples_idx] - poisson_loss += w * xlogy(y[i, k], y[i, k] / y_mean) + poisson_loss += w * xlogy( + y[samples_idx, output_idx], + y[samples_idx, output_idx] / y_mean + ) return poisson_loss / (weight_sum * n_outputs) From 7722824616672ff687353414cca11ad15b107e8d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 23 Nov 2022 15:15:34 -0500 Subject: [PATCH 4/8] Fix indentation --- sklearn/tree/_splitter.pyx | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 355a5da8427be..7b87ef170010e 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -179,12 +179,14 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init(self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end) + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples, + start, + end + ) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 From b47f79a907b11cf857f458c37be6ef56a496a9a8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 23 Nov 2022 15:17:46 -0500 Subject: [PATCH 5/8] All fixed --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index b63cfd8ef7db8..0a133a41e06e9 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1342,7 +1342,7 @@ cdef class Poisson(RegressionCriterion): cdef SIZE_t n_outputs = self.n_outputs for output_idx in range(n_outputs): - if y_sum[k] <= EPSILON: + if y_sum[output_idx] <= EPSILON: # y_sum could be computed from the subtraction # sum_right = sum_total - sum_left leading to a potential # floating point rounding error. From d223ff7d36d4ff5f7e2e47d23b2d8c87f8a56802 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Wed, 30 Nov 2022 22:29:53 -0500 Subject: [PATCH 6/8] revert variable naming as suggested by @jjerphan --- sklearn/tree/_criterion.pyx | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0a133a41e06e9..a0b5d54cb866d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1338,11 +1338,10 @@ cdef class Poisson(RegressionCriterion): cdef DOUBLE_t y_mean = 0. cdef DOUBLE_t poisson_loss = 0. cdef DOUBLE_t w = 1.0 - cdef SIZE_t samples_idx, output_idx, index_pointer cdef SIZE_t n_outputs = self.n_outputs - for output_idx in range(n_outputs): - if y_sum[output_idx] <= EPSILON: + for k in range(n_outputs): + if y_sum[k] <= EPSILON: # y_sum could be computed from the subtraction # sum_right = sum_total - sum_left leading to a potential # floating point rounding error. @@ -1352,14 +1351,11 @@ cdef class Poisson(RegressionCriterion): y_mean = y_sum[output_idx] / weight_sum - for index_pointer in range(start, end): - samples_idx = sample_indices[index_pointer] + for p in range(start, end): + i = sample_indices[p] if sample_weight is not None: - w = sample_weight[samples_idx] + w = sample_weight[i] - poisson_loss += w * xlogy( - y[samples_idx, output_idx], - y[samples_idx, output_idx] / y_mean - ) + poisson_loss += w * xlogy(y[i, k], y[i, k] / y_mean) return poisson_loss / (weight_sum * n_outputs) From 5a13731f42c9e65a750f88a5d656fbfd4c2f19fd Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 30 Nov 2022 22:56:53 -0500 Subject: [PATCH 7/8] Update sklearn/tree/_criterion.pyx --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index a0b5d54cb866d..55f63086cdd9f 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1349,7 +1349,7 @@ cdef class Poisson(RegressionCriterion): # y_sum <= EPSILON. return INFINITY - y_mean = y_sum[output_idx] / weight_sum + y_mean = y_sum[k] / weight_sum for p in range(start, end): i = sample_indices[p] From 02d56c2f1d275b00009dfa7f839838fdbd77c223 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 1 Dec 2022 00:13:28 -0500 Subject: [PATCH 8/8] Add definitions --- sklearn/tree/_criterion.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 55f63086cdd9f..e9f32b6e06ef9 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1338,6 +1338,7 @@ cdef class Poisson(RegressionCriterion): cdef DOUBLE_t y_mean = 0. cdef DOUBLE_t poisson_loss = 0. cdef DOUBLE_t w = 1.0 + cdef SIZE_t i, k, p cdef SIZE_t n_outputs = self.n_outputs for k in range(n_outputs):