From 03e1d77d22e398d3e8485ef9ef66fc1cc133cdd9 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 22 Nov 2022 21:28:33 -0500
Subject: [PATCH 1/8] Initial commit

---
 sklearn/tree/_criterion.pxd |  4 ++--
 sklearn/tree/_criterion.pyx | 36 ++++++++++++++++++------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index ee09c307aac79..a6fde757c5412 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -24,7 +24,7 @@ cdef class Criterion:
     cdef const DOUBLE_t[:, ::1] y        # Values of y
     cdef const DOUBLE_t[:] sample_weight # Sample weights
 
-    cdef SIZE_t* samples                 # Sample indices in X, y
+    cdef const SIZE_t[:] samples         # Sample indices in X, y
     cdef SIZE_t start                    # samples[start:pos] are the samples in the left node
     cdef SIZE_t pos                      # samples[pos:end] are the samples in the right node
     cdef SIZE_t end
@@ -46,7 +46,7 @@ cdef class Criterion:
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        SIZE_t* samples,
+        const SIZE_t[:] samples,
         SIZE_t start,
         SIZE_t end
     ) nogil except -1
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 4527789cfcc0c..b1d2d4f04396a 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -46,7 +46,7 @@ cdef class Criterion:
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        SIZE_t* samples,
+        const SIZE_t[:] samples,
         SIZE_t start,
         SIZE_t end,
     ) nogil except -1:
@@ -64,9 +64,9 @@ cdef class Criterion:
             The weight of each sample stored as a Cython memoryview.
         weighted_n_samples : double
             The total weight of the samples being considered
-        samples : array-like, dtype=SIZE_t
+        samples : ndarray, dtype=SIZE_t
             Indices of the samples in X and y, where samples[start:end]
-            correspond to the samples in this node
+            correspond to the samples in this node stored as a Cython memoryview.
         start : SIZE_t
             The first sample to be used on this node
         end : SIZE_t
@@ -214,7 +214,6 @@ cdef class ClassificationCriterion(Criterion):
         n_classes : numpy.ndarray, dtype=SIZE_t
             The number of unique classes in each target
         """
-        self.samples = NULL
         self.start = 0
         self.pos = 0
         self.end = 0
@@ -255,7 +254,7 @@ cdef class ClassificationCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        SIZE_t* samples,
+        const SIZE_t[:] samples,
         SIZE_t start,
         SIZE_t end
     ) nogil except -1:
@@ -275,8 +274,9 @@ cdef class ClassificationCriterion(Criterion):
             The weight of each sample stored as a Cython memoryview.
         weighted_n_samples : double
             The total weight of all samples
-        samples : array-like, dtype=SIZE_t
-            A mask on the samples, showing which ones we want to use
+        samples : ndarray, dtype=SIZE_t
+            A mask on the samples, showing which ones we want to use stored
+            as a Cython memoryview.
         start : SIZE_t
             The first sample to use in the mask
         end : SIZE_t
@@ -368,7 +368,7 @@ cdef class ClassificationCriterion(Criterion):
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
 
-        cdef SIZE_t* samples = self.samples
+        cdef const SIZE_t[:] samples = self.samples
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
 
         cdef SIZE_t i
@@ -623,7 +623,6 @@ cdef class RegressionCriterion(Criterion):
             The total number of samples to fit on
         """
         # Default values
-        self.samples = NULL
         self.start = 0
         self.pos = 0
         self.end = 0
@@ -649,7 +648,7 @@ cdef class RegressionCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        SIZE_t* samples,
+        const SIZE_t[:] samples,
         SIZE_t start,
         SIZE_t end,
     ) nogil except -1:
@@ -720,7 +719,7 @@ cdef class RegressionCriterion(Criterion):
     cdef int update(self, SIZE_t new_pos) nogil except -1:
         """Updated statistics by moving samples[pos:new_pos] to the left."""
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef SIZE_t* samples = self.samples
+        cdef const SIZE_t[:] samples = self.samples
 
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
@@ -845,7 +844,7 @@ cdef class MSE(RegressionCriterion):
         impurity the right child (samples[pos:end]).
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef SIZE_t* samples = self.samples
+        cdef const SIZE_t[:] samples = self.samples
         cdef SIZE_t pos = self.pos
         cdef SIZE_t start = self.start
 
@@ -904,7 +903,6 @@ cdef class MAE(RegressionCriterion):
             The total number of samples to fit on
         """
         # Default values
-        self.samples = NULL
         self.start = 0
         self.pos = 0
         self.end = 0
@@ -930,7 +928,7 @@ cdef class MAE(RegressionCriterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        SIZE_t* samples,
+        const SIZE_t[:] samples,
         SIZE_t start,
         SIZE_t end,
     ) nogil except -1:
@@ -1049,7 +1047,7 @@ cdef class MAE(RegressionCriterion):
         or 0 otherwise.
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef SIZE_t* samples = self.samples
+        cdef const SIZE_t[:] samples = self.samples
 
         cdef void** left_child = <void**> self.left_child.data
         cdef void** right_child = <void**> self.right_child.data
@@ -1113,7 +1111,7 @@ cdef class MAE(RegressionCriterion):
         better.
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef SIZE_t* samples = self.samples
+        cdef const SIZE_t[:] samples = self.samples
         cdef SIZE_t i, p, k
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t impurity = 0.0
@@ -1137,7 +1135,7 @@ cdef class MAE(RegressionCriterion):
         impurity the right child (samples[pos:end]).
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef SIZE_t* samples = self.samples
+        cdef const SIZE_t[:] samples = self.samples
 
         cdef SIZE_t start = self.start
         cdef SIZE_t pos = self.pos
@@ -1335,10 +1333,12 @@ cdef class Poisson(RegressionCriterion):
         """
         cdef const DOUBLE_t[:, ::1] y = self.y
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
+        cdef const SIZE_t[:] samples = self.samples
 
         cdef DOUBLE_t y_mean = 0.
         cdef DOUBLE_t poisson_loss = 0.
         cdef DOUBLE_t w = 1.0
+        cdef SIZE_t i, k, p
         cdef SIZE_t n_outputs = self.n_outputs
 
         for k in range(n_outputs):
@@ -1353,7 +1353,7 @@ cdef class Poisson(RegressionCriterion):
             y_mean = y_sum[k] / weight_sum
 
             for p in range(start, end):
-                i = self.samples[p]
+                i = samples[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]

From 4a2979b54151ebda51553f0a1d2df2c13756c96f Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 22 Nov 2022 21:31:39 -0500
Subject: [PATCH 2/8] Fix pip install

---
 sklearn/tree/_splitter.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 47b97e30fb131..355a5da8427be 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -182,7 +182,7 @@ cdef class Splitter:
         self.criterion.init(self.y,
                             self.sample_weight,
                             self.weighted_n_samples,
-                            &self.samples[0],
+                            self.samples,
                             start,
                             end)
 

From 1b4bd2e8e518ff934502a7fd51618a1d4ed06e80 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 23 Nov 2022 15:14:04 -0500
Subject: [PATCH 3/8] Address comments about naming

---
 sklearn/tree/_criterion.pxd |   4 +-
 sklearn/tree/_criterion.pyx | 153 ++++++++++++++++++------------------
 2 files changed, 80 insertions(+), 77 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index a6fde757c5412..3c6217a8c272c 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -24,7 +24,7 @@ cdef class Criterion:
     cdef const DOUBLE_t[:, ::1] y        # Values of y
     cdef const DOUBLE_t[:] sample_weight # Sample weights
 
-    cdef const SIZE_t[:] samples         # Sample indices in X, y
+    cdef const SIZE_t[:] sample_indices  # Sample indices in X, y
     cdef SIZE_t start                    # samples[start:pos] are the samples in the left node
     cdef SIZE_t pos                      # samples[pos:end] are the samples in the right node
     cdef SIZE_t end
@@ -46,7 +46,7 @@ cdef class Criterion:
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] samples,
+        const SIZE_t[:] sample_indices,
         SIZE_t start,
         SIZE_t end
     ) nogil except -1
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index b1d2d4f04396a..b63cfd8ef7db8 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -46,7 +46,7 @@ cdef class Criterion:
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] samples,
+        const SIZE_t[:] sample_indices,
         SIZE_t start,
         SIZE_t end,
     ) nogil except -1:
@@ -64,9 +64,9 @@ cdef class Criterion:
             The weight of each sample stored as a Cython memoryview.
         weighted_n_samples : double
             The total weight of the samples being considered
-        samples : ndarray, dtype=SIZE_t
-            Indices of the samples in X and y, where samples[start:end]
-            correspond to the samples in this node stored as a Cython memoryview.
+        sample_indices : ndarray, dtype=SIZE_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
         start : SIZE_t
             The first sample to be used on this node
         end : SIZE_t
@@ -90,16 +90,16 @@ cdef class Criterion:
         pass
 
     cdef int update(self, SIZE_t new_pos) nogil except -1:
-        """Updated statistics by moving samples[pos:new_pos] to the left child.
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
 
-        This updates the collected statistics by moving samples[pos:new_pos]
+        This updates the collected statistics by moving sample_indices[pos:new_pos]
         from the right child to the left child. It must be implemented by
         the subclass.
 
         Parameters
         ----------
         new_pos : SIZE_t
-            New starting index position of the samples in the right child
+            New starting index position of the sample_indices in the right child
         """
         pass
 
@@ -107,7 +107,7 @@ cdef class Criterion:
         """Placeholder for calculating the impurity of the node.
 
         Placeholder for a method which will evaluate the impurity of
-        the current node, i.e. the impurity of samples[start:end]. This is the
+        the current node, i.e. the impurity of sample_indices[start:end]. This is the
         primary function of the criterion class. The smaller the impurity the
         better.
         """
@@ -118,8 +118,8 @@ cdef class Criterion:
         """Placeholder for calculating the impurity of children.
 
         Placeholder for a method which evaluates the impurity in
-        children nodes, i.e. the impurity of samples[start:pos] + the impurity
-        of samples[pos:end].
+        children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity
+        of sample_indices[pos:end].
 
         Parameters
         ----------
@@ -136,7 +136,7 @@ cdef class Criterion:
         """Placeholder for storing the node value.
 
         Placeholder for a method which will compute the node value
-        of samples[start:end] and save the value into dest.
+        of sample_indices[start:end] and save the value into dest.
 
         Parameters
         ----------
@@ -254,14 +254,14 @@ cdef class ClassificationCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] samples,
+        const SIZE_t[:] sample_indices,
         SIZE_t start,
         SIZE_t end
     ) nogil except -1:
         """Initialize the criterion.
 
-        This initializes the criterion at node samples[start:end] and children
-        samples[start:start] and samples[start:end].
+        This initializes the criterion at node sample_indices[start:end] and children
+        sample_indices[start:start] and sample_indices[start:end].
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
@@ -274,9 +274,9 @@ cdef class ClassificationCriterion(Criterion):
             The weight of each sample stored as a Cython memoryview.
         weighted_n_samples : double
             The total weight of all samples
-        samples : ndarray, dtype=SIZE_t
-            A mask on the samples, showing which ones we want to use stored
-            as a Cython memoryview.
+        sample_indices : ndarray, dtype=SIZE_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
         start : SIZE_t
             The first sample to use in the mask
         end : SIZE_t
@@ -284,7 +284,7 @@ cdef class ClassificationCriterion(Criterion):
         """
         self.y = y
         self.sample_weight = sample_weight
-        self.samples = samples
+        self.sample_indices = sample_indices
         self.start = start
         self.end = end
         self.n_node_samples = end - start
@@ -301,7 +301,7 @@ cdef class ClassificationCriterion(Criterion):
             memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double))
 
         for p in range(start, end):
-            i = samples[p]
+            i = sample_indices[p]
 
             # w is originally set to be 1.0, meaning that if no sample weights
             # are given, the default weight of each sample is 1.0.
@@ -354,7 +354,7 @@ cdef class ClassificationCriterion(Criterion):
         return 0
 
     cdef int update(self, SIZE_t new_pos) nogil except -1:
-        """Updated statistics by moving samples[pos:new_pos] to the left child.
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
@@ -362,13 +362,13 @@ cdef class ClassificationCriterion(Criterion):
         Parameters
         ----------
         new_pos : SIZE_t
-            The new ending position for which to move samples from the right
+            The new ending position for which to move sample_indices from the right
             child to the left child.
         """
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
 
-        cdef const SIZE_t[:] samples = self.samples
+        cdef const SIZE_t[:] sample_indices = self.sample_indices
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
 
         cdef SIZE_t i
@@ -386,7 +386,7 @@ cdef class ClassificationCriterion(Criterion):
         # of computations, i.e. from pos to new_pos or from end to new_po.
         if (new_pos - pos) <= (end - new_pos):
             for p in range(pos, new_pos):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -400,7 +400,7 @@ cdef class ClassificationCriterion(Criterion):
             self.reverse_reset()
 
             for p in range(end - 1, new_pos - 1, -1):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -427,7 +427,7 @@ cdef class ClassificationCriterion(Criterion):
         pass
 
     cdef void node_value(self, double* dest) nogil:
-        """Compute the node value of samples[start:end] and save it into dest.
+        """Compute the node value of sample_indices[start:end] and save it into dest.
 
         Parameters
         ----------
@@ -461,7 +461,7 @@ cdef class Entropy(ClassificationCriterion):
         """Evaluate the impurity of the current node.
 
         Evaluate the cross-entropy criterion as impurity of the current node,
-        i.e. the impurity of samples[start:end]. The smaller the impurity the
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
         """
         cdef double entropy = 0.0
@@ -482,8 +482,8 @@ cdef class Entropy(ClassificationCriterion):
                                 double* impurity_right) nogil:
         """Evaluate the impurity in children nodes.
 
-        i.e. the impurity of the left child (samples[start:pos]) and the
-        impurity the right child (samples[pos:end]).
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]).
 
         Parameters
         ----------
@@ -535,7 +535,7 @@ cdef class Gini(ClassificationCriterion):
         """Evaluate the impurity of the current node.
 
         Evaluate the Gini criterion as impurity of the current node,
-        i.e. the impurity of samples[start:end]. The smaller the impurity the
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
         """
         cdef double gini = 0.0
@@ -560,8 +560,8 @@ cdef class Gini(ClassificationCriterion):
                                 double* impurity_right) nogil:
         """Evaluate the impurity in children nodes.
 
-        i.e. the impurity of the left child (samples[start:pos]) and the
-        impurity the right child (samples[pos:end]) using the Gini index.
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]) using the Gini index.
 
         Parameters
         ----------
@@ -648,19 +648,19 @@ cdef class RegressionCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] samples,
+        const SIZE_t[:] sample_indices,
         SIZE_t start,
         SIZE_t end,
     ) nogil except -1:
         """Initialize the criterion.
 
-        This initializes the criterion at node samples[start:end] and children
-        samples[start:start] and samples[start:end].
+        This initializes the criterion at node sample_indices[start:end] and children
+        sample_indices[start:start] and sample_indices[start:end].
         """
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
-        self.samples = samples
+        self.sample_indices = sample_indices
         self.start = start
         self.end = end
         self.n_node_samples = end - start
@@ -677,7 +677,7 @@ cdef class RegressionCriterion(Criterion):
         memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double))
 
         for p in range(start, end):
-            i = samples[p]
+            i = sample_indices[p]
 
             if sample_weight is not None:
                 w = sample_weight[i]
@@ -717,9 +717,9 @@ cdef class RegressionCriterion(Criterion):
         return 0
 
     cdef int update(self, SIZE_t new_pos) nogil except -1:
-        """Updated statistics by moving samples[pos:new_pos] to the left."""
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left."""
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef const SIZE_t[:] samples = self.samples
+        cdef const SIZE_t[:] sample_indices = self.sample_indices
 
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
@@ -737,7 +737,7 @@ cdef class RegressionCriterion(Criterion):
         # of computations, i.e. from pos to new_pos or from end to new_pos.
         if (new_pos - pos) <= (end - new_pos):
             for p in range(pos, new_pos):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -750,7 +750,7 @@ cdef class RegressionCriterion(Criterion):
             self.reverse_reset()
 
             for p in range(end - 1, new_pos - 1, -1):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -776,7 +776,7 @@ cdef class RegressionCriterion(Criterion):
         pass
 
     cdef void node_value(self, double* dest) nogil:
-        """Compute the node value of samples[start:end] into dest."""
+        """Compute the node value of sample_indices[start:end] into dest."""
         cdef SIZE_t k
 
         for k in range(self.n_outputs):
@@ -793,7 +793,7 @@ cdef class MSE(RegressionCriterion):
         """Evaluate the impurity of the current node.
 
         Evaluate the MSE criterion as impurity of the current node,
-        i.e. the impurity of samples[start:end]. The smaller the impurity the
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
         """
         cdef double impurity
@@ -840,11 +840,11 @@ cdef class MSE(RegressionCriterion):
                                 double* impurity_right) nogil:
         """Evaluate the impurity in children nodes.
 
-        i.e. the impurity of the left child (samples[start:pos]) and the
-        impurity the right child (samples[pos:end]).
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]).
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef const SIZE_t[:] samples = self.samples
+        cdef const SIZE_t[:] sample_indices = self.sample_indices
         cdef SIZE_t pos = self.pos
         cdef SIZE_t start = self.start
 
@@ -859,7 +859,7 @@ cdef class MSE(RegressionCriterion):
         cdef DOUBLE_t w = 1.0
 
         for p in range(start, pos):
-            i = samples[p]
+            i = sample_indices[p]
 
             if sample_weight is not None:
                 w = sample_weight[i]
@@ -928,14 +928,14 @@ cdef class MAE(RegressionCriterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] samples,
+        const SIZE_t[:] sample_indices,
         SIZE_t start,
         SIZE_t end,
     ) nogil except -1:
         """Initialize the criterion.
 
-        This initializes the criterion at node samples[start:end] and children
-        samples[start:start] and samples[start:end].
+        This initializes the criterion at node sample_indices[start:end] and children
+        sample_indices[start:start] and sample_indices[start:end].
         """
         cdef SIZE_t i, p, k
         cdef DOUBLE_t w = 1.0
@@ -943,7 +943,7 @@ cdef class MAE(RegressionCriterion):
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
-        self.samples = samples
+        self.sample_indices = sample_indices
         self.start = start
         self.end = end
         self.n_node_samples = end - start
@@ -961,7 +961,7 @@ cdef class MAE(RegressionCriterion):
             (<WeightedMedianCalculator> right_child[k]).reset()
 
         for p in range(start, end):
-            i = samples[p]
+            i = sample_indices[p]
 
             if sample_weight is not None:
                 w = sample_weight[i]
@@ -1041,13 +1041,13 @@ cdef class MAE(RegressionCriterion):
         return 0
 
     cdef int update(self, SIZE_t new_pos) nogil except -1:
-        """Updated statistics by moving samples[pos:new_pos] to the left.
+        """Updated statistics by moving sample_indices[pos:new_pos] to the left.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef const SIZE_t[:] samples = self.samples
+        cdef const SIZE_t[:] sample_indices = self.sample_indices
 
         cdef void** left_child = <void**> self.left_child.data
         cdef void** right_child = <void**> self.right_child.data
@@ -1064,7 +1064,7 @@ cdef class MAE(RegressionCriterion):
         # computations, i.e. from pos to new_pos or from end to new_pos.
         if (new_pos - pos) <= (end - new_pos):
             for p in range(pos, new_pos):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -1080,7 +1080,7 @@ cdef class MAE(RegressionCriterion):
             self.reverse_reset()
 
             for p in range(end - 1, new_pos - 1, -1):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -1098,7 +1098,7 @@ cdef class MAE(RegressionCriterion):
         return 0
 
     cdef void node_value(self, double* dest) nogil:
-        """Computes the node value of samples[start:end] into dest."""
+        """Computes the node value of sample_indices[start:end] into dest."""
         cdef SIZE_t k
         for k in range(self.n_outputs):
             dest[k] = <double> self.node_medians[k]
@@ -1107,18 +1107,18 @@ cdef class MAE(RegressionCriterion):
         """Evaluate the impurity of the current node.
 
         Evaluate the MAE criterion as impurity of the current node,
-        i.e. the impurity of samples[start:end]. The smaller the impurity the
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef const SIZE_t[:] samples = self.samples
+        cdef const SIZE_t[:] sample_indices = self.sample_indices
         cdef SIZE_t i, p, k
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t impurity = 0.0
 
         for k in range(self.n_outputs):
             for p in range(self.start, self.end):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -1131,11 +1131,11 @@ cdef class MAE(RegressionCriterion):
                                 double* p_impurity_right) nogil:
         """Evaluate the impurity in children nodes.
 
-        i.e. the impurity of the left child (samples[start:pos]) and the
-        impurity the right child (samples[pos:end]).
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity the right child (sample_indices[pos:end]).
         """
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef const SIZE_t[:] samples = self.samples
+        cdef const SIZE_t[:] sample_indices = self.sample_indices
 
         cdef SIZE_t start = self.start
         cdef SIZE_t pos = self.pos
@@ -1153,7 +1153,7 @@ cdef class MAE(RegressionCriterion):
         for k in range(self.n_outputs):
             median = (<WeightedMedianCalculator> left_child[k]).get_median()
             for p in range(start, pos):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -1165,7 +1165,7 @@ cdef class MAE(RegressionCriterion):
         for k in range(self.n_outputs):
             median = (<WeightedMedianCalculator> right_child[k]).get_median()
             for p in range(pos, end):
-                i = samples[p]
+                i = sample_indices[p]
 
                 if sample_weight is not None:
                     w = sample_weight[i]
@@ -1255,7 +1255,7 @@ cdef class Poisson(RegressionCriterion):
         """Evaluate the impurity of the current node.
 
         Evaluate the Poisson criterion as impurity of the current node,
-        i.e. the impurity of samples[start:end]. The smaller the impurity the
+        i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
         """
         return self.poisson_loss(self.start, self.end, self.sum_total,
@@ -1311,8 +1311,8 @@ cdef class Poisson(RegressionCriterion):
                                 double* impurity_right) nogil:
         """Evaluate the impurity in children nodes.
 
-        i.e. the impurity of the left child (samples[start:pos]) and the
-        impurity of the right child (samples[pos:end]) for Poisson.
+        i.e. the impurity of the left child (sample_indices[start:pos]) and the
+        impurity of the right child (sample_indices[pos:end]) for Poisson.
         """
         cdef SIZE_t start = self.start
         cdef SIZE_t pos = self.pos
@@ -1333,15 +1333,15 @@ cdef class Poisson(RegressionCriterion):
         """
         cdef const DOUBLE_t[:, ::1] y = self.y
         cdef const DOUBLE_t[:] sample_weight = self.sample_weight
-        cdef const SIZE_t[:] samples = self.samples
+        cdef const SIZE_t[:] sample_indices = self.sample_indices
 
         cdef DOUBLE_t y_mean = 0.
         cdef DOUBLE_t poisson_loss = 0.
         cdef DOUBLE_t w = 1.0
-        cdef SIZE_t i, k, p
+        cdef SIZE_t samples_idx, output_idx, index_pointer
         cdef SIZE_t n_outputs = self.n_outputs
 
-        for k in range(n_outputs):
+        for output_idx in range(n_outputs):
             if y_sum[k] <= EPSILON:
                 # y_sum could be computed from the subtraction
                 # sum_right = sum_total - sum_left leading to a potential
@@ -1350,13 +1350,16 @@ cdef class Poisson(RegressionCriterion):
                 # y_sum <= EPSILON.
                 return INFINITY
 
-            y_mean = y_sum[k] / weight_sum
+            y_mean = y_sum[output_idx] / weight_sum
 
-            for p in range(start, end):
-                i = samples[p]
+            for index_pointer in range(start, end):
+                samples_idx = sample_indices[index_pointer]
 
                 if sample_weight is not None:
-                    w = sample_weight[i]
+                    w = sample_weight[samples_idx]
 
-                poisson_loss += w * xlogy(y[i, k], y[i, k] / y_mean)
+                poisson_loss += w * xlogy(
+                    y[samples_idx, output_idx],
+                    y[samples_idx, output_idx] / y_mean
+                )
         return poisson_loss / (weight_sum * n_outputs)

From 7722824616672ff687353414cca11ad15b107e8d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 23 Nov 2022 15:15:34 -0500
Subject: [PATCH 4/8] Fix indentation

---
 sklearn/tree/_splitter.pyx | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 355a5da8427be..7b87ef170010e 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -179,12 +179,14 @@ cdef class Splitter:
         self.start = start
         self.end = end
 
-        self.criterion.init(self.y,
-                            self.sample_weight,
-                            self.weighted_n_samples,
-                            self.samples,
-                            start,
-                            end)
+        self.criterion.init(
+            self.y,
+            self.sample_weight,
+            self.weighted_n_samples,
+            self.samples,
+            start,
+            end
+        )
 
         weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
         return 0

From b47f79a907b11cf857f458c37be6ef56a496a9a8 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 23 Nov 2022 15:17:46 -0500
Subject: [PATCH 5/8] All fixed

---
 sklearn/tree/_criterion.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index b63cfd8ef7db8..0a133a41e06e9 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1342,7 +1342,7 @@ cdef class Poisson(RegressionCriterion):
         cdef SIZE_t n_outputs = self.n_outputs
 
         for output_idx in range(n_outputs):
-            if y_sum[k] <= EPSILON:
+            if y_sum[output_idx] <= EPSILON:
                 # y_sum could be computed from the subtraction
                 # sum_right = sum_total - sum_left leading to a potential
                 # floating point rounding error.

From d223ff7d36d4ff5f7e2e47d23b2d8c87f8a56802 Mon Sep 17 00:00:00 2001
From: Jong Shin <jshin.m@gmail.com>
Date: Wed, 30 Nov 2022 22:29:53 -0500
Subject: [PATCH 6/8] revert variable naming as suggested by @jjerphan

---
 sklearn/tree/_criterion.pyx | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 0a133a41e06e9..a0b5d54cb866d 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1338,11 +1338,10 @@ cdef class Poisson(RegressionCriterion):
         cdef DOUBLE_t y_mean = 0.
         cdef DOUBLE_t poisson_loss = 0.
         cdef DOUBLE_t w = 1.0
-        cdef SIZE_t samples_idx, output_idx, index_pointer
         cdef SIZE_t n_outputs = self.n_outputs
 
-        for output_idx in range(n_outputs):
-            if y_sum[output_idx] <= EPSILON:
+        for k in range(n_outputs):
+            if y_sum[k] <= EPSILON:
                 # y_sum could be computed from the subtraction
                 # sum_right = sum_total - sum_left leading to a potential
                 # floating point rounding error.
@@ -1352,14 +1351,11 @@ cdef class Poisson(RegressionCriterion):
 
             y_mean = y_sum[output_idx] / weight_sum
 
-            for index_pointer in range(start, end):
-                samples_idx = sample_indices[index_pointer]
+            for p in range(start, end):
+                i = sample_indices[p]
 
                 if sample_weight is not None:
-                    w = sample_weight[samples_idx]
+                    w = sample_weight[i]
 
-                poisson_loss += w * xlogy(
-                    y[samples_idx, output_idx],
-                    y[samples_idx, output_idx] / y_mean
-                )
+                poisson_loss += w * xlogy(y[i, k], y[i, k] / y_mean)
         return poisson_loss / (weight_sum * n_outputs)

From 5a13731f42c9e65a750f88a5d656fbfd4c2f19fd Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 30 Nov 2022 22:56:53 -0500
Subject: [PATCH 7/8] Update sklearn/tree/_criterion.pyx

---
 sklearn/tree/_criterion.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index a0b5d54cb866d..55f63086cdd9f 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1349,7 +1349,7 @@ cdef class Poisson(RegressionCriterion):
                 # y_sum <= EPSILON.
                 return INFINITY
 
-            y_mean = y_sum[output_idx] / weight_sum
+            y_mean = y_sum[k] / weight_sum
 
             for p in range(start, end):
                 i = sample_indices[p]

From 02d56c2f1d275b00009dfa7f839838fdbd77c223 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 1 Dec 2022 00:13:28 -0500
Subject: [PATCH 8/8] Add definitions

---
 sklearn/tree/_criterion.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 55f63086cdd9f..e9f32b6e06ef9 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1338,6 +1338,7 @@ cdef class Poisson(RegressionCriterion):
         cdef DOUBLE_t y_mean = 0.
         cdef DOUBLE_t poisson_loss = 0.
         cdef DOUBLE_t w = 1.0
+        cdef SIZE_t i, k, p
         cdef SIZE_t n_outputs = self.n_outputs
 
         for k in range(n_outputs):