scikit-learn
diff --git a/‎sklearn/tree/_criterion.pyx
Lines changed: 25 additions & 111 deletions b/‎sklearn/tree/_criterion.pyx
Lines changed: 25 additions & 111 deletions
diff --git a/‎sklearn/tree/_splitter.pyx
Lines changed: 5 additions & 4 deletions b/‎sklearn/tree/_splitter.pyx
Lines changed: 5 additions & 4 deletions
diff --git a/‎sklearn/tree/_tree.pyx
Lines changed: 1 addition & 1 deletion b/‎sklearn/tree/_tree.pyx
Lines changed: 1 addition & 1 deletion
@@ -81,9 +81,6 @@ cdef class Criterion:
         n_missing: SIZE_t
             The number of missing-valued samples in this node. Alternatively,
             this is the size of the missing_samples array.
-        missing_samples: array-like, dtype=DOUBLE_t
-            Indices of the samples in X and y, where samples[start:end]
-            correspond to the non-missing valued samples alone.
         """
 
         pass
@@ -328,8 +325,8 @@ cdef class ClassificationCriterion(Criterion):
     cdef void init(self, DOUBLE_t* y, SIZE_t y_stride,
                    DOUBLE_t* sample_weight, double weighted_n_samples,
                    SIZE_t* samples, SIZE_t start, SIZE_t end,
-                   SIZE_t* missing_samples, SIZE_t n_missing,
-                   SIZE_t missing_direction) nogil:
+                   SIZE_t* missing_samples, SIZE_t n_missing) nogil:
+                   SIZE_t missing_direction=MISSING_DIR_LEFT) nogil:
         """Initialize the criterion at node samples[start:end] and
         children samples[start:start] and samples[start:end].
 
@@ -428,21 +425,19 @@ cdef class ClassificationCriterion(Criterion):
 
         self.pos = self.start
 
-        self.weighted_n_left = 0
         # SELFNOTE should we put the missing_samples too on the right to simplify calculations (missing_direction=1)??
-        self.weighted_n_right = self.weighted_n_node_samples
-
         # SELFNOTE The case of wighted_n_missing being 0, will be taken care
         # SELFNOTE at the time of building missing_samples array
-        if self.n_missing != 0:
-            # Move missing samples left
-            if self.missing_direction == 0:
-                self.weighted_n_left = self.weighted_n_node_missing
-                self.weighted_n_right = self.weighted_n_node_nonmissing
-            # Move missing samples right
-            elif self.missing_direction == 1:
-                self.weighted_n_right = self.weighted_n_node_missing
-                self.weighted_n_left = self.weighted_n_node_nonmissing
+        # SELFNOTE should we put the missing_samples too on the left to simplify calculations (missing_direction=0)??
+        if self.n_missing <= 0:
+            self.weighted_n_left = self.weighted_n_node_samples
+            self.weighted_n_right = 0.0
+        elif self.missing_direction == MISSING_DIR_LEFT:
+            self.weighted_n_left = self.weighted_n_node_missing
+            self.weighted_n_right = self.weighted_n_node_nonmissing
+        else:
+            self.weighted_n_left = self.weighted_n_node_nonmissing
+            self.weighted_n_right = self.weighted_n_node_missing
 
         cdef double* sum_total = self.sum_total
         cdef double* sum_left = self.sum_left
@@ -464,13 +459,16 @@ cdef class ClassificationCriterion(Criterion):
         """Reset the criterion at pos=end."""
         self.pos = self.end
 
-        # SELFNOTE should we put the missing_samples too on the left to simplify calculations (missing_direction=0)??
-        self.weighted_n_left = self.weighted_n_node_samples
-        self.weighted_n_right = 0.0
-
-        if self.n_missing > 0:
-            # SELFNOTE we are leaving missing samples untouched
-            self.missing_direction = 0
+        # SELFNOTE should we put the missing_samples too on the right to simplify calculations (missing_direction=0)??
+        if self.n_missing <= 0:
+            self.weighted_n_left = 0.0
+            self.weighted_n_right = self.weighted_n_node_samples
+        elif self.missing_direction == MISSING_DIR_LEFT:
+            self.weighted_n_left = self.weighted_n_node_missing
+            self.weighted_n_right = self.weighted_n_node_nonmissing
+        else:
+            self.weighted_n_left = self.weighted_n_node_nonmissing
+            self.weighted_n_right = self.weighted_n_node_missing
 
         cdef double* sum_total = self.sum_total
         cdef double* sum_left = self.sum_left
@@ -487,87 +485,6 @@ cdef class ClassificationCriterion(Criterion):
             sum_left += self.sum_stride
             sum_right += self.sum_stride
 
-    cdef void update(self, SIZE_t new_pos) nogil:
-        """Updated statistics by moving samples[pos:new_pos] to the left child.
-
-        Parameters
-        ----------
-        new_pos: SIZE_t
-            The new ending position for which to move samples from the right
-            child to the left child.
-        """
-        cdef DOUBLE_t* y = self.y
-        cdef SIZE_t pos = self.pos
-        cdef SIZE_t end = self.end
-
-        cdef double* sum_left = self.sum_left
-        cdef double* sum_right = self.sum_right
-        cdef double* sum_total = self.sum_total
-
-        cdef SIZE_t* n_classes = self.n_classes
-        cdef SIZE_t* samples = self.samples
-        cdef DOUBLE_t* sample_weight = self.sample_weight
-
-        cdef SIZE_t i
-        cdef SIZE_t p
-        cdef SIZE_t k
-        cdef SIZE_t c
-        cdef SIZE_t label_index
-        cdef DOUBLE_t w = 1.0
-
-        # Update statistics up to new_pos
-        #
-        # Given that
-        #   sum_left[x] +  sum_right[x] = sum_total[x]
-        # and that sum_total is known, we are going to update
-        # sum_left from the direction that require the least amount
-        # of computations, i.e. from pos to new_pos or from end to new_pos.
-
-        if (new_pos - pos) <= (end - new_pos):
-            for p in range(pos, new_pos):
-                i = samples[p]
-
-                if sample_weight != NULL:
-                    w = sample_weight[i]
-
-                for k in range(self.n_outputs):
-                    label_index = (k * self.sum_stride +
-                                   <SIZE_t> y[i * self.y_stride + k])
-                    sum_left[label_index] += w
-
-                self.weighted_n_left += w
-
-        else:
-            self.reverse_reset()
-
-            for p in range(end - 1, new_pos - 1, -1):
-                i = samples[p]
-
-                if sample_weight != NULL:
-                    w = sample_weight[i]
-
-                for k in range(self.n_outputs):
-                    label_index = (k * self.sum_stride +
-                                   <SIZE_t> y[i * self.y_stride + k])
-                    sum_left[label_index] -= w
-
-                self.weighted_n_left -= w
-
-        # Update right part statistics
-        self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left
-        for k in range(self.n_outputs):
-            for c in range(n_classes[k]):
-                sum_right[c] = sum_total[c] - sum_left[c]
-
-            sum_right += self.sum_stride
-            sum_left += self.sum_stride
-            sum_total += self.sum_stride
-
-        self.pos = new_pos
-
-    # SELFNOTE overloading works in cython
-    # SELFNOTE we need a separate function to avoid adding overheads when n_missing=0
-    # SELFNOTE dont set a default value for missing_direction, it will conflict with the function above
     cdef void update(self, SIZE_t new_pos, SIZE_t missing_direction) nogil:
         """Updated statistics by moving samples[pos:new_pos] to the left child.
 
@@ -584,11 +501,6 @@ cdef class ClassificationCriterion(Criterion):
             1 - To send the missing values right
             2 - To ignore the missing values
         """
-        # SELFNOTE Splitter will filter this case of n_missing = 0
-        # SELFNOTE remove
-        # if self.n_missing <= 0:
-        #    return self.update(new_pos)
-
         cdef DOUBLE_t* y = self.y
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
@@ -600,7 +512,6 @@ cdef class ClassificationCriterion(Criterion):
 
         cdef SIZE_t* n_classes = self.n_classes
         cdef SIZE_t* samples = self.samples
-        cdef SIZE_t* missing_samples = self.samples
         cdef DOUBLE_t* sample_weight = self.sample_weight
 
         cdef SIZE_t i
@@ -648,6 +559,9 @@ cdef class ClassificationCriterion(Criterion):
 
                 self.weighted_n_left -= w
 
+        if n_missing > 0 and self.missing_direction == MISSING_DIR_LEFT:
+            self.weighted_n_left += self.weighted_n_node_missing
+
         # Update right part statistics
         self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left
         for k in range(self.n_outputs):
 
@@ -180,8 +180,7 @@ cdef class Splitter:
             # typically is very less compared to normal size
             for i in range(n_features):
                 n_missing[i] = np.count_nonzero(missing_mask[:, i])
-                max_missing = int_max(
-                                      max_missing)
+                max_missing = int_max(max_missing)
 
             # Create a new array which will be used to store nonzero weighted
             # missing-valued samples from the feature of interest
@@ -259,6 +258,7 @@ cdef class Splitter:
         self.start = start
         self.end = end
         self.n_missing = n_missing
+        self.missing_direction = MISSING_DIR_RIGHT
 
         self.criterion.init(self.y,
                             self.y_stride,
@@ -268,7 +268,8 @@ cdef class Splitter:
                             start,
                             end,
                             self.missing_samples,
-                            n_missing)
+                            n_missing,
+                            self.missing_direction)
 
         weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
 
@@ -583,7 +584,7 @@ cdef class BestSplitter(BaseDenseSplitter):
 
                             if p < end:
                                 # Send missing-valued samples both ways and get the best
-                                for missing_direction in range(1)
+                                for missing_direction in range(1):
                                     # Reject if min_samples_leaf is not guaranteed
                                     if (((p - start + n_missing * (1 - missing_direction)) < min_samples_leaf) or
                                             ((end - p + n_missing * missing_direction) < min_samples_leaf)):
 
@@ -198,7 +198,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
         cdef bint include_missing = <bint> self.allow_missing
         cdef bint include_missing_r = MISSING_DIR_UNDEFINED
-        cdef bint include_missing_l = MISSING_DIR_UNDEFINED 
+        cdef bint include_missing_l = MISSING_DIR_UNDEFINED
 
         cdef SplitRecord split
         cdef SIZE_t node_id