scikit-learn
diff --git a/‎sklearn/tree/_criterion.pyx
Lines changed: 62 additions & 22 deletions b/‎sklearn/tree/_criterion.pyx
Lines changed: 62 additions & 22 deletions
diff --git a/‎sklearn/tree/_splitter.pxd
Lines changed: 30 additions & 12 deletions b/‎sklearn/tree/_splitter.pxd
Lines changed: 30 additions & 12 deletions
diff --git a/‎sklearn/tree/_splitter.pyx
Lines changed: 42 additions & 6 deletions b/‎sklearn/tree/_splitter.pyx
Lines changed: 42 additions & 6 deletions
@@ -28,6 +28,9 @@ from ._utils cimport log
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
 
+cdef int LEFT = 0
+cdef int RIGHT = 1
+
 cdef class Criterion:
     """Interface for impurity criteria.
 
@@ -49,8 +52,9 @@ cdef class Criterion:
         pass
 
     cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
-                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
-                   SIZE_t end) nogil:
+                   double weighted_n_samples, SIZE_t* samples,
+                   SIZE_t start, SIZE_t end,
+                   SIZE_t start_missing, SIZE_t end_missing) nogil:
         """Placeholder for a method which will initialize the criterion.
 
         Parameters
@@ -66,11 +70,17 @@ cdef class Criterion:
             The total w
10000
eight of the samples being considered
         samples: array-like, dtype=DOUBLE_t
             Indices of the samples in X and y, where samples[start:end]
-            correspond to the samples in this node
+            correspond to the non-missing samples and
+            samples[start_missing:end_missing] correspond to the missing valued
+            samples in this node
         start: SIZE_t
-            The first sample to be used on this node
+            The first non-missing-valued sample to be used on this node
         end: SIZE_t
-            The last sample used on this node
+            The last non-missing-valued sample used on this node
+        start_missing: SIZE_t
+            The first missing-valued sample to be used on this node
+        end_missing: SIZE_t
+            The last missing-valued sample used on this node
 
         """
 
@@ -106,6 +116,23 @@ cdef class Criterion:
 
         pass
 
+    cdef void move_missing(self, bint direction) nogil:
+        """Updated statistics by moving the missing-valued samples to l/r.
+
+        This updates the collected statistics by moving the missing-valued
+        samples (samples[start_missing:end_nonmissing]) to the direction as
+        specified.
+
+        Parameters
+        ----------
+        direction: bint
+            0 (false) to move the missing-valued samples left.
+            1 (true) to move the missing-valued samples right.
+
+        """
+
+        pass
+
     cdef double node_impurity(self) nogil:
         """Placeholder for calculating the impurity of the node.
 
@@ -198,9 +225,9 @@ cdef class Criterion:
         self.children_impurity(&impurity_left, &impurity_right)
 
         return ((self.weighted_n_node_samples / self.weighted_n_samples) *
-                (impurity - (self.weighted_n_right / 
+                (impurity - (self.weighted_n_right /
                              self.weighted_n_node_samples * impurity_right)
-                          - (self.weighted_n_left / 
+                          - (self.weighted_n_left /
                              self.weighted_n_node_samples * impurity_left)))
 
 
@@ -227,11 +254,18 @@ cdef class ClassificationCriterion(Criterion):
         self.sample_weight = NULL
 
         self.samples = NULL
-        self.start = 0
-        self.pos = 0
-        self.end = 0
+        self.start_nonmissing = 0
+        self.pos_nonmissing = 0
+        self.end_nonmissing = 0
+
+        self.start_missing = 0
+        self.end_missing = 0
 
         self.n_outputs = n_outputs
+
+        self.n_missing = 0
+        self.n_nonmissing = 0
+
         self.n_node_samples = 0
         self.weighted_n_node_samples = 0.0
         self.weighted_n_left = 0.0
@@ -263,7 +297,7 @@ cdef class ClassificationCriterion(Criterion):
         self.sum_left = <double*> calloc(n_elements, sizeof(double))
         self.sum_right = <double*> calloc(n_elements, sizeof(double))
 
-        if (self.sum_total == NULL or 
+        if (self.sum_total == NULL or
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
@@ -281,7 +315,8 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef void init(self, DOUBLE_t* y, SIZE_t y_stride,
                    DOUBLE_t* sample_weight, double weighted_n_samples,
-                   SIZE_t* samples, SIZE_t start, SIZE_t end) nogil:
+                   SIZE_t* samples,
+                   SIZE_t start, SIZE_t end) nogil:
         """Initialize the criterion at node samples[start:end] and
         children samples[start:start] and samples[start:end].
 
@@ -298,10 +333,14 @@ cdef class ClassificationCriterion(Criterion):
             The total weight of all samples
         samples: array-like, dtype=SIZE_t
             A mask on the samples, showing which ones we want to use
-        start: SIZE_t
-            The first sample to use in the mask
-        end: SIZE_t
-            The last sample to use in the mask
+        start_nonmissing: SIZE_t
+            The first non-missing-valued sample to be used on this node
+        end_nonmissing: SIZE_t
+            The last non-missing-valued sample used on this node
+        start_missing: SIZE_t
+            The first missing-valued sample to be used on this node
+        end_missing: SIZE_t
+            The last missing-valued sample used on this node
         """
 
         self.y = y
@@ -328,7 +367,8 @@ cdef class ClassificationCriterion(Criterion):
             memset(sum_total + offset, 0, n_classes[k] * sizeof(double))
             offset += self.sum_stride
 
-        for p in range(start, end):
+        for p in (range(start_nonmissing, end_nonmissing) +
+                  range(start_missing, end_missing)):
             i = samples[p]
 
             # w is originally set to be 1.0, meaning that if no sample weights
@@ -722,7 +762,7 @@ cdef class RegressionCriterion(Criterion):
         self.sum_left = <double*> calloc(n_outputs, sizeof(double))
         self.sum_right = <double*> calloc(n_outputs, sizeof(double))
 
-        if (self.sum_total == NULL or 
+        if (self.sum_total == NULL or
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
@@ -847,7 +887,7 @@ cdef class RegressionCriterion(Criterion):
 
                 self.weighted_n_left -= w
 
-        self.weighted_n_right = (self.weighted_n_node_samples - 
+        self.weighted_n_right = (self.weighted_n_node_samples -
                                  self.weighted_n_left)
         for k in range(self.n_outputs):
             sum_right[k] = sum_total[k] - sum_left[k]
@@ -957,7 +997,7 @@ cdef class MSE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
-            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 
+            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
 
         impurity_left[0] /= self.n_outputs
         impurity_right[0] /= self.n_outputs
@@ -1019,6 +1059,6 @@ cdef class FriedmanMSE(MSE):
         diff = (self.weighted_n_right * total_sum_left -
                 self.weighted_n_left * total_sum_right) / self.n_outputs
 
-        return (diff * diff / (self.weighted_n_left * self.weighted_n_right * 
+        return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
                                self.weighted_n_node_samples))
-                               
+
@@ -22,14 +22,15 @@ ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 
 cdef struct SplitRecord:
     # Data to track sample split
-    SIZE_t feature         # Which feature to split on.
-    SIZE_t pos             # Split samples array at the given position,
-                           # i.e. count of samples below threshold for feature.
-                           # pos is >= end if the node is a leaf.
-    double threshold       # Threshold to split at.
-    double improvement     # Impurity improvement given parent node.
-    double impurity_left   # Impurity of the left split.
-    double impurity_right  # Impurity of the right split.
+    SIZE_t feature            # Which feature to split on.
+    SIZE_t pos                # Split samples array at the given position,
+                              # i.e. count of samples below threshold for feature.
+                              # pos is >= end if the node is a leaf.
+    double threshold          # Threshold to split at.
+    double improvement        # Impurity improvement given parent node.
+    double impurity_left      # Impurity of the left split.
+    double impurity_right     # Impurity of the right split.
+    double send_missing_left  # Whether to send the missing values left/right
 
 cdef class Splitter:
     # The splitter searches in the input space for a feature and a threshold
@@ -47,15 +48,28 @@ cdef class Splitter:
     cdef UINT32_t rand_r_state           # sklearn_rand_r random number state
 
     cdef SIZE_t* samples                 # Sample indices in X, y
+    cdef SIZE_t* missing_samples         # Sample indices with missing values
+
     cdef SIZE_t n_samples                # X.shape[0]
+    # TODO selfnote we need n_missing?
+
     cdef double weighted_n_samples       # Weighted number of samples
+
     cdef SIZE_t* features                # Feature indices in X
     cdef SIZE_t* constant_features       # Constant features indices
     cdef SIZE_t n_features               # X.shape[1]
-    cdef DTYPE_t* feature_values         # temp. array holding feature values
+    cdef DTYPE_t* feature_values         # temp. array holding non-missing
+                                         # feature values
 
     cdef SIZE_t start                    # Start position for the current node
-    cdef SIZE_t end                      # End position for the current node
+                                         # for the non missing values
+    cdef SIZE_t end                      # End pos for the current node
+                                         # for the non missing values
+
+    cdef SIZE_t start_missing            # Start position for the current node
+                                         # for the missing values
+    cdef SIZE_t end_missing              # End pos for the current node
+                                         # for the missing values
 
     cdef bint presort                    # Whether to use presorting, only
                                          # allowed on dense data
@@ -69,6 +83,9 @@ cdef class Splitter:
     # `node_split` reorganizes the node samples `samples[start:end]` in two
     # subsets `samples[start:pos]` and `samples[pos:end]`.
 
+    # The indices of samples with missing values for a node are grouped into
+    # the `samples_missing`
+
     # The 1-d  `features` array of size n_features contains the features
     # indices and allows fast sampling without replacement of features.
 
@@ -83,7 +100,8 @@ cdef class Splitter:
     # Methods
     cdef void init(self, object X, np.ndarray y,
                    DOUBLE_t* sample_weight,
-                   np.ndarray X_idx_sorted=*) except *
+                   np.ndarray X_idx_sorted=*,
+                   object missing_samples) except *
 
     cdef void node_reset(self, SIZE_t start, SIZE_t end,
                          double* weighted_n_node_samples) nogil
@@ -95,4 +113,4 @@ cdef class Splitter:
 
     cdef void node_value(self, double* dest) nogil
 
-    cdef double node_impurity(self) nogil
+    cdef double node_impurity(self) nogil
@@ -35,6 +35,7 @@ from ._utils cimport RAND_R_MAX
 from ._utils cimport safe_realloc
 
 cdef double INFINITY = np.inf
+cdef inline int int_min(int a, int b): return a if a <= b else b
 
 # Mitigate precision differences between 32 bit and 64 bit
 cdef DTYPE_t FEATURE_THRESHOLD = 1e-7
@@ -50,6 +51,7 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil:
     self.feature = 0
     self.threshold = 0.
     self.improvement = -INFINITY
+    self.send_missing_left = 0
 
 cdef class Splitter:
     """Abstract splitter class.
@@ -60,7 +62,7 @@ cdef class Splitter:
 
     def __cinit__(self, Criterion criterion, SIZE_t max_features,
                   SIZE_t min_samples_leaf, double min_weight_leaf,
-                  object random_state, bint presort):
+                  object random_state, bint presort, bint handle_missing):
         """
         Parameters
         ----------
@@ -82,12 +84,29 @@ cdef class Splitter:
 
         random_state: object
             The user inputted random state to be used for pseudo-randomness
+
+        presort : bool, optional (default=False)
+            Whether to presort the data to speed up the finding of best splits
+            in fitting.
+
+        handle_missing : bool, optional (default=False)
+            Whether to handle missing values.
         """
 
         self.criterion = criterion
 
         self.samples = NULL
+        self.n_nonmissing_samples = 0
+
+        self.handle_missing = handle_missing  # SELFNOTE XXX
+
+        self.missing_samples = NULL  # SELFNOTE XXX 
+        self.missing_mask = NULL
+
+        self.n_missing_samples = 0   # Do I need this??
+
         self.n_samples = 0
+
         self.features = NULL
         self.n_features = 0
         self.feature_values = NULL
@@ -106,6 +125,7 @@ cdef class Splitter:
         """Destructor."""
 
         free(self.samples)
+        free(self.missing_samples)
         free(self.features)
         free(self.constant_features)
         free(self.feature_values)
@@ -120,7 +140,8 @@ cdef class Splitter:
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
-                   np.ndarray X_idx_sorted=None) except *:
+                   np.ndarray X_idx_sorted=None,
+                   np.ndarray missing_mask=None) except *:
         """Initialize the splitter.
 
         Take in the input data X, the target Y, and optional sample weights.
@@ -137,12 +158,24 @@ cdef class Splitter:
             The weights of the samples, where higher weighted samples are fit
             closer than lower weight samples. If not provided, all samples
             are assumed to have uniform weight.
-        """
 
+        missing_mask: numpy.ndarray, dtype=bool (optional)
+            The mask to specify the locations of missing values in the dataset.
+        """
         self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
         cdef SIZE_t n_samples = X.shape[0]
 
-        # Create a new array which will be used to store nonzero
+        if self.handle_missing:
+            cdef SIZE_t* max_missing = 0
+
+            for i in range(X.shape[1]):
+                max_missing = int_max(np.count_nonzero(missing_mask[:, i]),
+                                      max_missing)
+
+            cdef SIZE_t* missing_samples = safe_realloc(
+                &self.missing_samples, max_missing)
+
+        # Create a new array which will be used to store nonzero weighted
         # samples from the feature of interest
         cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples)
 
@@ -261,7 +294,8 @@ cdef class BaseDenseSplitter(Splitter):
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
-                   np.ndarray X_idx_sorted=None) except *:
+                   np.ndarray X_idx_sorted=None,
+                   object missing_mask) except *:
         """Initialize the splitter."""
 
         # Call parent init
@@ -302,6 +336,8 @@ cdef class BestSplitter(BaseDenseSplitter):
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
+        cdef SIZE_t start_missing = self.start_missing
+        cdef SIZE_t end_missing = self.end_missing
 
         cdef SIZE_t* features = self.features
         cdef SIZE_t* constant_features = self.constant_features
@@ -404,7 +440,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                     p = start
                     feature_idx_offset = self.X_idx_sorted_stride * current.feature
 
-                    for i in range(self.n_total_samples): 
+                    for i in range(self.n_total_samples):
                         j = X_idx_sorted[i + feature_idx_offset]
                         if sample_mask[j] == 1:
                             samples[p] = j