scikit-learn · adrinjalali · Aug 21, 2019 · May 20, 2019 · May 20, 2019 · May 20, 2019
diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
@@ -2,6 +2,7 @@
 import argparse
 
 import matplotlib.pyplot as plt
+import numpy as np
 from sklearn.model_selection import train_test_split
 # To use this experimental feature, we need to explicitly ask for it:
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
@@ -25,6 +26,7 @@
 parser.add_argument('--learning-rate', type=float, default=.1)
 parser.add_argument('--problem', type=str, default='classification',
                     choices=['classification', 'regression'])
+parser.add_argument('--missing-fraction', type=float, default=0)
 parser.add_argument('--n-classes', type=int, default=2)
 parser.add_argument('--n-samples-max', type=int, default=int(1e6))
 parser.add_argument('--n-features', type=int, default=20)
@@ -52,6 +54,11 @@ def get_estimator_and_data():
 
 
 X, y, Estimator = get_estimator_and_data()
+if args.missing_fraction:
+    mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype(
+        np.bool)
+    X[mask] = np.nan
+
 X_train_, X_test_, y_train_, y_test_ = train_test_split(
     X, y, test_size=0.5, random_state=0)
 

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -864,7 +864,7 @@ Usage
 Most of the parameters are unchanged from
 :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`.
 One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and
-controls the number of iterations of the boosting process:
+controls the number of iterations of the boosting process::
 
   >>> from sklearn.experimental import enable_hist_gradient_boosting
   >>> from sklearn.ensemble import HistGradientBoostingClassifier
@@ -873,10 +873,10 @@ controls the number of iterations of the boosting process:
   >>> X, y = make_hastie_10_2(random_state=0)
   >>> X_train, X_test = X[:2000], X[2000:]
   >>> y_train, y_test = y[:2000], y[2000:]
-  >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train)
 
  >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train)
   >>> clf.score(X_test, y_test)
-  0.8998
+  0.8965
 
 The size of the trees can be controlled through the ``max_leaf_nodes``,
 ``max_depth``, and ``min_samples_leaf`` parameters.
@@ -895,6 +895,45 @@ using an arbitrary :term:`scorer`, or just the training or validation loss. By
 default, early-stopping is performed using the default :term:`scorer` of
 the estimator on a validation set.
 
+Missing values support
+----------------------
+
+:class:`HistGradientBoostingClassifier` and
+:class:`HistGradientBoostingRegressor` have built-in support for missing
+values (NaNs).
+
+During training, the tree grower learns at each split point whether samples
+with missing values should go to the left or right child, based on the
+potential gain. When predicting, samples with missing values are assigned to
+the left or right child consequently::
+
+  >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+  >>> from sklearn.ensemble import HistGradientBoostingClassifier
+  >>> import numpy as np
+
+  >>> X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
+  >>> y = [0, 0, 1, 1]
+
+  >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
+  >>> gbdt.predict(X)
+  array([0, 0, 1, 1])
+
+When the missingness pattern is predictive, the splits can be done on
+whether the feature value is missing or not::
+
+  >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1)
+  >>> y = [0, 1, 0, 0, 1]
+  >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1,
+  ...                                       max_depth=2,
+  ...                                       learning_rate=1,
+  ...                                       max_iter=1).fit(X, y)
+  >>> gbdt.predict(X)
+  array([0, 1, 0, 0, 1])
+
+If no missing values were encountered for a given feature during training,
+then samples with missing values are mapped to whichever child has the most
+samples.
+
 Low-level parallelism
 ---------------------
 

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -23,10 +23,11 @@ random sampling procedures.
 - :class:`decomposition.SparseCoder` with `algorithm='lasso_lars'` |Fix|
 - :class:`decomposition.SparsePCA` where `normalize_components` has no effect
   due to deprecation.
-
 - :class:`linear_model.Ridge` when `X` is sparse. |Fix|
-
 - :class:`cluster.KMeans` when `n_jobs=1`. |Fix|
+- :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` |Fix|, |Feature|,
+  |Enhancement|.
 
 Details are listed in the changelog below.
 
@@ -112,24 +113,31 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
-- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` have an additional
-  parameter called `warm_start` that enables warm starting. :pr:`14012` by
-  :user:`Johann Faouzi <johannfaouzi>`.
-
-- |Fix| :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` now bin the training and
-  validation data separately to avoid any data leak. :pr:`13933` by
-  `Nicolas Hug`_.
+- Many improvements were made to
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor`:
+
+  - |MajorFeature| Estimators now natively support dense data with missing
+    values both for training and predicting. They also support infinite
+    values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_
+    and `Olivier Grisel`_.
+  - |Feature| Estimators now have an additional `warm_start` parameter that
+    enables warm starting. :pr:`14012` by :user:`Johann Faouzi <johannfaouzi>`.
+  - |Enhancement| for :class:`ensemble.HistGradientBoostingClassifier` the
+    training loss or score is now monitored on a class-wise stratified
+    subsample to preserve the class balance of the original training set.
+    :pr:`14194` by :user:`Johann Faouzi <johannfaouzi>`.
+  - |Feature| :func:`inspection.partial_dependence` and
+    :func:`inspection.plot_partial_dependence` now support the fast 'recursion'
+    method for both estimators. :pr:`13769` by `Nicolas Hug`_.
+  - |Fix| Estimators now bin the training and validation data separately to
+    avoid any data leak. :pr:`13933` by `Nicolas Hug`_.
+
+  Note that pickles from 0.21 will not work in 0.22.
 
 - |Fix| :func:`ensemble.VotingClassifier.predict_proba` will no longer be
   present when `voting='hard'`. :pr:`14287` by `Thomas Fan`_.
 
-- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` the training
-  loss or score is now monitored on a class-wise stratified subsample to
-  preserve the class balance of the original training set. :pr:`14194`
-  by :user:`Johann Faouzi <johannfaouzi>`.
-
 - |Fix| Run by default
   :func:`utils.estimator_checks.check_estimator` on both
   :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. It
@@ -182,6 +190,12 @@ Changelog
   measure the importance of each feature in an arbitrary trained model with
   respect to a given scoring function. :issue:`13146` by `Thomas Fan`_.
 
+- |Feature| :func:`inspection.partial_dependence` and
+  :func:`inspection.plot_partial_dependence` now support the fast 'recursion'
+  method for :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor`. :pr:`13769` by
+  `Nicolas Hug`_.
+
 :mod:`sklearn.linear_model`
 ...........................
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -12,11 +12,14 @@ import numpy as np
 cimport numpy as np
 from numpy.math cimport INFINITY
 from cython.parallel import prange
+from libc.math cimport isnan
 
-from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C
+from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
 
-cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds,
-                   X_BINNED_DTYPE_C [::1, :] binned):
+def _map_to_bins(const X_DTYPE_C [:, :] data,
+                 list binning_thresholds,
+                 const unsigned char missing_values_bin_idx,
+                 X_BINNED_DTYPE_C [::1, :] binned):
     """Bin numerical values to discrete integer-coded levels.
 
     Parameters
@@ -35,11 +38,13 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds,
     for feature_idx in range(data.shape[1]):
         _map_num_col_to_bins(data[:, feature_idx],
                              binning_thresholds[feature_idx],
+                             missing_values_bin_idx,
                              binned[:, feature_idx])
 
 
 cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
                                const X_DTYPE_C [:] binning_thresholds,
+                               const unsigned char missing_values_bin_idx,
                                X_BINNED_DTYPE_C [:] binned):
     """Binary search to find the bin index for each value in the data."""
     cdef:
@@ -49,11 +54,11 @@ cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
         int middle
 
     for i in prange(data.shape[0], schedule='static', nogil=True):
-        if data[i] == INFINITY:
-            # Special case for +inf.
-            # -inf is handled properly by binary search.
-            binned[i] = binning_thresholds.shape[0]
+
+        if isnan(data[i]):
+            binned[i] = missing_values_bin_idx
         else:
+            # for known values, use binary search
             left, right = 0, binning_thresholds.shape[0]
             while left < right:
                 middle = (right + left - 1) // 2

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
@@ -10,8 +10,8 @@ from cython.parallel import prange
 import numpy as np
 cimport numpy as np
 
-from .types import Y_DTYPE
-from .types cimport Y_DTYPE_C
+from .common import Y_DTYPE
+from .common cimport Y_DTYPE_C
 
 
 def _update_raw_predictions(

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -12,8 +12,8 @@ cimport numpy as np
 
 from libc.math cimport exp
 
-from .types cimport Y_DTYPE_C
-from .types cimport G_H_DTYPE_C
+from .common cimport Y_DTYPE_C
+from .common cimport G_H_DTYPE_C
 
 
 def _update_gradients_least_squares(

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -7,15 +7,16 @@
 
 cimport cython
 from cython.parallel import prange
+from libc.math cimport isnan
 import numpy as np
 cimport numpy as np
 from numpy.math cimport INFINITY
 
-from .types cimport X_DTYPE_C
-from .types cimport Y_DTYPE_C
-from .types import Y_DTYPE
-from .types cimport X_BINNED_DTYPE_C
-from .types cimport node_struct
+from .common cimport X_DTYPE_C
+from .common cimport Y_DTYPE_C
+from .common import Y_DTYPE
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport node_struct
 
 
 def _predict_from_numeric_data(
@@ -43,10 +44,12 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data(
     while True:
         if node.is_leaf:
             return node.value
-        if numeric_data[row, node.feature_idx] == INFINITY:
-            # if data is +inf we always go to the right child, even when the
-            # threhsold is +inf
-            node = nodes[node.right]
+
+        if isnan(numeric_data[row, node.feature_idx]):
+            if node.missing_go_to_left:
+                node = nodes[node.left]
+            else:
+                node = nodes[node.right]
         else:
             if numeric_data[row, node.feature_idx] <= node.threshold:
                 node = nodes[node.left]
@@ -57,19 +60,22 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data(
 def _predict_from_binned_data(
         node_struct [:] nodes,
         const X_BINNED_DTYPE_C [:, :] binned_data,
+        const unsigned char missing_values_bin_idx,
         Y_DTYPE_C [:] out):
 
     cdef:
         int i
 
     for i in prange(binned_data.shape[0], schedule='static', nogil=True):
-        out[i] = _predict_one_from_binned_data(nodes, binned_data, i)
+        out[i] = _predict_one_from_binned_data(nodes, binned_data, i,
+                                               missing_values_bin_idx)
 
 
 cdef inline Y_DTYPE_C _predict_one_from_binned_data(
         node_struct [:] nodes,
         const X_BINNED_DTYPE_C [:, :] binned_data,
-        const int row) nogil:
+        const int row,
+        const unsigned char missing_values_bin_idx) nogil:
     # Need to pass the whole array and the row index, else prange won't work.
     # See issue Cython #2798
 
@@ -79,10 +85,16 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data(
     while True:
         if node.is_leaf:
             return node.value
-        if binned_data[row, node.feature_idx] <= node.bin_threshold:
-            node = nodes[node.left]
+        if binned_data[row, node.feature_idx] ==  missing_values_bin_idx:
+            if node.missing_go_to_left:
+                node = nodes[node.left]
+            else:
+                node = nodes[node.right]
         else:
-            node = nodes[node.right]
+            if binned_data[row, node.feature_idx] <= node.bin_threshold:
+                node = nodes[node.left]
+            else:
+                node = nodes[node.right]
 
 def _compute_partial_dependence(
     node_struct [:] nodes,