scikit-learn · thomasjpfan · Feb 24, 2020 · Aug 20, 2019 · Aug 20, 2019 · Aug 21, 2019
diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
@@ -32,6 +32,9 @@
 parser.add_argument('--n-samples-max', type=int, default=int(1e6))
 parser.add_argument('--n-features', type=int, default=20)
 parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--random-sample-weights', action="store_true",
+                    default=False,
+                    help="generate and use random sample weights")
 args = parser.parse_args()
 
 n_leaf_nodes = args.n_leaf_nodes
@@ -46,6 +49,7 @@ def get_estimator_and_data():
                                    n_features=args.n_features,
                                    n_classes=args.n_classes,
                                    n_clusters_per_class=1,
+                                   n_informative=args.n_classes,
                                    random_state=0)
         return X, y, HistGradientBoostingClassifier
     elif args.problem == 'regression':
@@ -60,15 +64,30 @@ def get_estimator_and_data():
         np.bool)
     X[mask] = np.nan
 
-X_train_, X_test_, y_train_, y_test_ = train_test_split(
-    X, y, test_size=0.5, random_state=0)
+if args.random_sample_weights:
+    sample_weight = np.random.rand(len(X)) * 10
+else:
+    sample_weight = None
+
+if sample_weight is not None:
+    (X_train_, X_test_, y_train_, y_test_,
+     sample_weight_train_, _) = train_test_split(
+        X, y, sample_weight, test_size=0.5, random_state=0)
+else:
+    X_train_, X_test_, y_train_, y_test_ = train_test_split(
+        X, y, test_size=0.5, random_state=0)
+    sample_weight_train_ = None
 
 
 def one_run(n_samples):
     X_train = X_train_[:n_samples]
     X_test = X_test_[:n_samples]
     y_train = y_train_[:n_samples]
     y_test = y_test_[:n_samples]
+    if sample_weight is not None:
+        sample_weight_train = sample_weight_train_[:n_samples]
+    else:
+        sample_weight_train = None
     assert X_train.shape[0] == n_samples
     assert X_test.shape[0] == n_samples
     print("Data size: %d samples train, %d samples test."
@@ -93,7 +112,7 @@ def one_run(n_samples):
         if loss == 'default':
             loss = 'least_squares'
     est.set_params(loss=loss)
-    est.fit(X_train, y_train)
+    est.fit(X_train, y_train, sample_weight=sample_weight_train)
     sklearn_fit_duration = time() - tic
     tic = time()
     sklearn_score = est.score(X_test, y_test)
@@ -110,7 +129,7 @@ def one_run(n_samples):
         lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
 
         tic = time()
-        lightgbm_est.fit(X_train, y_train)
+        lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         lightgbm_fit_duration = time() - tic
         tic = time()
         lightgbm_score = lightgbm_est.score(X_test, y_test)
@@ -127,7 +146,7 @@ def one_run(n_samples):
         xgb_est = get_equivalent_estimator(est, lib='xgboost')
 
         tic = time()
-        xgb_est.fit(X_train, y_train)
+        xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         xgb_fit_duration = time() - tic
         tic = time()
         xgb_score = xgb_est.score(X_test, y_test)
@@ -144,7 +163,7 @@ def one_run(n_samples):
         cat_est = get_equivalent_estimator(est, lib='catboost')
 
         tic = time()
-        cat_est.fit(X_train, y_train)
+        cat_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         cat_fit_duration = time() - tic
         tic = time()
         cat_score = cat_est.score(X_test, y_test)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -856,8 +856,7 @@ leverage integer-based data structures (histograms) instead of relying on
 sorted continuous values when building the trees. The API of these
 estimators is slightly different, and some of the features from
 :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`
-are not yet supported: in particular sample weights, and some loss
-functions.
+are not yet supported, for instance some loss functions.
 
 These estimators are still **experimental**: their predictions
 and their API might change without any deprecation cycle. To use them, you
@@ -957,6 +956,39 @@ If no missing values were encountered for a given feature during training,
 then samples with missing values are mapped to whichever child has the most
 samples.
 
+Sample weight support
+---------------------
+
+:class:`HistGradientBoostingClassifier` and
+:class:`HistGradientBoostingRegressor` sample support weights during
+:term:`fit`.
+
+The following toy example demonstrates how the model ignores the samples with
+zero sample weights:
+
+    >>> X = [[1, 0],
+    ...      [1, 0],
+    ...      [1, 0],
+    ...      [0, 1]]
+    >>> y = [0, 0, 1, 0]
+    >>> # ignore the first 2 training samples by setting their weight to 0
+    >>> sample_weight = [0, 0, 1, 1]
+    >>> gb = HistGradientBoostingClassifier(min_samples_leaf=1)
+    >>> gb.fit(X, y, sample_weight=sample_weight)
+    HistGradientBoostingClassifier(...)
+    >>> gb.predict([[1, 0]])
+    array([1])
+    >>> gb.predict_proba([[1, 0]])[0, 1]
+    0.99...
+
+As you can see, the `[1, 0]` is comfortably classified as `1` since the first
+two samples are ignored due to their sample weights.
+
+Implementation detail: taking sample weights into account amounts to
+multiplying the gradients (and the hessians) by the sample weights. Note that
+the binning stage (specifically the quantiles computation) does not take the
+weights into account.
+
 Low-level parallelism
 ---------------------
 

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -401,11 +401,11 @@ Changelog
   <glemaitre>` and :user:`Caio Oliveira <caioaao>` and :pr:`15138` by
   :user:`Jon Cusick <jcusick13>`..
 
-- Many improvements were made to
+- |MajorFeature| Many improvements were made to
   :class:`ensemble.HistGradientBoostingClassifier` and
   :class:`ensemble.HistGradientBoostingRegressor`:
 
-  - |MajorFeature| Estimators now natively support dense data with missing
+  - |Feature| Estimators now natively support dense data with missing
     values both for training and predicting. They also support infinite
     values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_
     and `Olivier Grisel`_.

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -142,6 +142,10 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |MajorFeature|  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` now support
+  :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_.
+
 - |API| Added boolean `verbose` flag to classes:
   :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`.
   :pr:`15991` by :user:`Sam Bail <spbail>`,

diff --git a/sklearn/ensemble/_hist_gradient_boostin D63E g/_loss.pyx b/sklearn/ensemble/_hist_gradient_boostin D63E g/_loss.pyx
@@ -27,9 +27,51 @@ def _update_gradients_least_squares(
 
     n_samples = raw_predictions.shape[0]
     for i in prange(n_samples, schedule='static', nogil=True):
+        # Note: a more correct exp is 2 * (raw_predictions - y_true)
+        # but since we use 1 for the constant hessian value (and not 2) this
+        # is strictly equivalent for the leaves values.
         gradients[i] = raw_predictions[i] - y_true[i]
 
 
+def _update_gradients_hessians_least_squares(
+        G_H_DTYPE_C [::1] gradients,  # OUT
+        G_H_DTYPE_C [::1] hessians,  # OUT
+        const Y_DTYPE_C [::1] y_true,  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
+
+    cdef:
+        int n_samples
+        int i
+
+    n_samples = raw_predictions.shape[0]
+    for i in prange(n_samples, schedule='static', nogil=True):
+        # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight
+        # but since we use 1 for the constant hessian value (and not 2) this
+        # is strictly equivalent for the leaves values.
+        gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i]
+        hessians[i] = sample_weight[i]
+
+
+def _update_gradients_hessians_least_absolute_deviation(
+        G_H_DTYPE_C [::1] gradients,  # OUT
+        G_H_DTYPE_C [::1] hessians,  # OUT
+        const Y_DTYPE_C [::1] y_true,  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
+
+    cdef:
+        int n_samples
+        int i
+
+    n_samples = raw_predictions.shape[0]
+    for i in prange(n_samples, schedule='static', nogil=True):
+        # gradient = sign(raw_predicition - y_pred) * sample_weight
+        gradients[i] = sample_weight[i] * (2 *
+                        (y_true[i] - raw_predictions[i] < 0) - 1)
+        hessians[i] = sample_weight[i]
+
+
 def _update_gradients_least_absolute_deviation(
         G_H_DTYPE_C [::1] gradients,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
@@ -49,44 +91,66 @@ def _update_gradients_hessians_binary_crossentropy(
         G_H_DTYPE_C [::1] gradients,  # OUT
         G_H_DTYPE_C [::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions):  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
     cdef:
         int n_samples
         Y_DTYPE_C p_i  # proba that ith sample belongs to positive class
         int i
 
     n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True):
-        p_i = _cexpit(raw_predictions[i])
-        gradients[i] = p_i - y_true[i]
-        hessians[i] = p_i * (1. - p_i)
+    if sample_weight is None:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            p_i = _cexpit(raw_predictions[i])
+            gradients[i] = p_i - y_true[i]
+            hessians[i] = p_i * (1. - p_i)
+    else:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            p_i = _cexpit(raw_predictions[i])
+            gradients[i] = (p_i - y_true[i]) * sample_weight[i]
+            hessians[i] = p_i * (1. - p_i) * sample_weight[i]
 
 
 def _update_gradients_hessians_categorical_crossentropy(
         G_H_DTYPE_C [:, ::1] gradients,  # OUT
         G_H_DTYPE_C [:, ::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [:, ::1] raw_predictions):  # IN
+        const Y_DTYPE_C [:, ::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
     cdef:
         int prediction_dim = raw_predictions.shape[0]
         int n_samples = raw_predictions.shape[1]
         int k  # class index
         int i  # sample index
+        Y_DTYPE_C sw
         # p[i, k] is the probability that class(ith sample) == k.
         # It's the softmax of the raw predictions
         Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim))
         Y_DTYPE_C p_i_k
 
-    for i in prange(n_samples, schedule='static', nogil=True):
-        # first compute softmaxes of sample i for each class
-        for k in range(prediction_dim):
-            p[i, k] = raw_predictions[k, i]  # prepare softmax
-        _compute_softmax(p, i)
-        # then update gradients and hessians
-        for k in range(prediction_dim):
-            p_i_k = p[i, k]
-            gradients[k, i] = p_i_k - (y_true[i] == k)
-            hessians[k, i] = p_i_k * (1. - p_i_k)
+    if sample_weight is None:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # first compute softmaxes of sample i for each class
+            for k in range(prediction_dim):
+                p[i, k] = raw_predictions[k, i]  # prepare softmax
+            _compute_softmax(p, i)
+            # then update gradients and hessians
+            for k in range(prediction_dim):
+                p_i_k = p[i, k]
+                gradients[k, i] = p_i_k - (y_true[i] == k)
+                hessians[k, i] = p_i_k * (1. - p_i_k)
+    else:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # first compute softmaxes of sample i for each class
+            for k in range(prediction_dim):
+                p[i, k] = raw_predictions[k, i]  # prepare softmax
+            _compute_softmax(p, i)
+            # then update gradients and hessians
+            sw = sample_weight[i]
+            for k in range(prediction_dim):
+                p_i_k = p[i, k]
+                gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw
+                hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw
 
 
 cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: