diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
index f87f22ab8837e..ac7e77e2a1f99 100644
--- a/benchmarks/bench_hist_gradient_boosting.py
+++ b/benchmarks/bench_hist_gradient_boosting.py
@@ -32,6 +32,9 @@
 parser.add_argument('--n-samples-max', type=int, default=int(1e6))
 parser.add_argument('--n-features', type=int, default=20)
 parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--random-sample-weights', action="store_true",
+                    default=False,
+                    help="generate and use random sample weights")
 args = parser.parse_args()
 
 n_leaf_nodes = args.n_leaf_nodes
@@ -46,6 +49,7 @@ def get_estimator_and_data():
                                    n_features=args.n_features,
                                    n_classes=args.n_classes,
                                    n_clusters_per_class=1,
+                                   n_informative=args.n_classes,
                                    random_state=0)
         return X, y, HistGradientBoostingClassifier
     elif args.problem == 'regression':
@@ -60,8 +64,19 @@ def get_estimator_and_data():
         np.bool)
     X[mask] = np.nan
 
-X_train_, X_test_, y_train_, y_test_ = train_test_split(
-    X, y, test_size=0.5, random_state=0)
+if args.random_sample_weights:
+    sample_weight = np.random.rand(len(X)) * 10
+else:
+    sample_weight = None
+
+if sample_weight is not None:
+    (X_train_, X_test_, y_train_, y_test_,
+     sample_weight_train_, _) = train_test_split(
+        X, y, sample_weight, test_size=0.5, random_state=0)
+else:
+    X_train_, X_test_, y_train_, y_test_ = train_test_split(
+        X, y, test_size=0.5, random_state=0)
+    sample_weight_train_ = None
 
 
 def one_run(n_samples):
@@ -69,6 +84,10 @@ def one_run(n_samples):
     X_test = X_test_[:n_samples]
     y_train = y_train_[:n_samples]
     y_test = y_test_[:n_samples]
+    if sample_weight is not None:
+        sample_weight_train = sample_weight_train_[:n_samples]
+    else:
+        sample_weight_train = None
     assert X_train.shape[0] == n_samples
     assert X_test.shape[0] == n_samples
     print("Data size: %d samples train, %d samples test."
@@ -93,7 +112,7 @@ def one_run(n_samples):
         if loss == 'default':
             loss = 'least_squares'
     est.set_params(loss=loss)
-    est.fit(X_train, y_train)
+    est.fit(X_train, y_train, sample_weight=sample_weight_train)
     sklearn_fit_duration = time() - tic
     tic = time()
     sklearn_score = est.score(X_test, y_test)
@@ -110,7 +129,7 @@ def one_run(n_samples):
         lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
 
         tic = time()
-        lightgbm_est.fit(X_train, y_train)
+        lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         lightgbm_fit_duration = time() - tic
         tic = time()
         lightgbm_score = lightgbm_est.score(X_test, y_test)
@@ -127,7 +146,7 @@ def one_run(n_samples):
         xgb_est = get_equivalent_estimator(est, lib='xgboost')
 
         tic = time()
-        xgb_est.fit(X_train, y_train)
+        xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         xgb_fit_duration = time() - tic
         tic = time()
         xgb_score = xgb_est.score(X_test, y_test)
@@ -144,7 +163,7 @@ def one_run(n_samples):
         cat_est = get_equivalent_estimator(est, lib='catboost')
 
         tic = time()
-        cat_est.fit(X_train, y_train)
+        cat_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         cat_fit_duration = time() - tic
         tic = time()
         cat_score = cat_est.score(X_test, y_test)
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index deefaaf642c39..3d6798e0d548b 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -856,8 +856,7 @@ leverage integer-based data structures (histograms) instead of relying on
 sorted continuous values when building the trees. The API of these
 estimators is slightly different, and some of the features from
 :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`
-are not yet supported: in particular sample weights, and some loss
-functions.
+are not yet supported, for instance some loss functions.
 
 These estimators are still **experimental**: their predictions
 and their API might change without any deprecation cycle. To use them, you
@@ -957,6 +956,39 @@ If no missing values were encountered for a given feature during training,
 then samples with missing values are mapped to whichever child has the most
 samples.
 
+Sample weight support
+---------------------
+
+:class:`HistGradientBoostingClassifier` and
+:class:`HistGradientBoostingRegressor` sample support weights during
+:term:`fit`.
+
+The following toy example demonstrates how the model ignores the samples with
+zero sample weights:
+
+    >>> X = [[1, 0],
+    ...      [1, 0],
+    ...      [1, 0],
+    ...      [0, 1]]
+    >>> y = [0, 0, 1, 0]
+    >>> # ignore the first 2 training samples by setting their weight to 0
+    >>> sample_weight = [0, 0, 1, 1]
+    >>> gb = HistGradientBoostingClassifier(min_samples_leaf=1)
+    >>> gb.fit(X, y, sample_weight=sample_weight)
+    HistGradientBoostingClassifier(...)
+    >>> gb.predict([[1, 0]])
+    array([1])
+    >>> gb.predict_proba([[1, 0]])[0, 1]
+    0.99...
+
+As you can see, the `[1, 0]` is comfortably classified as `1` since the first
+two samples are ignored due to their sample weights.
+
+Implementation detail: taking sample weights into account amounts to
+multiplying the gradients (and the hessians) by the sample weights. Note that
+the binning stage (specifically the quantiles computation) does not take the
+weights into account.
+
 Low-level parallelism
 ---------------------
 
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index bbd35676defcf..399f6352410e9 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -401,11 +401,11 @@ Changelog
   <glemaitre>` and :user:`Caio Oliveira <caioaao>` and :pr:`15138` by
   :user:`Jon Cusick <jcusick13>`..
 
-- Many improvements were made to
+- |MajorFeature| Many improvements were made to
   :class:`ensemble.HistGradientBoostingClassifier` and
   :class:`ensemble.HistGradientBoostingRegressor`:
 
-  - |MajorFeature| Estimators now natively support dense data with missing
+  - |Feature| Estimators now natively support dense data with missing
     values both for training and predicting. They also support infinite
     values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_
     and `Olivier Grisel`_.
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 1167fc1bf4556..720a19cd862f6 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -142,6 +142,10 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |MajorFeature|  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` now support
+  :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_.
+
 - |API| Added boolean `verbose` flag to classes:
   :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`.
   :pr:`15991` by :user:`Sam Bail <spbail>`,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
index 418a9124d37fa..821a81a48fcf3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -27,9 +27,51 @@ def _update_gradients_least_squares(
 
     n_samples = raw_predictions.shape[0]
     for i in prange(n_samples, schedule='static', nogil=True):
+        # Note: a more correct exp is 2 * (raw_predictions - y_true)
+        # but since we use 1 for the constant hessian value (and not 2) this
+        # is strictly equivalent for the leaves values.
         gradients[i] = raw_predictions[i] - y_true[i]
 
 
+def _update_gradients_hessians_least_squares(
+        G_H_DTYPE_C [::1] gradients,  # OUT
+        G_H_DTYPE_C [::1] hessians,  # OUT
+        const Y_DTYPE_C [::1] y_true,  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
+
+    cdef:
+        int n_samples
+        int i
+
+    n_samples = raw_predictions.shape[0]
+    for i in prange(n_samples, schedule='static', nogil=True):
+        # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight
+        # but since we use 1 for the constant hessian value (and not 2) this
+        # is strictly equivalent for the leaves values.
+        gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i]
+        hessians[i] = sample_weight[i]
+
+
+def _update_gradients_hessians_least_absolute_deviation(
+        G_H_DTYPE_C [::1] gradients,  # OUT
+        G_H_DTYPE_C [::1] hessians,  # OUT
+        const Y_DTYPE_C [::1] y_true,  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
+
+    cdef:
+        int n_samples
+        int i
+
+    n_samples = raw_predictions.shape[0]
+    for i in prange(n_samples, schedule='static', nogil=True):
+        # gradient = sign(raw_predicition - y_pred) * sample_weight
+        gradients[i] = sample_weight[i] * (2 *
+                        (y_true[i] - raw_predictions[i] < 0) - 1)
+        hessians[i] = sample_weight[i]
+
+
 def _update_gradients_least_absolute_deviation(
         G_H_DTYPE_C [::1] gradients,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
@@ -49,44 +91,66 @@ def _update_gradients_hessians_binary_crossentropy(
         G_H_DTYPE_C [::1] gradients,  # OUT
         G_H_DTYPE_C [::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions):  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
     cdef:
         int n_samples
         Y_DTYPE_C p_i  # proba that ith sample belongs to positive class
         int i
 
     n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True):
-        p_i = _cexpit(raw_predictions[i])
-        gradients[i] = p_i - y_true[i]
-        hessians[i] = p_i * (1. - p_i)
+    if sample_weight is None:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            p_i = _cexpit(raw_predictions[i])
+            gradients[i] = p_i - y_true[i]
+            hessians[i] = p_i * (1. - p_i)
+    else:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            p_i = _cexpit(raw_predictions[i])
+            gradients[i] = (p_i - y_true[i]) * sample_weight[i]
+            hessians[i] = p_i * (1. - p_i) * sample_weight[i]
 
 
 def _update_gradients_hessians_categorical_crossentropy(
         G_H_DTYPE_C [:, ::1] gradients,  # OUT
         G_H_DTYPE_C [:, ::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [:, ::1] raw_predictions):  # IN
+        const Y_DTYPE_C [:, ::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
     cdef:
         int prediction_dim = raw_predictions.shape[0]
         int n_samples = raw_predictions.shape[1]
         int k  # class index
         int i  # sample index
+        Y_DTYPE_C sw
         # p[i, k] is the probability that class(ith sample) == k.
         # It's the softmax of the raw predictions
         Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim))
         Y_DTYPE_C p_i_k
 
-    for i in prange(n_samples, schedule='static', nogil=True):
-        # first compute softmaxes of sample i for each class
-        for k in range(prediction_dim):
-            p[i, k] = raw_predictions[k, i]  # prepare softmax
-        _compute_softmax(p, i)
-        # then update gradients and hessians
-        for k in range(prediction_dim):
-            p_i_k = p[i, k]
-            gradients[k, i] = p_i_k - (y_true[i] == k)
-            hessians[k, i] = p_i_k * (1. - p_i_k)
+    if sample_weight is None:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # first compute softmaxes of sample i for each class
+            for k in range(prediction_dim):
+                p[i, k] = raw_predictions[k, i]  # prepare softmax
+            _compute_softmax(p, i)
+            # then update gradients and hessians
+            for k in range(prediction_dim):
+                p_i_k = p[i, k]
+                gradients[k, i] = p_i_k - (y_true[i] == k)
+                hessians[k, i] = p_i_k * (1. - p_i_k)
+    else:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # first compute softmaxes of sample i for each class
+            for k in range(prediction_dim):
+                p[i, k] = raw_predictions[k, i]  # prepare softmax
+            _compute_softmax(p, i)
+            # then update gradients and hessians
+            sw = sample_weight[i]
+            for k in range(prediction_dim):
+                p_i_k = p[i, k]
+                gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw
+                hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw
 
 
 cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil:
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 376ab4f940a29..97fcdb4713802 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -9,7 +9,8 @@
 from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin,
                      is_classifier)
 from ...utils import check_X_y, check_random_state, check_array, resample
-from ...utils.validation import check_is_fitted
+from ...utils.validation import (check_is_fitted,
+                                 check_consistent_length, _check_sample_weight)
 from ...utils.multiclass import check_classification_targets
 from ...metrics import check_scoring
 from ...model_selection import train_test_split
@@ -81,7 +82,7 @@ def _validate_parameters(self):
             raise ValueError('max_bins={} should be no smaller than 2 '
                              'and no larger than 255.'.format(self.max_bins))
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """Fit the gradient boosting model.
 
         Parameters
@@ -92,6 +93,9 @@ def fit(self, X, y):
         y : array-like of shape (n_samples,)
             Target values.
 
+        sample_weight : array-like of shape (n_samples,) default=None
+            Weights of training data.
+
         Returns
         -------
         self : object
@@ -104,6 +108,14 @@ def fit(self, X, y):
         acc_prediction_time = 0.
         X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite=False)
         y = self._encode_y(y)
+        check_consistent_length(X, y)
+        # Do not create unit sample weights by default to later skip some
+        # computation
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X,
+                                                 dtype=np.float64)
+            # TODO: remove when PDP suports sample weights
+            self._fitted_with_sw = True
 
         rng = check_random_state(self.random_state)
 
@@ -127,7 +139,7 @@ def fit(self, X, y):
         # data.
         self._in_fit = True
 
-        self.loss_ = self._get_loss()
+        self.loss_ = self._get_loss(sample_weight=sample_weight)
         if self.early_stopping == 'auto':
             self.do_early_stopping_ = n_samples > 10000
         else:
@@ -143,12 +155,23 @@ def fit(self, X, y):
             # This is needed in order to have the same split when using
             # warm starting.
 
-            X_train, X_val, y_train, y_val = train_test_split(
-                X, y, test_size=self.validation_fraction, stratify=stratify,
-                random_state=self._random_seed)
+            if sample_weight is None:
+                X_train, X_val, y_train, y_val = train_test_split(
+                    X, y, test_size=self.validation_fraction,
+                    stratify=stratify,
+                    random_state=self._random_seed)
+                sample_weight_train = sample_weight_val = None
+            else:
+                # TODO: incorporate sample_weight in sampling here, as well as
+                # stratify
+                (X_train, X_val, y_train, y_val, sample_weight_train,
+                 sample_weight_val) = train_test_split(
+                    X, y, sample_weight, test_size=self.validation_fraction,
+                    stratify=stratify,
+                    random_state=self._random_seed)
         else:
-            X_train, y_train = X, y
-            X_val, y_val = None, None
+            X_train, y_train, sample_weight_train = X, y, sample_weight
+            X_val = y_val = sample_weight_val = None
 
         has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8)
 
@@ -185,7 +208,7 @@ def fit(self, X, y):
             # n_trees_per_iterations is n_classes in multiclass classification,
             # else 1.
             self._baseline_prediction = self.loss_.get_baseline_prediction(
-                y_train, self.n_trees_per_iteration_
+                y_train, sample_weight_train, self.n_trees_per_iteration_
             )
             raw_predictions = np.zeros(
                 shape=(self.n_trees_per_iteration_, n_samples),
@@ -193,6 +216,14 @@ def fit(self, X, y):
             )
             raw_predictions += self._baseline_prediction
 
+            # initialize gradients and hessians (empty arrays).
+            # shape = (n_trees_per_iteration, n_samples).
+            gradients, hessians = self.loss_.init_gradients_and_hessians(
+                n_samples=n_samples,
+                prediction_dim=self.n_trees_per_iteration_,
+                sample_weight=sample_weight_train
+            )
+
             # predictors is a matrix (list of lists) of TreePredictor objects
             # with shape (n_iter_, n_trees_per_iteration)
             self._predictors = predictors = []
@@ -227,7 +258,9 @@ def fit(self, X, y):
                         raw_predictions_val += self._baseline_prediction
 
                     self._check_early_stopping_loss(raw_predictions, y_train,
-                                                    raw_predictions_val, y_val)
+                                                    sample_weight_train,
+                                                    raw_predictions_val, y_val,
+                                                    sample_weight_val)
                 else:
                     self.scorer_ = check_scoring(self, self.scoring)
                     # scorer_ is a callable with signature (est, X, y) and
@@ -239,12 +272,15 @@ def fit(self, X, y):
 
                     # Compute the subsample set
                     (X_binned_small_train,
-                     y_small_train) = self._get_small_trainset(
-                        X_binned_train, y_train, self._random_seed)
+                     y_small_train,
+                     sample_weight_small_train) = self._get_small_trainset(
+                        X_binned_train, y_train, sample_weight_train,
+                        self._random_seed)
 
                     self._check_early_stopping_scorer(
                         X_binned_small_train, y_small_train,
-                        X_binned_val, y_val,
+                        sample_weight_small_train,
+                        X_binned_val, y_val, sample_weight_val,
                     )
             begin_at_stage = 0
 
@@ -270,8 +306,18 @@ def fit(self, X, y):
 
             if self.do_early_stopping_ and self.scoring != 'loss':
                 # Compute the subsample set
-                X_binned_small_train, y_small_train = self._get_small_trainset(
-                    X_binned_train, y_train, self._random_seed)
+                (X_binned_small_train,
+                 y_small_train,
+                 sample_weight_small_train) = self._get_small_trainset(
+                    X_binned_train, y_train, sample_weight_train,
+                    self._random_seed)
+
+            # Initialize the gradients and hessians
+            gradients, hessians = self.loss_.init_gradients_and_hessians(
+                n_samples=n_samples,
+                sample_weight=sample_weight_train,
+                prediction_dim=self.n_trees_per_iteration_
+            )
 
             # Get the predictors from the previous fit
             predictors = self._predictors
@@ -282,7 +328,8 @@ def fit(self, X, y):
         # shape = (n_trees_per_iteration, n_samples).
         gradients, hessians = self.loss_.init_gradients_and_hessians(
             n_samples=n_samples,
-            prediction_dim=self.n_trees_per_iteration_
+            prediction_dim=self.n_trees_per_iteration_,
+            sample_weight=sample_weight_train
         )
 
         for iteration in range(begin_at_stage, self.max_iter):
@@ -294,7 +341,8 @@ def fit(self, X, y):
 
             # Update gradients and hessians, inplace
             self.loss_.update_gradients_and_hessians(gradients, hessians,
-                                                     y_train, raw_predictions)
+                                                     y_train, raw_predictions,
+                                                     sample_weight_train)
 
             # Append a list since there may be more than 1 predictor per iter
             predictors.append([])
@@ -320,7 +368,8 @@ def fit(self, X, y):
 
                 if self.loss_.need_update_leaves_values:
                     self.loss_.update_leaves_values(grower, y_train,
-                                                    raw_predictions[k, :])
+                                                    raw_predictions[k, :],
+                                                    sample_weight_train)
 
                 predictor = grower.make_predictor(
                     bin_thresholds=self.bin_mapper_.bin_thresholds_
@@ -348,14 +397,15 @@ def fit(self, X, y):
                             )
 
                     should_early_stop = self._check_early_stopping_loss(
-                        raw_predictions, y_train,
-                        raw_predictions_val, y_val
+                        raw_predictions, y_train, sample_weight_train,
+                        raw_predictions_val, y_val, sample_weight_val
                     )
 
                 else:
                     should_early_stop = self._check_early_stopping_scorer(
                         X_binned_small_train, y_small_train,
-                        X_binned_val, y_val,
+                        sample_weight_small_train,
+                        X_binned_val, y_val, sample_weight_val
                     )
 
             if self.verbose:
@@ -400,12 +450,14 @@ def _clear_state(self):
             if hasattr(self, var):
                 delattr(self, var)
 
-    def _get_small_trainset(self, X_binned_train, y_train, seed):
+    def _get_small_trainset(self, X_binned_train, y_train, sample_weight_train,
+                            seed):
         """Compute the indices of the subsample set and return this set.
 
         For efficiency, we need to subsample the training set to compute scores
         with scorers.
         """
+        # TODO: incorporate sample_weights here in `resample`
         subsample_size = 10000
         if X_binned_train.shape[0] > subsample_size:
             indices = np.arange(X_binned_train.shape[0])
@@ -415,29 +467,48 @@ def _get_small_trainset(self, X_binned_train, y_train, seed):
                                stratify=stratify)
             X_binned_small_train = X_binned_train[indices]
             y_small_train = y_train[indices]
+            if sample_weight_train is not None:
+                sample_weight_small_train = sample_weight_train[indices]
+            else:
+                sample_weight_small_train = None
             X_binned_small_train = np.ascontiguousarray(X_binned_small_train)
-            return X_binned_small_train, y_small_train
+            return (X_binned_small_train, y_small_train,
+                    sample_weight_small_train)
         else:
-            return X_binned_train, y_train
+            return X_binned_train, y_train, sample_weight_train
 
     def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train,
-                                     X_binned_val, y_val):
+                                     sample_weight_small_train,
+                                     X_binned_val, y_val, sample_weight_val):
         """Check if fitting should be early-stopped based on scorer.
 
         Scores are computed on validation data or on training data.
         """
         if is_classifier(self):
             y_small_train = self.classes_[y_small_train.astype(int)]
-        self.train_score_.append(
-            self.scorer_(self, X_binned_small_train, y_small_train)
-        )
+
+        if sample_weight_small_train is None:
+            self.train_score_.append(
+                self.scorer_(self, X_binned_small_train, y_small_train)
+            )
+        else:
+            self.train_score_.append(
+                self.scorer_(self, X_binned_small_train, y_small_train,
+                             sample_weight=sample_weight_small_train)
+            )
 
         if self._use_validation_data:
             if is_classifier(self):
                 y_val = self.classes_[y_val.astype(int)]
-            self.validation_score_.append(
-                self.scorer_(self, X_binned_val, y_val)
-            )
+            if sample_weight_val is None:
+                self.validation_score_.append(
+                    self.scorer_(self, X_binned_val, y_val)
+                )
+            else:
+                self.validation_score_.append(
+                    self.scorer_(self, X_binned_val, y_val,
+                                 sample_weight=sample_weight_val)
+                )
             return self._should_stop(self.validation_score_)
         else:
             return self._should_stop(self.train_score_)
@@ -445,20 +516,22 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train,
     def _check_early_stopping_loss(self,
                                    raw_predictions,
                                    y_train,
+                                   sample_weight_train,
                                    raw_predictions_val,
-                                   y_val):
+                                   y_val,
+                                   sample_weight_val):
         """Check if fitting should be early-stopped based on loss.
 
         Scores are computed on validation data or on training data.
         """
 
         self.train_score_.append(
-            -self.loss_(y_train, raw_predictions)
+            -self.loss_(y_train, raw_predictions, sample_weight_train)
         )
 
         if self._use_validation_data:
             self.validation_score_.append(
-                -self.loss_(y_val, raw_predictions_val)
+                -self.loss_(y_val, raw_predictions_val, sample_weight_val)
             )
             return self._should_stop(self.validation_score_)
         else:
@@ -609,6 +682,13 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
                 (n_trees_per_iteration, n_samples)
             The value of the partial dependence function on each grid point.
         """
+
+        if getattr(self, '_fitted_with_sw', False):
+            raise NotImplementedError("{} does not support partial dependence "
+                                      "plots with the 'recursion' method when "
+                                      "sample weights were given during fit "
+                                      "time.".format(self.__class__.__name__))
+
         grid = np.asarray(grid, dtype=X_DTYPE, order='C')
         averaged_predictions = np.zeros(
             (self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE)
@@ -626,7 +706,7 @@ def _more_tags(self):
         return {'allow_nan': True}
 
     @abstractmethod
-    def _get_loss(self):
+    def _get_loss(self, sample_weight):
         pass
 
     @abstractmethod
@@ -820,8 +900,8 @@ def _encode_y(self, y):
         y = y.astype(Y_DTYPE, copy=False)
         return y
 
-    def _get_loss(self):
-        return _LOSSES[self.loss]()
+    def _get_loss(self, sample_weight):
+        return _LOSSES[self.loss](sample_weight=sample_weight)
 
 
 class HistGradientBoostingClassifier(BaseHistGradientBoosting,
@@ -1058,7 +1138,7 @@ def _encode_y(self, y):
         encoded_y = encoded_y.astype(Y_DTYPE, copy=False)
         return encoded_y
 
-    def _get_loss(self):
+    def _get_loss(self, sample_weight):
         if (self.loss == 'categorical_crossentropy' and
                 self.n_trees_per_iteration_ == 1):
             raise ValueError("'categorical_crossentropy' is not suitable for "
@@ -1067,8 +1147,10 @@ def _get_loss(self):
 
         if self.loss == 'auto':
             if self.n_trees_per_iteration_ == 1:
-                return _LOSSES['binary_crossentropy']()
+                return _LOSSES['binary_crossentropy'](
+                    sample_weight=sample_weight)
             else:
-                return _LOSSES['categorical_crossentropy']()
+                return _LOSSES['categorical_crossentropy'](
+                    sample_weight=sample_weight)
 
-        return _LOSSES[self.loss]()
+        return _LOSSES[self.loss](sample_weight=sample_weight)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index dae85f57134e4..2dbf8bd58773e 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -18,13 +18,27 @@
 from .common import Y_DTYPE
 from .common import G_H_DTYPE
 from ._loss import _update_gradients_least_squares
+from ._loss import _update_gradients_hessians_least_squares
 from ._loss import _update_gradients_least_absolute_deviation
+from ._loss import _update_gradients_hessians_least_absolute_deviation
 from ._loss import _update_gradients_hessians_binary_crossentropy
 from ._loss import _update_gradients_hessians_categorical_crossentropy
+from ...utils.stats import _weighted_percentile
 
 
 class BaseLoss(ABC):
     """Base class for a loss."""
+    def __init__(self, hessians_are_constant):
+        self.hessians_are_constant = hessians_are_constant
+
+    def __call__(self, y_true, raw_predictions, sample_weight):
+        """Return the weighted average loss"""
+        return np.average(self.pointwise_loss(y_true, raw_predictions),
+                          weights=sample_weight)
+
+    @abstractmethod
+    def pointwise_loss(self, y_true, raw_predictions):
+        """Return loss value for each input"""
 
     # This variable indicates whether the loss requires the leaves values to
     # be updated once the tree has been trained. The trees are trained to
@@ -36,7 +50,8 @@ class BaseLoss(ABC):
     # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
     need_update_leaves_values = False
 
-    def init_gradients_and_hessians(self, n_samples, prediction_dim):
+    def init_gradients_and_hessians(self, n_samples, prediction_dim,
+                                    sample_weight):
         """Return initial gradients and hessians.
 
         Unless hessians are constant, arrays are initialized with undefined
@@ -46,12 +61,16 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim):
         ----------
         n_samples : int
             The number of samples passed to `fit()`.
+
         prediction_dim : int
             The dimension of a raw prediction, i.e. the number of trees
             built at each iteration. Equals 1 for regression and binary
             classification, or K where K is the number of classes for
             multiclass classification.
 
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+
         Returns
         -------
         gradients : ndarray, shape (prediction_dim, n_samples)
@@ -63,6 +82,7 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim):
         """
         shape = (prediction_dim, n_samples)
         gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
+
         if self.hessians_are_constant:
             # If the hessians are constant, we consider they are equal to 1.
             # - This is correct for the half LS loss
@@ -75,13 +95,17 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim):
         return gradients, hessians
 
     @abstractmethod
-    def get_baseline_prediction(self, y_train, prediction_dim):
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
         """Return initial predictions (before the first iteration).
 
         Parameters
         ----------
         y_train : ndarray, shape (n_samples,)
             The target training values.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
+
         prediction_dim : int
             The dimension of one prediction: 1 for binary classification and
             regression, n_classes for multiclass classification.
@@ -94,7 +118,7 @@ def get_baseline_prediction(self, y_train, prediction_dim):
 
     @abstractmethod
     def update_gradients_and_hessians(self, gradients, hessians, y_true,
-                                      raw_predictions):
+                                      raw_predictions, sample_weight):
         """Update gradients and hessians arrays, inplace.
 
         The gradients (resp. hessians) are the first (resp. second) order
@@ -105,14 +129,20 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true,
         ----------
         gradients : ndarray, shape (prediction_dim, n_samples)
             The gradients (treated as OUT array).
+
         hessians : ndarray, shape (prediction_dim, n_samples) or \
             (1,)
             The hessians (treated as OUT array).
+
         y_true : ndarray, shape (n_samples,)
             The true target values or each training sample.
+
         raw_predictions : ndarray, shape (prediction_dim, n_samples)
             The raw_predictions (i.e. values from the trees) of the tree
             ensemble at iteration ``i - 1``.
+
+        sample_weight : array-like of shape(n_samples,) default=None
+            Weights of training data.
         """
 
 
@@ -123,34 +153,43 @@ class LeastSquares(BaseLoss):
 
         loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2
 
-    This actually computes the half least squares loss to optimize simplify
+    This actually computes the half least squares loss to simplify
     the computation of the gradients and get a unit hessian (and be consistent
     with what is done in LightGBM).
     """
+    def __init__(self, sample_weight):
+        # If sample weights are provided, the hessians and gradients
+        # are multiplied by sample_weight, which means the hessians are
+        # equal to sample weights.
+        super().__init__(hessians_are_constant=sample_weight is None)
 
-    hessians_are_constant = True
-
-    def __call__(self, y_true, raw_predictions, average=True):
+    def pointwise_loss(self, y_true, raw_predictions):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
         # return a view.
         raw_predictions = raw_predictions.reshape(-1)
         loss = 0.5 * np.power(y_true - raw_predictions, 2)
-        return loss.mean() if average else loss
+        return loss
 
-    def get_baseline_prediction(self, y_train, prediction_dim):
-        return np.mean(y_train)
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        return np.average(y_train, weights=sample_weight)
 
     @staticmethod
     def inverse_link_function(raw_predictions):
         return raw_predictions
 
     def update_gradients_and_hessians(self, gradients, hessians, y_true,
-                                      raw_predictions):
+                                      raw_predictions, sample_weight):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
         # return a view.
         raw_predictions = raw_predictions.reshape(-1)
         gradients = gradients.reshape(-1)
-        _update_gradients_least_squares(gradients, y_true, raw_predictions)
+        if sample_weight is None:
+            _update_gradients_least_squares(gradients, y_true, raw_predictions)
+        else:
+            hessians = hessians.reshape(-1)
+            _update_gradients_hessians_least_squares(gradients, hessians,
+                                                     y_true, raw_predictions,
+                                                     sample_weight)
 
 
 class LeastAbsoluteDeviation(BaseLoss):
@@ -160,8 +199,12 @@ class LeastAbsoluteDeviation(BaseLoss):
 
         loss(x_i) = |y_true_i - raw_pred_i|
     """
+    def __init__(self, sample_weight):
+        # If sample weights are provided, the hessians and gradients
+        # are multiplied by sample_weight, which means the hessians are
+        # equal to sample weights.
+        super().__init__(hessians_are_constant=sample_weight is None)
 
-    hessians_are_constant = True
     # This variable indicates whether the loss requires the leaves values to
     # be updated once the tree has been trained. The trees are trained to
     # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
@@ -172,30 +215,39 @@ class LeastAbsoluteDeviation(BaseLoss):
     # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
     need_update_leaves_values = True
 
-    def __call__(self, y_true, raw_predictions, average=True):
+    def pointwise_loss(self, y_true, raw_predictions):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
         # return a view.
         raw_predictions = raw_predictions.reshape(-1)
         loss = np.abs(y_true - raw_predictions)
-        return loss.mean() if average else loss
+        return loss
 
-    def get_baseline_prediction(self, y_train, prediction_dim):
-        return np.median(y_train)
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        if sample_weight is None:
+            return np.median(y_train)
+        else:
+            return _weighted_percentile(y_train, sample_weight, 50)
 
     @staticmethod
     def inverse_link_function(raw_predictions):
         return raw_predictions
 
     def update_gradients_and_hessians(self, gradients, hessians, y_true,
-                                      raw_predictions):
+                                      raw_predictions, sample_weight):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
         # return a view.
         raw_predictions = raw_predictions.reshape(-1)
         gradients = gradients.reshape(-1)
-        _update_gradients_least_absolute_deviation(gradients, y_true,
-                                                   raw_predictions)
+        if sample_weight is None:
+            _update_gradients_least_absolute_deviation(gradients, y_true,
+                                                       raw_predictions)
+        else:
+            hessians = hessians.reshape(-1)
+            _update_gradients_hessians_least_absolute_deviation(
+                gradients, hessians, y_true, raw_predictions, sample_weight)
 
-    def update_leaves_values(self, grower, y_true, raw_predictions):
+    def update_leaves_values(self, grower, y_true, raw_predictions,
+                             sample_weight):
         # Update the values predicted by the tree with
         # median(y_true - raw_predictions).
         # See note about need_update_leaves_values in BaseLoss.
@@ -205,7 +257,14 @@ def update_leaves_values(self, grower, y_true, raw_predictions):
         # requires a cython version of median()
         for leaf in grower.finalized_leaves:
             indices = leaf.sample_indices
-            median_res = np.median(y_true[indices] - raw_predictions[indices])
+            if sample_weight is None:
+                median_res = np.median(y_true[indices]
+                                       - raw_predictions[indices])
+            else:
+                median_res = _weighted_percentile(y_true[indices]
+                                                  - raw_predictions[indices],
+                                                  sample_weight=sample_weight,
+                                                  percentile=50)
             leaf.value = grower.shrinkage * median_res
             # Note that the regularization is ignored here
 
@@ -222,24 +281,26 @@ class BinaryCrossEntropy(BaseLoss):
     section 4.4.1 (about logistic regression).
     """
 
-    hessians_are_constant = False
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
     inverse_link_function = staticmethod(expit)
 
-    def __call__(self, y_true, raw_predictions, average=True):
+    def pointwise_loss(self, y_true, raw_predictions):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
         # return a view.
         raw_predictions = raw_predictions.reshape(-1)
         # logaddexp(0, x) = log(1 + exp(x))
         loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions
-        return loss.mean() if average else loss
+        return loss
 
-    def get_baseline_prediction(self, y_train, prediction_dim):
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
         if prediction_dim > 2:
             raise ValueError(
                 "loss='binary_crossentropy' is not defined for multiclass"
                 " classification with n_classes=%d, use"
                 " loss='categorical_crossentropy' instead" % prediction_dim)
-        proba_positive_class = np.mean(y_train)
+        proba_positive_class = np.average(y_train, weights=sample_weight)
         eps = np.finfo(y_train.dtype).eps
         proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)
         # log(x / 1 - x) is the anti function of sigmoid, or the link function
@@ -247,14 +308,14 @@ def get_baseline_prediction(self, y_train, prediction_dim):
         return np.log(proba_positive_class / (1 - proba_positive_class))
 
     def update_gradients_and_hessians(self, gradients, hessians, y_true,
-                                      raw_predictions):
+                                      raw_predictions, sample_weight):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
         # return a view.
         raw_predictions = raw_predictions.reshape(-1)
         gradients = gradients.reshape(-1)
         hessians = hessians.reshape(-1)
         _update_gradients_hessians_binary_crossentropy(
-            gradients, hessians, y_true, raw_predictions)
+            gradients, hessians, y_true, raw_predictions, sample_weight)
 
     def predict_proba(self, raw_predictions):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
@@ -274,9 +335,10 @@ class CategoricalCrossEntropy(BaseLoss):
     cross-entropy to more than 2 classes.
     """
 
-    hessians_are_constant = False
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
 
-    def __call__(self, y_true, raw_predictions, average=True):
+    def pointwise_loss(self, y_true, raw_predictions):
         one_hot_true = np.zeros_like(raw_predictions)
         prediction_dim = raw_predictions.shape[0]
         for k in range(prediction_dim):
@@ -284,22 +346,23 @@ def __call__(self, y_true, raw_predictions, average=True):
 
         loss = (logsumexp(raw_predictions, axis=0) -
                 (one_hot_true * raw_predictions).sum(axis=0))
-        return loss.mean() if average else loss
+        return loss
 
-    def get_baseline_prediction(self, y_train, prediction_dim):
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
         init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)
         eps = np.finfo(y_train.dtype).eps
         for k in range(prediction_dim):
-            proba_kth_class = np.mean(y_train == k)
+            proba_kth_class = np.average(y_train == k,
+                                         weights=sample_weight)
             proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)
             init_value[k, :] += np.log(proba_kth_class)
 
         return init_value
 
     def update_gradients_and_hessians(self, gradients, hessians, y_true,
-                                      raw_predictions):
+                                      raw_predictions, sample_weight):
         _update_gradients_hessians_categorical_crossentropy(
-            gradients, hessians, y_true, raw_predictions)
+            gradients, hessians, y_true, raw_predictions, sample_weight)
 
     def predict_proba(self, raw_predictions):
         # TODO: This could be done in parallel
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 0850ac6638a80..c5b4a143591d6 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pytest
-from numpy.testing import assert_allclose
+from numpy.testing import assert_allclose, assert_array_equal
 from sklearn.datasets import make_classification, make_regression
 from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
 from sklearn.model_selection import train_test_split
@@ -11,6 +11,8 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.utils import shuffle
 
@@ -301,7 +303,8 @@ def test_small_trainset():
     gb = HistGradientBoostingClassifier()
 
     # Compute the small training set
-    X_small, y_small = gb._get_small_trainset(X, y, seed=42)
+    X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42,
+                                                 sample_weight_train=None)
 
     # Compute the class distribution in the small training set
     unique, counts = np.unique(y_small, return_counts=True)
@@ -435,6 +438,20 @@ def test_infinite_values():
     np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
 
 
+def test_consistent_lengths():
+    X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
+    y = np.array([0, 0, 1, 1])
+    sample_weight = np.array([.1, .3, .1])
+    gbdt = HistGradientBoostingRegressor()
+    with pytest.raises(ValueError,
+                       match=r"sample_weight.shape == \(3,\), expected"):
+        gbdt.fit(X, y, sample_weight)
+
+    with pytest.raises(ValueError,
+                       match="Found input variables with inconsistent number"):
+        gbdt.fit(X, y[1:])
+
+
 def test_infinite_values_missing_values():
     # High level test making sure that inf and nan values are properly handled
     # when both are present. This is similar to
@@ -474,6 +491,148 @@ def test_string_target_early_stopping(scoring):
     gbrt.fit(X, y)
 
 
+def test_zero_sample_weights_regression():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert gb.predict([[1, 0]])[0] > 0.5
+
+
+def test_zero_sample_weights_classification():
+    # Make sure setting a SW to zero amounts to ignoring the corresponding
+    # sample
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1]]
+    y = [0, 0, 1, 0]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1]
+    gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
+                                        min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+    X = [[1, 0],
+         [1, 0],
+         [1, 0],
+         [0, 1],
+         [1, 1]]
+    y = [0, 0, 1, 0, 2]
+    # ignore the first 2 training samples by setting their weight to 0
+    sample_weight = [0, 0, 1, 1, 1]
+    gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
+                                        min_samples_leaf=1)
+    gb.fit(X, y, sample_weight=sample_weight)
+    assert_array_equal(gb.predict([[1, 0]]), [1])
+
+
+@pytest.mark.parametrize('problem', (
+    'regression',
+    'binary_classification',
+    'multiclass_classification'
+))
+@pytest.mark.parametrize('duplication', ('half', 'all'))
+def test_sample_weight_effect(problem, duplication):
+    # High level test to make sure that duplicating a sample is equivalent to
+    # giving it weight of 2.
+
+    # fails for n_samples > 255 because binning does not take sample weights
+    # into account. Keeping n_samples <= 255 makes
+    # sure only unique values are used so SW have no effect on binning.
+    n_samples = 255
+    n_features = 2
+    if problem == 'regression':
+        X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                               n_informative=n_features, random_state=0)
+        Klass = HistGradientBoostingRegressor
+    else:
+        n_classes = 2 if problem == 'binary_classification' else 3
+        X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                                   n_informative=n_features, n_redundant=0,
+                                   n_clusters_per_class=1,
+                                   n_classes=n_classes, random_state=0)
+        Klass = HistGradientBoostingClassifier
+
+    # This test can't pass if min_samples_leaf > 1 because that would force 2
+    # samples to be in the same node in est_sw, while these samples would be
+    # free to be separate in est_dup: est_dup would just group together the
+    # duplicated samples.
+    est = Klass(min_samples_leaf=1)
+
+    # Create dataset with duplicate and corresponding sample weights
+    if duplication == 'half':
+        lim = n_samples // 2
+    else:
+        lim = n_samples
+    X_dup = np.r_[X, X[:lim]]
+    y_dup = np.r_[y, y[:lim]]
+    sample_weight = np.ones(shape=(n_samples))
+    sample_weight[:lim] = 2
+
+    est_sw = clone(est).fit(X, y, sample_weight=sample_weight)
+    est_dup = clone(est).fit(X_dup, y_dup)
+
+    # checking raw_predict is stricter than just predict for classification
+    assert np.allclose(est_sw._raw_predict(X_dup),
+                       est_dup._raw_predict(X_dup))
+
+
+@pytest.mark.parametrize('loss_name', ('least_squares',
+                                       'least_absolute_deviation'))
+def test_sum_hessians_are_sample_weight(loss_name):
+    # For losses with constant hessians, the sum_hessians field of the
+    # histograms must be equal to the sum of the sample weight of samples at
+    # the corresponding bin.
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                           random_state=rng)
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    sample_weight = rng.normal(size=n_samples)
+
+    loss = _LOSSES[loss_name](sample_weight=sample_weight)
+    gradients, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight)
+    raw_predictions = rng.normal(size=(1, n_samples))
+    loss.update_gradients_and_hessians(gradients, hessians, y,
+                                       raw_predictions, sample_weight)
+
+    # build sum_sample_weight which contains the sum of the sample weights at
+    # each bin (for each feature). This must be equal to the sum_hessians
+    # field of the corresponding histogram
+    sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins))
+    for feature_idx in range(n_features):
+        for sample_idx in range(n_samples):
+            sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += (
+                sample_weight[sample_idx])
+
+    # Build histogram
+    grower = TreeGrower(X_binned, gradients[0], hessians[0],
+                        n_bins=bin_mapper.n_bins)
+    histograms = grower.histogram_builder.compute_histograms_brute(
+        grower.root.sample_indices)
+
+    for feature_idx in range(n_features):
+        for bin_idx in range(bin_mapper.n_bins):
+            assert histograms[feature_idx, bin_idx]['sum_hessians'] == (
+                pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5))
+
+
 def test_max_depth_max_leaf_nodes():
     # Non regression test for
     # https://github.com/scikit-learn/scikit-learn/issues/16179
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 8c300db993d3d..915dc300e4760 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -20,7 +20,7 @@ def get_gradients(y_true, raw_predictions):
         gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
         hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
         loss.update_gradients_and_hessians(gradients, hessians, y_true,
-                                           raw_predictions)
+                                           raw_predictions, None)
         return gradients
 
     def get_hessians(y_true, raw_predictions):
@@ -28,7 +28,7 @@ def get_hessians(y_true, raw_predictions):
         gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
         hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
         loss.update_gradients_and_hessians(gradients, hessians, y_true,
-                                           raw_predictions)
+                                           raw_predictions, None)
 
         if loss.__class__.__name__ == 'LeastSquares':
             # hessians aren't updated because they're constant:
@@ -62,13 +62,13 @@ def test_derivatives(loss, x0, y_true):
     # using Halley's method with the first and second order derivatives
     # computed by the Loss instance.
 
-    loss = _LOSSES[loss]()
+    loss = _LOSSES[loss](sample_weight=None)
     y_true = np.array([y_true], dtype=Y_DTYPE)
     x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1)
     get_gradients, get_hessians = get_derivatives_helper(loss)
 
     def func(x):
-        return loss(y_true, x)
+        return loss.pointwise_loss(y_true, x)
 
     def fprime(x):
         return get_gradients(y_true, x)
@@ -78,7 +78,7 @@ def fprime2(x):
 
     optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2)
     assert np.allclose(loss.inverse_link_function(optimum), y_true)
-    assert np.allclose(loss(y_true, optimum), 0)
+    assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
     assert np.allclose(get_gradients(y_true, optimum), 0)
 
 
@@ -105,7 +105,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
     raw_predictions = rng.normal(
         size=(prediction_dim, n_samples)
     ).astype(Y_DTYPE)
-    loss = _LOSSES[loss]()
+    loss = _LOSSES[loss](sample_weight=None)
     get_gradients, get_hessians = get_derivatives_helper(loss)
 
     # only take gradients and hessians of first tree / class.
@@ -120,16 +120,16 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
     eps = 1e-9
     offset = np.zeros_like(raw_predictions)
     offset[0, :] = eps
-    f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False)
-    f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False)
+    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2)
+    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2)
     numerical_gradients = (f_plus_eps - f_minus_eps) / eps
 
     # Approximate hessians
     eps = 1e-4  # need big enough eps as we divide by its square
     offset[0, :] = eps
-    f_plus_eps = loss(y_true, raw_predictions + offset, average=False)
-    f_minus_eps = loss(y_true, raw_predictions - offset, average=False)
-    f = loss(y_true, raw_predictions, average=False)
+    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset)
+    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset)
+    f = loss.pointwise_loss(y_true, raw_predictions)
     numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2
 
     assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7)
@@ -139,9 +139,9 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
 def test_baseline_least_squares():
     rng = np.random.RandomState(0)
 
-    loss = _LOSSES['least_squares']()
+    loss = _LOSSES['least_squares'](sample_weight=None)
     y_train = rng.normal(size=100)
-    baseline_prediction = loss.get_baseline_prediction(y_train, 1)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
     assert baseline_prediction.shape == tuple()  # scalar
     assert baseline_prediction.dtype == y_train.dtype
     # Make sure baseline prediction is the mean of all targets
@@ -153,9 +153,9 @@ def test_baseline_least_squares():
 def test_baseline_least_absolute_deviation():
     rng = np.random.RandomState(0)
 
-    loss = _LOSSES['least_absolute_deviation']()
+    loss = _LOSSES['least_absolute_deviation'](sample_weight=None)
     y_train = rng.normal(size=100)
-    baseline_prediction = loss.get_baseline_prediction(y_train, 1)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
     assert baseline_prediction.shape == tuple()  # scalar
     assert baseline_prediction.dtype == y_train.dtype
     # Make sure baseline prediction is the median of all targets
@@ -167,10 +167,10 @@ def test_baseline_least_absolute_deviation():
 def test_baseline_binary_crossentropy():
     rng = np.random.RandomState(0)
 
-    loss = _LOSSES['binary_crossentropy']()
+    loss = _LOSSES['binary_crossentropy'](sample_weight=None)
     for y_train in (np.zeros(shape=100), np.ones(shape=100)):
         y_train = y_train.astype(np.float64)
-        baseline_prediction = loss.get_baseline_prediction(y_train, 1)
+        baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
         assert_all_finite(baseline_prediction)
         assert np.allclose(loss.inverse_link_function(baseline_prediction),
                            y_train[0])
@@ -181,7 +181,7 @@ def test_baseline_binary_crossentropy():
     # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
     # So we want raw_prediction = link_function(p) = log(p / (1 - p))
     y_train = rng.randint(0, 2, size=100).astype(np.float64)
-    baseline_prediction = loss.get_baseline_prediction(y_train, 1)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
     assert baseline_prediction.shape == tuple()  # scalar
     assert baseline_prediction.dtype == y_train.dtype
     p = y_train.mean()
@@ -192,10 +192,10 @@ def test_baseline_categorical_crossentropy():
     rng = np.random.RandomState(0)
 
     prediction_dim = 4
-    loss = _LOSSES['categorical_crossentropy']()
+    loss = _LOSSES['categorical_crossentropy'](sample_weight=None)
     for y_train in (np.zeros(shape=100), np.ones(shape=100)):
         y_train = y_train.astype(np.float64)
-        baseline_prediction = loss.get_baseline_prediction(y_train,
+        baseline_prediction = loss.get_baseline_prediction(y_train, None,
                                                            prediction_dim)
         assert baseline_prediction.dtype == y_train.dtype
         assert_all_finite(baseline_prediction)
@@ -203,8 +203,85 @@ def test_baseline_categorical_crossentropy():
     # Same logic as for above test. Here inverse_link_function = softmax and
     # link_function = log
     y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
-    baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None,
+                                                       prediction_dim)
     assert baseline_prediction.shape == (prediction_dim, 1)
     for k in range(prediction_dim):
         p = (y_train == k).mean()
         assert np.allclose(baseline_prediction[k, :], np.log(p))
+
+
+@pytest.mark.parametrize('loss, problem', [
+    ('least_squares', 'regression'),
+    ('least_absolute_deviation', 'regression'),
+    ('binary_crossentropy', 'classification'),
+    ('categorical_crossentropy', 'classification')
+    ])
+@pytest.mark.parametrize('sample_weight', ['ones', 'random'])
+def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
+    # Make sure that passing sample weights to the gradient and hessians
+    # computation methods is equivalent to multiplying by the weights.
+
+    rng = np.random.RandomState(42)
+    n_samples = 1000
+
+    if loss == 'categorical_crossentropy':
+        n_classes = prediction_dim = 3
+    else:
+        n_classes = prediction_dim = 1
+
+    if problem == 'regression':
+        y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    else:
+        y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
+
+    if sample_weight == 'ones':
+        sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE)
+    else:
+        sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE)
+
+    loss_ = _LOSSES[loss](sample_weight=sample_weight)
+
+    baseline_prediction = loss_.get_baseline_prediction(
+        y_true, None, prediction_dim
+    )
+    raw_predictions = np.zeros(shape=(prediction_dim, n_samples),
+                               dtype=baseline_prediction.dtype)
+    raw_predictions += baseline_prediction
+
+    gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    loss_.update_gradients_and_hessians(gradients, hessians, y_true,
+                                        raw_predictions, None)
+
+    gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
+    loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true,
+                                        raw_predictions, sample_weight)
+
+    assert np.allclose(gradients * sample_weight, gradients_sw)
+    assert np.allclose(hessians * sample_weight, hessians_sw)
+
+
+def test_init_gradient_and_hessians_sample_weight():
+    # Make sure that passing sample_weight to a loss correctly influences the
+    # hessians_are_constant attribute, and consequently the shape of the
+    # hessians array.
+
+    prediction_dim = 2
+    n_samples = 5
+    sample_weight = None
+    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    _, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=prediction_dim,
+        sample_weight=None)
+    assert loss.hessians_are_constant
+    assert hessians.shape == (1, 1)
+
+    sample_weight = np.ones(n_samples)
+    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    _, hessians = loss.init_gradients_and_hessians(
+        n_samples=n_samples, prediction_dim=prediction_dim,
+        sample_weight=sample_weight)
+    assert not loss.hessians_are_constant
+    assert hessians.shape == (prediction_dim, n_samples)
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 7f9c37dfca642..530a53b83dce4 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -520,6 +520,16 @@ def test_partial_dependence_sample_weight():
     assert np.corrcoef(pdp, values)[0, 1] > 0.99
 
 
+def test_hist_gbdt_sw_not_supported():
+    # TODO: remove/fix when PDP supports HGBT with sample weights
+    clf = HistGradientBoostingRegressor(random_state=1)
+    clf.fit(X, y, sample_weight=np.ones(len(X)))
+
+    with pytest.raises(NotImplementedError,
+                       match="does not support partial dependence"):
+        partial_dependence(clf, X, features=[1])
+
+
 # TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
 @ignore_warnings(category=FutureWarning)
 def test_partial_dependence_pipeline():