adrinjalali
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/binning.py
Lines changed: 18 additions & 7 deletions b/‎sklearn/ensemble/_hist_gradient_boosting/binning.py
Lines changed: 18 additions & 7 deletions
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Lines changed: 10 additions & 4 deletions b/‎sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Lines changed: 10 additions & 4 deletions
diff --git a/‎sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
Lines changed: 1 addition & 1 deletion b/‎sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
Lines changed: 1 addition & 1 deletion
@@ -16,7 +16,8 @@
 from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF
 
 
-def _find_binning_thresholds(data, max_bins, subsample, random_state):
+def _find_binning_thresholds(data, sample_weight, max_bins, subsample,
+                             random_state):
     """Extract feature-wise quantiles from numerical data.
 
     Missing values are ignored for finding the thresholds.
@@ -25,6 +26,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
     ----------
     data : array-like, shape (n_samples, n_features)
         The data to bin.
+    sample_weight : ndarray of shape(n_samples,), or None
+        Sample weights associated with the data.
     max_bins: int
         The maximum number of bins to use for non-missing values. If for a
         given feature the number of unique values is less than ``max_bins``,
@@ -46,9 +49,15 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
         n_features``.
     """
     rng = check_random_state(random_state)
-    if subsample is not None and data.shape[0] > subsample:
-        subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False)
-        data = data.take(subset, axis=0)
+    sample_size = min(subsample, data.shape[0])
+    if sample_weight is not None:
+        subset = rng.choice(np.arange(data.shape[0]), size=sample_size,
+                            replace=True,
+                            p=sample_weight / sample_weight.sum())
+    else:
+        subset = rng.choice(np.arange(data.shape[0]), size=sample_size,
+                            replace=True)
+    data = data.take(subset, axis=0)
 
     binning_thresholds = []
     for f_idx in range(data.shape[1]):
@@ -136,7 +145,7 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
         self.subsample = subsample
         self.random_state = random_state
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
         """Fit data X by computing the binning thresholds.
 
         The last bin is reserved for missing values, whether missing values
@@ -146,8 +155,10 @@ def fit(self, X, y=None):
         ----------
         X : array-like, shape (n_samples, n_features)
             The data to bin.
-        y: None
+        y : None
             Ignored.
+        sample_weight : ndarray of shape(n_samples,), or None
+            Sample weights associated with the data.
 
         Returns
         -------
@@ -161,7 +172,7 @@ def fit(self, X, y=None):
         X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
         max_bins = self.n_bins - 1
         self.bin_thresholds_ = _find_binning_thresholds(
-            X, max_bins, subsample=self.subsample,
+            X, sample_weight, max_bins, subsample=self.subsample,
             random_state=self.random_state)
 
         self.n_bins_non_missing_ = np.array(
 
@@ -184,9 +184,13 @@ def fit(self, X, y, sample_weight=None):
         n_bins = self.max_bins + 1  # + 1 for missing values
         self.bin_mapper_ = _BinMapper(n_bins=n_bins,
                                       random_state=self._random_seed)
-        X_binned_train = self._bin_data(X_train, is_training_data=True)
+        X_binned_train = self._bin_data(X_train,
+                                        sample_weight=sample_weight_train,
+                                        is_training_data=True)
         if X_val is not None:
-            X_binned_val = self._bin_data(X_val, is_training_data=False)
+            X_binned_val = self._bin_data(X_val,
+                                          sample_weight=sample_weight_val,
+                                          is_training_data=False)
         else:
             X_binned_val = None
 
@@ -554,7 +558,7 @@ def _should_stop(self, scores):
                                for score in recent_scores]
         return not any(recent_improvements)
 
-    def _bin_data(self, X, is_training_data):
+    def _bin_data(self, X, sample_weight, is_training_data):
         """Bin data X.
 
         If is_training_data, then set the bin_mapper_ attribute.
@@ -567,7 +571,9 @@ def _bin_data(self, X, is_training_data):
                 X.nbytes / 1e9, description), end="", flush=True)
         tic = time()
         if is_training_data:
-            X_binned = self.bin_mapper_.fit_transform(X)  # F-aligned array
+            # F-aligned array
+            X_binned = self.bin_mapper_.fit_transform(
+                X, sample_weight=sample_weight)
         else:
             X_binned = self.bin_mapper_.transform(X)  # F-aligned array
             # We convert the array to C-contiguous since predicting is faster
 
@@ -21,7 +21,7 @@ def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
                              random_state=None):
     # Just a redef to avoid having to pass arguments all the time (as the
     # function is private we don't use default values for parameters)
-    return _find_binning_thresholds_orig(data, max_bins, subsample,
+    return _find_binning_thresholds_orig(data, None, max_bins, subsample,
                                          random_state)