From 376c477fd4deea26c11960cbd6494decafbc7ad9 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2019 11:46:26 +0200 Subject: [PATCH 01/55] check consistent lengths --- .../_hist_gradient_boosting/gradient_boosting.py | 13 +++++++++++-- .../tests/test_gradient_boosting.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 4c40f662d0656..bb65ff400fcac 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -8,7 +8,8 @@ from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier) from ...utils import check_X_y, check_random_state, check_array, resample -from ...utils.validation import check_is_fitted +from ...utils.validation import (check_is_fitted, column_or_1d, + check_consistent_length) from ...utils.multiclass import check_classification_targets from ...metrics import check_scoring from ...model_selection import train_test_split @@ -75,7 +76,7 @@ def _validate_parameters(self): raise ValueError('tol={} ' 'must not be smaller than 0.'.format(self.tol)) - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """Fit the gradient boosting model. Parameters @@ -86,6 +87,9 @@ def fit(self, X, y): y : array-like, shape=(n_samples,) Target values. + sample_weight : array-like of shape(n_samples,) default=None + Weights of training data. + Returns ------- self : object @@ -98,6 +102,11 @@ def fit(self, X, y): acc_prediction_time = 0. X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite=False) y = self._encode_y(y) + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + check_consistent_length(X, y, sample_weight) + else: + check_consistent_length(X, y) # The rng state must be preserved if warm_start is True if (self.warm_start and hasattr(self, '_rng')): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index be7e424a844bc..5390effba7117 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -230,3 +230,17 @@ def test_infinite_values(): gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) gbdt.fit(X, y) np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4) + + +def test_consistent_lengths(): + X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) + y = np.array([0, 0, 1, 1]) + sample_weight = np.array([.1, .3, .1]) + gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) + with pytest.raises(ValueError, + match="Found input variables with inconsistent numbers"): + gbdt.fit(X, y, sample_weight) + + with pytest.raises(ValueError, + match="Found input variables with inconsistent numbers"): + gbdt.fit(X, y[1:]) From f9e0a1bfb826ded54b19282ca1e441c0743160fc Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Aug 2019 18:10:13 +0200 Subject: [PATCH 02/55] changes to loss and gradient_boosting.py --- .../gradient_boosting.py | 77 +++++++++++++------ .../ensemble/_hist_gradient_boosting/loss.py | 20 +++-- 2 files changed, 70 insertions(+), 27 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index bb65ff400fcac..c195e57fac005 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -145,12 +145,20 @@ def fit(self, X, y, sample_weight=None): if not (self._is_fitted() and self.warm_start): self._train_val_split_seed = rng.randint(1024) - X_train, X_val, y_train, y_val = train_test_split( - X, y, test_size=self.validation_fraction, stratify=stratify, - random_state=self._train_val_split_seed) + if sample_weight is None: + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=self.validation_fraction, + stratify=stratify, + random_state=self._train_val_split_seed) + else: + (X_train, X_val, y_train, y_val, sample_weight_train, + sample_weight_val) = train_test_split( + X, y, sample_weight, test_size=self.validation_fraction, + stratify=stratify, + random_state=self._train_val_split_seed) else: - X_train, y_train = X, y - X_val, y_val = None, None + X_train, y_train, sample_weight_train = X, y, sample_weight + X_val = y_val = sample_weight_val = None # Bin the data self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) @@ -176,7 +184,7 @@ def fit(self, X, y, sample_weight=None): # n_trees_per_iterations is n_classes in multiclass classification, # else 1. self._baseline_prediction = self.loss_.get_baseline_prediction( - y_train, self.n_trees_per_iteration_ + y_train, sample_weight_train, self.n_trees_per_iteration_ ) raw_predictions = np.zeros( shape=(self.n_trees_per_iteration_, n_samples), @@ -225,7 +233,9 @@ def fit(self, X, y, sample_weight=None): raw_predictions_val += self._baseline_prediction self._check_early_stopping_loss(raw_predictions, y_train, - raw_predictions_val, y_val) + sample_weight_train, + raw_predictions_val, y_val, + sample_weight_val) else: self.scorer_ = check_scoring(self, self.scoring) # scorer_ is a callable with signature (est, X, y) and @@ -240,12 +250,15 @@ def fit(self, X, y, sample_weight=None): # Compute the subsample set (X_binned_small_train, - y_small_train) = self._get_small_trainset( - X_binned_train, y_train, self._small_trainset_seed) + y_small_train, + sample_weight_small_train) = self._get_small_trainset( + X_binned_train, y_train, sample_weight_train, + self._small_trainset_seed) self._check_early_stopping_scorer( X_binned_small_train, y_small_train, - X_binned_val, y_val, + sample_weight_small_train, + X_binned_val, y_val, sample_weight_val, ) begin_at_stage = 0 @@ -269,8 +282,11 @@ def fit(self, X, y, sample_weight=None): if self.do_early_stopping_ and self.scoring != 'loss': # Compute the subsample set - X_binned_small_train, y_small_train = self._get_small_trainset( - X_binned_train, y_train, self._small_trainset_seed) + (X_binned_small_train, + y_small_train, + sample_weight_small_train) = self._get_small_trainset( + X_binned_train, y_train, sample_weight_train, + self._small_trainset_seed) # Initialize the gradients and hessians gradients, hessians = self.loss_.init_gradients_and_hessians( @@ -395,6 +411,7 @@ def _get_small_trainset(self, X_binned_train, y_train, seed): For efficiency, we need to subsample the training set to compute scores with scorers. """ + # TODO: incorporate sample_weights here in `resample` subsample_size = 10000 if X_binned_train.shape[0] > subsample_size: indices = np.arange(X_binned_train.shape[0]) @@ -410,19 +427,33 @@ def _get_small_trainset(self, X_binned_train, y_train, seed): return X_binned_train, y_train def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, - X_binned_val, y_val): + sample_weight_small_train, + X_binned_val, y_val, sample_weight_val): """Check if fitting should be early-stopped based on scorer. Scores are computed on validation data or on training data. """ - self.train_score_.append( - self.scorer_(self, X_binned_small_train, y_small_train) - ) + # TODO: handle when _scorer doesn't accept sample_weight, but + # sample_weight is provided + if sample_weight_small_train is None: + self.train_score_.append( + self.scorer_(self, X_binned_small_train, y_small_train) + ) + else: + self.train_score_.append( + self.scorer_(self, X_binned_small_train, y_small_train, + sample_weight_small_train) + ) if self._use_validation_data: - self.validation_score_.append( - self.scorer_(self, X_binned_val, y_val) - ) + if sample_weight_val is None: + self.validation_score_.append( + self.scorer_(self, X_binned_val, y_val) + ) + else: + self.validation_score_.append( + self.scorer_(self, X_binned_val, y_val, sample_weight_val) + ) return self._should_stop(self.validation_score_) else: return self._should_stop(self.train_score_) @@ -430,20 +461,22 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, def _check_early_stopping_loss(self, raw_predictions, y_train, + sample_weight_train, raw_predictions_val, - y_val): + y_val, + sample_weight_val): """Check if fitting should be early-stopped based on loss. Scores are computed on validation data or on training data. """ self.train_score_.append( - -self.loss_(y_train, raw_predictions) + -self.loss_(y_train, raw_predictions, sample_weight_train) ) if self._use_validation_data: self.validation_score_.append( - -self.loss_(y_val, raw_predictions_val) + -self.loss_(y_val, raw_predictions_val, sample_weight_val) ) return self._should_stop(self.validation_score_) else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 5d7c68ea0b38f..691bfcecc7da7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -111,22 +111,32 @@ class LeastSquares(BaseLoss): loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2 - This actually computes the half least squares loss to optimize simplify + This actually computes the half least squares loss to simplify the computation of the gradients and get a unit hessian (and be consistent with what is done in LightGBM). """ hessians_are_constant = True - def __call__(self, y_true, raw_predictions, average=True): + def __call__(self, y_true, raw_predictions, sample_weight, + average=True): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = 0.5 * np.power(y_true - raw_predictions, 2) - return loss.mean() if average else loss + if sample_weight is not None: + loss = sample_weight * loss + + if average: + if sample_weight is None: + return loss.mean() + else: + return loss.sum() / sample_weight.sum + else: + return loss - def get_baseline_prediction(self, y_train, prediction_dim): - return np.mean(y_train) + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): + return np.average(y_train, sample_weight) @staticmethod def inverse_link_function(raw_predictions): From 379366169d3950fa4ef74c65a27f1a00414bacb7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 21 Aug 2019 15:55:56 +0200 Subject: [PATCH 03/55] pep8 --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 5390effba7117..96972585f74f4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -238,9 +238,9 @@ def test_consistent_lengths(): sample_weight = np.array([.1, .3, .1]) gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) with pytest.raises(ValueError, - match="Found input variables with inconsistent numbers"): + match="Found input variables with inconsistent number"): gbdt.fit(X, y, sample_weight) with pytest.raises(ValueError, - match="Found input variables with inconsistent numbers"): + match="Found input variables with inconsistent number"): gbdt.fit(X, y[1:]) From 4e686d43461193ea567586bfcde95fe27e54615a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 30 Aug 2019 12:53:26 +0200 Subject: [PATCH 04/55] revert loss.py to not take into account sample weights, the caller handles it for now --- .../_hist_gradient_boosting/binning.py | 1 + .../gradient_boosting.py | 4 +++- .../ensemble/_hist_gradient_boosting/loss.py | 19 ++++++------------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index a6c779ca0a97b..60ff19662634c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -47,6 +47,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): """ rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: + # TODO: depends on the weights and stratify maybe subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) data = data.take(subset, axis=0) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 6150080373c6c..43544c25d2947 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -155,6 +155,7 @@ def fit(self, X, y, sample_weight=None): X, y, test_size=self.validation_fraction, stratify=stratify, random_state=self._train_val_split_seed) + sample_weight_train = sample_weight_val = None else: (X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val) = train_test_split( @@ -425,7 +426,8 @@ def _clear_state(self): if hasattr(self, var): delattr(self, var) - def _get_small_trainset(self, X_binned_train, y_train, seed): + def _get_small_trainset(self, X_binned_train, y_train, sample_weight_train, + seed): """Compute the indices of the subsample set and return this set. For efficiency, we need to subsample the training set to compute scores diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 45d119cb38412..b5ea27e74f397 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -118,25 +118,18 @@ class LeastSquares(BaseLoss): hessians_are_constant = True - def __call__(self, y_true, raw_predictions, sample_weight, - average=True): + def __call__(self, y_true, raw_predictions, average=True): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = 0.5 * np.power(y_true - raw_predictions, 2) - if sample_weight is not None: - loss = sample_weight * loss - - if average: - if sample_weight is None: - return loss.mean() - else: - return loss.sum() / sample_weight.sum - else: - return loss + return loss.mean() if average else loss def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): - return np.average(y_train, sample_weight) + if sample_weight is None: + return np.mean(y_train) + else: + return np.average(y_train, sample_weight) @staticmethod def inverse_link_function(raw_predictions): From a9c30d54662e5e90fa1c866c3055a7527bb4cd47 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 30 Aug 2019 14:04:23 +0200 Subject: [PATCH 05/55] gb tests pass, loss has a different average method --- .../gradient_boosting.py | 20 ++++++++++++------- .../ensemble/_hist_gradient_boosting/loss.py | 19 ++++++++++++++---- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5a71b1ca4f306..88180933b8322 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -374,14 +374,15 @@ def fit(self, X, y, sample_weight=None): ) should_early_stop = self._check_early_stopping_loss( - raw_predictions, y_train, - raw_predictions_val, y_val + raw_predictions, y_train, sample_weight_train, + raw_predictions_val, y_val, sample_weight_val ) else: should_early_stop = self._check_early_stopping_scorer( X_binned_small_train, y_small_train, - X_binned_val, y_val, + sample_weight_small_train, + X_binned_val, y_val, sample_weight_val ) if self.verbose: @@ -443,10 +444,12 @@ def _get_small_trainset(self, X_binned_train, y_train, sample_weight_train, stratify=stratify) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] + sample_weight_small_train = sample_weight_train[indices] X_binned_small_train = np.ascontiguousarray(X_binned_small_train) - return X_binned_small_train, y_small_train + return (X_binned_small_train, y_small_train, + sample_weight_small_train) else: - return X_binned_train, y_train + return X_binned_train, y_train, sample_weight_train def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, sample_weight_small_train, @@ -460,6 +463,7 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, # TODO: handle when _scorer doesn't accept sample_weight, but # sample_weight is provided + import sys if sample_weight_small_train is None: self.train_score_.append( self.scorer_(self, X_binned_small_train, y_small_train) @@ -498,12 +502,14 @@ def _check_early_stopping_loss(self, """ self.train_score_.append( - -self.loss_(y_train, raw_predictions, sample_weight_train) + -self.loss_.get_average_loss(y_train, raw_predictions, + sample_weight_train) ) if self._use_validation_data: self.validation_score_.append( - -self.loss_(y_val, raw_predictions_val, sample_weight_val) + -self.loss_.get_average_loss(y_val, raw_predictions_val, + sample_weight_val) ) return self._should_stop(self.validation_score_) else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index b5ea27e74f397..1d41f04d90d0e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -63,13 +63,17 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): return gradients, hessians @abstractmethod - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): """Return initial predictions (before the first iteration). Parameters ---------- y_train : ndarray, shape (n_samples,) The target training values. + + sample_weight : array-like of shape(n_samples,) default=None + Weights of training data. + prediction_dim : int The dimension of one prediction: 1 for binary classification and regression, n_classes for multiclass classification. @@ -118,12 +122,19 @@ class LeastSquares(BaseLoss): hessians_are_constant = True - def __call__(self, y_true, raw_predictions, average=True): + def __call__(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = 0.5 * np.power(y_true - raw_predictions, 2) - return loss.mean() if average else loss + return loss + + def get_average_loss(self, y_true, raw_predictions, sample_weight): + if sample_weight is None: + return self(y_true, raw_predictions).mean() + else: + return np.average(self(y_true, raw_predictions), + weights=sample_weight) def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): if sample_weight is None: @@ -167,7 +178,7 @@ def __call__(self, y_true, raw_predictions, average=True): loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions return loss.mean() if average else loss - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): if prediction_dim > 2: raise ValueError( "loss='binary_crossentropy' is not defined for multiclass" From f09beb67a7bcb17bfd3f2756d92f9ff0a8aa8335 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 30 Aug 2019 19:27:40 +0200 Subject: [PATCH 06/55] loss handles sample weight --- .../gradient_boosting.py | 8 +++- .../ensemble/_hist_gradient_boosting/loss.py | 46 ++++++++++++++----- .../tests/test_gradient_boosting.py | 3 +- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 88180933b8322..8e3b689606ed1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -324,7 +324,8 @@ def fit(self, X, y, sample_weight=None): # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, - y_train, raw_predictions) + y_train, raw_predictions, + sample_weight_train) # Append a list since there may be more than 1 predictor per iter predictors.append([]) @@ -444,7 +445,10 @@ def _get_small_trainset(self, X_binned_train, y_train, sample_weight_train, stratify=stratify) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] - sample_weight_small_train = sample_weight_train[indices] + if sample_weight_train is not None: + sample_weight_small_train = sample_weight_train[indices] + else: + sample_weight_small_train = None X_binned_small_train = np.ascontiguousarray(X_binned_small_train) return (X_binned_small_train, y_small_train, sample_weight_small_train) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 1d41f04d90d0e..849824124b1ac 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -25,6 +25,20 @@ class BaseLoss(ABC): """Base class for a loss.""" + @abstractmethod + def __call__(self, y_true, raw_predictions): + """Return the loss for each value + """ + + def get_average_loss(self, y_true, raw_predictions, sample_weight): + """Return the average loss, weighted if sample_weight is not None + """ + if sample_weight is None: + return self(y_true, raw_predictions).mean() + else: + return np.average(self(y_true, raw_predictions), + weights=sample_weight) + def init_gradients_and_hessians(self, n_samples, prediction_dim): """Return initial gradients and hessians. @@ -86,7 +100,7 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): @abstractmethod def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): """Update gradients and hessians arrays, inplace. The gradients (resp. hessians) are the first (resp. second) order @@ -97,14 +111,20 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, ---------- gradients : ndarray, shape (prediction_dim, n_samples) The gradients (treated as OUT array). + hessians : ndarray, shape (prediction_dim, n_samples) or \ (1,) The hessians (treated as OUT array). + y_true : ndarray, shape (n_samples,) The true target values or each training sample. + raw_predictions : ndarray, shape (prediction_dim, n_samples) The raw_predictions (i.e. values from the trees) of the tree ensemble at iteration ``i - 1``. + + sample_weight : array-like of shape(n_samples,) default=None + Weights of training data. """ @@ -129,13 +149,6 @@ def __call__(self, y_true, raw_predictions): loss = 0.5 * np.power(y_true - raw_predictions, 2) return loss - def get_average_loss(self, y_true, raw_predictions, sample_weight): - if sample_weight is None: - return self(y_true, raw_predictions).mean() - else: - return np.average(self(y_true, raw_predictions), - weights=sample_weight) - def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): if sample_weight is None: return np.mean(y_train) @@ -147,12 +160,15 @@ def inverse_link_function(raw_predictions): return raw_predictions def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) _update_gradients_least_squares(gradients, y_true, raw_predictions) + if sample_weight is not None: + np.multiply(gradients, sample_weight, out=gradients) + np.multiply(hessians, sample_weight, out=hessians) class BinaryCrossEntropy(BaseLoss): @@ -192,7 +208,7 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): return np.log(proba_positive_class / (1 - proba_positive_class)) def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) @@ -200,6 +216,9 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, hessians = hessians.reshape(-1) _update_gradients_hessians_binary_crossentropy( gradients, hessians, y_true, raw_predictions) + if sample_weight is not None: + np.multiply(gradients, sample_weight, out=gradients) + np.multiply(hessians, sample_weight, out=hessians) def predict_proba(self, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to @@ -231,7 +250,7 @@ def __call__(self, y_true, raw_predictions, average=True): (one_hot_true * raw_predictions).sum(axis=0)) return loss.mean() if average else loss - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) eps = np.finfo(y_train.dtype).eps for k in range(prediction_dim): @@ -242,9 +261,12 @@ def get_baseline_prediction(self, y_train, prediction_dim): return init_value def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): _update_gradients_hessians_categorical_crossentropy( gradients, hessians, y_true, raw_predictions) + if sample_weight is not None: + np.multiply(gradients, sample_weight, out=gradients) + np.multiply(hessians, sample_weight, out=hessians) def predict_proba(self, raw_predictions): # TODO: This could be done in parallel diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 6ae53e33e83e6..0d183b22ccf7d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -266,7 +266,8 @@ def test_small_trainset(): gb = HistGradientBoostingClassifier() # Compute the small training set - X_small, y_small = gb._get_small_trainset(X, y, seed=42) + X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42, + sample_weight_train=None) # Compute the class distribution in the small training set unique, counts = np.unique(y_small, return_counts=True) From c922d30c7552aaa917173fd0a7f2cdf295ee985d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 30 Aug 2019 19:46:32 +0200 Subject: [PATCH 07/55] more fixes for tests --- .../ensemble/_hist_gradient_boosting/loss.py | 8 +++--- .../tests/test_loss.py | 25 ++++++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 849824124b1ac..fcb89d98e93d9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -186,13 +186,13 @@ class BinaryCrossEntropy(BaseLoss): hessians_are_constant = False inverse_link_function = staticmethod(expit) - def __call__(self, y_true, raw_predictions, average=True): + def __call__(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) # logaddexp(0, x) = log(1 + exp(x)) loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions - return loss.mean() if average else loss + return loss def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): if prediction_dim > 2: @@ -240,7 +240,7 @@ class CategoricalCrossEntropy(BaseLoss): hessians_are_constant = False - def __call__(self, y_true, raw_predictions, average=True): + def __call__(self, y_true, raw_predictions): one_hot_true = np.zeros_like(raw_predictions) prediction_dim = raw_predictions.shape[0] for k in range(prediction_dim): @@ -248,7 +248,7 @@ def __call__(self, y_true, raw_predictions, average=True): loss = (logsumexp(raw_predictions, axis=0) - (one_hot_true * raw_predictions).sum(axis=0)) - return loss.mean() if average else loss + return loss def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index b49acc52b6e40..d7c0c88d0ce20 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -20,7 +20,7 @@ def get_gradients(y_true, raw_predictions): gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, - raw_predictions) + raw_predictions, None) return gradients def get_hessians(y_true, raw_predictions): @@ -28,7 +28,7 @@ def get_hessians(y_true, raw_predictions): gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, - raw_predictions) + raw_predictions, None) if loss.__class__.__name__ == 'LeastSquares': # hessians aren't updated because they're constant: @@ -116,16 +116,16 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): eps = 1e-9 offset = np.zeros_like(raw_predictions) offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False) - f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False) + f_plus_eps = loss(y_true, raw_predictions + offset / 2) + f_minus_eps = loss(y_true, raw_predictions - offset / 2) numerical_gradients = (f_plus_eps - f_minus_eps) / eps # Approximate hessians eps = 1e-4 # need big enough eps as we divide by its square offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset, average=False) - f_minus_eps = loss(y_true, raw_predictions - offset, average=False) - f = loss(y_true, raw_predictions, average=False) + f_plus_eps = loss(y_true, raw_predictions + offset) + f_minus_eps = loss(y_true, raw_predictions - offset) + f = loss(y_true, raw_predictions) numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2 def relative_error(a, b): @@ -140,7 +140,7 @@ def test_baseline_least_squares(): loss = _LOSSES['least_squares']() y_train = rng.normal(size=100) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype # Make sure baseline prediction is the mean of all targets @@ -153,7 +153,7 @@ def test_baseline_binary_crossentropy(): loss = _LOSSES['binary_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert_all_finite(baseline_prediction) assert np.allclose(loss.inverse_link_function(baseline_prediction), y_train[0]) @@ -164,7 +164,7 @@ def test_baseline_binary_crossentropy(): # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) y_train = rng.randint(0, 2, size=100).astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype p = y_train.mean() @@ -178,7 +178,7 @@ def test_baseline_categorical_crossentropy(): loss = _LOSSES['categorical_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, + baseline_prediction = loss.get_baseline_prediction(y_train, None, prediction_dim) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) @@ -186,7 +186,8 @@ def test_baseline_categorical_crossentropy(): # Same logic as for above test. Here inverse_link_function = softmax and # link_function = log y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) - baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) + baseline_prediction = loss.get_baseline_prediction(y_train, None, + prediction_dim) assert baseline_prediction.shape == (prediction_dim, 1) for k in range(prediction_dim): p = (y_train == k).mean() From 39282a0e099f5b3a1b453934f9e87dff9e15964f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 30 Aug 2019 19:49:08 +0200 Subject: [PATCH 08/55] pep8 --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 8e3b689606ed1..6f7b995695e12 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -467,7 +467,6 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, # TODO: handle when _scorer doesn't accept sample_weight, but # sample_weight is provided - import sys if sample_weight_small_train is None: self.train_score_.append( self.scorer_(self, X_binned_small_train, y_small_train) From 934323e6d4b8e5d6f89ef3caee835303c3d37110 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 31 Aug 2019 13:24:11 +0200 Subject: [PATCH 09/55] fix constant hessian and sample weight --- .../gradient_boosting.py | 3 ++- .../ensemble/_hist_gradient_boosting/loss.py | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 6f7b995695e12..cc872ef7db745 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -212,7 +212,8 @@ def fit(self, X, y, sample_weight=None): # shape = (n_trees_per_iteration, n_samples). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, - prediction_dim=self.n_trees_per_iteration_ + prediction_dim=self.n_trees_per_iteration_, + sample_weight=sample_weight ) # predictors is a matrix (list of lists) of TreePredictor objects diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index fcb89d98e93d9..0e2045ebc61c4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -39,7 +39,8 @@ def get_average_loss(self, y_true, raw_predictions, sample_weight): return np.average(self(y_true, raw_predictions), weights=sample_weight) - def init_gradients_and_hessians(self, n_samples, prediction_dim): + def init_gradients_and_hessians(self, n_samples, prediction_dim, + sample_weight): """Return initial gradients and hessians. Unless hessians are constant, arrays are initialized with undefined @@ -49,12 +50,16 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): ---------- n_samples : int The number of samples passed to `fit()`. + prediction_dim : int The dimension of a raw prediction, i.e. the number of trees built at each iteration. Equals 1 for regression and binary classification, or K where K is the number of classes for multiclass classification. + sample_weight : array-like of shape(n_samples,) default=None + Weights of training data. + Returns ------- gradients : ndarray, shape (prediction_dim, n_samples) @@ -66,10 +71,14 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): """ shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) + if sample_weight is not None: + self.hessians_are_constant = False if self.hessians_are_constant: # if the hessians are constant, we consider they are equal to 1. # this is correct as long as we adjust the gradients. See e.g. LS - # loss + # loss. If sample weights are provided, the hessians and gradients + # are multiplied by sample_weight, which means the hessains are + # equal to sample weights. hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE) else: hessians = np.empty(shape=shape, dtype=G_H_DTYPE) @@ -153,7 +162,7 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): if sample_weight is None: return np.mean(y_train) else: - return np.average(y_train, sample_weight) + return np.average(y_train, weights=sample_weight) @staticmethod def inverse_link_function(raw_predictions): @@ -168,7 +177,7 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, _update_gradients_least_squares(gradients, y_true, raw_predictions) if sample_weight is not None: np.multiply(gradients, sample_weight, out=gradients) - np.multiply(hessians, sample_weight, out=hessians) + hessians[:] = sample_weight class BinaryCrossEntropy(BaseLoss): From 54d3c2760bc1f22b359b98306229ece6af2e72c9 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 31 Aug 2019 21:03:56 +0200 Subject: [PATCH 10/55] fix classification losses, and test --- .../ensemble/_hist_gradient_boosting/loss.py | 11 +++++-- .../tests/test_gradient_boosting.py | 30 ++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 0e2045ebc61c4..66223df683b6b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -209,7 +209,10 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): "loss='binary_crossentropy' is not defined for multiclass" " classification with n_classes=%d, use" " loss='categorical_crossentropy' instead" % prediction_dim) - proba_positive_class = np.mean(y_train) + if sample_weight is None: + proba_positive_class = np.mean(y_train) + else: + proba_positive_class = np.average(y_train, weights=sample_weight) eps = np.finfo(y_train.dtype).eps proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) # log(x / 1 - x) is the anti function of sigmoid, or the link function @@ -263,7 +266,11 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) eps = np.finfo(y_train.dtype).eps for k in range(prediction_dim): - proba_kth_class = np.mean(y_train == k) + if sample_weight is None: + proba_kth_class = np.mean(y_train == k) + else: + proba_kth_class = np.average(y_train == k, + weights=sample_weight) proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) init_value[k, :] += np.log(proba_kth_class) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 0d183b22ccf7d..9cc180bdaba78 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,6 +1,6 @@ import numpy as np import pytest -from numpy.testing import assert_allclose +from numpy.testing import assert_allclose, assert_array_equal from sklearn.datasets import make_classification, make_regression from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler from sklearn.model_selection import train_test_split @@ -441,3 +441,31 @@ def test_string_target_early_stopping(scoring): y = np.array(['x'] * 50 + ['y'] * 50, dtype=object) gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring) gbrt.fit(X, y) + + +def test_non_uniform_weights_toy_edge_case_reg(): + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingRegressor(min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert gb.predict([[1, 0]])[0] > 0.5 + + +@pytest.mark.parametrize("loss", ['binary_crossentropy', + 'categorical_crossentropy']) +def test_non_uniform_weights_toy_edge_case_clf(loss): + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingClassifier(loss=loss, min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) From df2891931891a457ddf1a9581a8878296b775806 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 31 Aug 2019 21:10:56 +0200 Subject: [PATCH 11/55] fix the test --- .../tests/test_gradient_boosting.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 9cc180bdaba78..5f33a4757f075 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -466,6 +466,20 @@ def test_non_uniform_weights_toy_edge_case_clf(loss): y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] - gb = HistGradientBoostingClassifier(loss=loss, min_samples_leaf=1) + gb = HistGradientBoostingClassifier(loss='binary_crossentropy', + min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1], + [1, 1]] + y = [0, 0, 1, 0, 2] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1, 1] + gb = HistGradientBoostingClassifier(loss='categorical_crossentropy', + min_samples_leaf=1) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1]) From 56b385a6e71bc7dc77f47fd017af1ff3579c9de3 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 1 Sep 2019 12:44:31 +0200 Subject: [PATCH 12/55] minor fix --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index cc872ef7db745..02559bf7c5022 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -308,6 +308,7 @@ def fit(self, X, y, sample_weight=None): # Initialize the gradients and hessians gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, + sample_weight=sample_weight_train, prediction_dim=self.n_trees_per_iteration_ ) From 66f7ad39547d2375097c181ed824268c70a6290b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 3 Sep 2019 14:46:17 +0200 Subject: [PATCH 13/55] address comments, move sample_weight to cython --- .../_hist_gradient_boosting/_loss.pyx | 50 +++++++++++++---- .../gradient_boosting.py | 12 ++--- .../ensemble/_hist_gradient_boosting/loss.py | 53 +++++++------------ .../tests/test_gradient_boosting.py | 2 +- 4 files changed, 67 insertions(+), 50 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index ff17654840005..1b7456afd3cd2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -27,39 +27,68 @@ def _update_gradients_least_squares( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): - # Note: a more correct exp is 2 * (raw_predictions - y_true) but - # since we use 1 for the constant hessian value (and not 2) this + # Note: a more correct exp is 2 * (raw_predictions - y_true) + # but since we use 1 for the constant hessian value (and not 2) this # is strictly equivalent for the leaves values. gradients[i] = raw_predictions[i] - y_true[i] +def _update_gradients_hessians_least_squares( + G_H_DTYPE_C [::1] gradients, # OUT + G_H_DTYPE_C [::1] hessians, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN + + cdef: + int n_samples + int i + + n_samples = raw_predictions.shape[0] + for i in prange(n_samples, schedule='static', nogil=True): + # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight + # but since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. + gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i] + hessians[i] = sample_weight[i] + + def _update_gradients_hessians_binary_crossentropy( G_H_DTYPE_C [::1] gradients, # OUT G_H_DTYPE_C [::1] hessians, # OUT const Y_DTYPE_C [::1] y_true, # IN - const Y_DTYPE_C [::1] raw_predictions): # IN + const Y_DTYPE_C [::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN cdef: int n_samples Y_DTYPE_C p_i # proba that ith sample belongs to positive class int i n_samples = raw_predictions.shape[0] - for i in prange(n_samples, schedule='static', nogil=True): - p_i = _cexpit(raw_predictions[i]) - gradients[i] = p_i - y_true[i] - hessians[i] = p_i * (1. - p_i) + if sample_weight is None: + for i in prange(n_samples, schedule='static', nogil=True): + p_i = _cexpit(raw_predictions[i]) + gradients[i] = p_i - y_true[i] + hessians[i] = p_i * (1. - p_i) + else: + for i in prange(n_samples, schedule='static', nogil=True): + p_i = _cexpit(raw_predictions[i]) + gradients[i] = (p_i - y_true[i]) * sample_weight[i] + hessians[i] = p_i * (1. - p_i) * sample_weight[i] def _update_gradients_hessians_categorical_crossentropy( G_H_DTYPE_C [:, ::1] gradients, # OUT G_H_DTYPE_C [:, ::1] hessians, # OUT const Y_DTYPE_C [::1] y_true, # IN - const Y_DTYPE_C [:, ::1] raw_predictions): # IN + const Y_DTYPE_C [:, ::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN cdef: int prediction_dim = raw_predictions.shape[0] int n_samples = raw_predictions.shape[1] int k # class index int i # sample index + Y_DTYPE_C sw # p[i, k] is the probability that class(ith sample) == k. # It's the softmax of the raw predictions Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) @@ -73,8 +102,9 @@ def _update_gradients_hessians_categorical_crossentropy( # then update gradients and hessians for k in range(prediction_dim): p_i_k = p[i, k] - gradients[k, i] = p_i_k - (y_true[i] == k) - hessians[k, i] = p_i_k * (1. - p_i_k) + sw = 1 if sample_weight is None else sample_weight[i] + gradients[k, i] = p_i_k - (y_true[i] == k) * sw + hessians[k, i] = p_i_k * (1. - p_i_k) * sw cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 02559bf7c5022..df9dfa9f70eee 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -9,8 +9,8 @@ from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier) from ...utils import check_X_y, check_random_state, check_array, resample -from ...utils.validation import (check_is_fitted, column_or_1d, - check_consistent_length) +from ...utils.validation import (check_is_fitted, + check_consistent_length, _check_sample_weight) from ...utils.multiclass import check_classification_targets from ...metrics import check_scoring from ...model_selection import train_test_split @@ -107,11 +107,9 @@ def fit(self, X, y, sample_weight=None): acc_prediction_time = 0. X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite=False) y = self._encode_y(y) + check_consistent_length(X, y) if sample_weight is not None: - sample_weight = column_or_1d(sample_weight) - check_consistent_length(X, y, sample_weight) - else: - check_consistent_length(X, y) + sample_weight = _check_sample_weight(sample_weight, X) # The rng state must be preserved if warm_start is True if (self.warm_start and hasattr(self, '_rng')): @@ -157,6 +155,8 @@ def fit(self, X, y, sample_weight=None): random_state=self._train_val_split_seed) sample_weight_train = sample_weight_val = None else: + # TODO: incorporate sample_weight in sampling here, as well as + # stratify (X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val) = train_test_split( X, y, sample_weight, test_size=self.validation_fraction, diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 66223df683b6b..c92046e8ccf85 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -18,6 +18,7 @@ from .common import Y_DTYPE from .common import G_H_DTYPE from ._loss import _update_gradients_least_squares +from ._loss import _update_gradients_hessians_least_squares from ._loss import _update_gradients_hessians_binary_crossentropy from ._loss import _update_gradients_hessians_categorical_crossentropy @@ -33,11 +34,8 @@ def __call__(self, y_true, raw_predictions): def get_average_loss(self, y_true, raw_predictions, sample_weight): """Return the average loss, weighted if sample_weight is not None """ - if sample_weight is None: - return self(y_true, raw_predictions).mean() - else: - return np.average(self(y_true, raw_predictions), - weights=sample_weight) + return np.average(self(y_true, raw_predictions), + weights=sample_weight) def init_gradients_and_hessians(self, n_samples, prediction_dim, sample_weight): @@ -72,13 +70,14 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim, shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) if sample_weight is not None: + # If sample weights are provided, the hessians and gradients + # are multiplied by sample_weight, which means the hessians are + # equal to sample weights. self.hessians_are_constant = False if self.hessians_are_constant: # if the hessians are constant, we consider they are equal to 1. # this is correct as long as we adjust the gradients. See e.g. LS - # loss. If sample weights are provided, the hessians and gradients - # are multiplied by sample_weight, which means the hessains are - # equal to sample weights. + # loss. hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE) else: hessians = np.empty(shape=shape, dtype=G_H_DTYPE) @@ -159,10 +158,7 @@ def __call__(self, y_true, raw_predictions): return loss def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): - if sample_weight is None: - return np.mean(y_train) - else: - return np.average(y_train, weights=sample_weight) + return np.average(y_train, weights=sample_weight) @staticmethod def inverse_link_function(raw_predictions): @@ -174,10 +170,13 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - _update_gradients_least_squares(gradients, y_true, raw_predictions) - if sample_weight is not None: - np.multiply(gradients, sample_weight, out=gradients) - hessians[:] = sample_weight + hessians = hessians.reshape(-1) + if sample_weight is None: + _update_gradients_least_squares(gradients, y_true, raw_predictions) + else: + _update_gradients_hessians_least_squares(gradients, hessians, + y_true, raw_predictions, + sample_weight) class BinaryCrossEntropy(BaseLoss): @@ -209,10 +208,7 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): "loss='binary_crossentropy' is not defined for multiclass" " classification with n_classes=%d, use" " loss='categorical_crossentropy' instead" % prediction_dim) - if sample_weight is None: - proba_positive_class = np.mean(y_train) - else: - proba_positive_class = np.average(y_train, weights=sample_weight) + proba_positive_class = np.average(y_train, weights=sample_weight) eps = np.finfo(y_train.dtype).eps proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) # log(x / 1 - x) is the anti function of sigmoid, or the link function @@ -227,10 +223,7 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, gradients = gradients.reshape(-1) hessians = hessians.reshape(-1) _update_gradients_hessians_binary_crossentropy( - gradients, hessians, y_true, raw_predictions) - if sample_weight is not None: - np.multiply(gradients, sample_weight, out=gradients) - np.multiply(hessians, sample_weight, out=hessians) + gradients, hessians, y_true, raw_predictions, sample_weight) def predict_proba(self, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to @@ -266,11 +259,8 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) eps = np.finfo(y_train.dtype).eps for k in range(prediction_dim): - if sample_weight is None: - proba_kth_class = np.mean(y_train == k) - else: - proba_kth_class = np.average(y_train == k, - weights=sample_weight) + proba_kth_class = np.average(y_train == k, + weights=sample_weight) proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) init_value[k, :] += np.log(proba_kth_class) @@ -279,10 +269,7 @@ def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight): _update_gradients_hessians_categorical_crossentropy( - gradients, hessians, y_true, raw_predictions) - if sample_weight is not None: - np.multiply(gradients, sample_weight, out=gradients) - np.multiply(hessians, sample_weight, out=hessians) + gradients, hessians, y_true, raw_predictions, sample_weight) def predict_proba(self, raw_predictions): # TODO: This could be done in parallel diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 5f33a4757f075..f65c4cd24e938 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -407,7 +407,7 @@ def test_consistent_lengths(): sample_weight = np.array([.1, .3, .1]) gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) with pytest.raises(ValueError, - match="Found input variables with inconsistent number"): + match=r"sample_weight.shape == \(3,\), expected"): gbdt.fit(X, y, sample_weight) with pytest.raises(ValueError, From a1440bb648d66654a34566f272fbd55f55dea4db Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 4 Sep 2019 22:57:13 +0200 Subject: [PATCH 14/55] change loss API --- .../gradient_boosting.py | 6 ++---- .../ensemble/_hist_gradient_boosting/loss.py | 20 +++++++++---------- .../tests/test_loss.py | 14 ++++++------- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index df9dfa9f70eee..c67ee05b6c9ee 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -507,14 +507,12 @@ def _check_early_stopping_loss(self, """ self.train_score_.append( - -self.loss_.get_average_loss(y_train, raw_predictions, - sample_weight_train) + -self.loss_(y_train, raw_predictions, sample_weight_train) ) if self._use_validation_data: self.validation_score_.append( - -self.loss_.get_average_loss(y_val, raw_predictions_val, - sample_weight_val) + -self.loss_(y_val, raw_predictions_val, sample_weight_val) ) return self._should_stop(self.validation_score_) else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index c92046e8ccf85..40a7b8049005b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -26,16 +26,16 @@ class BaseLoss(ABC): """Base class for a loss.""" - @abstractmethod - def __call__(self, y_true, raw_predictions): - """Return the loss for each value + def __call__(self, y_true, raw_predictions, sample_weight): + """Return the weighted average loss """ + return np.average(self.pointwise_loss(y_true, raw_predictions), + weights=sample_weight) - def get_average_loss(self, y_true, raw_predictions, sample_weight): - """Return the average loss, weighted if sample_weight is not None + @abstractmethod + def pointwise_loss(self, y_true, raw_predictions): + """Return loss value for each input """ - return np.average(self(y_true, raw_predictions), - weights=sample_weight) def init_gradients_and_hessians(self, n_samples, prediction_dim, sample_weight): @@ -150,7 +150,7 @@ class LeastSquares(BaseLoss): hessians_are_constant = True - def __call__(self, y_true, raw_predictions): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) @@ -194,7 +194,7 @@ class BinaryCrossEntropy(BaseLoss): hessians_are_constant = False inverse_link_function = staticmethod(expit) - def __call__(self, y_true, raw_predictions): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) @@ -245,7 +245,7 @@ class CategoricalCrossEntropy(BaseLoss): hessians_are_constant = False - def __call__(self, y_true, raw_predictions): + def pointwise_loss(self, y_true, raw_predictions): one_hot_true = np.zeros_like(raw_predictions) prediction_dim = raw_predictions.shape[0] for k in range(prediction_dim): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index d7c0c88d0ce20..dfa7e7cbd718c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -65,7 +65,7 @@ def test_derivatives(loss, x0, y_true): get_gradients, get_hessians = get_derivatives_helper(loss) def func(x): - return loss(y_true, x) + return loss.pointwise_loss(y_true, x) def fprime(x): return get_gradients(y_true, x) @@ -75,7 +75,7 @@ def fprime2(x): optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2) assert np.allclose(loss.inverse_link_function(optimum), y_true) - assert np.allclose(loss(y_true, optimum), 0) + assert np.allclose(loss.pointwise_loss(y_true, optimum), 0) assert np.allclose(get_gradients(y_true, optimum), 0) @@ -116,16 +116,16 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): eps = 1e-9 offset = np.zeros_like(raw_predictions) offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset / 2) - f_minus_eps = loss(y_true, raw_predictions - offset / 2) + f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2) + f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2) numerical_gradients = (f_plus_eps - f_minus_eps) / eps # Approximate hessians eps = 1e-4 # need big enough eps as we divide by its square offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset) - f_minus_eps = loss(y_true, raw_predictions - offset) - f = loss(y_true, raw_predictions) + f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset) + f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset) + f = loss.pointwise_loss(y_true, raw_predictions) numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2 def relative_error(a, b): From 34ad5a597c002b0d6fa11015dec595f0147d5770 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 4 Sep 2019 23:06:00 +0200 Subject: [PATCH 15/55] _loss perf improvement --- .../_hist_gradient_boosting/_loss.pyx | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 1b7456afd3cd2..52ed557c7b895 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -94,17 +94,29 @@ def _update_gradients_hessians_categorical_crossentropy( Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) Y_DTYPE_C p_i_k - for i in prange(n_samples, schedule='static', nogil=True): - # first compute softmaxes of sample i for each class - for k in range(prediction_dim): - p[i, k] = raw_predictions[k, i] # prepare softmax - _compute_softmax(p, i) - # then update gradients and hessians - for k in range(prediction_dim): - p_i_k = p[i, k] - sw = 1 if sample_weight is None else sample_weight[i] - gradients[k, i] = p_i_k - (y_true[i] == k) * sw - hessians[k, i] = p_i_k * (1. - p_i_k) * sw + if sample_weight is None: + for i in prange(n_samples, schedule='static', nogil=True): + # first compute softmaxes of sample i for each class + for k in range(prediction_dim): + p[i, k] = raw_predictions[k, i] # prepare softmax + _compute_softmax(p, i) + # then update gradients and hessians + for k in range(prediction_dim): + p_i_k = p[i, k] + gradients[k, i] = p_i_k - (y_true[i] == k) + hessians[k, i] = p_i_k * (1. - p_i_k) + else: + for i in prange(n_samples, schedule='static', nogil=True): + # first compute softmaxes of sample i for each class + for k in range(prediction_dim): + p[i, k] = raw_predictions[k, i] # prepare softmax + _compute_softmax(p, i) + # then update gradients and hessians + for k in range(prediction_dim): + p_i_k = p[i, k] + sw = sample_weight[i] + gradients[k, i] = p_i_k - (y_true[i] == k) * sw + hessians[k, i] = p_i_k * (1. - p_i_k) * sw cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: From 892a5b54456b57d6b6d661b7a73e8917cc1c8f8c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 6 Sep 2019 10:46:00 +0200 Subject: [PATCH 16/55] adding more tests --- .../tests/test_gradient_boosting.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index f65c4cd24e938..346770bac3ef8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -456,9 +456,7 @@ def test_non_uniform_weights_toy_edge_case_reg(): assert gb.predict([[1, 0]])[0] > 0.5 -@pytest.mark.parametrize("loss", ['binary_crossentropy', - 'categorical_crossentropy']) -def test_non_uniform_weights_toy_edge_case_clf(loss): +def test_non_uniform_weights_toy_edge_case_clf(): X = [[1, 0], [1, 0], [1, 0], @@ -483,3 +481,24 @@ def test_non_uniform_weights_toy_edge_case_clf(loss): min_samples_leaf=1) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1]) + + +@pytest.mark.parametrize( + "model, X, y", + [(HistGradientBoostingClassifier(), + *make_classification(n_classes=2)), + (HistGradientBoostingClassifier(), + *make_classification(n_classes=4, n_informative=16)), + (HistGradientBoostingRegressor(), *make_regression())]) +def test_sample_weight_effect(model, X, y): + n_samples = X.shape[0] + X_ = np.r_[X, X[:n_samples // 2, :]] + y_ = np.r_[y, y[:n_samples // 2, ]] + sample_weight = np.ones(shape=(n_samples)) + sample_weight[:n_samples // 2] = 2 + + no_dup_no_sw = model.fit(X, y).predict(X_) + dup_no_sw = model.fit(X_, y_).predict(X_) + no_dup_sw = model.fit(X, y, sample_weight=sample_weight) + assert np.all(dup_no_sw[n_samples // 2:, ] == no_dup_sw) + assert not np.all(no_dup_no_sw, dup_no_sw[n_samples // 2:, ]) From 78b8f643d75dd51a9066a9fce3c83576c0731c2a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 17 Sep 2019 17:00:25 +0200 Subject: [PATCH 17/55] fixing more of the sample weight for LAD --- .../_hist_gradient_boosting/_loss.pyx | 32 +++++++++++++++++-- .../ensemble/_hist_gradient_boosting/loss.py | 6 ++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 1ec2a3e05894d..20480aa02df82 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -27,6 +27,9 @@ def _update_gradients_least_squares( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): + # Note: a more correct exp is 2 * (raw_predictions - y_true) + # but since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. gradients[i] = raw_predictions[i] - y_true[i] @@ -43,12 +46,35 @@ def _update_gradients_hessians_least_squares( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): - # gradient = sign(raw_predicition - y_pred) - gradients[i] = (sample_weight[i] * - (2 * (y_true[i] - raw_predictions[i] < 0) - 1)) + # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight + # but since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. + gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i] hessians[i] = sample_weight[i] +def _update_gradients_least_absolute_deviation( + G_H_DTYPE_C [::1] gradients, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN + + cdef: + int n_samples + int i + + n_samples = raw_predictions.shape[0] + if sample_weight is None: + for i in prange(n_samples, schedule='static', nogil=True): + # gradient = sign(raw_predicition - y_pred) + gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1 + else: + for i in prange(n_samples, schedule='static', nogil=True): + # gradient = sign(raw_predicition - y_pred) * sample_weight + gradients[i] = (sample_weight[i] * 2 * + (y_true[i] - raw_predictions[i] < 0) - 1) + + def _update_gradients_hessians_binary_crossentropy( G_H_DTYPE_C [::1] gradients, # OUT G_H_DTYPE_C [::1] hessians, # OUT diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 95e54fcad168a..2b81ef6c5d553 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -210,14 +210,14 @@ class LeastAbsoluteDeviation(BaseLoss): # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. need_update_leaves_values = True - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = np.abs(y_true - raw_predictions) - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): return np.median(y_train) @staticmethod From eb3f8f16d97f9f6e5d7f7bcbf0af7fe06c392009 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 18 Sep 2019 15:48:41 +0200 Subject: [PATCH 18/55] fix LAD sample weight API --- sklearn/ensemble/_hist_gradient_boosting/loss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 2b81ef6c5d553..a8723981eec16 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -225,13 +225,14 @@ def inverse_link_function(raw_predictions): return raw_predictions def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) _update_gradients_least_absolute_deviation(gradients, y_true, - raw_predictions) + raw_predictions, + sample_weight) def update_leaves_values(self, grower, y_true, raw_predictions): # Update the values predicted by the tree with From cccde961384633238c97e5e064a529f6b129fb6e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 18 Sep 2019 15:51:55 +0200 Subject: [PATCH 19/55] apply Guillaume's suggestions --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d1508c2cc1f66..2804201fdd845 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -92,7 +92,7 @@ def fit(self, X, y, sample_weight=None): y : array-like, shape=(n_samples,) Target values. - sample_weight : array-like of shape(n_samples,) default=None + sample_weight : array-like of shape (n_samples,) default=None Weights of training data. Returns @@ -108,6 +108,8 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite=False) y = self._encode_y(y) check_consistent_length(X, y) + # Do not create unit sample weights by default to later skip some + # computation if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) From 7500c7892c22ccc11d6e0b1392a9f138ebdb1a72 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 18 Sep 2019 18:49:00 +0200 Subject: [PATCH 20/55] almost weighted binning --- .../_hist_gradient_boosting/binning.py | 45 +++++++++++++++++-- .../gradient_boosting.py | 11 +++-- .../tests/test_binning.py | 2 +- 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 6c8fbf40d6576..33938df044f5c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,7 +16,44 @@ from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF -def _find_binning_thresholds(data, max_bins, subsample, random_state): +def weighted_quantile(values, quantiles, sample_weight=None, + values_sorted=False, old_style=True): + """ Very close to numpy.percentile, but supports weights. + NOTE: quantiles should be in [0, 1]! + :param values: numpy.array with data + :param quantiles: array-like with many quantiles needed + :param sample_weight: array-like of the same length as `array` + :param values_sorted: bool, if True, then will avoid sorting of + initial array + :param old_style: if True, will correct output to be consistent + with numpy.percentile. + :return: numpy.array with computed quantiles. + """ + values = np.array(values) + quantiles = np.array(quantiles) + if sample_weight is None: + sample_weight = np.ones(len(values)) + sample_weight = np.array(sample_weight) + assert np.all(quantiles >= 0) and np.all(quantiles <= 1), \ + 'quantiles should be in [0, 1]' + + if not values_sorted: + sorter = np.argsort(values) + values = values[sorter] + sample_weight = sample_weight[sorter] + + weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight + if old_style: + # To be convenient with numpy.percentile + weighted_quantiles -= weighted_quantiles[0] + weighted_quantiles /= weighted_quantiles[-1] + else: + weighted_quantiles /= np.sum(sample_weight) + return np.interp(quantiles, weighted_quantiles, values) + + +def _find_binning_thresholds(data, sample_weight, max_bins, subsample, + random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -73,6 +110,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) + #midpoints = weighted_quantile(col_data, percentiles, + # sample_weight=sample_weight).astype(X_DTYPE) assert midpoints.shape[0] == max_bins - 1 # We avoid having +inf thresholds: +inf thresholds are only allowed in @@ -137,7 +176,7 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): self.subsample = subsample self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """Fit data X by computing the binning thresholds. The last bin is reserved for missing values, whether missing values @@ -162,7 +201,7 @@ def fit(self, X, y=None): X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) max_bins = self.n_bins - 1 self.bin_thresholds_ = _find_binning_thresholds( - X, max_bins, subsample=self.subsample, + X, sample_weight, max_bins, subsample=self.subsample, random_state=self.random_state) self.n_bins_non_missing_ = np.array( diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 2804201fdd845..f9a8b2a7312ff 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -180,9 +180,11 @@ def fit(self, X, y, sample_weight=None): # convention is that n_bins == max_bins + 1 n_bins = self.max_bins + 1 # + 1 for missing values self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=rng) - X_binned_train = self._bin_data(X_train, rng, is_training_data=True) + X_binned_train = self._bin_data(X_train, sample_weight_train, rng, + is_training_data=True) if X_val is not None: - X_binned_val = self._bin_data(X_val, rng, is_training_data=False) + X_binned_val = self._bin_data(X_val, sample_weight_val, rng, + is_training_data=False) else: X_binned_val = None @@ -544,7 +546,7 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) - def _bin_data(self, X, rng, is_training_data): + def _bin_data(self, X, sample_weight, rng, is_training_data): """Bin data X. If is_training_data, then set the bin_mapper_ attribute. @@ -557,7 +559,8 @@ def _bin_data(self, X, rng, is_training_data): X.nbytes / 1e9, description), end="", flush=True) tic = time() if is_training_data: - X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array + X_binned = self.bin_mapper_.fit( + X, None, sample_weight).transform(X) # F-aligned array else: X_binned = self.bin_mapper_.transform(X) # F-aligned array # We convert the array to C-contiguous since predicting is faster diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 06e38d62f7638..15f43be2f7167 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -21,7 +21,7 @@ def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - return _find_binning_thresholds_orig(data, max_bins, subsample, + return _find_binning_thresholds_orig(data, None, max_bins, subsample, random_state) From ccc527b6a57ffcb069f11ae4bcba802a6fb44116 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 19 Sep 2019 15:04:50 +0200 Subject: [PATCH 21/55] weighted quantiles --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 33938df044f5c..ec982960513d2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -106,12 +106,12 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - percentiles = np.linspace(0, 100, num=max_bins + 1) + percentiles = np.linspace(0, 1, num=max_bins + 1) percentiles = percentiles[1:-1] - midpoints = np.percentile(col_data, percentiles, - interpolation='midpoint').astype(X_DTYPE) - #midpoints = weighted_quantile(col_data, percentiles, - # sample_weight=sample_weight).astype(X_DTYPE) + #midpoints = np.percentile(col_data, percentiles, + # interpolation='midpoint').astype(X_DTYPE) + midpoints = weighted_quantile(col_data, percentiles, + sample_weight=sample_weight).astype(X_DTYPE) assert midpoints.shape[0] == max_bins - 1 # We avoid having +inf thresholds: +inf thresholds are only allowed in From a2f79b24b57851a93e76daf7d5da780bffc4324d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 1 Oct 2019 20:38:25 +0200 Subject: [PATCH 22/55] add loss tests, and fixes --- .../_hist_gradient_boosting/_loss.pyx | 36 +++++++++----- .../ensemble/_hist_gradient_boosting/loss.py | 27 ++++++++--- .../tests/test_loss.py | 48 +++++++++++++++++++ 3 files changed, 93 insertions(+), 18 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 20480aa02df82..ef6783156d804 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -53,8 +53,9 @@ def _update_gradients_hessians_least_squares( hessians[i] = sample_weight[i] -def _update_gradients_least_absolute_deviation( +def _update_gradients_hessians_least_absolute_deviation( G_H_DTYPE_C [::1] gradients, # OUT + G_H_DTYPE_C [::1] hessians, # OUT const Y_DTYPE_C [::1] y_true, # IN const Y_DTYPE_C [::1] raw_predictions, # IN const Y_DTYPE_C [::1] sample_weight): # IN @@ -64,15 +65,26 @@ def _update_gradients_least_absolute_deviation( int i n_samples = raw_predictions.shape[0] - if sample_weight is None: - for i in prange(n_samples, schedule='static', nogil=True): - # gradient = sign(raw_predicition - y_pred) - gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1 - else: - for i in prange(n_samples, schedule='static', nogil=True): - # gradient = sign(raw_predicition - y_pred) * sample_weight - gradients[i] = (sample_weight[i] * 2 * - (y_true[i] - raw_predictions[i] < 0) - 1) + for i in prange(n_samples, schedule='static', nogil=True): + # gradient = sign(raw_predicition - y_pred) * sample_weight + gradients[i] = sample_weight[i] * (2 * + (y_true[i] - raw_predictions[i] < 0) - 1) + hessians[i] = sample_weight[i] + + +def _update_gradients_least_absolute_deviation( + G_H_DTYPE_C [::1] gradients, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions): # IN + + cdef: + int n_samples + int i + + n_samples = raw_predictions.shape[0] + for i in prange(n_samples, schedule='static', nogil=True): + # gradient = sign(raw_predicition - y_pred) + gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1 def _update_gradients_hessians_binary_crossentropy( @@ -137,8 +149,8 @@ def _update_gradients_hessians_categorical_crossentropy( for k in range(prediction_dim): p_i_k = p[i, k] sw = sample_weight[i] - gradients[k, i] = p_i_k - (y_true[i] == k) * sw - hessians[k, i] = p_i_k * (1. - p_i_k) * sw + gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw + hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index a8723981eec16..91a9984563d83 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -20,8 +20,10 @@ from ._loss import _update_gradients_least_squares from ._loss import _update_gradients_hessians_least_squares from ._loss import _update_gradients_least_absolute_deviation +from ._loss import _update_gradients_hessians_least_absolute_deviation from ._loss import _update_gradients_hessians_binary_crossentropy from ._loss import _update_gradients_hessians_categorical_crossentropy +from ...utils.stats import _weighted_percentile class BaseLoss(ABC): @@ -218,7 +220,10 @@ def pointwise_loss(self, y_true, raw_predictions): return loss def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): - return np.median(y_train) + if sample_weight is None: + return np.median(y_train) + else: + return _weighted_percentile(y_train, sample_weight, 50) @staticmethod def inverse_link_function(raw_predictions): @@ -230,11 +235,16 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - _update_gradients_least_absolute_deviation(gradients, y_true, - raw_predictions, - sample_weight) + if sample_weight is None: + _update_gradients_least_absolute_deviation(gradients, y_true, + raw_predictions) + else: + hessians = hessians.reshape(-1) + _update_gradients_hessians_least_absolute_deviation( + gradients, hessians, y_true, raw_predictions, sample_weight) - def update_leaves_values(self, grower, y_true, raw_predictions): + def update_leaves_values(self, grower, y_true, raw_predictions, + sample_weights): # Update the values predicted by the tree with # median(y_true - raw_predictions). # See note about need_update_leaves_values in BaseLoss. @@ -244,7 +254,12 @@ def update_leaves_values(self, grower, y_true, raw_predictions): # requires a cython version of median() for leaf in grower.finalized_leaves: indices = leaf.sample_indices - median_res = np.median(y_true[indices] - raw_predictions[indices]) + if sample_weights is None: + median_res = np.median(y_true[indices] + - raw_predictions[indices]) + else: + median_res = _weighted_percentile(y_true[indices] + - raw_predictions[indices]) leaf.value = grower.shrinkage * median_res # Note that the regularization is ignored here diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index ee90809d02981..a572e24419561 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -209,3 +209,51 @@ def test_baseline_categorical_crossentropy(): for k in range(prediction_dim): p = (y_train == k).mean() assert np.allclose(baseline_prediction[k, :], np.log(p)) + + +@pytest.mark.parametrize('loss, problem', [ + ('least_squares', 'regression'), + ('least_absolute_deviation', 'regression'), + ('binary_crossentropy', 'classification'), + ('categorical_crossentropy', 'classification') + ]) +def test_sample_weight(loss, problem): + rng = np.random.RandomState(42) + n_samples = 100 + n_classes = 3 + prediction_dim = n_classes if loss == "categorical_crossentropy" else 1 + if problem == 'regression': + y_true = rng.normal(size=n_samples).astype(Y_DTYPE) + else: + y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) + raw_predictions = rng.normal( + size=(prediction_dim, n_samples) + ).astype(Y_DTYPE) + + sample_weight = rng.random(size=(n_samples,)) + ones = np.ones(shape=(n_samples,)) + + loss_ = _LOSSES[loss]() + gradients0, hessians0 = loss_.init_gradients_and_hessians( + n_samples, prediction_dim, None) + loss_.update_gradients_and_hessians(gradients0, hessians0, y_true, + raw_predictions, None) + + loss_ = _LOSSES[loss]() + gradients1, hessians1 = loss_.init_gradients_and_hessians( + n_samples, prediction_dim, ones) + loss_.update_gradients_and_hessians(gradients1, hessians1, y_true, + raw_predictions, ones) + + # passing ones as sample weights shouldn't change the values + assert np.allclose(gradients0, gradients1) + assert np.allclose(hessians0, hessians1) + + loss_ = _LOSSES[loss]() + gradients2, hessians2 = loss_.init_gradients_and_hessians( + n_samples, prediction_dim, ones) + loss_.update_gradients_and_hessians(gradients2, hessians2, y_true, + raw_predictions, sample_weight) + + assert np.allclose(gradients1 * sample_weight, gradients2) + assert np.allclose(hessians1 * sample_weight, hessians2) From 0ff3b7279b2e40666321e10634dcc3236aa5fd4b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 2 Oct 2019 11:34:18 +0200 Subject: [PATCH 23/55] fix missing arg --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 77777fd1c23f4..d4ad4f1b7eeb4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -355,7 +355,8 @@ def fit(self, X, y, sample_weight=None): if self.loss_.need_update_leaves_values: self.loss_.update_leaves_values(grower, y_train, - raw_predictions[k, :]) + raw_predictions[k, :], + sample_weight_train) predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_ From ed24433984e21669fd67030bc91cb884aeabd4de Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 2 Oct 2019 11:34:56 +0200 Subject: [PATCH 24/55] add comment --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index ec982960513d2..20390215168f8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -108,8 +108,9 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, # work on a fixed-size subsample of the full data. percentiles = np.linspace(0, 1, num=max_bins + 1) percentiles = percentiles[1:-1] - #midpoints = np.percentile(col_data, percentiles, + # midpoints = np.percentile(col_data, percentiles, # interpolation='midpoint').astype(X_DTYPE) + # the utils.stat._weighted_percentile is not suitable here midpoints = weighted_quantile(col_data, percentiles, sample_weight=sample_weight).astype(X_DTYPE) assert midpoints.shape[0] == max_bins - 1 From 76566572e27cca434bcf647330a9698ff17fd3b4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Oct 2019 16:18:16 -0400 Subject: [PATCH 25/55] fix missing loss param in test --- sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index a572e24419561..16ef405a4547c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -155,7 +155,7 @@ def test_baseline_least_absolute_deviation(): loss = _LOSSES['least_absolute_deviation']() y_train = rng.normal(size=100) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype # Make sure baseline prediction is the median of all targets From 44b5d1c57178a9ba375133a455b4a5ded7e0f87e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Oct 2019 17:27:04 -0400 Subject: [PATCH 26/55] factorized tests and used reasonable values for raw_predictions for gradients to be not too extreme --- .../gradient_boosting.py | 2 +- .../tests/test_loss.py | 58 ++++++++++--------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5284d1d1468c2..d278d25a8f935 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -111,7 +111,7 @@ def fit(self, X, y, sample_weight=None): # Do not create unit sample weights by default to later skip some # computation if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) rng = check_random_state(self.random_state) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 16ef405a4547c..a777dd2aa67d2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -217,43 +217,47 @@ def test_baseline_categorical_crossentropy(): ('binary_crossentropy', 'classification'), ('categorical_crossentropy', 'classification') ]) -def test_sample_weight(loss, problem): +@pytest.mark.parametrize('sample_weight', ['ones', 'random']) +def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): + # Make sure that passing sample weights to the gradient and hessians + # computation methods is equivalent to multiplying by the weights. + rng = np.random.RandomState(42) - n_samples = 100 - n_classes = 3 - prediction_dim = n_classes if loss == "categorical_crossentropy" else 1 + n_samples = 1000 + + if loss == 'categorical_crossentropy': + n_classes = prediction_dim = 3 + else: + n_classes = prediction_dim = 1 + if problem == 'regression': y_true = rng.normal(size=n_samples).astype(Y_DTYPE) else: y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) - raw_predictions = rng.normal( - size=(prediction_dim, n_samples) - ).astype(Y_DTYPE) - sample_weight = rng.random(size=(n_samples,)) - ones = np.ones(shape=(n_samples,)) + if sample_weight == 'ones': + sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE) + else: + sample_weight = rng.random(size=n_samples).astype(Y_DTYPE) loss_ = _LOSSES[loss]() - gradients0, hessians0 = loss_.init_gradients_and_hessians( - n_samples, prediction_dim, None) - loss_.update_gradients_and_hessians(gradients0, hessians0, y_true, - raw_predictions, None) - loss_ = _LOSSES[loss]() - gradients1, hessians1 = loss_.init_gradients_and_hessians( - n_samples, prediction_dim, ones) - loss_.update_gradients_and_hessians(gradients1, hessians1, y_true, - raw_predictions, ones) + baseline_prediction = loss_.get_baseline_prediction( + y_true, None, prediction_dim + ) + raw_predictions = np.zeros(shape=(prediction_dim, n_samples), + dtype=baseline_prediction.dtype) + raw_predictions += baseline_prediction - # passing ones as sample weights shouldn't change the values - assert np.allclose(gradients0, gradients1) - assert np.allclose(hessians0, hessians1) + gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + loss_.update_gradients_and_hessians(gradients, hessians, y_true, + raw_predictions, None) - loss_ = _LOSSES[loss]() - gradients2, hessians2 = loss_.init_gradients_and_hessians( - n_samples, prediction_dim, ones) - loss_.update_gradients_and_hessians(gradients2, hessians2, y_true, + gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true, raw_predictions, sample_weight) - assert np.allclose(gradients1 * sample_weight, gradients2) - assert np.allclose(hessians1 * sample_weight, hessians2) + assert np.allclose(gradients * sample_weight, gradients_sw) + assert np.allclose(hessians * sample_weight, hessians_sw) From 728b32e628595951f86a01d87c16e3daf1dd0c3d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Oct 2019 17:42:21 -0400 Subject: [PATCH 27/55] Added test for init_gradient_and_hessians --- .../ensemble/_hist_gradient_boosting/loss.py | 2 ++ .../tests/test_loss.py | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 91a9984563d83..4489309d06668 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -82,11 +82,13 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim, """ shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) + if sample_weight is not None: # If sample weights are provided, the hessians and gradients # are multiplied by sample_weight, which means the hessians are # equal to sample weights. self.hessians_are_constant = False + if self.hessians_are_constant: # If the hessians are constant, we consider they are equal to 1. # - This is correct for the half LS loss diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index a777dd2aa67d2..4177a6dd5ada8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -261,3 +261,27 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): assert np.allclose(gradients * sample_weight, gradients_sw) assert np.allclose(hessians * sample_weight, hessians_sw) + + +def test_init_gradient_and_hessians_sample_weight(): + # Make sure that passing sample_weight to a loss correctly influences the + # hessians_are_constant attribute, and consequently the shape of the + # hessians array. + + loss = _LOSSES['least_squares']() + assert loss.hessians_are_constant + + prediction_dim = 2 + n_samples = 5 + + _, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=prediction_dim, + sample_weight=None) + assert loss.hessians_are_constant + assert hessians.shape == (1, 1) + + _, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=prediction_dim, + sample_weight=np.ones(100)) + assert not loss.hessians_are_constant + assert hessians.shape == (prediction_dim, n_samples) From 3a0b62f13f0e31f6e8cc4cff62c886b5b09f1b98 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Oct 2019 17:48:57 -0400 Subject: [PATCH 28/55] fix typo in test --- sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 4177a6dd5ada8..42253233970c6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -282,6 +282,6 @@ def test_init_gradient_and_hessians_sample_weight(): _, hessians = loss.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=prediction_dim, - sample_weight=np.ones(100)) + sample_weight=np.ones(n_samples)) assert not loss.hessians_are_constant assert hessians.shape == (prediction_dim, n_samples) From cb8b94c61012894226a734b7425ee2b33b8c8691 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Oct 2019 10:52:36 -0400 Subject: [PATCH 29/55] Added test for sum_hessians in histogram --- .../tests/test_gradient_boosting.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 19a364ab00e80..4ee283626113b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -11,6 +11,8 @@ from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.utils import shuffle @@ -522,3 +524,47 @@ def test_sample_weight_effect(model, X, y): no_dup_sw = model.fit(X, y, sample_weight=sample_weight) assert np.all(dup_no_sw[n_samples // 2:, ] == no_dup_sw) assert not np.all(no_dup_no_sw, dup_no_sw[n_samples // 2:, ]) + + +@pytest.mark.parametrize('loss_name', ('least_squares', + 'least_absolute_deviation')) +def test_sum_hessians_are_sample_weight(loss_name): + # For losses with constant hessians, the sum_hessians field of the + # histograms must be equal to the sum of the sample weight of samples at + # the corresponding bin. + + rng = np.random.RandomState(0) + n_samples = 1000 + n_features = 2 + X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=rng) + bin_mapper = _BinMapper() + X_binned = bin_mapper.fit_transform(X) + + sample_weight = rng.normal(size=n_samples) + + loss = _LOSSES[loss_name]() + gradients, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight) + raw_predictions = rng.normal(size=(1, n_samples)) + loss.update_gradients_and_hessians(gradients, hessians, y, + raw_predictions, sample_weight) + + # build sum_sample_weight which contains the sum of the sample weights at + # each bin (for each feature). This must be equal to the sum_hessians + # field of the corresponding histogram + sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins)) + for feature_idx in range(n_features): + for sample_idx in range(n_samples): + sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += ( + sample_weight[sample_idx]) + + # Build histogram + grower = TreeGrower(X_binned, gradients[0], hessians[0], + n_bins=bin_mapper.n_bins) + histograms = grower.histogram_builder.compute_histograms_brute( + grower.root.sample_indices) + + for feature_idx in range(n_features): + for bin_idx in range(bin_mapper.n_bins): + assert histograms[feature_idx][bin_idx]['sum_hessians'] == ( + pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5)) From 90550afc932ab79923f9dafb585d6db3d90e5ddb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Oct 2019 13:15:40 -0400 Subject: [PATCH 30/55] make test pass, but need to unit test binning --- .../tests/test_gradient_boosting.py | 56 +++++++++++++------ 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 4ee283626113b..6dd43fd5b32e7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -505,25 +505,47 @@ def test_non_uniform_weights_toy_edge_case_clf(): assert_array_equal(gb.predict([[1, 0]]), [1]) -@pytest.mark.parametrize( - "model, X, y", - [(HistGradientBoostingClassifier(), - *make_classification(n_classes=2)), - (HistGradientBoostingClassifier(), - *make_classification(n_classes=4, n_informative=16)), - (HistGradientBoostingRegressor(), *make_regression())]) -def test_sample_weight_effect(model, X, y): - n_samples = X.shape[0] - X_ = np.r_[X, X[:n_samples // 2, :]] - y_ = np.r_[y, y[:n_samples // 2, ]] +@pytest.mark.parametrize('problem', ( + 'regression', + 'binary_classification', + 'multiclass_classification' +)) +@pytest.mark.parametrize('duplication', ('half', 'all')) +@pytest.mark.parametrize('seed', range(1)) +def test_sample_weight_effect(problem, duplication, seed): + # High level test to make sure that duplicating a sample is equivalent to + # giving it weight of 2. + + n_samples = 255 # fails for n_samples > 256 + n_features = 2 + if problem == 'regression': + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features, random_state=seed) + Klass = HistGradientBoostingRegressor + else: + n_classes = 2 if problem == 'binary_classification' else 3 + X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_informative=n_features, n_redundant=0, + n_clusters_per_class=1, + n_classes=n_classes, random_state=seed) + Klass = HistGradientBoostingClassifier + + est = Klass(min_samples_leaf=1) # fails if min_samples_leaf > 1 + + # Create dataset with duplicate and corresponding sample weights + if duplication == 'half': + lim = n_samples // 2 + else: + lim = n_samples + X_dup = np.r_[X, X[:lim]] + y_dup = np.r_[y, y[:lim]] sample_weight = np.ones(shape=(n_samples)) - sample_weight[:n_samples // 2] = 2 + sample_weight[:lim] = 2 + + no_dup_sw = est.fit(X, y, sample_weight=sample_weight).predict(X_dup) + dup_no_sw = est.fit(X_dup, y_dup).predict(X_dup) - no_dup_no_sw = model.fit(X, y).predict(X_) - dup_no_sw = model.fit(X_, y_).predict(X_) - no_dup_sw = model.fit(X, y, sample_weight=sample_weight) - assert np.all(dup_no_sw[n_samples // 2:, ] == no_dup_sw) - assert not np.all(no_dup_no_sw, dup_no_sw[n_samples // 2:, ]) + assert np.allclose(dup_no_sw, no_dup_sw) @pytest.mark.parametrize('loss_name', ('least_squares', From b233c63aef8f740c2e29d81780243b43ffe49cd6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Oct 2019 15:15:41 -0400 Subject: [PATCH 31/55] fix test --- sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 42253233970c6..ee6b383543a06 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -238,7 +238,7 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): if sample_weight == 'ones': sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE) else: - sample_weight = rng.random(size=n_samples).astype(Y_DTYPE) + sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE) loss_ = _LOSSES[loss]() From 964c68afb406ee66bb94e544f62bf7802cee60ea Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Oct 2019 16:01:52 -0400 Subject: [PATCH 32/55] Added some tests for binning (failing) --- .../_hist_gradient_boosting/binning.py | 11 +++--- .../tests/test_binning.py | 36 +++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 20390215168f8..ef840038d53bf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,8 +16,8 @@ from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF -def weighted_quantile(values, quantiles, sample_weight=None, - values_sorted=False, old_style=True): +def _weighted_quantile(values, quantiles, sample_weight=None, + values_sorted=False, old_style=True): """ Very close to numpy.percentile, but supports weights. NOTE: quantiles should be in [0, 1]! :param values: numpy.array with data @@ -85,8 +85,11 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: # TODO: depends on the weights and stratify maybe + # Question: should we subsample based on the weight if we also compute + # weight-based quantiles?? subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) data = data.take(subset, axis=0) + sample_weight = sample_weight.take(subset) binning_thresholds = [] for f_idx in range(data.shape[1]): @@ -111,8 +114,8 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, # midpoints = np.percentile(col_data, percentiles, # interpolation='midpoint').astype(X_DTYPE) # the utils.stat._weighted_percentile is not suitable here - midpoints = weighted_quantile(col_data, percentiles, - sample_weight=sample_weight).astype(X_DTYPE) + midpoints = _weighted_quantile(col_data, percentiles, + sample_weight=sample_weight).astype(X_DTYPE) assert midpoints.shape[0] == max_bins - 1 # We avoid having +inf thresholds: +inf thresholds are only allowed in diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 15f43be2f7167..6b69e44390ff1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -312,3 +312,39 @@ def test_infinite_values(): expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1) assert_array_equal(bin_mapper.transform(X), expected_binned_X) + + +def test_sample_weight_small_number_unique_values(): + # Make sure that when the number of unique values is small, the thresholds + # are the same whether sample_weight are passed or not. + n_samples = 255 + rng = np.random.RandomState(0) + X = rng.uniform(0, 1000, size=n_samples).reshape(-1, 1) + sample_weight = rng.uniform(0, 1000, size=n_samples) + + with_sw = _BinMapper().fit(X, sample_weight=sample_weight) + without_sw = _BinMapper().fit(X) + + np.testing.assert_allclose(with_sw.bin_thresholds_, + without_sw.bin_thresholds_) + +def test_sample_weight_equiv_duplication(): + from sklearn.ensemble._hist_gradient_boosting.binning import _weighted_quantile + + n_samples = 500 + n_quantiles = 100 + rng = np.random.RandomState(0) + + X = rng.uniform(0, 1000, size=n_samples) + lim = n_samples // 2 + X_dup = np.r_[X, X[:lim]] + + sample_weight = np.ones(n_samples) + sample_weight[:lim] = 2 + + quantiles = np.linspace(0, 1, n_quantiles)[1:-1] + no_dup_sw = _weighted_quantile(X, quantiles, sample_weight=sample_weight) + dup_no_sw = _weighted_quantile(X_dup, quantiles, sample_weight=None) + print(dup_no_sw - no_dup_sw) + np.testing.assert_allclose(dup_no_sw, no_dup_sw) # fails + From 09319208234c02c3251144aeac43bb1f348f7484 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 28 Oct 2019 14:54:41 -0400 Subject: [PATCH 33/55] WIP --- .../tests/test_gradient_boosting.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 6dd43fd5b32e7..d29d342f2abd4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -516,11 +516,11 @@ def test_sample_weight_effect(problem, duplication, seed): # High level test to make sure that duplicating a sample is equivalent to # giving it weight of 2. - n_samples = 255 # fails for n_samples > 256 + n_samples = 10 # fails for n_samples > 256 n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, - n_informative=n_features, random_state=seed) + n_informative=n_features, random_state=seed) Klass = HistGradientBoostingRegressor else: n_classes = 2 if problem == 'binary_classification' else 3 @@ -530,7 +530,7 @@ def test_sample_weight_effect(problem, duplication, seed): n_classes=n_classes, random_state=seed) Klass = HistGradientBoostingClassifier - est = Klass(min_samples_leaf=1) # fails if min_samples_leaf > 1 + est = Klass(min_samples_leaf=2, max_iter=1) # fails if min_samples_leaf > 1 # Create dataset with duplicate and corresponding sample weights if duplication == 'half': @@ -542,8 +542,11 @@ def test_sample_weight_effect(problem, duplication, seed): sample_weight = np.ones(shape=(n_samples)) sample_weight[:lim] = 2 - no_dup_sw = est.fit(X, y, sample_weight=sample_weight).predict(X_dup) - dup_no_sw = est.fit(X_dup, y_dup).predict(X_dup) + # Check decision function instead of just classes for classification + print(est.fit(X_dup, y_dup)._raw_predict(X_dup)) + print(est.fit(X, y, sample_weight=sample_weight)._raw_predict(X_dup)) + # no_dup_sw = est.fit(X, y, sample_weight=sample_weight)._raw_predict(X_dup) + # dup_no_sw = est.fit(X_dup, y_dup)._raw_predict(X_dup) assert np.allclose(dup_no_sw, no_dup_sw) From 06161634ef47ca529f370564a0f0596a0545e1ce Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 28 Oct 2019 15:22:41 -0400 Subject: [PATCH 34/55] Slight test refactoring --- .../tests/test_gradient_boosting.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index d29d342f2abd4..83e1d08957522 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -516,7 +516,10 @@ def test_sample_weight_effect(problem, duplication, seed): # High level test to make sure that duplicating a sample is equivalent to # giving it weight of 2. - n_samples = 10 # fails for n_samples > 256 + # fails for n_samples > 255 because binning implementation isn't strictly + # equivalent to just duplicating samples. Keeping n_samples <= 255 makes + # sure only unique values are used so SW have no effect on binning. + n_samples = 255 n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, @@ -530,7 +533,11 @@ def test_sample_weight_effect(problem, duplication, seed): n_classes=n_classes, random_state=seed) Klass = HistGradientBoostingClassifier - est = Klass(min_samples_leaf=2, max_iter=1) # fails if min_samples_leaf > 1 + # This test can't pass if min_samples_leaf > 1 because that would force 2 + # samples to be in the same node in est_sw, while these samples would be + # free to be separate in est_dup: est_dup would just group together the + # duplicated samples. + est = Klass(min_samples_leaf=1) # Create dataset with duplicate and corresponding sample weights if duplication == 'half': @@ -542,13 +549,12 @@ def test_sample_weight_effect(problem, duplication, seed): sample_weight = np.ones(shape=(n_samples)) sample_weight[:lim] = 2 - # Check decision function instead of just classes for classification - print(est.fit(X_dup, y_dup)._raw_predict(X_dup)) - print(est.fit(X, y, sample_weight=sample_weight)._raw_predict(X_dup)) - # no_dup_sw = est.fit(X, y, sample_weight=sample_weight)._raw_predict(X_dup) - # dup_no_sw = est.fit(X_dup, y_dup)._raw_predict(X_dup) + est_sw = clone(est).fit(X, y, sample_weight=sample_weight) + est_dup = clone(est).fit(X_dup, y_dup) - assert np.allclose(dup_no_sw, no_dup_sw) + # Check decision function instead of just classes for classification + assert np.allclose(est_sw._raw_predict(X_dup), + est_dup._raw_predict(X_dup)) @pytest.mark.parametrize('loss_name', ('least_squares', From 6116a0d82ffcb26cf18664c2c1846a8a3515d600 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 30 Oct 2019 16:22:55 +0100 Subject: [PATCH 35/55] whats_new --- doc/whats_new/v0.22.rst | 2 + .../_hist_gradient_boosting/binning.py | 56 ++----------------- .../gradient_boosting.py | 3 +- .../tests/test_binning.py | 38 +------------ 4 files changed, 10 insertions(+), 89 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index d8ca30898774a..59a172b74ce8e 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -217,6 +217,8 @@ Changelog values both for training and predicting. They also support infinite values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_ and `Olivier Grisel`_. + - |MajorFeature| Estimators now support :term:`sample_weight`. :pr:`14696` by + `Adrin Jalali`_ and `Nicolas Hug`_. - |Feature| Estimators now have an additional `warm_start` parameter that enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. - |Enhancement| for :class:`ensemble.HistGradientBoostingClassifier` the diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index ef840038d53bf..18cddca2d867f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,44 +16,7 @@ from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF -def _weighted_quantile(values, quantiles, sample_weight=None, - values_sorted=False, old_style=True): - """ Very close to numpy.percentile, but supports weights. - NOTE: quantiles should be in [0, 1]! - :param values: numpy.array with data - :param quantiles: array-like with many quantiles needed - :param sample_weight: array-like of the same length as `array` - :param values_sorted: bool, if True, then will avoid sorting of - initial array - :param old_style: if True, will correct output to be consistent - with numpy.percentile. - :return: numpy.array with computed quantiles. - """ - values = np.array(values) - quantiles = np.array(quantiles) - if sample_weight is None: - sample_weight = np.ones(len(values)) - sample_weight = np.array(sample_weight) - assert np.all(quantiles >= 0) and np.all(quantiles <= 1), \ - 'quantiles should be in [0, 1]' - - if not values_sorted: - sorter = np.argsort(values) - values = values[sorter] - sample_weight = sample_weight[sorter] - - weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight - if old_style: - # To be convenient with numpy.percentile - weighted_quantiles -= weighted_quantiles[0] - weighted_quantiles /= weighted_quantiles[-1] - else: - weighted_quantiles /= np.sum(sample_weight) - return np.interp(quantiles, weighted_quantiles, values) - - -def _find_binning_thresholds(data, sample_weight, max_bins, subsample, - random_state): +def _find_binning_thresholds(data, max_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -84,12 +47,8 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, """ rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: - # TODO: depends on the weights and stratify maybe - # Question: should we subsample based on the weight if we also compute - # weight-based quantiles?? subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) data = data.take(subset, axis=0) - sample_weight = sample_weight.take(subset) binning_thresholds = [] for f_idx in range(data.shape[1]): @@ -109,13 +68,10 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - percentiles = np.linspace(0, 1, num=max_bins + 1) + percentiles = np.linspace(0, 100, num=max_bins + 1) percentiles = percentiles[1:-1] - # midpoints = np.percentile(col_data, percentiles, - # interpolation='midpoint').astype(X_DTYPE) - # the utils.stat._weighted_percentile is not suitable here - midpoints = _weighted_quantile(col_data, percentiles, - sample_weight=sample_weight).astype(X_DTYPE) + midpoints = np.percentile(col_data, percentiles, + interpolation='midpoint').astype(X_DTYPE) assert midpoints.shape[0] == max_bins - 1 # We avoid having +inf thresholds: +inf thresholds are only allowed in @@ -180,7 +136,7 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): self.subsample = subsample self.random_state = random_state - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None): """Fit data X by computing the binning thresholds. The last bin is reserved for missing values, whether missing values @@ -205,7 +161,7 @@ def fit(self, X, y=None, sample_weight=None): X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) max_bins = self.n_bins - 1 self.bin_thresholds_ = _find_binning_thresholds( - X, sample_weight, max_bins, subsample=self.subsample, + X, max_bins, subsample=self.subsample, random_state=self.random_state) self.n_bins_non_missing_ = np.array( diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d278d25a8f935..fca2044279f27 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -558,8 +558,7 @@ def _bin_data(self, X, sample_weight, is_training_data): X.nbytes / 1e9, description), end="", flush=True) tic = time() if is_training_data: - X_binned = self.bin_mapper_.fit( - X, None, sample_weight).transform(X) # F-aligned array + X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array else: X_binned = self.bin_mapper_.transform(X) # F-aligned array # We convert the array to C-contiguous since predicting is faster diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 6b69e44390ff1..06e38d62f7638 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -21,7 +21,7 @@ def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - return _find_binning_thresholds_orig(data, None, max_bins, subsample, + return _find_binning_thresholds_orig(data, max_bins, subsample, random_state) @@ -312,39 +312,3 @@ def test_infinite_values(): expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1) assert_array_equal(bin_mapper.transform(X), expected_binned_X) - - -def test_sample_weight_small_number_unique_values(): - # Make sure that when the number of unique values is small, the thresholds - # are the same whether sample_weight are passed or not. - n_samples = 255 - rng = np.random.RandomState(0) - X = rng.uniform(0, 1000, size=n_samples).reshape(-1, 1) - sample_weight = rng.uniform(0, 1000, size=n_samples) - - with_sw = _BinMapper().fit(X, sample_weight=sample_weight) - without_sw = _BinMapper().fit(X) - - np.testing.assert_allclose(with_sw.bin_thresholds_, - without_sw.bin_thresholds_) - -def test_sample_weight_equiv_duplication(): - from sklearn.ensemble._hist_gradient_boosting.binning import _weighted_quantile - - n_samples = 500 - n_quantiles = 100 - rng = np.random.RandomState(0) - - X = rng.uniform(0, 1000, size=n_samples) - lim = n_samples // 2 - X_dup = np.r_[X, X[:lim]] - - sample_weight = np.ones(n_samples) - sample_weight[:lim] = 2 - - quantiles = np.linspace(0, 1, n_quantiles)[1:-1] - no_dup_sw = _weighted_quantile(X, quantiles, sample_weight=sample_weight) - dup_no_sw = _weighted_quantile(X_dup, quantiles, sample_weight=None) - print(dup_no_sw - no_dup_sw) - np.testing.assert_allclose(dup_no_sw, no_dup_sw) # fails - From 9ff57a63124d7f913041b3795a66f193ee5cbf86 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 30 Oct 2019 16:56:42 +0100 Subject: [PATCH 36/55] add content to user guide --- doc/modules/ensemble.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index ec7b337a20593..ff8969b2d329c 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -941,6 +941,31 @@ If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. +Sample Weight Support +--------------------- + +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` sample weights during :term:`fit`. + +The following toy example demonstrates how the model ignores the samples with +zero sample weights: + + >>> X = [[1, 0], + ... [1, 0], + ... [1, 0], + ... [0, 1]] + >>> y = [0, 0, 1, 0] + >>> # ignore the first 2 training samples by setting their weight to 0 + >>> sample_weight = [0, 0, 1, 1] + >>> gb = HistGradientBoostingClassifier(min_samples_leaf=1) + >>> gb.fit(X, y, sample_weight=sample_weight) + HistGradientBoostingClassifier(...) + >>> gb.predict([[1, 0]]) + array([1]) + +As you can see, the `[1, 0]` is comfortably classified as `1` since the first +two samples are ignored due to their sample weights. + Low-level parallelism --------------------- From 2ffee14a8ea724080ea08a6491368d3ce9a5c5da Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 31 Oct 2019 09:52:24 +0100 Subject: [PATCH 37/55] remove todo, fix pep8 --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 5 ++--- .../tests/test_gradient_boosting.py | 7 ++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index fca2044279f27..c1059551981ed 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -111,7 +111,8 @@ def fit(self, X, y, sample_weight=None): # Do not create unit sample weights by default to later skip some # computation if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) + sample_weight = _check_sample_weight(sample_weight, X, + dtype=np.float64) rng = check_random_state(self.random_state) @@ -474,8 +475,6 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, if is_classifier(self): y_small_train = self.classes_[y_small_train.astype(int)] - # TODO: handle when _scorer doesn't accept sample_weight, but - # sample_weight is provided if sample_weight_small_train is None: self.train_score_.append( self.scorer_(self, X_binned_small_train, y_small_train) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 83e1d08957522..a9479b09f6412 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -567,7 +567,8 @@ def test_sum_hessians_are_sample_weight(loss_name): rng = np.random.RandomState(0) n_samples = 1000 n_features = 2 - X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=rng) + X, y = make_regression(n_samples=n_samples, n_features=n_features, + random_state=rng) bin_mapper = _BinMapper() X_binned = bin_mapper.fit_transform(X) @@ -597,5 +598,5 @@ def test_sum_hessians_are_sample_weight(loss_name): for feature_idx in range(n_features): for bin_idx in range(bin_mapper.n_bins): - assert histograms[feature_idx][bin_idx]['sum_hessians'] == ( - pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5)) + assert histograms[feature_idx][bin_idx]['sum_hessians'] == ( + pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5)) From d6e0482d1d19db60a85c10a439c03b70007076dc Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 1 Nov 2019 12:56:14 +0100 Subject: [PATCH 38/55] pdp notimplementederror --- .../gradient_boosting.py | 9 +++++++ .../tests/test_partial_dependence.py | 26 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index c1059551981ed..f0cb2c8906b03 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -113,6 +113,7 @@ def fit(self, X, y, sample_weight=None): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) + self._fitted_with_sw = True rng = check_random_state(self.random_state) @@ -669,6 +670,14 @@ def _compute_partial_dependence_recursion(self, grid, target_features): (n_trees_per_iteration, n_samples) The value of the partial dependence function on each grid point. """ + + if getattr(self, '_fitted_with_sw', False): + raise NotImplementedError("{} does not support partial dependence" + " plots when sample weights were given " + "during fit time.".format( + self.__class__.__name__ + )) + grid = np.asarray(grid, dtype=X_DTYPE, order='C') averaged_predictions = np.zeros( (self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 8d3194f34249f..27d4678ee9ac0 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -442,6 +442,32 @@ def test_partial_dependence_sample_weight(): assert np.corrcoef(pdp, values)[0, 1] > 0.99 +def test_partial_dependence_sample_weight_hgbt(): + # Test near perfect correlation between partial dependence and diagonal + # when sample weights emphasize y = x predictions + # non-regression test for #13193 + # TODO: check properly when sample weight is supported in pdp of HGBT + N = 1000 + rng = np.random.RandomState(123456) + mask = rng.randint(2, size=N, dtype=bool) + + x = rng.rand(N) + # set y = x on mask and y = -x outside + y = x.copy() + y[~mask] = -y[~mask] + X = np.c_[mask, x] + # sample weights to emphasize data points where y = x + sample_weight = np.ones(N) + sample_weight[mask] = 1000. + + clf = HistGradientBoostingRegressor(random_state=1) + clf.fit(X, y, sample_weight=sample_weight) + + with pytest.raises(NotImplementedError, + match="does not support partial dependence"): + partial_dependence(clf, X, features=[1]) + + def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() From 942d542c635c2726c10549d6310b6aea19f5019a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 4 Nov 2019 12:58:19 +0100 Subject: [PATCH 39/55] apply Nicolas's suggestions --- doc/modules/ensemble.rst | 13 +++++++---- doc/whats_new/v0.22.rst | 6 ++--- .../_hist_gradient_boosting/_loss.pyx | 2 +- .../gradient_boosting.py | 14 ++++++------ .../ensemble/_hist_gradient_boosting/loss.py | 9 ++++---- .../tests/test_gradient_boosting.py | 18 ++++++++++----- .../tests/test_loss.py | 3 +++ .../tests/test_partial_dependence.py | 22 +++---------------- sklearn/utils/validation.py | 16 +++++++++----- 9 files changed, 52 insertions(+), 51 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index ff8969b2d329c..b0e0ab5478ec7 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -842,8 +842,7 @@ leverage integer-based data structures (histograms) instead of relying on sorted continuous values when building the trees. The API of these estimators is slightly different, and some of the features from :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` -are not yet supported: in particular sample weights, and some loss -functions. +are not yet supported, for instance some loss functions. These estimators are still **experimental**: their predictions and their API might change without any deprecation cycle. To use them, you @@ -941,11 +940,12 @@ If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. -Sample Weight Support +Sample weight support --------------------- :class:`HistGradientBoostingClassifier` and -:class:`HistGradientBoostingRegressor` sample weights during :term:`fit`. +:class:`HistGradientBoostingRegressor` sample support weights during +:term:`fit`. The following toy example demonstrates how the model ignores the samples with zero sample weights: @@ -966,6 +966,11 @@ zero sample weights: As you can see, the `[1, 0]` is comfortably classified as `1` since the first two samples are ignored due to their sample weights. +Implementation detail: taking sample weights into accounts amounts to +multiplying the gradients (and the hessians) by the sample weights. Note that +the binning stage (specifically the quantiles computation) does not take the +weights into account. + Low-level parallelism --------------------- diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index a4baf57c8e13e..a662724fd6390 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -209,15 +209,15 @@ Changelog ` and :user:`Caio Oliveira ` and :pr:`15138` by :user:`Jon Cusick `.. -- Many improvements were made to +- |MajorFeature| Many improvements were made to :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor`: - - |MajorFeature| Estimators now natively support dense data with missing + - |Feature| Estimators now natively support dense data with missing values both for training and predicting. They also support infinite values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_ and `Olivier Grisel`_. - - |MajorFeature| Estimators now support :term:`sample_weight`. :pr:`14696` by + - |Feature| Estimators now support :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_. - |Feature| Estimators now have an additional `warm_start` parameter that enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index ef6783156d804..821a81a48fcf3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -146,9 +146,9 @@ def _update_gradients_hessians_categorical_crossentropy( p[i, k] = raw_predictions[k, i] # prepare softmax _compute_softmax(p, i) # then update gradients and hessians + sw = sample_weight[i] for k in range(prediction_dim): p_i_k = p[i, k] - sw = sample_weight[i] gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f0cb2c8906b03..67f22a717a94f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -110,9 +110,11 @@ def fit(self, X, y, sample_weight=None): check_consistent_length(X, y) # Do not create unit sample weights by default to later skip some # computation + sample_weight = _check_sample_weight(sample_weight, X, + dtype=np.float64, + return_ones=False) + # TODO: remove when PDP suports sample weights if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, - dtype=np.float64) self._fitted_with_sw = True rng = check_random_state(self.random_state) @@ -183,11 +185,9 @@ def fit(self, X, y, sample_weight=None): n_bins = self.max_bins + 1 # + 1 for missing values self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=self._random_seed) - X_binned_train = self._bin_data(X_train, sample_weight_train, - is_training_data=True) + X_binned_train = self._bin_data(X_train, is_training_data=True) if X_val is not None: - X_binned_val = self._bin_data(X_val, sample_weight_val, - is_training_data=False) + X_binned_val = self._bin_data(X_val, is_training_data=False) else: X_binned_val = None @@ -545,7 +545,7 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) - def _bin_data(self, X, sample_weight, is_training_data): + def _bin_data(self, X, is_training_data): """Bin data X. If is_training_data, then set the bin_mapper_ attribute. diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 4489309d06668..25da3a78384bb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -30,15 +30,13 @@ class BaseLoss(ABC): """Base class for a loss.""" def __call__(self, y_true, raw_predictions, sample_weight): - """Return the weighted average loss - """ + """Return the weighted average loss""" return np.average(self.pointwise_loss(y_true, raw_predictions), weights=sample_weight) @abstractmethod def pointwise_loss(self, y_true, raw_predictions): - """Return loss value for each input - """ + """Return loss value for each input""" # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to @@ -261,7 +259,8 @@ def update_leaves_values(self, grower, y_true, raw_predictions, - raw_predictions[indices]) else: median_res = _weighted_percentile(y_true[indices] - - raw_predictions[indices]) + - raw_predictions[indices], + percentile=50) leaf.value = grower.shrinkage * median_res # Note that the regularization is ignored here diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index a9479b09f6412..51f10f47a17a8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -416,7 +416,7 @@ def test_consistent_lengths(): X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) y = np.array([0, 0, 1, 1]) sample_weight = np.array([.1, .3, .1]) - gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) + gbdt = HistGradientBoostingRegressor() with pytest.raises(ValueError, match=r"sample_weight.shape == \(3,\), expected"): gbdt.fit(X, y, sample_weight) @@ -465,7 +465,10 @@ def test_string_target_early_stopping(scoring): gbrt.fit(X, y) -def test_non_uniform_weights_toy_edge_case_reg(): +def test_zero_sample_weights_regression(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + X = [[1, 0], [1, 0], [1, 0], @@ -478,7 +481,10 @@ def test_non_uniform_weights_toy_edge_case_reg(): assert gb.predict([[1, 0]])[0] > 0.5 -def test_non_uniform_weights_toy_edge_case_clf(): +def test_zero_sample_weights_classification(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + X = [[1, 0], [1, 0], [1, 0], @@ -516,8 +522,8 @@ def test_sample_weight_effect(problem, duplication, seed): # High level test to make sure that duplicating a sample is equivalent to # giving it weight of 2. - # fails for n_samples > 255 because binning implementation isn't strictly - # equivalent to just duplicating samples. Keeping n_samples <= 255 makes + # fails for n_samples > 255 because binning does not take sample weights + # into account. Keeping n_samples <= 255 makes # sure only unique values are used so SW have no effect on binning. n_samples = 255 n_features = 2 @@ -552,7 +558,7 @@ def test_sample_weight_effect(problem, duplication, seed): est_sw = clone(est).fit(X, y, sample_weight=sample_weight) est_dup = clone(est).fit(X_dup, y_dup) - # Check decision function instead of just classes for classification + # checking raw_predict is stricter than just predict for classification assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index ee6b383543a06..2a27b30f78df7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -283,5 +283,8 @@ def test_init_gradient_and_hessians_sample_weight(): _, hessians = loss.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=prediction_dim, sample_weight=np.ones(n_samples)) + # the `hessians_are_constant` is true for the class, but not for the + # instance. + assert _LOSSES['least_squares'].hessians_are_constant # still true assert not loss.hessians_are_constant assert hessians.shape == (prediction_dim, n_samples) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 27d4678ee9ac0..359d95a2c5ebd 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -442,26 +442,10 @@ def test_partial_dependence_sample_weight(): assert np.corrcoef(pdp, values)[0, 1] > 0.99 -def test_partial_dependence_sample_weight_hgbt(): - # Test near perfect correlation between partial dependence and diagonal - # when sample weights emphasize y = x predictions - # non-regression test for #13193 - # TODO: check properly when sample weight is supported in pdp of HGBT - N = 1000 - rng = np.random.RandomState(123456) - mask = rng.randint(2, size=N, dtype=bool) - - x = rng.rand(N) - # set y = x on mask and y = -x outside - y = x.copy() - y[~mask] = -y[~mask] - X = np.c_[mask, x] - # sample weights to emphasize data points where y = x - sample_weight = np.ones(N) - sample_weight[mask] = 1000. - +def check_hist_gbdt_sw_not_supported(): + # TODO: remove/fix when PDP supports HGBT with sample weights clf = HistGradientBoostingRegressor(random_state=1) - clf.fit(X, y, sample_weight=sample_weight) + clf.fit(X, y, sample_weight=np.ones(len(X))) with pytest.raises(NotImplementedError, match="does not support partial dependence"): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index dad56850f2235..a1231fb56761a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1020,13 +1020,11 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_sample_weight(sample_weight, X, dtype=None): +def _check_sample_weight(sample_weight, X, dtype=None, return_ones=True): """Validate sample weights. - Note that passing sample_weight=None will output an array of ones. - Therefore, in some cases, you may want to protect the call with: - if sample_weight is not None: - sample_weight = _check_sample_weight(...) + Note that passing sample_weight=None will output an array of ones if + ``return_ones=True``, otherwise ``None``. Parameters ---------- @@ -1043,9 +1041,13 @@ def _check_sample_weight(sample_weight, X, dtype=None): is be allocated. If `dtype` is not one of `float32`, `float64`, `None`, the output will be of dtype `float64`. + return_ones: boolean, default=True + If ``True``, returns an array of ones if ``sample_weight is None``. + Otherwise returns ``None`` when ``sample_weight is None``. + Returns ------- - sample_weight : ndarray, shape (n_samples,) + sample_weight : ndarray of shape (n_samples,), or None Validated sample weight. It is guaranteed to be "C" contiguous. """ n_samples = _num_samples(X) @@ -1053,6 +1055,8 @@ def _check_sample_weight(sample_weight, X, dtype=None): if dtype is not None and dtype not in [np.float32, np.float64]: dtype = np.float64 + if sample_weight is None and not return_ones: + sample_weight = None if sample_weight is None or isinstance(sample_weight, numbers.Number): if sample_weight is None: sample_weight = np.ones(n_samples, dtype=dtype) From 45b360cf1bc08c14061643787d340648124faefd Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 4 Nov 2019 13:00:48 +0100 Subject: [PATCH 40/55] fix --- sklearn/ensemble/_hist_gradient_boosting/loss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 25da3a78384bb..bd9c2f84debe5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -244,7 +244,7 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, gradients, hessians, y_true, raw_predictions, sample_weight) def update_leaves_values(self, grower, y_true, raw_predictions, - sample_weights): + sample_weight): # Update the values predicted by the tree with # median(y_true - raw_predictions). # See note about need_update_leaves_values in BaseLoss. @@ -254,12 +254,13 @@ def update_leaves_values(self, grower, y_true, raw_predictions, # requires a cython version of median() for leaf in grower.finalized_leaves: indices = leaf.sample_indices - if sample_weights is None: + if sample_weight is None: median_res = np.median(y_true[indices] - raw_predictions[indices]) else: median_res = _weighted_percentile(y_true[indices] - raw_predictions[indices], + sample_weight=sample_weight, percentile=50) leaf.value = grower.shrinkage * median_res # Note that the regularization is ignored here From fc2ad10af6840085c30c288212144db4212efb76 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 4 Nov 2019 14:47:24 +0100 Subject: [PATCH 41/55] revert return_ones --- .../_hist_gradient_boosting/gradient_boosting.py | 7 +++---- sklearn/utils/validation.py | 16 ++++++---------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 2a064714d9b6a..2df3ce7547ee8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -110,11 +110,10 @@ def fit(self, X, y, sample_weight=None): check_consistent_length(X, y) # Do not create unit sample weights by default to later skip some # computation - sample_weight = _check_sample_weight(sample_weight, X, - dtype=np.float64, - return_ones=False) - # TODO: remove when PDP suports sample weights if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, + dtype=np.float64) + # TODO: remove when PDP suports sample weights self._fitted_with_sw = True rng = check_random_state(self.random_state) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a1231fb56761a..dad56850f2235 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1020,11 +1020,13 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_sample_weight(sample_weight, X, dtype=None, return_ones=True): +def _check_sample_weight(sample_weight, X, dtype=None): """Validate sample weights. - Note that passing sample_weight=None will output an array of ones if - ``return_ones=True``, otherwise ``None``. + Note that passing sample_weight=None will output an array of ones. + Therefore, in some cases, you may want to protect the call with: + if sample_weight is not None: + sample_weight = _check_sample_weight(...) Parameters ---------- @@ -1041,13 +1043,9 @@ def _check_sample_weight(sample_weight, X, dtype=None, return_ones=True): is be allocated. If `dtype` is not one of `float32`, `float64`, `None`, the output will be of dtype `float64`. - return_ones: boolean, default=True - If ``True``, returns an array of ones if ``sample_weight is None``. - Otherwise returns ``None`` when ``sample_weight is None``. - Returns ------- - sample_weight : ndarray of shape (n_samples,), or None + sample_weight : ndarray, shape (n_samples,) Validated sample weight. It is guaranteed to be "C" contiguous. """ n_samples = _num_samples(X) @@ -1055,8 +1053,6 @@ def _check_sample_weight(sample_weight, X, dtype=None, return_ones=True): if dtype is not None and dtype not in [np.float32, np.float64]: dtype = np.float64 - if sample_weight is None and not return_ones: - sample_weight = None if sample_weight is None or isinstance(sample_weight, numbers.Number): if sample_weight is None: sample_weight = np.ones(n_samples, dtype=dtype) From ac3df845b304c848ed7087d00f7e0b9430a4a31a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 5 Nov 2019 10:30:50 +0100 Subject: [PATCH 42/55] add sw to the benchmark --- benchmarks/bench_hist_gradient_boosting.py | 31 +++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 9bfd6d743ee4f..b801ea43512f6 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -32,6 +32,9 @@ parser.add_argument('--n-samples-max', type=int, default=int(1e6)) parser.add_argument('--n-features', type=int, default=20) parser.add_argument('--max-bins', type=int, default=255) +parser.add_argument('--random-sample-weights', action="store_true", + default=False, + help="generate and use random sample weights") args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes @@ -46,6 +49,7 @@ def get_estimator_and_data(): n_features=args.n_features, n_classes=args.n_classes, n_clusters_per_class=1, + n_informative=args.n_classes, random_state=0) return X, y, HistGradientBoostingClassifier elif args.problem == 'regression': @@ -60,8 +64,19 @@ def get_estimator_and_data(): np.bool) X[mask] = np.nan -X_train_, X_test_, y_train_, y_test_ = train_test_split( - X, y, test_size=0.5, random_state=0) +if args.random_sample_weights: + sample_weight = np.random.rand(len(X)) * 10 +else: + sample_weight = None + +if sample_weight is not None: + (X_train_, X_test_, y_train_, y_test_, + sample_weight_train_, _) = train_test_split( + X, y, sample_weight, test_size=0.5, random_state=0) +else: + X_train_, X_test_, y_train_, y_test_ = train_test_split( + X, y, test_size=0.5, random_state=0) + sample_weight_train_ = None def one_run(n_samples): @@ -69,6 +84,10 @@ def one_run(n_samples): X_test = X_test_[:n_samples] y_train = y_train_[:n_samples] y_test = y_test_[:n_samples] + if sample_weight is not None: + sample_weight_train = sample_weight_train_[:n_samples] + else: + sample_weight_train = None assert X_train.shape[0] == n_samples assert X_test.shape[0] == n_samples print("Data size: %d samples train, %d samples test." @@ -93,7 +112,7 @@ def one_run(n_samples): if loss == 'default': loss = 'least_squares' est.set_params(loss=loss) - est.fit(X_train, y_train) + est.fit(X_train, y_train, sample_weight=sample_weight_train) sklearn_fit_duration = time() - tic tic = time() sklearn_score = est.score(X_test, y_test) @@ -110,7 +129,7 @@ def one_run(n_samples): lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') tic = time() - lightgbm_est.fit(X_train, y_train) + lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train) lightgbm_fit_duration = time() - tic tic = time() lightgbm_score = lightgbm_est.score(X_test, y_test) @@ -127,7 +146,7 @@ def one_run(n_samples): xgb_est = get_equivalent_estimator(est, lib='xgboost') tic = time() - xgb_est.fit(X_train, y_train) + xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train) xgb_fit_duration = time() - tic tic = time() xgb_score = xgb_est.score(X_test, y_test) @@ -144,7 +163,7 @@ def one_run(n_samples): cat_est = get_equivalent_estimator(est, lib='catboost') tic = time() - cat_est.fit(X_train, y_train) + cat_est.fit(X_train, y_train, sample_weight=sample_weight_train) cat_fit_duration = time() - tic tic = time() cat_score = cat_est.score(X_test, y_test) From 577a4f3eb05869680e6878b0b307a83c158c09be Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 5 Nov 2019 11:08:19 +0100 Subject: [PATCH 43/55] check -> test in test --- sklearn/inspection/tests/test_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 359d95a2c5ebd..4d54c00e5c356 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -442,7 +442,7 @@ def test_partial_dependence_sample_weight(): assert np.corrcoef(pdp, values)[0, 1] > 0.99 -def check_hist_gbdt_sw_not_supported(): +def test_hist_gbdt_sw_not_supported(): # TODO: remove/fix when PDP supports HGBT with sample weights clf = HistGradientBoostingRegressor(random_state=1) clf.fit(X, y, sample_weight=np.ones(len(X))) From 916eaa5ade26616b2872257fe86a5d6f28fd1eed Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 5 Nov 2019 16:09:31 +0100 Subject: [PATCH 44/55] typo --- doc/modules/ensemble.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index b0e0ab5478ec7..83c68daa6c115 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -966,7 +966,7 @@ zero sample weights: As you can see, the `[1, 0]` is comfortably classified as `1` since the first two samples are ignored due to their sample weights. -Implementation detail: taking sample weights into accounts amounts to +Implementation detail: taking sample weights into account amounts to multiplying the gradients (and the hessians) by the sample weights. Note that the binning stage (specifically the quantiles computation) does not take the weights into account. From 1938175fc46956539f0ddd0ce6d2ea40e584b105 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 11 Nov 2019 12:47:34 +0100 Subject: [PATCH 45/55] pass sample_weight to loss's init, and set hessians_are_constant there --- .../gradient_boosting.py | 18 +++++----- .../ensemble/_hist_gradient_boosting/loss.py | 34 +++++++++++++------ .../tests/test_gradient_boosting.py | 2 +- .../tests/test_loss.py | 27 +++++++-------- 4 files changed, 47 insertions(+), 34 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 2df3ce7547ee8..71e7777d6fb17 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -138,7 +138,7 @@ def fit(self, X, y, sample_weight=None): # data. self._in_fit = True - self.loss_ = self._get_loss() + self.loss_ = self._get_loss(sample_weight=sample_weight) self.do_early_stopping_ = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) @@ -694,7 +694,7 @@ def _more_tags(self): return {'allow_nan': True} @abstractmethod - def _get_loss(self): + def _get_loss(self, sample_weight): pass @abstractmethod @@ -882,8 +882,8 @@ def _encode_y(self, y): y = y.astype(Y_DTYPE, copy=False) return y - def _get_loss(self): - return _LOSSES[self.loss]() + def _get_loss(self, sample_weight): + return _LOSSES[self.loss](sample_weight=sample_weight) class HistGradientBoostingClassifier(BaseHistGradientBoosting, @@ -1111,7 +1111,7 @@ def _encode_y(self, y): encoded_y = encoded_y.astype(Y_DTYPE, copy=False) return encoded_y - def _get_loss(self): + def _get_loss(self, sample_weight): if (self.loss == 'categorical_crossentropy' and self.n_trees_per_iteration_ == 1): raise ValueError("'categorical_crossentropy' is not suitable for " @@ -1120,8 +1120,10 @@ def _get_loss(self): if self.loss == 'auto': if self.n_trees_per_iteration_ == 1: - return _LOSSES['binary_crossentropy']() + return _LOSSES['binary_crossentropy']( + sample_weight=sample_weight) else: - return _LOSSES['categorical_crossentropy']() + return _LOSSES['categorical_crossentropy']( + sample_weight=sample_weight) - return _LOSSES[self.loss]() + return _LOSSES[self.loss](sample_weight=sample_weight) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index bd9c2f84debe5..8b8e54ca7c0c1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -28,6 +28,8 @@ class BaseLoss(ABC): """Base class for a loss.""" + def __init__(self, hessians_are_constant): + self.hessians_are_constant = hessians_are_constant def __call__(self, y_true, raw_predictions, sample_weight): """Return the weighted average loss""" @@ -81,12 +83,6 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim, shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) - if sample_weight is not None: - # If sample weights are provided, the hessians and gradients - # are multiplied by sample_weight, which means the hessians are - # equal to sample weights. - self.hessians_are_constant = False - if self.hessians_are_constant: # If the hessians are constant, we consider they are equal to 1. # - This is correct for the half LS loss @@ -161,8 +157,15 @@ class LeastSquares(BaseLoss): the computation of the gradients and get a unit hessian (and be consistent with what is done in LightGBM). """ + def __init__(self, sample_weight): + # If sample weights are provided, the hessians and gradients + # are multiplied by sample_weight, which means the hessians are + # equal to sample weights. - hessians_are_constant = True + hessians_are_constant = True + if sample_weight is not None: + hessians_are_constant = False + super().__init__(hessians_are_constant=hessians_are_constant) def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to @@ -200,8 +203,16 @@ class LeastAbsoluteDeviation(BaseLoss): loss(x_i) = |y_true_i - raw_pred_i| """ + def __init__(self, sample_weight): + # If sample weights are provided, the hessians and gradients + # are multiplied by sample_weight, which means the hessians are + # equal to sample weights. + + hessians_are_constant = True + if sample_weight is not None: + hessians_are_constant = False + super().__init__(hessians_are_constant=hessians_are_constant) - hessians_are_constant = True # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to # predict a Newton-Raphson step (see grower._finalize_leaf()). But for @@ -278,7 +289,9 @@ class BinaryCrossEntropy(BaseLoss): section 4.4.1 (about logistic regression). """ - hessians_are_constant = False + def __init__(self, sample_weight): + super().__init__(hessians_are_constant=False) + inverse_link_function = staticmethod(expit) def pointwise_loss(self, y_true, raw_predictions): @@ -330,7 +343,8 @@ class CategoricalCrossEntropy(BaseLoss): cross-entropy to more than 2 classes. """ - hessians_are_constant = False + def __init__(self, sample_weight): + super().__init__(hessians_are_constant=False) def pointwise_loss(self, y_true, raw_predictions): one_hot_true = np.zeros_like(raw_predictions) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 51f10f47a17a8..0c0b6d2b05896 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -580,7 +580,7 @@ def test_sum_hessians_are_sample_weight(loss_name): sample_weight = rng.normal(size=n_samples) - loss = _LOSSES[loss_name]() + loss = _LOSSES[loss_name](sample_weight=sample_weight) gradients, hessians = loss.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight) raw_predictions = rng.normal(size=(1, n_samples)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 2a27b30f78df7..915dc300e4760 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -62,7 +62,7 @@ def test_derivatives(loss, x0, y_true): # using Halley's method with the first and second order derivatives # computed by the Loss instance. - loss = _LOSSES[loss]() + loss = _LOSSES[loss](sample_weight=None) y_true = np.array([y_true], dtype=Y_DTYPE) x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1) get_gradients, get_hessians = get_derivatives_helper(loss) @@ -105,7 +105,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): raw_predictions = rng.normal( size=(prediction_dim, n_samples) ).astype(Y_DTYPE) - loss = _LOSSES[loss]() + loss = _LOSSES[loss](sample_weight=None) get_gradients, get_hessians = get_derivatives_helper(loss) # only take gradients and hessians of first tree / class. @@ -139,7 +139,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): def test_baseline_least_squares(): rng = np.random.RandomState(0) - loss = _LOSSES['least_squares']() + loss = _LOSSES['least_squares'](sample_weight=None) y_train = rng.normal(size=100) baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar @@ -153,7 +153,7 @@ def test_baseline_least_squares(): def test_baseline_least_absolute_deviation(): rng = np.random.RandomState(0) - loss = _LOSSES['least_absolute_deviation']() + loss = _LOSSES['least_absolute_deviation'](sample_weight=None) y_train = rng.normal(size=100) baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar @@ -167,7 +167,7 @@ def test_baseline_least_absolute_deviation(): def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) - loss = _LOSSES['binary_crossentropy']() + loss = _LOSSES['binary_crossentropy'](sample_weight=None) for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) @@ -192,7 +192,7 @@ def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) prediction_dim = 4 - loss = _LOSSES['categorical_crossentropy']() + loss = _LOSSES['categorical_crossentropy'](sample_weight=None) for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, None, @@ -240,7 +240,7 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): else: sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE) - loss_ = _LOSSES[loss]() + loss_ = _LOSSES[loss](sample_weight=sample_weight) baseline_prediction = loss_.get_baseline_prediction( y_true, None, prediction_dim @@ -268,23 +268,20 @@ def test_init_gradient_and_hessians_sample_weight(): # hessians_are_constant attribute, and consequently the shape of the # hessians array. - loss = _LOSSES['least_squares']() - assert loss.hessians_are_constant - prediction_dim = 2 n_samples = 5 - + sample_weight = None + loss = _LOSSES['least_squares'](sample_weight=sample_weight) _, hessians = loss.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=prediction_dim, sample_weight=None) assert loss.hessians_are_constant assert hessians.shape == (1, 1) + sample_weight = np.ones(n_samples) + loss = _LOSSES['least_squares'](sample_weight=sample_weight) _, hessians = loss.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=prediction_dim, - sample_weight=np.ones(n_samples)) - # the `hessians_are_constant` is true for the class, but not for the - # instance. - assert _LOSSES['least_squares'].hessians_are_constant # still true + sample_weight=sample_weight) assert not loss.hessians_are_constant assert hessians.shape == (prediction_dim, n_samples) From 469b6d9682c0d103f5cc56cf110b027ccf5a4eea Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 11 Nov 2019 16:34:56 +0100 Subject: [PATCH 46/55] simply hessians_are_constant --- sklearn/ensemble/_hist_gradient_boosting/loss.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 8b8e54ca7c0c1..e0fa9a951f9e4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -161,11 +161,7 @@ def __init__(self, sample_weight): # If sample weights are provided, the hessians and gradients # are multiplied by sample_weight, which means the hessians are # equal to sample weights. - - hessians_are_constant = True - if sample_weight is not None: - hessians_are_constant = False - super().__init__(hessians_are_constant=hessians_are_constant) + super().__init__(hessians_are_constant=sample_weight is None) def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to @@ -207,11 +203,7 @@ def __init__(self, sample_weight): # If sample weights are provided, the hessians and gradients # are multiplied by sample_weight, which means the hessians are # equal to sample weights. - - hessians_are_constant = True - if sample_weight is not None: - hessians_are_constant = False - super().__init__(hessians_are_constant=hessians_are_constant) + super().__init__(hessians_are_constant=sample_weight is None) # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to From b759142ac7ada080718bf6f596ebf4fed0592d7c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 6 Jan 2020 11:56:38 +0100 Subject: [PATCH 47/55] pass tests after merge --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d2cbfb4518c60..9069b33c8d5a6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -219,7 +219,7 @@ def fit(self, X, y, sample_weight=None): gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.n_trees_per_iteration_, - sample_weight=sample_weight + sample_weight=sample_weight_train ) # predictors is a matrix (list of lists) of TreePredictor objects @@ -326,7 +326,8 @@ def fit(self, X, y, sample_weight=None): # shape = (n_trees_per_iteration, n_samples). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, - prediction_dim=self.n_trees_per_iteration_ + prediction_dim=self.n_trees_per_iteration_, + sample_weight=sample_weight_train ) for iteration in range(begin_at_stage, self.max_iter): From 76dc71080a11016d5e973301bc4c65c26d26292f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 6 Jan 2020 12:31:38 +0100 Subject: [PATCH 48/55] sample with replacement before binning --- .../_hist_gradient_boosting/binning.py | 25 +++++++++++++------ .../gradient_boosting.py | 14 ++++++++--- .../tests/test_binning.py | 2 +- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index a4dec15763940..38df3df384bf2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,7 +16,8 @@ from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF -def _find_binning_thresholds(data, max_bins, subsample, random_state): +def _find_binning_thresholds(data, sample_weight, max_bins, subsample, + random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -25,6 +26,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): ---------- data : array-like, shape (n_samples, n_features) The data to bin. + sample_weight : ndarray of shape(n_samples,), or None + Sample weights associated with the data. max_bins: int The maximum number of bins to use for non-missing values. If for a given feature the number of unique values is less than ``max_bins``, @@ -46,9 +49,15 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): n_features``. """ rng = check_random_state(random_state) - if subsample is not None and data.shape[0] > subsample: - subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) - data = data.take(subset, axis=0) + sample_size = min(subsample, data.shape[0]) + if sample_weight is not None: + subset = rng.choice(np.arange(data.shape[0]), size=sample_size, + replace=True, + p=sample_weight / sample_weight.sum()) + else: + subset = rng.choice(np.arange(data.shape[0]), size=sample_size, + replace=True) + data = data.take(subset, axis=0) binning_thresholds = [] for f_idx in range(data.shape[1]): @@ -136,7 +145,7 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): self.subsample = subsample self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """Fit data X by computing the binning thresholds. The last bin is reserved for missing values, whether missing values @@ -146,8 +155,10 @@ def fit(self, X, y=None): ---------- X : array-like, shape (n_samples, n_features) The data to bin. - y: None + y : None Ignored. + sample_weight : ndarray of shape(n_samples,), or None + Sample weights associated with the data. Returns ------- @@ -161,7 +172,7 @@ def fit(self, X, y=None): X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) max_bins = self.n_bins - 1 self.bin_thresholds_ = _find_binning_thresholds( - X, max_bins, subsample=self.subsample, + X, sample_weight, max_bins, subsample=self.subsample, random_state=self.random_state) self.n_bins_non_missing_ = np.array( diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 9069b33c8d5a6..2d1970080377a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -184,9 +184,13 @@ def fit(self, X, y, sample_weight=None): n_bins = self.max_bins + 1 # + 1 for missing values self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=self._random_seed) - X_binned_train = self._bin_data(X_train, is_training_data=True) + X_binned_train = self._bin_data(X_train, + sample_weight=sample_weight_train, + is_training_data=True) if X_val is not None: - X_binned_val = self._bin_data(X_val, is_training_data=False) + X_binned_val = self._bin_data(X_val, + sample_weight=sample_weight_val, + is_training_data=False) else: X_binned_val = None @@ -554,7 +558,7 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) - def _bin_data(self, X, is_training_data): + def _bin_data(self, X, sample_weight, is_training_data): """Bin data X. If is_training_data, then set the bin_mapper_ attribute. @@ -567,7 +571,9 @@ def _bin_data(self, X, is_training_data): X.nbytes / 1e9, description), end="", flush=True) tic = time() if is_training_data: - X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array + # F-aligned array + X_binned = self.bin_mapper_.fit_transform( + X, sample_weight=sample_weight) else: X_binned = self.bin_mapper_.transform(X) # F-aligned array # We convert the array to C-contiguous since predicting is faster diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 06e38d62f7638..15f43be2f7167 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -21,7 +21,7 @@ def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - return _find_binning_thresholds_orig(data, max_bins, subsample, + return _find_binning_thresholds_orig(data, None, max_bins, subsample, random_state) From 9bc22d9c467783d39d03ecea1d7184b37576b998 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 14 Jan 2020 15:37:10 +0100 Subject: [PATCH 49/55] pass ints to choice and don't always subsample --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 38df3df384bf2..9bb3952432fb2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -51,11 +51,11 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, rng = check_random_state(random_state) sample_size = min(subsample, data.shape[0]) if sample_weight is not None: - subset = rng.choice(np.arange(data.shape[0]), size=sample_size, + subset = rng.choice(data.shape[0], size=sample_size, replace=True, p=sample_weight / sample_weight.sum()) - else: - subset = rng.choice(np.arange(data.shape[0]), size=sample_size, + elif subsample is not None and data.shape[0] > subsample: + subset = rng.choice(data.shape[0], size=sample_size, replace=True) data = data.take(subset, axis=0) From a4877f9068194ef580861c60e60de795d4bdbcd5 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 14 Jan 2020 16:25:15 +0100 Subject: [PATCH 50/55] fix local var --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 9bb3952432fb2..a6721879d54d2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -49,15 +49,19 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, n_features``. """ rng = check_random_state(random_state) - sample_size = min(subsample, data.shape[0]) + if subsample: + sample_size = min(subsample, data.shape[0]) + else: + sample_size = data.shape[0] if sample_weight is not None: subset = rng.choice(data.shape[0], size=sample_size, replace=True, p=sample_weight / sample_weight.sum()) + data = data.take(subset, axis=0) elif subsample is not None and data.shape[0] > subsample: subset = rng.choice(data.shape[0], size=sample_size, replace=True) - data = data.take(subset, axis=0) + data = data.take(subset, axis=0) binning_thresholds = [] for f_idx in range(data.shape[1]): From d5f98d7afc61bd4faf2b3958d4c13ec16f6b91c6 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 11 Feb 2020 11:11:32 +0100 Subject: [PATCH 51/55] Revert "fix local var" This reverts commit a4877f9068194ef580861c60e60de795d4bdbcd5. --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index a6721879d54d2..9bb3952432fb2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -49,19 +49,15 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, n_features``. """ rng = check_random_state(random_state) - if subsample: - sample_size = min(subsample, data.shape[0]) - else: - sample_size = data.shape[0] + sample_size = min(subsample, data.shape[0]) if sample_weight is not None: subset = rng.choice(data.shape[0], size=sample_size, replace=True, p=sample_weight / sample_weight.sum()) - data = data.take(subset, axis=0) elif subsample is not None and data.shape[0] > subsample: subset = rng.choice(data.shape[0], size=sample_size, replace=True) - data = data.take(subset, axis=0) + data = data.take(subset, axis=0) binning_thresholds = [] for f_idx in range(data.shape[1]): From cfaa05794379057e312809ef9e5526282d00437b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 11 Feb 2020 11:12:43 +0100 Subject: [PATCH 52/55] Revert "pass ints to choice and don't always subsample" This reverts commit 9bc22d9c467783d39d03ecea1d7184b37576b998. --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 9bb3952432fb2..38df3df384bf2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -51,11 +51,11 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, rng = check_random_state(random_state) sample_size = min(subsample, data.shape[0]) if sample_weight is not None: - subset = rng.choice(data.shape[0], size=sample_size, + subset = rng.choice(np.arange(data.shape[0]), size=sample_size, replace=True, p=sample_weight / sample_weight.sum()) - elif subsample is not None and data.shape[0] > subsample: - subset = rng.choice(data.shape[0], size=sample_size, + else: + subset = rng.choice(np.arange(data.shape[0]), size=sample_size, replace=True) data = data.take(subset, axis=0) From f5960f2afda9cf20595974471e36e1b09a19ef8c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 11 Feb 2020 11:12:52 +0100 Subject: [PATCH 53/55] Revert "sample with replacement before binning" This reverts commit 76dc71080a11016d5e973301bc4c65c26d26292f. --- .../_hist_gradient_boosting/binning.py | 25 ++++++------------- .../gradient_boosting.py | 14 +++-------- .../tests/test_binning.py | 2 +- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 38df3df384bf2..a4dec15763940 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,8 +16,7 @@ from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF -def _find_binning_thresholds(data, sample_weight, max_bins, subsample, - random_state): +def _find_binning_thresholds(data, max_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -26,8 +25,6 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, ---------- data : array-like, shape (n_samples, n_features) The data to bin. - sample_weight : ndarray of shape(n_samples,), or None - Sample weights associated with the data. max_bins: int The maximum number of bins to use for non-missing values. If for a given feature the number of unique values is less than ``max_bins``, @@ -49,15 +46,9 @@ def _find_binning_thresholds(data, sample_weight, max_bins, subsample, n_features``. """ rng = check_random_state(random_state) - sample_size = min(subsample, data.shape[0]) - if sample_weight is not None: - subset = rng.choice(np.arange(data.shape[0]), size=sample_size, - replace=True, - p=sample_weight / sample_weight.sum()) - else: - subset = rng.choice(np.arange(data.shape[0]), size=sample_size, - replace=True) - data = data.take(subset, axis=0) + if subsample is not None and data.shape[0] > subsample: + subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) + data = data.take(subset, axis=0) binning_thresholds = [] for f_idx in range(data.shape[1]): @@ -145,7 +136,7 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): self.subsample = subsample self.random_state = random_state - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None): """Fit data X by computing the binning thresholds. The last bin is reserved for missing values, whether missing values @@ -155,10 +146,8 @@ def fit(self, X, y=None, sample_weight=None): ---------- X : array-like, shape (n_samples, n_features) The data to bin. - y : None + y: None Ignored. - sample_weight : ndarray of shape(n_samples,), or None - Sample weights associated with the data. Returns ------- @@ -172,7 +161,7 @@ def fit(self, X, y=None, sample_weight=None): X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) max_bins = self.n_bins - 1 self.bin_thresholds_ = _find_binning_thresholds( - X, sample_weight, max_bins, subsample=self.subsample, + X, max_bins, subsample=self.subsample, random_state=self.random_state) self.n_bins_non_missing_ = np.array( diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 2d1970080377a..9069b33c8d5a6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -184,13 +184,9 @@ def fit(self, X, y, sample_weight=None): n_bins = self.max_bins + 1 # + 1 for missing values self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=self._random_seed) - X_binned_train = self._bin_data(X_train, - sample_weight=sample_weight_train, - is_training_data=True) + X_binned_train = self._bin_data(X_train, is_training_data=True) if X_val is not None: - X_binned_val = self._bin_data(X_val, - sample_weight=sample_weight_val, - is_training_data=False) + X_binned_val = self._bin_data(X_val, is_training_data=False) else: X_binned_val = None @@ -558,7 +554,7 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) - def _bin_data(self, X, sample_weight, is_training_data): + def _bin_data(self, X, is_training_data): """Bin data X. If is_training_data, then set the bin_mapper_ attribute. @@ -571,9 +567,7 @@ def _bin_data(self, X, sample_weight, is_training_data): X.nbytes / 1e9, description), end="", flush=True) tic = time() if is_training_data: - # F-aligned array - X_binned = self.bin_mapper_.fit_transform( - X, sample_weight=sample_weight) + X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array else: X_binned = self.bin_mapper_.transform(X) # F-aligned array # We convert the array to C-contiguous since predicting is faster diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 15f43be2f7167..06e38d62f7638 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -21,7 +21,7 @@ def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - return _find_binning_thresholds_orig(data, None, max_bins, subsample, + return _find_binning_thresholds_orig(data, max_bins, subsample, random_state) From 13d120a21b2a7cf456d480424d5ae7f0ef178288 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sat, 22 Feb 2020 16:42:07 -0500 Subject: [PATCH 54/55] CLN Address comments --- doc/whats_new/v0.22.rst | 2 -- doc/whats_new/v0.23.rst | 4 ++++ .../_hist_gradient_boosting/gradient_boosting.py | 9 ++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 4fcce372a10af..399f6352410e9 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -409,8 +409,6 @@ Changelog values both for training and predicting. They also support infinite values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_ and `Olivier Grisel`_. - - |Feature| Estimators now support :term:`sample_weight`. :pr:`14696` by - `Adrin Jalali`_ and `Nicolas Hug`_. - |Feature| Estimators now have an additional `warm_start` parameter that enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. - |Feature| :func:`inspection.partial_dependence` and diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index a1830229b57ec..463a7e626438c 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -105,6 +105,10 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |MajorFeature| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` now support + :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_. + - |API| Added boolean `verbose` flag to classes: :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. :pr:`15991` by :user:`Sam Bail `, diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e3fc25231e9ed..3dd14690aa560 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -681,11 +681,10 @@ def _compute_partial_dependence_recursion(self, grid, target_features): """ if getattr(self, '_fitted_with_sw', False): - raise NotImplementedError("{} does not support partial dependence" - " plots when sample weights were given " - "during fit time.".format( - self.__class__.__name__ - )) + raise NotImplementedError("{} does not support partial dependence " + "plots with the 'recursion' method when " + "sample weights were given during fit " + "time.".format(self.__class__.__name__)) grid = np.asarray(grid, dtype=X_DTYPE, order='C') averaged_predictions = np.zeros( From dcccf01e41383ad64b3a485c9fa06c10814d0f86 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 24 Feb 2020 11:24:07 +0100 Subject: [PATCH 55/55] address Thomas's comments --- doc/modules/ensemble.rst | 2 ++ .../_hist_gradient_boosting/gradient_boosting.py | 5 +++-- sklearn/ensemble/_hist_gradient_boosting/loss.py | 2 +- .../tests/test_gradient_boosting.py | 9 ++++----- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 40e1e102a4fa6..a4c302bc1f8a2 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -977,6 +977,8 @@ zero sample weights: HistGradientBoostingClassifier(...) >>> gb.predict([[1, 0]]) array([1]) + >>> gb.predict_proba([[1, 0]])[0, 1] + 0.99... As you can see, the `[1, 0]` is comfortably classified as `1` since the first two samples are ignored due to their sample weights. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e3fc25231e9ed..1641a593a01f6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -492,7 +492,7 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, else: self.train_score_.append( self.scorer_(self, X_binned_small_train, y_small_train, - sample_weight_small_train) + sample_weight=sample_weight_small_train) ) if self._use_validation_data: @@ -504,7 +504,8 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, ) else: self.validation_score_.append( - self.scorer_(self, X_binned_val, y_val, sample_weight_val) + self.scorer_(self, X_binned_val, y_val, + sample_weight=sample_weight_val) ) return self._should_stop(self.validation_score_) else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index a7e34d292b4e3..2dbf8bd58773e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -183,10 +183,10 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - hessians = hessians.reshape(-1) if sample_weight is None: _update_gradients_least_squares(gradients, y_true, raw_predictions) else: + hessians = hessians.reshape(-1) _update_gradients_hessians_least_squares(gradients, hessians, y_true, raw_predictions, sample_weight) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index b417fc5471768..029f2f6822f23 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -516,8 +516,7 @@ def test_zero_sample_weights_classification(): 'multiclass_classification' )) @pytest.mark.parametrize('duplication', ('half', 'all')) -@pytest.mark.parametrize('seed', range(1)) -def test_sample_weight_effect(problem, duplication, seed): +def test_sample_weight_effect(problem, duplication): # High level test to make sure that duplicating a sample is equivalent to # giving it weight of 2. @@ -528,14 +527,14 @@ def test_sample_weight_effect(problem, duplication, seed): n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, - n_informative=n_features, random_state=seed) + n_informative=n_features, random_state=0) Klass = HistGradientBoostingRegressor else: n_classes = 2 if problem == 'binary_classification' else 3 X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, n_clusters_per_class=1, - n_classes=n_classes, random_state=seed) + n_classes=n_classes, random_state=0) Klass = HistGradientBoostingClassifier # This test can't pass if min_samples_leaf > 1 because that would force 2 @@ -603,7 +602,7 @@ def test_sum_hessians_are_sample_weight(loss_name): for feature_idx in range(n_features): for bin_idx in range(bin_mapper.n_bins): - assert histograms[feature_idx][bin_idx]['sum_hessians'] == ( + assert histograms[feature_idx, bin_idx]['sum_hessians'] == ( pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5))