diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index f87f22ab8837e..ac7e77e2a1f99 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -32,6 +32,9 @@ parser.add_argument('--n-samples-max', type=int, default=int(1e6)) parser.add_argument('--n-features', type=int, default=20) parser.add_argument('--max-bins', type=int, default=255) +parser.add_argument('--random-sample-weights', action="store_true", + default=False, + help="generate and use random sample weights") args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes @@ -46,6 +49,7 @@ def get_estimator_and_data(): n_features=args.n_features, n_classes=args.n_classes, n_clusters_per_class=1, + n_informative=args.n_classes, random_state=0) return X, y, HistGradientBoostingClassifier elif args.problem == 'regression': @@ -60,8 +64,19 @@ def get_estimator_and_data(): np.bool) X[mask] = np.nan -X_train_, X_test_, y_train_, y_test_ = train_test_split( - X, y, test_size=0.5, random_state=0) +if args.random_sample_weights: + sample_weight = np.random.rand(len(X)) * 10 +else: + sample_weight = None + +if sample_weight is not None: + (X_train_, X_test_, y_train_, y_test_, + sample_weight_train_, _) = train_test_split( + X, y, sample_weight, test_size=0.5, random_state=0) +else: + X_train_, X_test_, y_train_, y_test_ = train_test_split( + X, y, test_size=0.5, random_state=0) + sample_weight_train_ = None def one_run(n_samples): @@ -69,6 +84,10 @@ def one_run(n_samples): X_test = X_test_[:n_samples] y_train = y_train_[:n_samples] y_test = y_test_[:n_samples] + if sample_weight is not None: + sample_weight_train = sample_weight_train_[:n_samples] + else: + sample_weight_train = None assert X_train.shape[0] == n_samples assert X_test.shape[0] == n_samples print("Data size: %d samples train, %d samples test." @@ -93,7 +112,7 @@ def one_run(n_samples): if loss == 'default': loss = 'least_squares' est.set_params(loss=loss) - est.fit(X_train, y_train) + est.fit(X_train, y_train, sample_weight=sample_weight_train) sklearn_fit_duration = time() - tic tic = time() sklearn_score = est.score(X_test, y_test) @@ -110,7 +129,7 @@ def one_run(n_samples): lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') tic = time() - lightgbm_est.fit(X_train, y_train) + lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train) lightgbm_fit_duration = time() - tic tic = time() lightgbm_score = lightgbm_est.score(X_test, y_test) @@ -127,7 +146,7 @@ def one_run(n_samples): xgb_est = get_equivalent_estimator(est, lib='xgboost') tic = time() - xgb_est.fit(X_train, y_train) + xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train) xgb_fit_duration = time() - tic tic = time() xgb_score = xgb_est.score(X_test, y_test) @@ -144,7 +163,7 @@ def one_run(n_samples): cat_est = get_equivalent_estimator(est, lib='catboost') tic = time() - cat_est.fit(X_train, y_train) + cat_est.fit(X_train, y_train, sample_weight=sample_weight_train) cat_fit_duration = time() - tic tic = time() cat_score = cat_est.score(X_test, y_test) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index deefaaf642c39..3d6798e0d548b 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -856,8 +856,7 @@ leverage integer-based data structures (histograms) instead of relying on sorted continuous values when building the trees. The API of these estimators is slightly different, and some of the features from :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` -are not yet supported: in particular sample weights, and some loss -functions. +are not yet supported, for instance some loss functions. These estimators are still **experimental**: their predictions and their API might change without any deprecation cycle. To use them, you @@ -957,6 +956,39 @@ If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. +Sample weight support +--------------------- + +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` sample support weights during +:term:`fit`. + +The following toy example demonstrates how the model ignores the samples with +zero sample weights: + + >>> X = [[1, 0], + ... [1, 0], + ... [1, 0], + ... [0, 1]] + >>> y = [0, 0, 1, 0] + >>> # ignore the first 2 training samples by setting their weight to 0 + >>> sample_weight = [0, 0, 1, 1] + >>> gb = HistGradientBoostingClassifier(min_samples_leaf=1) + >>> gb.fit(X, y, sample_weight=sample_weight) + HistGradientBoostingClassifier(...) + >>> gb.predict([[1, 0]]) + array([1]) + >>> gb.predict_proba([[1, 0]])[0, 1] + 0.99... + +As you can see, the `[1, 0]` is comfortably classified as `1` since the first +two samples are ignored due to their sample weights. + +Implementation detail: taking sample weights into account amounts to +multiplying the gradients (and the hessians) by the sample weights. Note that +the binning stage (specifically the quantiles computation) does not take the +weights into account. + Low-level parallelism --------------------- diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index bbd35676defcf..399f6352410e9 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -401,11 +401,11 @@ Changelog ` and :user:`Caio Oliveira ` and :pr:`15138` by :user:`Jon Cusick `.. -- Many improvements were made to +- |MajorFeature| Many improvements were made to :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor`: - - |MajorFeature| Estimators now natively support dense data with missing + - |Feature| Estimators now natively support dense data with missing values both for training and predicting. They also support infinite values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_ and `Olivier Grisel`_. diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 1167fc1bf4556..720a19cd862f6 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -142,6 +142,10 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |MajorFeature| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` now support + :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_. + - |API| Added boolean `verbose` flag to classes: :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. :pr:`15991` by :user:`Sam Bail `, diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 418a9124d37fa..821a81a48fcf3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -27,9 +27,51 @@ def _update_gradients_least_squares( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): + # Note: a more correct exp is 2 * (raw_predictions - y_true) + # but since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. gradients[i] = raw_predictions[i] - y_true[i] +def _update_gradients_hessians_least_squares( + G_H_DTYPE_C [::1] gradients, # OUT + G_H_DTYPE_C [::1] hessians, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN + + cdef: + int n_samples + int i + + n_samples = raw_predictions.shape[0] + for i in prange(n_samples, schedule='static', nogil=True): + # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight + # but since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. + gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i] + hessians[i] = sample_weight[i] + + +def _update_gradients_hessians_least_absolute_deviation( + G_H_DTYPE_C [::1] gradients, # OUT + G_H_DTYPE_C [::1] hessians, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN + + cdef: + int n_samples + int i + + n_samples = raw_predictions.shape[0] + for i in prange(n_samples, schedule='static', nogil=True): + # gradient = sign(raw_predicition - y_pred) * sample_weight + gradients[i] = sample_weight[i] * (2 * + (y_true[i] - raw_predictions[i] < 0) - 1) + hessians[i] = sample_weight[i] + + def _update_gradients_least_absolute_deviation( G_H_DTYPE_C [::1] gradients, # OUT const Y_DTYPE_C [::1] y_true, # IN @@ -49,44 +91,66 @@ def _update_gradients_hessians_binary_crossentropy( G_H_DTYPE_C [::1] gradients, # OUT G_H_DTYPE_C [::1] hessians, # OUT const Y_DTYPE_C [::1] y_true, # IN - const Y_DTYPE_C [::1] raw_predictions): # IN + const Y_DTYPE_C [::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN cdef: int n_samples Y_DTYPE_C p_i # proba that ith sample belongs to positive class int i n_samples = raw_predictions.shape[0] - for i in prange(n_samples, schedule='static', nogil=True): - p_i = _cexpit(raw_predictions[i]) - gradients[i] = p_i - y_true[i] - hessians[i] = p_i * (1. - p_i) + if sample_weight is None: + for i in prange(n_samples, schedule='static', nogil=True): + p_i = _cexpit(raw_predictions[i]) + gradients[i] = p_i - y_true[i] + hessians[i] = p_i * (1. - p_i) + else: + for i in prange(n_samples, schedule='static', nogil=True): + p_i = _cexpit(raw_predictions[i]) + gradients[i] = (p_i - y_true[i]) * sample_weight[i] + hessians[i] = p_i * (1. - p_i) * sample_weight[i] def _update_gradients_hessians_categorical_crossentropy( G_H_DTYPE_C [:, ::1] gradients, # OUT G_H_DTYPE_C [:, ::1] hessians, # OUT const Y_DTYPE_C [::1] y_true, # IN - const Y_DTYPE_C [:, ::1] raw_predictions): # IN + const Y_DTYPE_C [:, ::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN cdef: int prediction_dim = raw_predictions.shape[0] int n_samples = raw_predictions.shape[1] int k # class index int i # sample index + Y_DTYPE_C sw # p[i, k] is the probability that class(ith sample) == k. # It's the softmax of the raw predictions Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) Y_DTYPE_C p_i_k - for i in prange(n_samples, schedule='static', nogil=True): - # first compute softmaxes of sample i for each class - for k in range(prediction_dim): - p[i, k] = raw_predictions[k, i] # prepare softmax - _compute_softmax(p, i) - # then update gradients and hessians - for k in range(prediction_dim): - p_i_k = p[i, k] - gradients[k, i] = p_i_k - (y_true[i] == k) - hessians[k, i] = p_i_k * (1. - p_i_k) + if sample_weight is None: + for i in prange(n_samples, schedule='static', nogil=True): + # first compute softmaxes of sample i for each class + for k in range(prediction_dim): + p[i, k] = raw_predictions[k, i] # prepare softmax + _compute_softmax(p, i) + # then update gradients and hessians + for k in range(prediction_dim): + p_i_k = p[i, k] + gradients[k, i] = p_i_k - (y_true[i] == k) + hessians[k, i] = p_i_k * (1. - p_i_k) + else: + for i in prange(n_samples, schedule='static', nogil=True): + # first compute softmaxes of sample i for each class + for k in range(prediction_dim): + p[i, k] = raw_predictions[k, i] # prepare softmax + _compute_softmax(p, i) + # then update gradients and hessians + sw = sample_weight[i] + for k in range(prediction_dim): + p_i_k = p[i, k] + gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw + hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 376ab4f940a29..97fcdb4713802 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -9,7 +9,8 @@ from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier) from ...utils import check_X_y, check_random_state, check_array, resample -from ...utils.validation import check_is_fitted +from ...utils.validation import (check_is_fitted, + check_consistent_length, _check_sample_weight) from ...utils.multiclass import check_classification_targets from ...metrics import check_scoring from ...model_selection import train_test_split @@ -81,7 +82,7 @@ def _validate_parameters(self): raise ValueError('max_bins={} should be no smaller than 2 ' 'and no larger than 255.'.format(self.max_bins)) - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """Fit the gradient boosting model. Parameters @@ -92,6 +93,9 @@ def fit(self, X, y): y : array-like of shape (n_samples,) Target values. + sample_weight : array-like of shape (n_samples,) default=None + Weights of training data. + Returns ------- self : object @@ -104,6 +108,14 @@ def fit(self, X, y): acc_prediction_time = 0. X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite=False) y = self._encode_y(y) + check_consistent_length(X, y) + # Do not create unit sample weights by default to later skip some + # computation + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, + dtype=np.float64) + # TODO: remove when PDP suports sample weights + self._fitted_with_sw = True rng = check_random_state(self.random_state) @@ -127,7 +139,7 @@ def fit(self, X, y): # data. self._in_fit = True - self.loss_ = self._get_loss() + self.loss_ = self._get_loss(sample_weight=sample_weight) if self.early_stopping == 'auto': self.do_early_stopping_ = n_samples > 10000 else: @@ -143,12 +155,23 @@ def fit(self, X, y): # This is needed in order to have the same split when using # warm starting. - X_train, X_val, y_train, y_val = train_test_split( - X, y, test_size=self.validation_fraction, stratify=stratify, - random_state=self._random_seed) + if sample_weight is None: + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=self.validation_fraction, + stratify=stratify, + random_state=self._random_seed) + sample_weight_train = sample_weight_val = None + else: + # TODO: incorporate sample_weight in sampling here, as well as + # stratify + (X_train, X_val, y_train, y_val, sample_weight_train, + sample_weight_val) = train_test_split( + X, y, sample_weight, test_size=self.validation_fraction, + stratify=stratify, + random_state=self._random_seed) else: - X_train, y_train = X, y - X_val, y_val = None, None + X_train, y_train, sample_weight_train = X, y, sample_weight + X_val = y_val = sample_weight_val = None has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8) @@ -185,7 +208,7 @@ def fit(self, X, y): # n_trees_per_iterations is n_classes in multiclass classification, # else 1. self._baseline_prediction = self.loss_.get_baseline_prediction( - y_train, self.n_trees_per_iteration_ + y_train, sample_weight_train, self.n_trees_per_iteration_ ) raw_predictions = np.zeros( shape=(self.n_trees_per_iteration_, n_samples), @@ -193,6 +216,14 @@ def fit(self, X, y): ) raw_predictions += self._baseline_prediction + # initialize gradients and hessians (empty arrays). + # shape = (n_trees_per_iteration, n_samples). + gradients, hessians = self.loss_.init_gradients_and_hessians( + n_samples=n_samples, + prediction_dim=self.n_trees_per_iteration_, + sample_weight=sample_weight_train + ) + # predictors is a matrix (list of lists) of TreePredictor objects # with shape (n_iter_, n_trees_per_iteration) self._predictors = predictors = [] @@ -227,7 +258,9 @@ def fit(self, X, y): raw_predictions_val += self._baseline_prediction self._check_early_stopping_loss(raw_predictions, y_train, - raw_predictions_val, y_val) + sample_weight_train, + raw_predictions_val, y_val, + sample_weight_val) else: self.scorer_ = check_scoring(self, self.scoring) # scorer_ is a callable with signature (est, X, y) and @@ -239,12 +272,15 @@ def fit(self, X, y): # Compute the subsample set (X_binned_small_train, - y_small_train) = self._get_small_trainset( - X_binned_train, y_train, self._random_seed) + y_small_train, + sample_weight_small_train) = self._get_small_trainset( + X_binned_train, y_train, sample_weight_train, + self._random_seed) self._check_early_stopping_scorer( X_binned_small_train, y_small_train, - X_binned_val, y_val, + sample_weight_small_train, + X_binned_val, y_val, sample_weight_val, ) begin_at_stage = 0 @@ -270,8 +306,18 @@ def fit(self, X, y): if self.do_early_stopping_ and self.scoring != 'loss': # Compute the subsample set - X_binned_small_train, y_small_train = self._get_small_trainset( - X_binned_train, y_train, self._random_seed) + (X_binned_small_train, + y_small_train, + sample_weight_small_train) = self._get_small_trainset( + X_binned_train, y_train, sample_weight_train, + self._random_seed) + + # Initialize the gradients and hessians + gradients, hessians = self.loss_.init_gradients_and_hessians( + n_samples=n_samples, + sample_weight=sample_weight_train, + prediction_dim=self.n_trees_per_iteration_ + ) # Get the predictors from the previous fit predictors = self._predictors @@ -282,7 +328,8 @@ def fit(self, X, y): # shape = (n_trees_per_iteration, n_samples). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, - prediction_dim=self.n_trees_per_iteration_ + prediction_dim=self.n_trees_per_iteration_, + sample_weight=sample_weight_train ) for iteration in range(begin_at_stage, self.max_iter): @@ -294,7 +341,8 @@ def fit(self, X, y): # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, - y_train, raw_predictions) + y_train, raw_predictions, + sample_weight_train) # Append a list since there may be more than 1 predictor per iter predictors.append([]) @@ -320,7 +368,8 @@ def fit(self, X, y): if self.loss_.need_update_leaves_values: self.loss_.update_leaves_values(grower, y_train, - raw_predictions[k, :]) + raw_predictions[k, :], + sample_weight_train) predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_ @@ -348,14 +397,15 @@ def fit(self, X, y): ) should_early_stop = self._check_early_stopping_loss( - raw_predictions, y_train, - raw_predictions_val, y_val + raw_predictions, y_train, sample_weight_train, + raw_predictions_val, y_val, sample_weight_val ) else: should_early_stop = self._check_early_stopping_scorer( X_binned_small_train, y_small_train, - X_binned_val, y_val, + sample_weight_small_train, + X_binned_val, y_val, sample_weight_val ) if self.verbose: @@ -400,12 +450,14 @@ def _clear_state(self): if hasattr(self, var): delattr(self, var) - def _get_small_trainset(self, X_binned_train, y_train, seed): + def _get_small_trainset(self, X_binned_train, y_train, sample_weight_train, + seed): """Compute the indices of the subsample set and return this set. For efficiency, we need to subsample the training set to compute scores with scorers. """ + # TODO: incorporate sample_weights here in `resample` subsample_size = 10000 if X_binned_train.shape[0] > subsample_size: indices = np.arange(X_binned_train.shape[0]) @@ -415,29 +467,48 @@ def _get_small_trainset(self, X_binned_train, y_train, seed): stratify=stratify) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] + if sample_weight_train is not None: + sample_weight_small_train = sample_weight_train[indices] + else: + sample_weight_small_train = None X_binned_small_train = np.ascontiguousarray(X_binned_small_train) - return X_binned_small_train, y_small_train + return (X_binned_small_train, y_small_train, + sample_weight_small_train) else: - return X_binned_train, y_train + return X_binned_train, y_train, sample_weight_train def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, - X_binned_val, y_val): + sample_weight_small_train, + X_binned_val, y_val, sample_weight_val): """Check if fitting should be early-stopped based on scorer. Scores are computed on validation data or on training data. """ if is_classifier(self): y_small_train = self.classes_[y_small_train.astype(int)] - self.train_score_.append( - self.scorer_(self, X_binned_small_train, y_small_train) - ) + + if sample_weight_small_train is None: + self.train_score_.append( + self.scorer_(self, X_binned_small_train, y_small_train) + ) + else: + self.train_score_.append( + self.scorer_(self, X_binned_small_train, y_small_train, + sample_weight=sample_weight_small_train) + ) if self._use_validation_data: if is_classifier(self): y_val = self.classes_[y_val.astype(int)] - self.validation_score_.append( - self.scorer_(self, X_binned_val, y_val) - ) + if sample_weight_val is None: + self.validation_score_.append( + self.scorer_(self, X_binned_val, y_val) + ) + else: + self.validation_score_.append( + self.scorer_(self, X_binned_val, y_val, + sample_weight=sample_weight_val) + ) return self._should_stop(self.validation_score_) else: return self._should_stop(self.train_score_) @@ -445,20 +516,22 @@ def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, def _check_early_stopping_loss(self, raw_predictions, y_train, + sample_weight_train, raw_predictions_val, - y_val): + y_val, + sample_weight_val): """Check if fitting should be early-stopped based on loss. Scores are computed on validation data or on training data. """ self.train_score_.append( - -self.loss_(y_train, raw_predictions) + -self.loss_(y_train, raw_predictions, sample_weight_train) ) if self._use_validation_data: self.validation_score_.append( - -self.loss_(y_val, raw_predictions_val) + -self.loss_(y_val, raw_predictions_val, sample_weight_val) ) return self._should_stop(self.validation_score_) else: @@ -609,6 +682,13 @@ def _compute_partial_dependence_recursion(self, grid, target_features): (n_trees_per_iteration, n_samples) The value of the partial dependence function on each grid point. """ + + if getattr(self, '_fitted_with_sw', False): + raise NotImplementedError("{} does not support partial dependence " + "plots with the 'recursion' method when " + "sample weights were given during fit " + "time.".format(self.__class__.__name__)) + grid = np.asarray(grid, dtype=X_DTYPE, order='C') averaged_predictions = np.zeros( (self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE) @@ -626,7 +706,7 @@ def _more_tags(self): return {'allow_nan': True} @abstractmethod - def _get_loss(self): + def _get_loss(self, sample_weight): pass @abstractmethod @@ -820,8 +900,8 @@ def _encode_y(self, y): y = y.astype(Y_DTYPE, copy=False) return y - def _get_loss(self): - return _LOSSES[self.loss]() + def _get_loss(self, sample_weight): + return _LOSSES[self.loss](sample_weight=sample_weight) class HistGradientBoostingClassifier(BaseHistGradientBoosting, @@ -1058,7 +1138,7 @@ def _encode_y(self, y): encoded_y = encoded_y.astype(Y_DTYPE, copy=False) return encoded_y - def _get_loss(self): + def _get_loss(self, sample_weight): if (self.loss == 'categorical_crossentropy' and self.n_trees_per_iteration_ == 1): raise ValueError("'categorical_crossentropy' is not suitable for " @@ -1067,8 +1147,10 @@ def _get_loss(self): if self.loss == 'auto': if self.n_trees_per_iteration_ == 1: - return _LOSSES['binary_crossentropy']() + return _LOSSES['binary_crossentropy']( + sample_weight=sample_weight) else: - return _LOSSES['categorical_crossentropy']() + return _LOSSES['categorical_crossentropy']( + sample_weight=sample_weight) - return _LOSSES[self.loss]() + return _LOSSES[self.loss](sample_weight=sample_weight) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index dae85f57134e4..2dbf8bd58773e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -18,13 +18,27 @@ from .common import Y_DTYPE from .common import G_H_DTYPE from ._loss import _update_gradients_least_squares +from ._loss import _update_gradients_hessians_least_squares from ._loss import _update_gradients_least_absolute_deviation +from ._loss import _update_gradients_hessians_least_absolute_deviation from ._loss import _update_gradients_hessians_binary_crossentropy from ._loss import _update_gradients_hessians_categorical_crossentropy +from ...utils.stats import _weighted_percentile class BaseLoss(ABC): """Base class for a loss.""" + def __init__(self, hessians_are_constant): + self.hessians_are_constant = hessians_are_constant + + def __call__(self, y_true, raw_predictions, sample_weight): + """Return the weighted average loss""" + return np.average(self.pointwise_loss(y_true, raw_predictions), + weights=sample_weight) + + @abstractmethod + def pointwise_loss(self, y_true, raw_predictions): + """Return loss value for each input""" # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to @@ -36,7 +50,8 @@ class BaseLoss(ABC): # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. need_update_leaves_values = False - def init_gradients_and_hessians(self, n_samples, prediction_dim): + def init_gradients_and_hessians(self, n_samples, prediction_dim, + sample_weight): """Return initial gradients and hessians. Unless hessians are constant, arrays are initialized with undefined @@ -46,12 +61,16 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): ---------- n_samples : int The number of samples passed to `fit()`. + prediction_dim : int The dimension of a raw prediction, i.e. the number of trees built at each iteration. Equals 1 for regression and binary classification, or K where K is the number of classes for multiclass classification. + sample_weight : array-like of shape(n_samples,) default=None + Weights of training data. + Returns ------- gradients : ndarray, shape (prediction_dim, n_samples) @@ -63,6 +82,7 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): """ shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) + if self.hessians_are_constant: # If the hessians are constant, we consider they are equal to 1. # - This is correct for the half LS loss @@ -75,13 +95,17 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): return gradients, hessians @abstractmethod - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): """Return initial predictions (before the first iteration). Parameters ---------- y_train : ndarray, shape (n_samples,) The target training values. + + sample_weight : array-like of shape(n_samples,) default=None + Weights of training data. + prediction_dim : int The dimension of one prediction: 1 for binary classification and regression, n_classes for multiclass classification. @@ -94,7 +118,7 @@ def get_baseline_prediction(self, y_train, prediction_dim): @abstractmethod def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): """Update gradients and hessians arrays, inplace. The gradients (resp. hessians) are the first (resp. second) order @@ -105,14 +129,20 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, ---------- gradients : ndarray, shape (prediction_dim, n_samples) The gradients (treated as OUT array). + hessians : ndarray, shape (prediction_dim, n_samples) or \ (1,) The hessians (treated as OUT array). + y_true : ndarray, shape (n_samples,) The true target values or each training sample. + raw_predictions : ndarray, shape (prediction_dim, n_samples) The raw_predictions (i.e. values from the trees) of the tree ensemble at iteration ``i - 1``. + + sample_weight : array-like of shape(n_samples,) default=None + Weights of training data. """ @@ -123,34 +153,43 @@ class LeastSquares(BaseLoss): loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2 - This actually computes the half least squares loss to optimize simplify + This actually computes the half least squares loss to simplify the computation of the gradients and get a unit hessian (and be consistent with what is done in LightGBM). """ + def __init__(self, sample_weight): + # If sample weights are provided, the hessians and gradients + # are multiplied by sample_weight, which means the hessians are + # equal to sample weights. + super().__init__(hessians_are_constant=sample_weight is None) - hessians_are_constant = True - - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = 0.5 * np.power(y_true - raw_predictions, 2) - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): - return np.mean(y_train) + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): + return np.average(y_train, weights=sample_weight) @staticmethod def inverse_link_function(raw_predictions): return raw_predictions def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - _update_gradients_least_squares(gradients, y_true, raw_predictions) + if sample_weight is None: + _update_gradients_least_squares(gradients, y_true, raw_predictions) + else: + hessians = hessians.reshape(-1) + _update_gradients_hessians_least_squares(gradients, hessians, + y_true, raw_predictions, + sample_weight) class LeastAbsoluteDeviation(BaseLoss): @@ -160,8 +199,12 @@ class LeastAbsoluteDeviation(BaseLoss): loss(x_i) = |y_true_i - raw_pred_i| """ + def __init__(self, sample_weight): + # If sample weights are provided, the hessians and gradients + # are multiplied by sample_weight, which means the hessians are + # equal to sample weights. + super().__init__(hessians_are_constant=sample_weight is None) - hessians_are_constant = True # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to # predict a Newton-Raphson step (see grower._finalize_leaf()). But for @@ -172,30 +215,39 @@ class LeastAbsoluteDeviation(BaseLoss): # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. need_update_leaves_values = True - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = np.abs(y_true - raw_predictions) - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): - return np.median(y_train) + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): + if sample_weight is None: + return np.median(y_train) + else: + return _weighted_percentile(y_train, sample_weight, 50) @staticmethod def inverse_link_function(raw_predictions): return raw_predictions def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - _update_gradients_least_absolute_deviation(gradients, y_true, - raw_predictions) + if sample_weight is None: + _update_gradients_least_absolute_deviation(gradients, y_true, + raw_predictions) + else: + hessians = hessians.reshape(-1) + _update_gradients_hessians_least_absolute_deviation( + gradients, hessians, y_true, raw_predictions, sample_weight) - def update_leaves_values(self, grower, y_true, raw_predictions): + def update_leaves_values(self, grower, y_true, raw_predictions, + sample_weight): # Update the values predicted by the tree with # median(y_true - raw_predictions). # See note about need_update_leaves_values in BaseLoss. @@ -205,7 +257,14 @@ def update_leaves_values(self, grower, y_true, raw_predictions): # requires a cython version of median() for leaf in grower.finalized_leaves: indices = leaf.sample_indices - median_res = np.median(y_true[indices] - raw_predictions[indices]) + if sample_weight is None: + median_res = np.median(y_true[indices] + - raw_predictions[indices]) + else: + median_res = _weighted_percentile(y_true[indices] + - raw_predictions[indices], + sample_weight=sample_weight, + percentile=50) leaf.value = grower.shrinkage * median_res # Note that the regularization is ignored here @@ -222,24 +281,26 @@ class BinaryCrossEntropy(BaseLoss): section 4.4.1 (about logistic regression). """ - hessians_are_constant = False + def __init__(self, sample_weight): + super().__init__(hessians_are_constant=False) + inverse_link_function = staticmethod(expit) - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) # logaddexp(0, x) = log(1 + exp(x)) loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): if prediction_dim > 2: raise ValueError( "loss='binary_crossentropy' is not defined for multiclass" " classification with n_classes=%d, use" " loss='categorical_crossentropy' instead" % prediction_dim) - proba_positive_class = np.mean(y_train) + proba_positive_class = np.average(y_train, weights=sample_weight) eps = np.finfo(y_train.dtype).eps proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) # log(x / 1 - x) is the anti function of sigmoid, or the link function @@ -247,14 +308,14 @@ def get_baseline_prediction(self, y_train, prediction_dim): return np.log(proba_positive_class / (1 - proba_positive_class)) def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) hessians = hessians.reshape(-1) _update_gradients_hessians_binary_crossentropy( - gradients, hessians, y_true, raw_predictions) + gradients, hessians, y_true, raw_predictions, sample_weight) def predict_proba(self, raw_predictions): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to @@ -274,9 +335,10 @@ class CategoricalCrossEntropy(BaseLoss): cross-entropy to more than 2 classes. """ - hessians_are_constant = False + def __init__(self, sample_weight): + super().__init__(hessians_are_constant=False) - def __call__(self, y_true, raw_predictions, average=True): + def pointwise_loss(self, y_true, raw_predictions): one_hot_true = np.zeros_like(raw_predictions) prediction_dim = raw_predictions.shape[0] for k in range(prediction_dim): @@ -284,22 +346,23 @@ def __call__(self, y_true, raw_predictions, average=True): loss = (logsumexp(raw_predictions, axis=0) - (one_hot_true * raw_predictions).sum(axis=0)) - return loss.mean() if average else loss + return loss - def get_baseline_prediction(self, y_train, prediction_dim): + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) eps = np.finfo(y_train.dtype).eps for k in range(prediction_dim): - proba_kth_class = np.mean(y_train == k) + proba_kth_class = np.average(y_train == k, + weights=sample_weight) proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) init_value[k, :] += np.log(proba_kth_class) return init_value def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): + raw_predictions, sample_weight): _update_gradients_hessians_categorical_crossentropy( - gradients, hessians, y_true, raw_predictions) + gradients, hessians, y_true, raw_predictions, sample_weight) def predict_proba(self, raw_predictions): # TODO: This could be done in parallel diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 0850ac6638a80..c5b4a143591d6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,6 +1,6 @@ import numpy as np import pytest -from numpy.testing import assert_allclose +from numpy.testing import assert_allclose, assert_array_equal from sklearn.datasets import make_classification, make_regression from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler from sklearn.model_selection import train_test_split @@ -11,6 +11,8 @@ from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.utils import shuffle @@ -301,7 +303,8 @@ def test_small_trainset(): gb = HistGradientBoostingClassifier() # Compute the small training set - X_small, y_small = gb._get_small_trainset(X, y, seed=42) + X_small, y_small, _ = gb._get_small_trainset(X, y, seed=42, + sample_weight_train=None) # Compute the class distribution in the small training set unique, counts = np.unique(y_small, return_counts=True) @@ -435,6 +438,20 @@ def test_infinite_values(): np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4) +def test_consistent_lengths(): + X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) + y = np.array([0, 0, 1, 1]) + sample_weight = np.array([.1, .3, .1]) + gbdt = HistGradientBoostingRegressor() + with pytest.raises(ValueError, + match=r"sample_weight.shape == \(3,\), expected"): + gbdt.fit(X, y, sample_weight) + + with pytest.raises(ValueError, + match="Found input variables with inconsistent number"): + gbdt.fit(X, y[1:]) + + def test_infinite_values_missing_values(): # High level test making sure that inf and nan values are properly handled # when both are present. This is similar to @@ -474,6 +491,148 @@ def test_string_target_early_stopping(scoring): gbrt.fit(X, y) +def test_zero_sample_weights_regression(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingRegressor(min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert gb.predict([[1, 0]])[0] > 0.5 + + +def test_zero_sample_weights_classification(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingClassifier(loss='binary_crossentropy', + min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + X = [[1, 0], + [1, 0], + [1, 0], + [0, 1], + [1, 1]] + y = [0, 0, 1, 0, 2] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1, 1] + gb = HistGradientBoostingClassifier(loss='categorical_crossentropy', + min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + +@pytest.mark.parametrize('problem', ( + 'regression', + 'binary_classification', + 'multiclass_classification' +)) +@pytest.mark.parametrize('duplication', ('half', 'all')) +def test_sample_weight_effect(problem, duplication): + # High level test to make sure that duplicating a sample is equivalent to + # giving it weight of 2. + + # fails for n_samples > 255 because binning does not take sample weights + # into account. Keeping n_samples <= 255 makes + # sure only unique values are used so SW have no effect on binning. + n_samples = 255 + n_features = 2 + if problem == 'regression': + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features, random_state=0) + Klass = HistGradientBoostingRegressor + else: + n_classes = 2 if problem == 'binary_classification' else 3 + X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_informative=n_features, n_redundant=0, + n_clusters_per_class=1, + n_classes=n_classes, random_state=0) + Klass = HistGradientBoostingClassifier + + # This test can't pass if min_samples_leaf > 1 because that would force 2 + # samples to be in the same node in est_sw, while these samples would be + # free to be separate in est_dup: est_dup would just group together the + # duplicated samples. + est = Klass(min_samples_leaf=1) + + # Create dataset with duplicate and corresponding sample weights + if duplication == 'half': + lim = n_samples // 2 + else: + lim = n_samples + X_dup = np.r_[X, X[:lim]] + y_dup = np.r_[y, y[:lim]] + sample_weight = np.ones(shape=(n_samples)) + sample_weight[:lim] = 2 + + est_sw = clone(est).fit(X, y, sample_weight=sample_weight) + est_dup = clone(est).fit(X_dup, y_dup) + + # checking raw_predict is stricter than just predict for classification + assert np.allclose(est_sw._raw_predict(X_dup), + est_dup._raw_predict(X_dup)) + + +@pytest.mark.parametrize('loss_name', ('least_squares', + 'least_absolute_deviation')) +def test_sum_hessians_are_sample_weight(loss_name): + # For losses with constant hessians, the sum_hessians field of the + # histograms must be equal to the sum of the sample weight of samples at + # the corresponding bin. + + rng = np.random.RandomState(0) + n_samples = 1000 + n_features = 2 + X, y = make_regression(n_samples=n_samples, n_features=n_features, + random_state=rng) + bin_mapper = _BinMapper() + X_binned = bin_mapper.fit_transform(X) + + sample_weight = rng.normal(size=n_samples) + + loss = _LOSSES[loss_name](sample_weight=sample_weight) + gradients, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight) + raw_predictions = rng.normal(size=(1, n_samples)) + loss.update_gradients_and_hessians(gradients, hessians, y, + raw_predictions, sample_weight) + + # build sum_sample_weight which contains the sum of the sample weights at + # each bin (for each feature). This must be equal to the sum_hessians + # field of the corresponding histogram + sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins)) + for feature_idx in range(n_features): + for sample_idx in range(n_samples): + sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += ( + sample_weight[sample_idx]) + + # Build histogram + grower = TreeGrower(X_binned, gradients[0], hessians[0], + n_bins=bin_mapper.n_bins) + histograms = grower.histogram_builder.compute_histograms_brute( + grower.root.sample_indices) + + for feature_idx in range(n_features): + for bin_idx in range(bin_mapper.n_bins): + assert histograms[feature_idx, bin_idx]['sum_hessians'] == ( + pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5)) + + def test_max_depth_max_leaf_nodes(): # Non regression test for # https://github.com/scikit-learn/scikit-learn/issues/16179 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 8c300db993d3d..915dc300e4760 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -20,7 +20,7 @@ def get_gradients(y_true, raw_predictions): gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, - raw_predictions) + raw_predictions, None) return gradients def get_hessians(y_true, raw_predictions): @@ -28,7 +28,7 @@ def get_hessians(y_true, raw_predictions): gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, - raw_predictions) + raw_predictions, None) if loss.__class__.__name__ == 'LeastSquares': # hessians aren't updated because they're constant: @@ -62,13 +62,13 @@ def test_derivatives(loss, x0, y_true): # using Halley's method with the first and second order derivatives # computed by the Loss instance. - loss = _LOSSES[loss]() + loss = _LOSSES[loss](sample_weight=None) y_true = np.array([y_true], dtype=Y_DTYPE) x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1) get_gradients, get_hessians = get_derivatives_helper(loss) def func(x): - return loss(y_true, x) + return loss.pointwise_loss(y_true, x) def fprime(x): return get_gradients(y_true, x) @@ -78,7 +78,7 @@ def fprime2(x): optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2) assert np.allclose(loss.inverse_link_function(optimum), y_true) - assert np.allclose(loss(y_true, optimum), 0) + assert np.allclose(loss.pointwise_loss(y_true, optimum), 0) assert np.allclose(get_gradients(y_true, optimum), 0) @@ -105,7 +105,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): raw_predictions = rng.normal( size=(prediction_dim, n_samples) ).astype(Y_DTYPE) - loss = _LOSSES[loss]() + loss = _LOSSES[loss](sample_weight=None) get_gradients, get_hessians = get_derivatives_helper(loss) # only take gradients and hessians of first tree / class. @@ -120,16 +120,16 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): eps = 1e-9 offset = np.zeros_like(raw_predictions) offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False) - f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False) + f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2) + f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2) numerical_gradients = (f_plus_eps - f_minus_eps) / eps # Approximate hessians eps = 1e-4 # need big enough eps as we divide by its square offset[0, :] = eps - f_plus_eps = loss(y_true, raw_predictions + offset, average=False) - f_minus_eps = loss(y_true, raw_predictions - offset, average=False) - f = loss(y_true, raw_predictions, average=False) + f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset) + f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset) + f = loss.pointwise_loss(y_true, raw_predictions) numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2 assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7) @@ -139,9 +139,9 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): def test_baseline_least_squares(): rng = np.random.RandomState(0) - loss = _LOSSES['least_squares']() + loss = _LOSSES['least_squares'](sample_weight=None) y_train = rng.normal(size=100) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype # Make sure baseline prediction is the mean of all targets @@ -153,9 +153,9 @@ def test_baseline_least_squares(): def test_baseline_least_absolute_deviation(): rng = np.random.RandomState(0) - loss = _LOSSES['least_absolute_deviation']() + loss = _LOSSES['least_absolute_deviation'](sample_weight=None) y_train = rng.normal(size=100) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype # Make sure baseline prediction is the median of all targets @@ -167,10 +167,10 @@ def test_baseline_least_absolute_deviation(): def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) - loss = _LOSSES['binary_crossentropy']() + loss = _LOSSES['binary_crossentropy'](sample_weight=None) for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert_all_finite(baseline_prediction) assert np.allclose(loss.inverse_link_function(baseline_prediction), y_train[0]) @@ -181,7 +181,7 @@ def test_baseline_binary_crossentropy(): # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) y_train = rng.randint(0, 2, size=100).astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, 1) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype p = y_train.mean() @@ -192,10 +192,10 @@ def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) prediction_dim = 4 - loss = _LOSSES['categorical_crossentropy']() + loss = _LOSSES['categorical_crossentropy'](sample_weight=None) for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) - baseline_prediction = loss.get_baseline_prediction(y_train, + baseline_prediction = loss.get_baseline_prediction(y_train, None, prediction_dim) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) @@ -203,8 +203,85 @@ def test_baseline_categorical_crossentropy(): # Same logic as for above test. Here inverse_link_function = softmax and # link_function = log y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) - baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) + baseline_prediction = loss.get_baseline_prediction(y_train, None, + prediction_dim) assert baseline_prediction.shape == (prediction_dim, 1) for k in range(prediction_dim): p = (y_train == k).mean() assert np.allclose(baseline_prediction[k, :], np.log(p)) + + +@pytest.mark.parametrize('loss, problem', [ + ('least_squares', 'regression'), + ('least_absolute_deviation', 'regression'), + ('binary_crossentropy', 'classification'), + ('categorical_crossentropy', 'classification') + ]) +@pytest.mark.parametrize('sample_weight', ['ones', 'random']) +def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): + # Make sure that passing sample weights to the gradient and hessians + # computation methods is equivalent to multiplying by the weights. + + rng = np.random.RandomState(42) + n_samples = 1000 + + if loss == 'categorical_crossentropy': + n_classes = prediction_dim = 3 + else: + n_classes = prediction_dim = 1 + + if problem == 'regression': + y_true = rng.normal(size=n_samples).astype(Y_DTYPE) + else: + y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) + + if sample_weight == 'ones': + sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE) + else: + sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE) + + loss_ = _LOSSES[loss](sample_weight=sample_weight) + + baseline_prediction = loss_.get_baseline_prediction( + y_true, None, prediction_dim + ) + raw_predictions = np.zeros(shape=(prediction_dim, n_samples), + dtype=baseline_prediction.dtype) + raw_predictions += baseline_prediction + + gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + loss_.update_gradients_and_hessians(gradients, hessians, y_true, + raw_predictions, None) + + gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE) + loss_.update_gradients_and_hessians(gradients_sw, hessians_sw, y_true, + raw_predictions, sample_weight) + + assert np.allclose(gradients * sample_weight, gradients_sw) + assert np.allclose(hessians * sample_weight, hessians_sw) + + +def test_init_gradient_and_hessians_sample_weight(): + # Make sure that passing sample_weight to a loss correctly influences the + # hessians_are_constant attribute, and consequently the shape of the + # hessians array. + + prediction_dim = 2 + n_samples = 5 + sample_weight = None + loss = _LOSSES['least_squares'](sample_weight=sample_weight) + _, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=prediction_dim, + sample_weight=None) + assert loss.hessians_are_constant + assert hessians.shape == (1, 1) + + sample_weight = np.ones(n_samples) + loss = _LOSSES['least_squares'](sample_weight=sample_weight) + _, hessians = loss.init_gradients_and_hessians( + n_samples=n_samples, prediction_dim=prediction_dim, + sample_weight=sample_weight) + assert not loss.hessians_are_constant + assert hessians.shape == (prediction_dim, n_samples) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 7f9c37dfca642..530a53b83dce4 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -520,6 +520,16 @@ def test_partial_dependence_sample_weight(): assert np.corrcoef(pdp, values)[0, 1] > 0.99 +def test_hist_gbdt_sw_not_supported(): + # TODO: remove/fix when PDP supports HGBT with sample weights + clf = HistGradientBoostingRegressor(random_state=1) + clf.fit(X, y, sample_weight=np.ones(len(X))) + + with pytest.raises(NotImplementedError, + match="does not support partial dependence"): + partial_dependence(clf, X, features=[1]) + + # TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates @ignore_warnings(category=FutureWarning) def test_partial_dependence_pipeline():