scikit-learn · trevorstephens · Feb 7, 2015
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -166,8 +166,12 @@ Enhancements
      faster in general. By `Joel Nothman`_.
 
    - Add ``class_weight`` parameter to automatically weight samples by class
-     frequency for :class:`ensemble.RandomForestClassifier`,
-     :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier`
+     frequency for :class:`ensemble.AdaBoostClassifier`,
+     :class:`ensemble.BaggingClassifier`,
+     :class:`ensemble.ExtraTreesClassifier`,
+     :class:`ensemble.GradientBoostingClassifier`,
+     :class:`ensemble.RandomForestClassifier`,
+     :class:`tree.DecisionTreeClassifier`,
      and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_.
 
    - :class:`grid_search.RandomizedSearchCV` now does sampling without

diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py
@@ -13,11 +13,12 @@
 
 from ..base import ClassifierMixin, RegressorMixin
 from ..externals.joblib import Parallel, delayed
-from ..externals.six import with_metaclass
+from ..externals.six import with_metaclass, string_types
 from ..externals.six.moves import zip
 from ..metrics import r2_score, accuracy_score
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils import check_random_state, check_X_y, check_array, column_or_1d
+from ..utils import compute_sample_weight
 from ..utils.random import sample_without_replacement
 from ..utils.validation import has_fit_parameter, check_is_fitted
 from ..utils.fixes import bincount
@@ -32,7 +33,7 @@
 
 
 def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
-                               seeds, verbose):
+                               class_weight, seeds, verbose):
     """Private function used to build a batch of estimators within a job."""
     # Retrieve settings
     n_samples, n_features = X.shape
@@ -52,7 +53,6 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
     support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                               "sample_weight")
 
-
     # Build estimators
     estimators = []
     estimators_samples = []
@@ -99,6 +99,13 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
 
                 curr_sample_weight[not_indices] = 0
 
+                if class_weight == 'subsample':
+                    indices = np.where(curr_sample_weight > 0)
+
+            if class_weight == 'subsample':
+                # Multiply all weights by subsample weights
+                curr_sample_weight *= compute_sample_weight('auto', y, indices)
+
                estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
             samples = curr_sample_weight > 0.
 
@@ -204,6 +211,7 @@ def __init__(self,
                  bootstrap=True,
                  bootstrap_features=False,
                  oob_score=False,
+                 class_weight=None,
                  n_jobs=1,
                  random_state=None,
                  verbose=0):
@@ -216,6 +224,7 @@ def __init__(self,
         self.bootstrap = bootstrap
         self.bootstrap_features = bootstrap_features
         self.oob_score = oob_score
+        self.class_weight = class_weight
         self.n_jobs = n_jobs
         self.random_state = random_state
         self.verbose = verbose
@@ -251,7 +260,7 @@ def fit(self, X, y, sample_weight=None):
 
         # Remap output
         n_samples, self.n_features_ = X.shape
-        y = self._validate_y(y)
+        y, expanded_class_weight = self._validate_y_class_weight(y)
 
         # Check parameters
         self._validate_estimator()
@@ -276,6 +285,13 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("Out of bag estimation only available"
                              " if bootstrap=True")
 
+        # Apply class_weights to sample weights
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_we
8000
ight = expanded_class_weight
+
         # Free allocated memory, if any
         self.estimators_ = None
 
@@ -291,6 +307,7 @@ def fit(self, X, y, sample_weight=None):
                 X,
                 y,
                 sample_weight,
+                self.class_weight,
                 seeds[starts[i]:starts[i + 1]],
                 verbose=self.verbose)
             for i in range(n_jobs))
@@ -312,9 +329,9 @@ def fit(self, X, y, sample_weight=None):
     def _set_oob_score(self, X, y):
         """Calculate out of bag predictions and score."""
 
-    def _validate_y(self, y):
+    def _validate_y_class_weight(self, y):
         # Default implementation
-        return column_or_1d(y, warn=True)
+        return column_or_1d(y, warn=True), None
 
 
 class BaggingClassifier(BaseBagging, ClassifierMixin):
@@ -366,6 +383,23 @@ class BaggingClassifier(BaseBagging, ClassifierMixin):
         Whether to use out-of-bag samples to estimate
         the generalization error.
 
+    class_weight : dict, "auto", "subsample" or None, optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "auto" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data.
+
+        The "subsample" mode is the same as "auto" except that weights are
+        computed based on the bootstrap or sub-sample for every tree grown as
+        defined by the ``max_features`` and/or ``bootstrap`` options.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+        Note that this is supported only if the base estimator supports
+        sample weighting.
+
     n_jobs : int, optional (default=1)
         The number of jobs to run in parallel for both `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
@@ -433,6 +467,7 @@ def __init__(self,
                  bootstrap=True,
                  bootstrap_features=False,
                  oob_score=False,
+                 class_weight=None,
                  n_jobs=1,
                  random_state=None,
                  verbose=0):
@@ -445,6 +480,7 @@ def __init__(self,
             bootstrap=bootstrap,
             bootstrap_features=bootstrap_features,
             oob_score=oob_score,
+            class_weight=class_weight,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose)
@@ -493,12 +529,29 @@ def _set_oob_score(self, X, y):
         self.oob_decision_function_ = oob_decision_function
         self.oob_score_ = oob_score
 
-    def _validate_y(self, y):
+    def _validate_y_class_weight(self, y):
         y = column_or_1d(y, warn=True)
+        expanded_class_weight = None
+
+        if self.class_weight is not None:
+            y_original = np.copy(y)
+
         self.classes_, y = np.unique(y, return_inverse=True)
         self.n_classes_ = len(self.classes_)
 
-        return y
+        if self.class_weight is not None:
+            valid_presets = ('auto', 'subsample')
+            if isinstance(self.class_weight, string_types):
+                if self.class_weight not in valid_presets:
+                    raise ValueError('Valid presets for class_weight include '
+                                     '"auto" and "subsample". Given "%s".'
+                                     % self.class_weight)
+
+            if self.class_weight != 'subsample':
+                expanded_class_weight = compute_sample_weight(
+                    self.class_weight, y_original)
+
+        return y, expanded_class_weight
 
     def predict(self, X):
         """Predict class for X.

diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
@@ -35,7 +35,7 @@
 from ..base import ClassifierMixin
 from ..base import RegressorMixin
 from ..utils import check_random_state, check_array, check_X_y, column_or_1d
-from ..utils import check_consistent_length
+from ..utils import check_consistent_length, compute_sample_weight
 from ..utils.extmath import logsumexp
 from ..utils.fixes import expit, bincount
 from ..utils.stats import _weighted_percentile
@@ -711,7 +711,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
                  min_samples_leaf, min_weight_fraction_leaf,
                  max_depth, init, subsample, max_features,
                  random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
-                 warm_start=False):
+                 warm_start=False, class_weight=None):
 
         self.n_estimators = n_estimators
         self.learning_rate = learning_rate
@@ -728,6 +728,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
         self.verbose = verbose
         self.max_leaf_nodes = max_leaf_nodes
         self.warm_start = warm_start
+        self.class_weight = class_weight
 
         self.estimators_ = np.empty((0, 0), dtype=np.object)
 
@@ -739,6 +740,12 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
         loss = self.loss_
         original_y = y
 
+        if self.class_weight == 'subsample':
+            indices = np.where(sample_mask)
+            # Multiply sample weights by balanced class weights
+            sample_weight = (sample_weight *
+                             compute_sample_weight('auto', y, indices))
+
         for k in range(loss.K):
             if loss.is_multi_class:
                 y = np.array(original_y == k, dtype=np.float64)
@@ -947,7 +954,14 @@ def fit(self, X, y, sample_weight=None, monitor=None):
 
         check_consistent_length(X, y, sample_weight)
 
-        y = self._validate_y(y)
+        y, expanded_class_weight = self._validate_y_class_weight(y)
+
+        # Apply class_weights to sample weights
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_weight = expanded_class_weight
 
         random_state = check_random_state(self.random_state)
         self._check_params()
@@ -1144,11 +1158,11 @@ def feature_importances_(self):
         importances = total_sum / len(self.estimators_)
         return importances
 
-    def _validate_y(self, y):
+    def _validate_y_class_weight(self, y):
         self.n_classes_ = 1
 
         # Default implementation
-        return y
+        return y, None
 
 
 class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
@@ -1241,6 +1255,21 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
         and add more estimators to the ensemble, otherwise, just erase the
         previous solution.
 
+    class_weight : dict, "auto", "subsample" or None, optional
+
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "auto" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data.
+
+        The "subsample" mode is the same as "auto" except that weights are
+        computed based on the bootstrap or sub-sample for every tree grown as
+        defined by the ``max_features`` and/or ``bootstrap`` options.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
     Attributes
     ----------
     feature_importances_ : array, shape = [n_features]
@@ -1290,7 +1319,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, init=None, random_state=None,
                  max_features=None, verbose=0,
-                 max_leaf_nodes=None, warm_start=False):
+                 max_leaf_nodes=None, warm_start=False, class_weight=None):
 
         super(GradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
@@ -1300,12 +1329,38 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
             max_depth=max_depth, init=init, subsample=subsample,
             max_features=max_features,
             random_state=random_state, verbose=verbose,
-            max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
+            max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
+            class_weight=class_weight)
+
+    def _validate_y_class_weight(self, y):
+        expanded_class_weight = None
+        if self.class_weight is not None:
+            y_original = np.copy(y)
 
-    def _validate_y(self, y):
         self.classes_, y = np.unique(y, return_inverse=True)
         self.n_classes_ = len(self.classes_)
-        return y
+
+        if self.class_weight is not None:
+            valid_presets = ('auto', 'subsample')
+            if isinstance(self.class_weight, six.string_types):
+                if self.class_weight not in valid_presets:
+                    raise ValueError('Valid presets for class_weight include '
+                                     '"auto" and "subsample". Given "%s".'
+                                     % self.class_weight)
+                if self.warm_start:
+                    warn('class_weight preset "auto" is not recommended for '
+                         'warm_start if the fitted data differs from the '
+                         'full dataset. In order to use "auto" weights, use '
+                         'compute_class_weight("auto", classes, y). In place '
+                         'of y you can use a large enough sample of the full '
+                         'training set target to properly estimate the class '
+                         'frequency distributions. Pass the resulting '
+                         'weights as the class_weight parameter.')
+            if self.class_weight != 'subsample':
+                expanded_class_weight = compute_sample_weight(
+                    self.class_weight, y_original)
+
+        return y, expanded_class_weight
 
     def predict(self, X):
         """Predict class for X.