scikit-learn
diff --git a/‎sklearn/ensemble/forest.py
Lines changed: 65 additions & 14 deletions b/‎sklearn/ensemble/forest.py
Lines changed: 65 additions & 14 deletions
diff --git a/‎sklearn/ensemble/tests/test_forest.py
Lines changed: 47 additions & 0 deletions b/‎sklearn/ensemble/tests/test_forest.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎sklearn/utils/estimator_checks.py
Lines changed: 2 additions & 0 deletions b/‎sklearn/utils/estimator_checks.py
Lines changed: 2 additions & 0 deletions
@@ -58,7 +58,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor,
                     ExtraTreeClassifier, ExtraTreeRegressor)
 from ..tree._tree import DTYPE, DOUBLE
-from ..utils import check_random_state, check_array
+from ..utils import check_random_state, check_array, compute_class_weight
 from ..utils.validation import DataConversionWarning
 from .base import BaseEnsemble, _partition_estimators
 
@@ -122,7 +122,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
         super(BaseForest, self).__init__(
             base_estimator=base_estimator,
             n_estimators=n_estimators,
@@ -134,6 +135,7 @@ def __init__(self,
         self.random_state = random_state
         self.verbose = verbose
         self.warm_start = warm_start
+        self.class_weight = class_weight
 
     def apply(self, X):
         """Apply trees in the forest to X, return leaf indices.
@@ -211,11 +213,17 @@ def fit(self, X, y, sample_weight=None):
 
         self.n_outputs_ = y.shape[1]
 
-        y = self._validate_y(y)
+        y, cw = self._validate_y_cw(y)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
+        if cw is not None:
+            if sample_weight is not None:
+                sample_weight *= cw
+            else:
+                sample_weight = cw
+
         # Check parameters
         self._validate_estimator()
 
@@ -279,9 +287,9 @@ def fit(self, X, y, sample_weight=None):
     def _set_oob_score(self, X, y):
         """Calculate out of bag predictions and score."""
 
-    def _validate_y(self, y):
+    def _validate_y_cw(self, y):
         # Default implementation
-        return y
+        return y, None
 
     @property
     def feature_importances_(self):
@@ -320,7 +328,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
 
         super(ForestClassifier, self).__init__(
             base_estimator,
@@ -331,7 +340,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            class_weight=class_weight)
 
     def _set_oob_score(self, X, y):
         """Compute out-of-bag score"""
@@ -377,8 +387,9 @@ def _set_oob_score(self, X, y):
 
         self.oob_score_ = oob_score / self.n_outputs_
 
-    def _validate_y(self, y):
-        y = np.copy(y)
+    def _validate_y_cw(self, y_org):
+        y = np.copy(y_org)
+        cw = None
 
         self.classes_ = []
         self.n_classes_ = []
@@ -388,7 +399,19 @@ def _validate_y(self, y):
             self.classes_.append(classes_k)
             self.n_classes_.append(classes_k.shape[0])
 
-        return y
+        if self.class_weight is not None:
+            if self.n_outputs_ == 1:
+                cw = compute_class_weight(self.class_weight,
+                                          self.classes_[0],
+                                          y_org[:, 0])
+                cw = cw[np.searchsorted(self.classes_[0], y_org[:, 0])]
+            else:
+                raise NotImplementedError('class_weights are not supported '
+                                          'for multi-output. You may use '
+                                          'sample_weights in the fit method '
+                                          'to weight by sample.')
+
+        return y, cw
 
     def predict(self, X):
         """Predict class for X.
@@ -707,6 +730,18 @@ class RandomForestClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
+    class_weight : dict, {class_label: weight} or "auto" or None, optional
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "auto" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies.
+
+        Note that this is only supported for single-output classification.
+
+        Note that these weights will be multiplied with class_weight (passed
+        through the fit method) if sample_weight is specified
+
     Attributes
     ----------
     estimators_ : list of DecisionTreeClassifier
@@ -755,7 +790,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
         super(RandomForestClassifier, self).__init__(
             base_estimator=DecisionTreeClassifier(),
             n_estimators=n_estimators,
@@ -768,7 +804,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            class_weight=class_weight)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1017,6 +1054,18 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
+    class_weight : dict, {class_label: weight} or "auto" or None, optional
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "auto" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies.
+
+        Note that this is only supported for single-output classification.
+
+        Note that these weights will be multiplied with class_weight (passed
+        through the fit method) if sample_weight is specified
+
     Attributes
     ----------
     estimators_ : list of DecisionTreeClassifier
@@ -1068,7 +1117,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
         super(ExtraTreesClassifier, self).__init__(
             base_estimator=ExtraTreeClassifier(),
             n_estimators=n_estimators,
@@ -1080,7 +1130,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            class_weight=class_weight)
 
         self.criterion = criterion
         self.max_depth = max_depth
 
@@ -747,6 +747,53 @@ def test_1d_input():
         yield check_1d_input, name, X, X_2d, y
 
 
+def check_class_weights(name):
+    """Check class_weights resemble sample_weights behavior."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    # Iris is balanced, so no effect expected for using 'auto' weights
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target)
+    clf2 = ForestClassifier(class_weight='auto', random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Inflate importance of class 1, check against user-defined weights
+    sample_weight = np.ones(iris.target.shape)
+    sample_weight[iris.target == 1] *= 100
+    class_weight = {0: 1., 1: 100., 2: 1.}
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight)
+    clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Check that sample_weight and class_weight are multiplicative
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight**2)
+    clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target, sample_weight)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+
+def test_class_weights():
+    for name in FOREST_CLASSIFIERS:
+        yield check_class_weights, name
+
+
+def check_class_weight_failure_multi_output(name):
+    """Test class_weight failure for multi-output"""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+    _y = np.vstack((y, np.array(y) * 2)).T
+    clf = ForestClassifier(class_weight='auto')
+    assert_raises(NotImplementedError, clf.fit, X, _y)
+
+
+def test_class_weight_failure_multi_output():
+    for name in FOREST_CLASSIFIERS:
+        yield check_class_weight_failure_multi_output, name
+
+
 def check_warm_start(name, random_state=42):
     """Test if fitting incrementally with warm start gives a forest of the
     right size and the same results as a normal fit."""
 
@@ -737,6 +737,8 @@ def check_class_weight_classifiers(name, Classifier):
             classifier = Classifier(class_weight=class_weight)
         if hasattr(classifier, "n_iter"):
             classifier.set_params(n_iter=100)
+        if hasattr(classifier, "min_weight_fraction_leaf"):
+            classifier.set_params(min_weight_fraction_leaf=0.01)
 
         set_random_state(classifier)
         classifier.fit(X_train, y_train)