From 01d024a8f9d87723330477a0d95114b9c0dbeb13 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Fri, 6 Feb 2015 18:23:19 -0800 Subject: [PATCH] support class_weight for remaining ensembles --- doc/whats_new.rst | 8 +- sklearn/ensemble/bagging.py | 69 ++++++++++++++++-- sklearn/ensemble/gradient_boosting.py | 73 ++++++++++++++++--- sklearn/ensemble/tests/test_bagging.py | 61 +++++++++++++++- .../ensemble/tests/test_gradient_boosting.py | 61 ++++++++++++++++ .../ensemble/tests/test_weight_boosting.py | 46 +++++++++++- sklearn/ensemble/weight_boosting.py | 35 ++++++++- sklearn/utils/estimator_checks.py | 9 +++ 8 files changed, 337 insertions(+), 25 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 5ec4591b29ae7..0273c0c28d170 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -166,8 +166,12 @@ Enhancements faster in general. By `Joel Nothman`_. - Add ``class_weight`` parameter to automatically weight samples by class - frequency for :class:`ensemble.RandomForestClassifier`, - :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier` + frequency for :class:`ensemble.AdaBoostClassifier`, + :class:`ensemble.BaggingClassifier`, + :class:`ensemble.ExtraTreesClassifier`, + :class:`ensemble.GradientBoostingClassifier`, + :class:`ensemble.RandomForestClassifier`, + :class:`tree.DecisionTreeClassifier`, and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_. - :class:`grid_search.RandomizedSearchCV` now does sampling without diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py index 9b3df5b1d26e1..1e1028308c69d 100644 --- a/sklearn/ensemble/bagging.py +++ b/sklearn/ensemble/bagging.py @@ -13,11 +13,12 @@ from ..base import ClassifierMixin, RegressorMixin from ..externals.joblib import Parallel, delayed -from ..externals.six import with_metaclass +from ..externals.six import with_metaclass, string_types from ..externals.six.moves import zip from ..metrics import r2_score, accuracy_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..utils import check_random_state, check_X_y, check_array, column_or_1d +from ..utils import compute_sample_weight from ..utils.random import sample_without_replacement from ..utils.validation import has_fit_parameter, check_is_fitted from ..utils.fixes import bincount @@ -32,7 +33,7 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, - seeds, verbose): + class_weight, seeds, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape @@ -52,7 +53,6 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") - # Build estimators estimators = [] estimators_samples = [] @@ -99,6 +99,13 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, curr_sample_weight[not_indices] = 0 + if class_weight == 'subsample': + indices = np.where(curr_sample_weight > 0) + + if class_weight == 'subsample': + # Multiply all weights by subsample weights + curr_sample_weight *= compute_sample_weight('auto', y, indices) + estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0. @@ -204,6 +211,7 @@ def __init__(self, bootstrap=True, bootstrap_features=False, oob_score=False, + class_weight=None, n_jobs=1, random_state=None, verbose=0): @@ -216,6 +224,7 @@ def __init__(self, self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.oob_score = oob_score + self.class_weight = class_weight self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose @@ -251,7 +260,7 @@ def fit(self, X, y, sample_weight=None): # Remap output n_samples, self.n_features_ = X.shape - y = self._validate_y(y) + y, expanded_class_weight = self._validate_y_class_weight(y) # Check parameters self._validate_estimator() @@ -276,6 +285,13 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Out of bag estimation only available" " if bootstrap=True") + # Apply class_weights to sample weights + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight + # Free allocated memory, if any self.estimators_ = None @@ -291,6 +307,7 @@ def fit(self, X, y, sample_weight=None): X, y, sample_weight, + self.class_weight, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) @@ -312,9 +329,9 @@ def fit(self, X, y, sample_weight=None): def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" - def _validate_y(self, y): + def _validate_y_class_weight(self, y): # Default implementation - return column_or_1d(y, warn=True) + return column_or_1d(y, warn=True), None class BaggingClassifier(BaseBagging, ClassifierMixin): @@ -366,6 +383,23 @@ class BaggingClassifier(BaseBagging, ClassifierMixin): Whether to use out-of-bag samples to estimate the generalization error. + class_weight : dict, "auto", "subsample" or None, optional + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "auto" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data. + + The "subsample" mode is the same as "auto" except that weights are + computed based on the bootstrap or sub-sample for every tree grown as + defined by the ``max_features`` and/or ``bootstrap`` options. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + Note that this is supported only if the base estimator supports + sample weighting. + n_jobs : int, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. @@ -433,6 +467,7 @@ def __init__(self, bootstrap=True, bootstrap_features=False, oob_score=False, + class_weight=None, n_jobs=1, random_state=None, verbose=0): @@ -445,6 +480,7 @@ def __init__(self, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, + class_weight=class_weight, n_jobs=n_jobs, random_state=random_state, verbose=verbose) @@ -493,12 +529,29 @@ def _set_oob_score(self, X, y): self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score - def _validate_y(self, y): + def _validate_y_class_weight(self, y): y = column_or_1d(y, warn=True) + expanded_class_weight = None + + if self.class_weight is not None: + y_original = np.copy(y) + self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) - return y + if self.class_weight is not None: + valid_presets = ('auto', 'subsample') + if isinstance(self.class_weight, string_types): + if self.class_weight not in valid_presets: + raise ValueError('Valid presets for class_weight include ' + '"auto" and "subsample". Given "%s".' + % self.class_weight) + + if self.class_weight != 'subsample': + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original) + + return y, expanded_class_weight def predict(self, X): """Predict class for X. diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 820638c5befe5..75e66bcf0ebe5 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -35,7 +35,7 @@ from ..base import ClassifierMixin from ..base import RegressorMixin from ..utils import check_random_state, check_array, check_X_y, column_or_1d -from ..utils import check_consistent_length +from ..utils import check_consistent_length, compute_sample_weight from ..utils.extmath import logsumexp from ..utils.fixes import expit, bincount from ..utils.stats import _weighted_percentile @@ -711,7 +711,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, init, subsample, max_features, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, - warm_start=False): + warm_start=False, class_weight=None): self.n_estimators = n_estimators self.learning_rate = learning_rate @@ -728,6 +728,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, self.verbose = verbose self.max_leaf_nodes = max_leaf_nodes self.warm_start = warm_start + self.class_weight = class_weight self.estimators_ = np.empty((0, 0), dtype=np.object) @@ -739,6 +740,12 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, loss = self.loss_ original_y = y + if self.class_weight == 'subsample': + indices = np.where(sample_mask) + # Multiply sample weights by balanced class weights + sample_weight = (sample_weight * + compute_sample_weight('auto', y, indices)) + for k in range(loss.K): if loss.is_multi_class: y = np.array(original_y == k, dtype=np.float64) @@ -947,7 +954,14 @@ def fit(self, X, y, sample_weight=None, monitor=None): check_consistent_length(X, y, sample_weight) - y = self._validate_y(y) + y, expanded_class_weight = self._validate_y_class_weight(y) + + # Apply class_weights to sample weights + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight random_state = check_random_state(self.random_state) self._check_params() @@ -1144,11 +1158,11 @@ def feature_importances_(self): importances = total_sum / len(self.estimators_) return importances - def _validate_y(self, y): + def _validate_y_class_weight(self, y): self.n_classes_ = 1 # Default implementation - return y + return y, None class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): @@ -1241,6 +1255,21 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): and add more estimators to the ensemble, otherwise, just erase the previous solution. + class_weight : dict, "auto", "subsample" or None, optional + + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "auto" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data. + + The "subsample" mode is the same as "auto" except that weights are + computed based on the bootstrap or sub-sample for every tree grown as + defined by the ``max_features`` and/or ``bootstrap`` options. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + Attributes ---------- feature_importances_ : array, shape = [n_features] @@ -1290,7 +1319,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, init=None, random_state=None, max_features=None, verbose=0, - max_leaf_nodes=None, warm_start=False): + max_leaf_nodes=None, warm_start=False, class_weight=None): super(GradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, @@ -1300,12 +1329,38 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, max_depth=max_depth, init=init, subsample=subsample, max_features=max_features, random_state=random_state, verbose=verbose, - max_leaf_nodes=max_leaf_nodes, warm_start=warm_start) + max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, + class_weight=class_weight) + + def _validate_y_class_weight(self, y): + expanded_class_weight = None + if self.class_weight is not None: + y_original = np.copy(y) - def _validate_y(self, y): self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) - return y + + if self.class_weight is not None: + valid_presets = ('auto', 'subsample') + if isinstance(self.class_weight, six.string_types): + if self.class_weight not in valid_presets: + raise ValueError('Valid presets for class_weight include ' + '"auto" and "subsample". Given "%s".' + % self.class_weight) + if self.warm_start: + warn('class_weight preset "auto" is not recommended for ' + 'warm_start if the fitted data differs from the ' + 'full dataset. In order to use "auto" weights, use ' + 'compute_class_weight("auto", classes, y). In place ' + 'of y you can use a large enough sample of the full ' + 'training set target to properly estimate the class ' + 'frequency distributions. Pass the resulting ' + 'weights as the class_weight parameter.') + if self.class_weight != 'subsample': + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original) + + return y, expanded_class_weight def predict(self, X): """Predict class for X. diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 7d905641719a3..2e89b2c380e89 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -25,7 +25,7 @@ from sklearn.svm import SVC, SVR from sklearn.pipeline import make_pipeline from sklearn.feature_selection import SelectKBest -from sklearn.cross_validation import train_test_split +from sklearn.cross_validation import train_test_split, StratifiedKFold from sklearn.datasets import load_boston, load_iris from sklearn.utils import check_random_state @@ -482,7 +482,7 @@ def test_parallel_regression(): def test_gridsearch(): """Check that bagging ensembles can be grid-searched.""" # Transform iris into a binary classification task - X, y = iris.data, iris.target + X, y = iris.data, iris.target.copy() y[y == 2] = 1 # Grid search with scoring based on decision_function @@ -551,6 +551,63 @@ def test_bagging_with_pipeline(): estimator.fit(iris.data, iris.target) +def test_class_weights(): + """Check class_weights resemble sample_weights behavior.""" + for train_index, test_index in StratifiedKFold(iris.target, n_folds=2, + random_state=0): + + # Iris is balanced, so no effect expected for using 'auto' weights + clf1 = BaggingClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index]) + clf2 = BaggingClassifier(class_weight='auto', random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index]) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target[train_index].shape) + sample_weight[iris.target[train_index] == 1] *= 100 + class_weight = {0: 1., 1: 100., 2: 1.} + clf1 = BaggingClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index], + sample_weight) + clf2 = BaggingClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index]) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + # Check that sample_weight and class_weight are multiplicative + clf1 = BaggingClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index], + sample_weight**2) + clf2 = BaggingClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index], + sample_weight) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + +def check_class_weight_subsample(): + """Test class_weight works for subsample option""" + clf = BaggingClassifier(max_samples=0.8, bootstrap=True, + class_weight='subsample', random_state=0) + clf.fit(iris.data, iris.target) + clf = BaggingClassifier(max_samples=0.8, bootstrap=False, + class_weight='subsample', random_state=0) + clf.fit(iris.data, iris.target) + clf = BaggingClassifier(max_samples=1.0, bootstrap=True, + class_weight='subsample', random_state=0) + clf.fit(iris.data, iris.target) + + +def check_class_weight_errors(): + """Test if class_weight raises errors and warnings when expected.""" + + # Invalid preset string + clf = BaggingClassifier(class_weight='the larch', random_state=0) + assert_raises(ValueError, clf.fit, iris.data, iris.target) + + if __name__ == "__main__": import nose nose.runmodule() diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index e8e5683d5a6e1..78c61539fc5ab 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -23,6 +23,7 @@ from sklearn.utils.testing import assert_warns from sklearn.utils.validation import DataConversionWarning from sklearn.utils.validation import NotFittedError +from sklearn.cross_validation import StratifiedKFold # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] @@ -984,6 +985,66 @@ def test_non_uniform_weights_toy_edge_case_clf(): assert_array_equal(gb.predict([[1, 0]]), [1]) +def test_class_weights(): + """Check class_weights resemble sample_weights behavior.""" + for train_index, test_index in StratifiedKFold(iris.target, n_folds=2, + random_state=0): + + # Iris is balanced, so no effect expected for using 'auto' weights + clf1 = GradientBoostingClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index]) + clf2 = GradientBoostingClassifier(class_weight='auto', random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index]) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target[train_index].shape) + sample_weight[iris.target[train_index] == 1] *= 100 + class_weight = {0: 1., 1: 100., 2: 1.} + clf1 = GradientBoostingClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index], + sample_weight) + clf2 = GradientBoostingClassifier(class_weight=class_weight, + random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index]) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + # Check that sample_weight and class_weight are multiplicative + clf1 = GradientBoostingClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index], + sample_weight**2) + clf2 = GradientBoostingClassifier(class_weight=class_weight, + random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index], + sample_weight) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + +def check_class_weight_subsample(): + """Test class_weight works for subsample option""" + clf = GradientBoostingClassifier(subsample=0.8, class_weight='subsample', + random_state=0) + clf.fit(iris.data, iris.target) + clf = GradientBoostingClassifier(subsample=1.0, class_weight='subsample', + random_state=0) + clf.fit(iris.data, iris.target) + + +def check_class_weight_errors(): + """Test if class_weight raises errors and warnings when expected.""" + + # Invalid preset string + clf = GradientBoostingClassifier(class_weight='the larch', random_state=0) + assert_raises(ValueError, clf.fit, iris.data, iris.target) + # Warning warm_start with preset + clf = GradientBoostingClassifier(class_weight='auto', warm_start=True, + random_state=0) + assert_warns(UserWarning, clf.fit, iris.data, iris.target) + + if __name__ == "__main__": import nose nose.runmodule() diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 608007623d81d..fb06dee91e1a1 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -6,7 +6,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises, assert_raises_regexp -from sklearn.cross_validation import train_test_split +from sklearn.cross_validation import train_test_split, StratifiedKFold from sklearn.grid_search import GridSearchCV from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostRegressor @@ -421,6 +421,50 @@ def fit(self, X, y, sample_weight=None): for t in types]) +def test_class_weights(): + """Check class_weights resemble sample_weights behavior.""" + for train_index, test_index in StratifiedKFold(iris.target, n_folds=2, + random_state=0): + + # Iris is balanced, so no effect expected for using 'auto' weights + clf1 = AdaBoostClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index]) + clf2 = AdaBoostClassifier(class_weight='auto', random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index]) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target[train_index].shape) + sample_weight[iris.target[train_index] == 1] *= 100 + class_weight = {0: 1., 1: 100., 2: 1.} + clf1 = AdaBoostClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index], + sample_weight) + clf2 = AdaBoostClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index]) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + # Check that sample_weight and class_weight are multiplicative + clf1 = AdaBoostClassifier(random_state=0) + clf1.fit(iris.data[train_index], iris.target[train_index], + sample_weight**2) + clf2 = AdaBoostClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data[train_index], iris.target[train_index], + sample_weight) + assert_array_almost_equal(clf1.predict(iris.data[test_index]), + clf2.predict(iris.data[test_index])) + + +def check_class_weight_errors(): + """Test if class_weight raises errors and warnings when expected.""" + + # Invalid preset string + clf = AdaBoostClassifier(class_weight='the larch', random_state=0) + assert_raises(ValueError, clf.fit, iris.data, iris.target) + + if __name__ == "__main__": import nose nose.runmodule() diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 3a4187f6fae5c..4f0ec20cdb53f 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -38,10 +38,9 @@ from ..tree.tree import BaseDecisionTree from ..tree._tree import DTYPE from ..utils import check_array, check_X_y, check_random_state +from ..utils import compute_sample_weight from ..metrics import accuracy_score, r2_score -from sklearn.utils.validation import ( - has_fit_parameter, - check_is_fitted) +from sklearn.utils.validation import has_fit_parameter, check_is_fitted __all__ = [ 'AdaBoostClassifier', @@ -62,6 +61,7 @@ def __init__(self, n_estimators=50, estimator_params=tuple(), learning_rate=1., + class_weight=None, random_state=None): super(BaseWeightBoosting, self).__init__( @@ -70,6 +70,7 @@ def __init__(self, estimator_params=estimator_params) self.learning_rate = learning_rate + self.class_weight = class_weight self.random_state = random_state def fit(self, X, y, sample_weight=None): @@ -111,6 +112,20 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype) + if self.class_weight is not None: + if isinstance(self.class_weight, six.string_types): + if self.class_weight != 'auto': + raise ValueError('The only supported preset for ' + 'class_weight is "auto". Given "%s".' + % self.class_weight) + + expanded_class_weight = compute_sample_weight(self.class_weight, + y) + if sample_weight is None: + sample_weight = expanded_class_weight + else: + sample_weight = expanded_class_weight * sample_weight + if sample_weight is None: # Initialize weights to 1 / n_samples sample_weight = np.empty(X.shape[0], dtype=np.float) @@ -271,6 +286,7 @@ def _validate_X_predict(self, X): return X + def _samme_proba(estimator, n_classes, X): """Calculate algorithm 4, step 2, equation c) of Zhu et al [1]. @@ -313,6 +329,17 @@ class AdaBoostClassifier(BaseWeightBoosting, ClassifierMixin): The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. + class_weight : dict, "auto", or None, optional (default=None) + + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "auto" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + learning_rate : float, optional (default=1.) Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and @@ -368,6 +395,7 @@ def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1., + class_weight=None, algorithm='SAMME.R', random_state=None): @@ -375,6 +403,7 @@ def __init__(self, base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, + class_weight=class_weight, random_state=random_state) self.algorithm = algorithm diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index a7344afc23e14..96537c16d7695 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -32,6 +32,8 @@ from sklearn.feature_selection import SelectKBest from sklearn.svm.base import BaseLibSVM from sklearn.pipeline import make_pipeline +from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier +from sklearn.tree import DecisionTreeClassifier from sklearn.utils.validation import DataConversionWarning, NotFittedError from sklearn.cross_validation import train_test_split @@ -788,8 +790,15 @@ def check_class_weight_classifiers(name, Classifier): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) + # The below attributes are set because the dataset is noisy and + # tree-based classifiers are not regularized if hasattr(classifier, "min_weight_fraction_leaf"): classifier.set_params(min_weight_fraction_leaf=0.01) + if isinstance(classifier, (BaggingClassifier, AdaBoostClassifier)): + classifier.set_params(base_estimator=DecisionTreeClassifier( + max_depth=1, min_weight_fraction_leaf=0.01)) + if hasattr(classifier, "learning_rate"): + classifier.set_params(learning_rate=0.1) set_random_state(classifier) classifier.fit(X_train, y_train)