8000 [MRG] GBM & meta-ensembles - support for class_weight by trevorstephens · Pull Request #4215 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] GBM & meta-ensembles - support for class_weight #4215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,12 @@ Enhancements
faster in general. By `Joel Nothman`_.

- Add ``class_weight`` parameter to automatically weight samples by class
frequency for :class:`ensemble.RandomForestClassifier`,
:class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier`
frequency for :class:`ensemble.AdaBoostClassifier`,
:class:`ensemble.BaggingClassifier`,
:class:`ensemble.ExtraTreesClassifier`,
:class:`ensemble.GradientBoostingClassifier`,
:class:`ensemble.RandomForestClassifier`,
:class:`tree.DecisionTreeClassifier`,
and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_.

- :class:`grid_search.RandomizedSearchCV` now does sampling without
Expand Down
69 changes: 61 additions & 8 deletions sklearn/ensemble/bagging.py
Original file line number Diff line number Diff line change
Expand Up 8000 @@ -13,11 +13,12 @@

from ..base import ClassifierMixin, RegressorMixin
from ..externals.joblib import Parallel, delayed
from ..externals.six import with_metaclass
from ..externals.six import with_metaclass, string_types
from ..externals.six.moves import zip
from ..metrics import r2_score, accuracy_score
from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
from ..utils import check_random_state, check_X_y, check_array, column_or_1d
from ..utils import compute_sample_weight
from ..utils.random import sample_without_replacement
from ..utils.validation import has_fit_parameter, check_is_fitted
from ..utils.fixes import bincount
Expand All @@ -32,7 +33,7 @@


def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
seeds, verbose):
class_weight, seeds, verbose):
"""Private function used to build a batch of estimators within a job."""
# Retrieve settings
n_samples, n_features = X.shape
Expand All @@ -52,7 +53,6 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
"sample_weight")


# Build estimators
estimators = []
estimators_samples = []
Expand Down Expand Up @@ -99,6 +99,13 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,

curr_sample_weight[not_indices] = 0

if class_weight == 'subsample':
indices = np.where(curr_sample_weight > 0)

if class_weight == 'subsample':
# Multiply all weights by subsample weights
curr_sample_weight *= compute_sample_weight('auto', y, indices)

estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
samples = curr_sample_weight > 0.

Expand Down Expand Up @@ -204,6 +211,7 @@ def __init__(self,
bootstrap=True,
bootstrap_features=False,
oob_score=False,
class_weight=None,
n_jobs=1,
random_state=None,
verbose=0):
Expand All @@ -216,6 +224,7 @@ def __init__(self,
self.bootstrap = bootstrap
self.bootstrap_features = bootstrap_features
self.oob_score = oob_score
self.class_weight = class_weight
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
Expand Down Expand Up @@ -251,7 +260,7 @@ def fit(self, X, y, sample_weight=None):

# Remap output
n_samples, self.n_features_ = X.shape
y = self._validate_y(y)
y, expanded_class_weight = self._validate_y_class_weight(y)

# Check parameters
self._validate_estimator()
Expand All @@ -276,6 +285,13 @@ def fit(self, X, y, sample_weight=None):
raise ValueError("Out of bag estimation only available"
" if bootstrap=True")

# Apply class_weights to sample weights
if expanded_class_weight is not None:
if sample_weight is not None:
sample_weight = sample_weight * expanded_class_weight
else:
sample_we 8000 ight = expanded_class_weight

# Free allocated memory, if any
self.estimators_ = None

Expand All @@ -291,6 +307,7 @@ def fit(self, X, y, sample_weight=None):
X,
y,
sample_weight,
self.class_weight,
seeds[starts[i]:starts[i + 1]],
verbose=self.verbose)
for i in range(n_jobs))
Expand All @@ -312,9 +329,9 @@ def fit(self, X, y, sample_weight=None):
def _set_oob_score(self, X, y):
"""Calculate out of bag predictions and score."""

def _validate_y(self, y):
def _validate_y_class_weight(self, y):
# Default implementation
return column_or_1d(y, warn=True)
return column_or_1d(y, warn=True), None


class BaggingClassifier(BaseBagging, ClassifierMixin):
Expand Down Expand Up @@ -366,6 +383,23 @@ class BaggingClassifier(BaseBagging, ClassifierMixin):
Whether to use out-of-bag samples to estimate
the generalization error.

class_weight : dict, "auto", "subsample" or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.

The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.

The "subsample" mode is the same as "auto" except that weights are
computed based on the bootstrap or sub-sample for every tree grown as
defined by the ``max_features`` and/or ``bootstrap`` options.

Note that these weights will be multiplied with sample_weight (passed
through the fit method) if sample_weight is specified.

Note that this is supported only if the base estimator supports
sample weighting.

n_jobs : int, optional (default=1)
The number of jobs to run in parallel for both `fit` and `predict`.
If -1, then the number of jobs is set to the number of cores.
Expand Down Expand Up @@ -433,6 +467,7 @@ def __init__(self,
bootstrap=True,
bootstrap_features=False,
oob_score=False,
class_weight=None,
n_jobs=1,
random_state=None,
verbose=0):
Expand All @@ -445,6 +480,7 @@ def __init__(self,
bootstrap=bootstrap,
bootstrap_features=bootstrap_features,
oob_score=oob_score,
class_weight=class_weight,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose)
Expand Down Expand Up @@ -493,12 +529,29 @@ def _set_oob_score(self, X, y):
self.oob_decision_function_ = oob_decision_function
self.oob_score_ = oob_score

def _validate_y(self, y):
def _validate_y_class_weight(self, y):
y = column_or_1d(y, warn=True)
expanded_class_weight = None

if self.class_weight is not None:
y_original = np.copy(y)

self.classes_, y = np.unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)

return y
if self.class_weight is not None:
valid_presets = ('auto', 'subsample')
if isinstance(self.class_weight, string_types):
if self.class_weight not in valid_presets:
raise ValueError('Valid presets for class_weight include '
'"auto" and "subsample". Given "%s".'
% self.class_weight)

if self.class_weight != 'subsample':
expanded_class_weight = compute_sample_weight(
self.class_weight, y_original)

return y, expanded_class_weight

def predict(self, X):
"""Predict class for X.
Expand Down
73 changes: 64 additions & 9 deletions sklearn/ensemble/gradient_boosting.py
9E88
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from ..base import ClassifierMixin
from ..base import RegressorMixin
from ..utils import check_random_state, check_array, check_X_y, column_or_1d
from ..utils import check_consistent_length
from ..utils import check_consistent_length, compute_sample_weight
from ..utils.extmath import logsumexp
from ..utils.fixes import expit, bincount
from ..utils.stats import _weighted_percentile
Expand Down Expand Up @@ -711,7 +711,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
min_samples_leaf, min_weight_fraction_leaf,
max_depth, init, subsample, max_features,
random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
warm_start=False):
warm_start=False, class_weight=None):

self.n_estimators = n_estimators
self.learning_rate = learning_rate
Expand All @@ -728,6 +728,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
self.verbose = verbose
self.max_leaf_nodes = max_leaf_nodes
self.warm_start = warm_start
self.class_weight = class_weight

self.estimators_ = np.empty((0, 0), dtype=np.object)

Expand All @@ -739,6 +740,12 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
loss = self.loss_
original_y = y

if self.class_weight == 'subsample':
indices = np.where(sample_mask)
# Multiply sample weights by balanced class weights
sample_weight = (sample_weight *
compute_sample_weight('auto', y, indices))

for k in range(loss.K):
if loss.is_multi_class:
y = np.array(original_y == k, dtype=np.float64)
Expand Down Expand Up @@ -947,7 +954,14 @@ def fit(self, X, y, sample_weight=None, monitor=None):

check_consistent_length(X, y, sample_weight)

y = self._validate_y(y)
y, expanded_class_weight = self._validate_y_class_weight(y)

# Apply class_weights to sample weights
if expanded_class_weight is not None:
if sample_weight is not None:
sample_weight = sample_weight * expanded_class_weight
else:
sample_weight = expanded_class_weight

random_state = check_random_state(self.random_state)
self._check_params()
Expand Down Expand Up @@ -1144,11 +1158,11 @@ def feature_importances_(self):
importances = total_sum / len(self.estimators_)
return importances

def _validate_y(self, y):
def _validate_y_class_weight(self, y):
self.n_classes_ = 1

# Default implementation
return y
return y, None


class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
Expand Down Expand Up @@ -1241,6 +1255,21 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
and add more estimators to the ensemble, otherwise, just erase the
previous solution.

class_weight : dict, "auto", "subsample" or None, optional

Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.

The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.

The "subsample" mode is the same as "auto" except that weights are
computed based on the bootstrap or sub-sample for every tree grown as
defined by the ``max_features`` and/or ``bootstrap`` options.

Note that these weights will be multiplied with sample_weight (passed
through the fit method) if sample_weight is specified.

Attributes
----------
feature_importances_ : array, shape = [n_features]
Expand Down Expand Up @@ -1290,7 +1319,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
min_samples_leaf=1, min_weight_fraction_leaf=0.,
max_depth=3, init=None, random_state=None,
max_features=None, verbose=0,
max_leaf_nodes=None, warm_start=False):
max_leaf_nodes=None, warm_start=False, class_weight=None):

super(GradientBoostingClassifier, self).__init__(
loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
Expand All @@ -1300,12 +1329,38 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
max_depth=max_depth, init=init, subsample=subsample,
max_features=max_features,
random_state=random_state, verbose=verbose,
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
class_weight=class_weight)

def _validate_y_class_weight(self, y):
expanded_class_weight = None
if self.class_weight is not None:
y_original = np.copy(y)

def _validate_y(self, y):
self.classes_, y = np.unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)
return y

if self.class_weight is not None:
valid_presets = ('auto', 'subsample')
if isinstance(self.class_weight, six.string_types):
if self.class_weight not in valid_presets:
raise ValueError('Valid presets for class_weight include '
'"auto" and "subsample". Given "%s".'
% self.class_weight)
if self.warm_start:
warn('class_weight preset "auto" is not recommended for '
'warm_start if the fitted data differs from the '
'full dataset. In order to use "auto" weights, use '
'compute_class_weight("auto", classes, y). In place '
'of y you can use a large enough sample of the full '
'training set target to properly estimate the class '
'frequency distributions. Pass the resulting '
'weights as the class_weight parameter.')
if self.class_weight != 'subsample':
expanded_class_weight = compute_sample_weight(
self.class_weight, y_original)

return y, expanded_class_weight

def predict(self, X):
"""Predict class for X.
Expand Down
Loading
0