8000 Fix some docstrings for class_weight=auto/balanced. Totally how I wan… · scikit-learn/scikit-learn@1920ff0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1920ff0

Browse files
committed
Fix some docstrings for class_weight=auto/balanced. Totally how I wanted to spend my afternoon....
1 parent 9c3055e commit 1920ff0

File tree

14 files changed

+132
-108
lines changed

14 files changed

+132
-108
lines changed

doc/modules/svm.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ Tips on Practical Use
405405
approximates the fraction of training errors and support vectors.
406406

407407
* In :class:`SVC`, if data for classification are unbalanced (e.g. many
408-
positive and few negative), set ``class_weight='auto'`` and/or try
408+
positive and few negative), set ``class_weight='balanced'`` and/or try
409409
different penalty parameters ``C``.
410410

411411
* The underlying :class:`LinearSVC` implementation uses a random

sklearn/ensemble/forest.py

Lines changed: 15 additions & 13 deletions
< B41A /div>
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
8989
curr_sample_weight *= sample_counts
9090

9191
if class_weight == 'subsample':
92-
curr_sample_weight *= compute_sample_weight('auto', y, indices)
92+
curr_sample_weight *= compute_sample_weight('balanced', y, indices)
9393

9494
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
9595

@@ -408,17 +408,17 @@ def _validate_y_class_weight(self, y):
408408
self.n_classes_.append(classes_k.shape[0])
409409

410410
if self.class_weight is not None:
411-
valid_presets = ('auto', 'subsample')
411+
valid_presets = ('auto', 'balanced', 'subsample')
412412
if isinstance(self.class_weight, six.string_types):
413413
if self.class_weight not in valid_presets:
414414
raise ValueError('Valid presets for class_weight include '
415-
'"auto" and "subsample". Given "%s".'
415+
'"balanced" and "subsample". Given "%s".'
416416
% self.class_weight)
417417
if self.warm_start:
418-
warn('class_weight presets "auto" or "subsample" are '
418+
warn('class_weight presets "balanced" or "subsample" are '
419419
'not recommended for warm_start if the fitted data '
420420
'differs from the full dataset. In order to use '
421-
'"auto" weights, use compute_class_weight("auto", '
421+
'"auto" weights, use compute_class_weight("balanced", '
422422
'classes, y). In place of y you can use a large '
423423
'enough sample of the full training set target to '
424424
'properly estimate the class frequency '
@@ -427,7 +427,7 @@ def _validate_y_class_weight(self, y):
427427

428428
if self.class_weight != 'subsample' or not self.bootstrap:
429429
if self.class_weight == 'subsample':
430-
class_weight = 'auto'
430+
class_weight = 'balanced'
431431
else:
432432
class_weight = self.class_weight
433433
expanded_class_weight = compute_sample_weight(class_weight,
@@ -760,17 +760,18 @@ class RandomForestClassifier(ForestClassifier):
760760
and add more estimators to the ensemble, otherwise, just fit a whole
761761
new forest.
762762
763-
class_weight : dict, list of dicts, "auto", "subsample" or None, optional
763+
class_weight : dict, list of dicts, "balanced", "subsample" or None, optional
764764
765765
Weights associated with classes in the form ``{class_label: weight}``.
766766
If not given, all classes are supposed to have weight one. For
767767
multi-output problems, a list of dicts can be provided in the same
768768
order as the columns of y.
769769
770-
The "auto" mode uses the values of y to automatically adjust
771-
weights inversely proportional to class frequencies in the input data.
770+
The "balanced" mode uses the values of y to automatically adjust
771+
weights inversely proportional to class frequencies in the input data
772+
as ``n_samples / (n_classes * np.bincount(y))``
772773
773-
The "subsample" mode is the same as "auto" except that weights are
774+
The "subsample" mode is the same as "balanced" except that weights are
774775
computed based on the bootstrap sample for every tree grown.
775776
776777
For multi-output, the weights of each column of y will be multiplied.
@@ -1097,10 +1098,11 @@ class ExtraTreesClassifier(ForestClassifier):
10971098
multi-output problems, a list of dicts can be provided in the same
10981099
order as the columns of y.
10991100
1100-
The "auto" mode uses the values of y to automatically adjust
1101-
weights inversely proportional to class frequencies in the input data.
1101+
The "balanced" mode uses the values of y to automatically adjust
1102+
weights inversely proportional to class frequencies in the input data
1103+
as ``n_samples / (n_classes * np.bincount(y))``
11021104
1103-
The "subsample" mode is the same as "auto" except that weights are
1105+
The "subsample" mode is the same as "balanced" except that weights are
11041106
computed based on the bootstrap sample for every tree grown.
11051107
11061108
For multi-output, the weights of each column of y will be multiplied.

sklearn/ensemble/tests/test_forest.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ def test_parallel():
329329
yield check_parallel, name, iris.data, iris.target
330330

331331
for name in FOREST_REGRESSORS:
332-
yield check_parallel, name, boston.data, boston.target
332+
yield check_parallel, name, boston.data, boston.target
333333

334334

335335
def check_pickle(name, X, y):
@@ -352,7 +352,7 @@ def test_pickle():
352352
yield check_pickle, name, iris.data[::2], iris.target[::2]
353353

354354
for name in FOREST_REGRESSORS:
355-
yield check_pickle, name, boston.data[::2], boston.target[::2]
355+
yield check_pickle, name, boston.data[::2], boston.target[::2]
356356

357357

358358
def check_multioutput(name):
@@ -749,10 +749,10 @@ def check_class_weights(name):
749749
"""Check class_weights resemble sample_weights behavior."""
750750
ForestClassifier = FOREST_CLASSIFIERS[name]
751751

752-
# Iris is balanced, so no effect expected for using 'auto' weights
752+
# Iris is balanced, so no effect expected for using 'balanced' weights
753753
clf1 = ForestClassifier(random_state=0)
754754
clf1.fit(iris.data, iris.target)
755-
clf2 = ForestClassifier(class_weight='auto', random_state=0)
755+
clf2 = ForestClassifier(class_weight='balanced', random_state=0)
756756
clf2.fit(iris.data, iris.target)
757757
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
758758

@@ -765,8 +765,8 @@ def check_class_weights(name):
765765
random_state=0)
766766
clf3.fit(iris.data, iris_multi)
767767
assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
768-
# Check against multi-output "auto" which should also have no effect
769-
clf4 = ForestClassifier(class_weight='auto', random_state=0)
768+
# Check against multi-output "balanced" which should also have no effect
769+
clf4 = ForestClassifier(class_weight='balanced', random_state=0)
770770
clf4.fit(iris.data, iris_multi)
771771
assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
772772

@@ -782,7 +782,7 @@ def check_class_weights(name):
782782

783783
# Check that sample_weight and class_weight are multiplicative
784784
clf1 = ForestClassifier(random_state=0)
785-
clf1.fit(iris.data, iris.target, sample_weight**2)
785+
clf1.fit(iris.data, iris.target, sample_weight ** 2)
786786
clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
787787
clf2.fit(iris.data, iris.target, sample_weight)
788788
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
@@ -793,11 +793,11 @@ def test_class_weights():
793793
yield check_class_weights, name
794794

795795

796-
def check_class_weight_auto_and_bootstrap_multi_output(name):
796+
def check_class_weight_balanced_and_bootstrap_multi_output(name):
797797
"""Test class_weight works for multi-output"""
798798
ForestClassifier = FOREST_CLASSIFIERS[name]
799799
_y = np.vstack((y, np.array(y) * 2)).T
800-
clf = ForestClassifier(class_weight='auto', random_state=0)
800+
clf = ForestClassifier(class_weight='balanced', random_state=0)
801801
clf.fit(X, _y)
802802
clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}],
803803
random_state=0)
@@ -806,9 +806,9 @@ def check_class_weight_auto_and_bootstrap_multi_output(name):
806806
clf.fit(X, _y)
807807

808808

809-
def test_class_weight_auto_and_bootstrap_multi_output():
809+
def test_class_weight_balanced_and_bootstrap_multi_output():
810810
for name in FOREST_CLASSIFIERS:
811-
yield check_class_weight_auto_and_bootstrap_multi_output, name
811+
yield check_class_weight_balanced_and_bootstrap_multi_output, name
812812

813813

814814
def check_class_weight_errors(name):

sklearn/linear_model/logistic.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -456,11 +456,13 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
456456
is called repeatedly with the same data, as y is modified
457457
along the path.
458458
459-
class_weight : {dict, 'auto'}, optional
460-
Over-/undersamples the samples of each class according to the given
461-
weights. If not given, all classes are supposed to have weight one.
462-
The 'auto' mode selects weights inversely proportional to class
463-
frequencies in the training set.
459+
class_weight : dict or 'balanced', optional
460+
Weights associated with classes in the form ``{class_label: weight}``.
461+
If not given, all classes are supposed to have weight one.
462+
463+
The "balanced" mode uses the values of y to automatically adjust
464+
weights inversely proportional to class frequencies in the input data
465+
as ``n_samples / (n_classes * np.bincount(y))``
464466
465467
dual : bool
466468
Dual or primal formulation. Dual formulation is only implemented for
@@ -729,11 +731,13 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
729731
tol : float
730732
Tolerance for stopping criteria.
731733
732-
class_weight : {dict, 'auto'}, optional
733-
Over-/undersamples the samples of each class according to the given
734-
weights. If not given, all classes are supposed to have weight one.
735-
The 'auto' mode selects weights inversely proportional to class
736-
frequencies in the training set.
734+
class_weight : dict or 'balanced', optional
735+
Weights associated with classes in the form ``{class_label: weight}``.
736+
If not given, all classes are supposed to have weight one.
737+
738+
The "balanced" mode uses the values of y to automatically adjust
739+
weights inversely proportional to class frequencies in the input data
740+
as ``n_samples / (n_classes * np.bincount(y))``
737741
738742
verbose : int
739743
For the liblinear and lbfgs solvers set verbose to any positive
@@ -897,11 +901,13 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
897901
To lessen the effect of regularization on synthetic feature weight
898902
(and therefore on the intercept) intercept_scaling has to be increased.
899903
900-
class_weight : {dict, 'auto'}, optional
901-
Over-/undersamples the samples of each class according to the given
902-
weights. If not given, all classes are supposed to have weight one.
903-
The 'auto' mode selects weights inversely proportional to class
904-
frequencies in the training set.
904+
class_weight : dict or 'balanced', optional
905+
Weights associated with classes in the form ``{class_label: weight}``.
906+
If not given, all classes are supposed to have weight one.
907+
908+
The "balanced" mode uses the values of y to automatically adjust
909+
weights inversely proportional to class frequencies in the input data
910+
as ``n_samples / (n_classes * np.bincount(y))``
905911
906912
max_iter : int
907913
Useful only for the newton-cg and lbfgs solvers. Maximum number of
@@ -1147,11 +1153,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
11471153
Specifies if a constant (a.k.a. bias or intercept) should be
11481154
added the decision function.
11491155
1150-
class_weight : {dict, 'auto'}, optional
1151-
Over-/undersamples the samples of each class according to the given
1152-
weights. If not given, all classes are supposed to have weight one.
1153-
The 'auto' mode selects weights inversely proportional to class
1154-
frequencies in the training set.
1156+
class_weight : dict or 'balanced', optional
1157+
Weights associated with classes in the form ``{class_label: weight}``.
1158+
If not given, all classes are supposed to have weight one.
1159+
1160+
The "balanced" mode uses the values of y to automatically adjust
1161+
weights inversely proportional to class frequencies in the input data
1162+
as ``n_samples / (n_classes * np.bincount(y))``
11551163
11561164
cv : integer or cross-validation generator
11571165
The default cross-validation generator used is Stratified K-Folds.
@@ -1182,11 +1190,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
11821190
max_iter : int, optional
11831191
Maximum number of iterations of the optimization algorithm.
11841192
1185-
class_weight : {dict, 'auto'}, optional
1186-
Over-/undersamples the samples of each class according to the given
1187-
weights. If not given, all classes are supposed to have weight one.
1188-
The 'auto' mode selects weights inversely proportional to class
1189-
frequencies in the training set.
1193+
class_weight : dict or 'balanced', optional
1194+
Weights associated with classes in the form ``{class_label: weight}``.
1195+
If not given, all classes are supposed to have weight one.
1196+
1197+
The "balanced" mode uses the values of y to automatically adjust
1198+
weights inversely proportional to class frequencies in the input data
1199+
as ``n_samples / (n_classes * np.bincount(y))``
11901200
11911201
n_jobs : int, optional
11921202
Number of CPU cores used during the cross-validation loop. If given

sklearn/linear_model/perceptron.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,15 @@ class Perceptron(BaseSGDClassifier, _LearntSelectorMixin):
4444
eta0 : double
4545
Constant by which the updates are multiplied. Defaults to 1.
4646
47-
class_weight : dict, {class_label: weight} or "auto" or None, optional
47+
class_weight : dict, {class_label: weight} or "balanced" or None, optional
4848
Preset for the class_weight fit parameter.
4949
5050
Weights associated with classes. If not given, all classes
5151
are supposed to have weight one.
5252
53-
The "auto" mode uses the values of y to automatically adjust
54-
weights inversely proportional to class frequencies.
53+
The "balanced" mode uses the values of y to automatically adjust
54+
weights inversely proportional to class frequencies in the input data
55+
as ``n_samples / (n_classes * np.bincount(y))``
5556
5657
warm_start : bool, optional
5758
When set to True, reuse the solution of the previous call to fit as

sklearn/linear_model/ridge.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from ..base import RegressorMixin
2222
from ..utils.extmath import safe_sparse_dot
2323
from ..utils import check_X_y
24-
from ..utils import compute_sample_weight, compute_class_weight
24+
from ..utils import compute_sample_weight
2525
from ..utils import column_or_1d
2626
from ..preprocessing import LabelBinarizer
2727
from ..grid_search import GridSearchCV
@@ -521,10 +521,13 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
521521
``(2*C)^-1`` in other linear models such as LogisticRegression or
522522
LinearSVC.
523523
524-
class_weight : dict, optional
525-
Weights associated with classes in the form
526-
``{class_label : weight}``. If not given, all classes are
527-
supposed to have weight one.
524+
class_weight : dict or 'balanced', optional
525+
Weights associated with classes in the form ``{class_label: weight}``.
526+
If not given, all classes are supposed to have weight one.
527+
528+
The "balanced" mode uses the values of y to automatically adjust
529+
weights inversely proportional to class frequencies in the input data
530+
as ``n_samples / (n_classes * np.bincount(y))``
528531
529532
copy_X : boolean, optional, default True
530533
If True, X will be copied; else, it may be overwritten.
@@ -1008,10 +1011,13 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
10081011
If None, Generalized Cross-Validation (efficient Leave-One-Out)
10091012
will be used.
10101013
1011-
class_weight : dict, optional
1012-
Weights associated with classes in the form
1013-
``{class_label : weight}``. If not given, all classes are
1014-
supposed to have weight one.
1014+
class_weight : dict or 'balanced', optional
1015+
Weights associated with classes in the form ``{class_label: weight}``.
1016+
If not given, all classes are supposed to have weight one.
1017+
1018+
The "balanced" mode uses the values of y to automatically adjust
1019+
weights inversely proportional to class frequencies in the input data
1020+
as ``n_samples / (n_classes * np.bincount(y))``
10151021
10161022
Attributes
10171023
----------

sklearn/linear_model/stochastic_gradient.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -511,15 +511,15 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
511511
-------
512512
self : returns an instance of self.
513513
"""
514-
if self.class_weight == 'auto':
515-
raise ValueError("class_weight 'auto' is not supported for "
514+
if self.class_weight in ['balanced', 'auto']:
515+
raise ValueError("class_weight '{0}' is not supported for "
516516
"partial_fit. In order to use 'auto' weights, "
517-
"use compute_class_weight('auto', classes, y). "
517+
"use compute_class_weight('{0}', classes, y). "
518518
"In place of y you can us a large enough sample "
519519
"of the full training set target to properly "
520520
"estimate the class frequency distributions. "
521521
"Pass the resulting weights as the class_weight "
522-
"parameter.")
522+
"parameter.".format(self.class_weight))
523523
return self._partial_fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss,
524524
learning_rate=self.learning_rate, n_iter=1,
525525
classes=classes, sample_weight=sample_weight,

sklearn/linear_model/tests/test_logistic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ def test_logistic_regressioncv_class_weights():
477477
clf_lib.fit(X, y_)
478478
assert_array_equal(clf_lib.classes_, [0, 1])
479479

480-
# Test for class_weight=auto
480+
# Test for class_weight=balanced
481481
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
482482
random_state=0)
483483
clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,

sklearn/linear_model/tests/test_ridge.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -477,18 +477,18 @@ def test_class_weights():
477477
# the prediction on this point should shift
478478
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
479479

480-
# check if class_weight = 'auto' can handle negative labels.
481-
clf = RidgeClassifier(class_weight='auto')
480+
# check if class_weight = 'balanced' can handle negative labels.
481+
clf = RidgeClassifier(class_weight='balanced')
482482
clf.fit(X, y)
483483
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
484484

485-
# class_weight = 'auto', and class_weight = None should return
485+
# class_weight = 'balanced', and class_weight = None should return
486486
# same values when y has equal number of all labels
487487
X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]])
488488
y = [1, 1, -1, -1]
489489
clf = RidgeClassifier(class_weight=None)
490490
clf.fit(X, y)
491-
clfa = RidgeClassifier(class_weight='auto')
491+
clfa = RidgeClassifier(class_weight='balanced')
492492
clfa.fit(X, y)
493493
assert_equal(len(clfa.classes_), 2)
494494
assert_array_almost_equal(clf.coef_, clfa.coef_)
@@ -570,12 +570,12 @@ def fit_ridge_not_ok_2():
570570
ridge.fit(X, y, sample_weights_not_OK_2)
571571

572572
assert_raise_message(ValueError,
573-
"Sample weights must be 1D array or scalar",
574-
fit_ridge_not_ok)
573+
"Sample weights must be 1D array or scalar",
574+
fit_ridge_not_ok)
575575

576576
assert_raise_message(ValueError,
577-
"Sample weights must be 1D array or scalar",
578-
fit_ridge_not_ok_2)
577+
"Sample weights must be 1D array or scalar",
578+
fit_ridge_not_ok_2)
579579

580580

581581
def test_sparse_design_with_sample_weights():

0 commit comments

Comments
 (0)
0