8000 Merge pull request #6817 from TomDLT/logistic_class_weight · scikit-learn/scikit-learn@af171b8 · GitHub
[go: up one dir, main page]

Skip to content

Commit af171b8

Browse files
committed
Merge pull request #6817 from TomDLT/logistic_class_weight
[MRG] use class_weight through sample_weight in LogisticRegression with liblinear
2 parents 20f89ef + 1107f22 commit af171b8

File tree

4 files changed

+44
-52
lines changed

4 files changed

+44
-52
lines changed

doc/whats_new.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,11 @@ Bug fixes
203203
- Fix bug where expected and adjusted mutual information were incorrect if
204204
cluster contingency cells exceeded ``2**16``. By `Joel Nothman`_.
205205

206+
- Fix bug in :class:`linear_model.LogisticRegressionCV` where
207+
``solver='liblinear'`` did not accept ``class_weights='balanced``.
208+
(`#6817 <https://github.com/scikit-learn/scikit-learn/pull/6817>`_).
209+
By `Tom Dupre la Tour`_.
210+
206211

207212
API changes summary
208213
-------------------

sklearn/linear_model/logistic.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -618,23 +618,9 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
618618
# are assigned to the original labels. If it is "balanced", then
619619
# the class_weights are assigned after masking the labels with a OvR.
620620
le = LabelEncoder()
621-
622621
if isinstance(class_weight, dict) or multi_class == 'multinomial':
623-
if solver == "liblinear":
624-
if classes.size == 2:
625-
# Reconstruct the weights with keys 1 and -1
626-
temp = {1: class_weight[pos_class],
627-
-1: class_weight[classes[0]]}
628-
class_weight = temp.copy()
629-
else:
630-
raise ValueError("In LogisticRegressionCV the liblinear "
631-
"solver cannot handle multiclass with "
632-
"class_weight of type dict. Use the lbfgs, "
633-
"newton-cg or sag solvers or set "
634-
"class_weight='balanced'")
635-
else:
636-
class_weight_ = compute_class_weight(class_weight, classes, y)
637-
sample_weight *= class_weight_[le.fit_transform(y)]
622+
class_weight_ = compute_class_weight(class_weight, classes, y)
623+
sample_weight *= class_weight_[le.fit_transform(y)]
638624

639625
# For doing a ovr, we need to mask the labels first. for the
640626
# multinomial case this is not necessary.
@@ -740,7 +726,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
740726
maxiter=max_iter, tol=tol)
741727
elif solver == 'liblinear':
742728
coef_, intercept_, n_iter_i, = _fit_liblinear(
743-
X, target, C, fit_intercept, intercept_scaling, class_weight,
729+
X, target, C, fit_intercept, intercept_scaling, None,
744730
penalty, dual, verbose, max_iter, tol, random_state,
745731
sample_weight=sample_weight)
746732
if fit_intercept:

sklearn/linear_model/tests/test_logistic.py

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -547,35 +547,35 @@ def test_logistic_regression_solvers_multiclass():
547547

548548

549549
def test_logistic_regressioncv_class_weights():
550-
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
551-
n_classes=3, random_state=0)
552-
553-
msg = ("In LogisticRegressionCV the liblinear solver cannot handle "
554-
"multiclass with class_weight of type dict. Use the lbfgs, "
555-
"newton-cg or sag solvers or set class_weight='balanced'")
556-
clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2},
557-
solver='liblinear')
558-
assert_raise_message(ValueError, msg, clf_lib.fit, X, y)
559-
y_ = y.copy()
560-
y_[y == 2] = 1
561-
clf_lib.fit(X, y_)
562-
assert_array_equal(clf_lib.classes_, [0, 1])
563-
564-
# Test for class_weight=balanced
565-
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
566-
random_state=0)
567-
clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,
568-
class_weight='balanced')
569-
clf_lbf.fit(X, y)
570-
clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False,
571-
class_weight='balanced')
572-
clf_lib.fit(X, y)
573-
clf_sag = LogisticRegressionCV(solver='sag', fit_intercept=False,
574-
class_weight='balanced', max_iter=2000)
575-
clf_sag.fit(X, y)
576-
assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
577-
assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4)
578-
assert_array_almost_equal(clf_lib.coef_, clf_sag.coef_, decimal=4)
550+
for weight in [{0: 0.1, 1: 0.2}, {0: 0.1, 1: 0.2, 2: 0.5}]:
551+
n_classes = len(weight)
552+
for class_weight in (weight, 'balanced'):
553+
X, y = make_classification(n_samples=30, n_features=3,
554+
n_repeated=0,
555+
n_informative=3, n_redundant=0,
556+
n_classes=n_classes, random_state=0)
557+
558+
clf_lbf = LogisticRegressionCV(solver='lbfgs', Cs=1,
559+
fit_intercept=False,
560+
class_weight=class_weight)
561+
clf_ncg = LogisticRegressionCV(solver='newton-cg', Cs=1,
562+
fit_intercept=False,
563+
class_weight=class_weight)
564+
clf_lib = LogisticRegressionCV(solver='liblinear', Cs=1,
565+
fit_intercept=False,
566+
class_weight=class_weight)
567+
clf_sag = LogisticRegressionCV(solver='sag', Cs=1,
568+
fit_intercept=False,
569+
class_weight=class_weight,
570+
tol=1e-5, max_iter=10000,
571+
random_state=0)
572+
clf_lbf.fit(X, y)
573+
clf_ncg.fit(X, y)
574+
clf_lib.fit(X, y)
575+
clf_sag.fit(X, y)
576+
assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
577+
assert_array_almost_equal(clf_ncg.coef_, clf_lbf.coef_, decimal=4)
578+
assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4)
579579

580580

581581
def test_logistic_regression_sample_weights():
@@ -926,7 +926,6 @@ def test_n_iter():
926926
assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs))
927927

928928

929-
@ignore_warnings
930929
def test_warm_start():
931930
# A 1-iteration second fit on same data should give almost same result
932931
# with warm starting, and quite different result without warm starting.
@@ -947,11 +946,11 @@ def test_warm_start():
947946
solver=solver,
948947
random_state=42, max_iter=100,
949948
fit_intercept=fit_intercept)
950-
clf.fit(X, y)
951-
coef_1 = clf.coef_
949+
with ignore_warnings(category=ConvergenceWarning):
950+
clf.fit(X, y)
951+
coef_1 = clf.coef_
952952

953-
clf.max_iter = 1
954-
with ignore_warnings():
953+
clf.max_iter = 1
955954
clf.fit(X, y)
956955
cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
957956
msg = ("Warm starting issue with %s solver in %s mode "

sklearn/utils/optimize.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import warnings
1818
from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
1919

20+
from ..exceptions import ConvergenceWarning
21+
2022

2123
class _LineSearchError(RuntimeError):
2224
pass
@@ -198,5 +200,5 @@ def newton_cg(grad_hess, func, grad, x0, args=(), tol=1e-4,
198200

199201
if warn and k >= maxiter:
200202
warnings.warn("newton-cg failed to converge. Increase the "
201-
"number of iterations.")
203+
"number of iterations.", ConvergenceWarning)
202204
return xk, k

0 commit comments

Comments
 (0)
0