10000 [MRG + 2] FIX LogisticRegressionCV to correctly handle string labels … · maskani-moh/scikit-learn@c19c27e · GitHub
[go: up one dir, main page]

Skip to content

Commit c19c27e

Browse files
raghavrvmaskani-moh
authored andcommitted
[MRG + 2] FIX LogisticRegressionCV to correctly handle string labels (scikit-learn#5874)
* TST if LogisticRegressionCV handles string labels properly * TST Add a test with class_weight dict * ENH Encode y and class_weight dict * Better variable names * TYPO casses --> classes * FIX Use dict comprehension; classes_labels --> classes * Revert dict comprehension (for Python 2.6 compat) * MNT reorder validation to improve clarity * Add whatsnew entry
1 parent a46d105 commit c19c27e

File tree

3 files changed

+90
-46
lines changed

3 files changed

+90
-46
lines changed

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ Bug fixes
9797
attribute in `transform()`. :issue:`7553` by :user:`Ekaterina
9898
Krivich <kiote>`.
9999

100+
- :class:`sklearn.linear_model.LogisticRegressionCV` now correctly handles
101+
string labels. :issue:`5874` by `Raghav RV`_.
102+
103+
100104
.. _changes_0_18_1:
101105

102106
Version 0.18.1

sklearn/linear_model/logistic.py

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
"""
32
Logistic Regression
43
"""
@@ -28,7 +27,6 @@
2827
from ..utils.extmath import row_norms
2928
from ..utils.optimize import newton_cg
3029
from ..utils.validation import check_X_y
31-
from ..exceptions import DataConversionWarning
3230
from ..exceptions import NotFittedError
3331
from ..utils.fixes import expit
3432
from ..utils.multiclass import check_classification_targets
@@ -925,9 +923,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
925923
y_test = np.ones(y_test.shape, dtype=np.float64)
926924
y_test[~mask] = -1.
927925

928-
# To deal with object dtypes, we need to convert into an array of floats.
929-
y_test = check_array(y_test, dtype=np.float64, ensure_2d=False)
930-
931926
scores = list()
932927

933928
if isinstance(scoring, six.string_types):
@@ -1561,64 +1556,64 @@ def fit(self, X, y, sample_weight=None):
15611556

15621557
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
15631558
order="C")
1559+
check_classification_targets(y)
1560+
1561+
class_weight = self.class_weight
1562+
if class_weight and not(isinstance(class_weight, dict) or
1563+
class_weight in ['balanced', 'auto']):
1564+
# 'auto' is deprecated and will be removed in 0.19
1565+
raise ValueError("class_weight provided should be a "
1566+
"dict or 'balanced'")
1567+
1568+
# Encode for string labels
1569+
label_encoder = LabelEncoder().fit(y)
1570+
y = label_encoder.transform(y)
1571+
if isinstance(class_weight, dict):
1572+
class_weight = dict((label_encoder.transform([cls])[0], v)
1573+
for cls, v in class_weight.items())
1574+
1575+
# The original class labels
1576+
classes = self.classes_ = label_encoder.classes_
1577+
encoded_labels = label_encoder.transform(label_encoder.classes_)
15641578

15651579
if self.solver == 'sag':
15661580
max_squared_sum = row_norms(X, squared=True).max()
15671581
else:
15681582
max_squared_sum = None
15691583

1570-
check_classification_targets(y)
1571-
1572-
if y.ndim == 2 and y.shape[1] == 1:
1573-
warnings.warn(
1574-
"A column-vector y was passed when a 1d array was"
1575-
" expected. Please change the shape of y to "
1576-
"(n_samples, ), for example using ravel().",
1577-
DataConversionWarning)
1578-
y = np.ravel(y)
1579-
1580-
check_consistent_length(X, y)
1581-
15821584
# init cross-validation generator
15831585
cv = check_cv(self.cv, y, classifier=True)
15841586
folds = list(cv.split(X, y))
15851587

1586-
self._enc = LabelEncoder()
1587-
self._enc.fit(y)
1588-
1589-
labels = self.classes_ = np.unique(y)
1590-
n_classes = len(labels)
1588+
# Use the label encoded classes
1589+
n_classes = len(encoded_labels)
15911590

15921591
if n_classes < 2:
15931592
raise ValueError("This solver needs samples of at least 2 classes"
15941593
" in the data, but the data contains only one"
1595-
" class: %r" % self.classes_[0])
1594+
" class: %r" % classes[0])
1595+
15961596
if n_classes == 2:
15971597
# OvR in case of binary problems is as good as fitting
15981598
# the higher label
15991599
n_classes = 1
1600-
labels = labels[1:]
1600+
encoded_labels = encoded_labels[1:]
1601+
classes = classes[1:]
16011602

16021603
# We need this hack to iterate only once over labels, in the case of
16031604
# multi_class = multinomial, without changing the value of the labels.
1604-
iter_labels = labels
16051605
if self.multi_class == 'multinomial':
1606-
iter_labels = [None]
1607-
1608-
if self.class_weight and not(isinstance(self.class_weight, dict) or
1609-
self.class_weight in
1610-
['balanced', 'auto']):
1611-
# 'auto' is deprecated and will be removed in 0.19
1612-
raise ValueError("class_weight provided should be a "
1613-
"dict or 'balanced'")
1606+
iter_encoded_labels = iter_classes = [None]
1607+
else:
1608+
iter_encoded_labels = encoded_labels
1609+
iter_classes = classes
16141610

16151611
# compute the class weights for the entire dataset y
1616-
if self.class_weight in ("auto", "balanced"):
1617-
classes = np.unique(y)
1618-
class_weight = compute_class_weight(self.class_weight, classes, y)
1619-
class_weight = dict(zip(classes, class_weight))
1620-
else:
1621-
class_weight = self.class_weight
1612+
if class_weight in ("auto", "balanced"):
1613+
class_weight = compute_class_weight(class_weight,
1614+
np.arange(len(self.classes_)),
1615+
y)
1616+
class_weight = dict(enumerate(class_weight))
16221617

16231618
path_func = delayed(_log_reg_scoring_path)
16241619

@@ -1638,7 +1633,7 @@ def fit(self, X, y, sample_weight=None):
16381633
max_squared_sum=max_squared_sum,
16391634
sample_weight=sample_weight
16401635
)
1641-
for label in iter_labels
1636+
for label in iter_encoded_labels
16421637
for train, test in folds)
16431638

16441639
if self.multi_class == 'multinomial':
@@ -1669,9 +1664,9 @@ def fit(self, X, y, sample_weight=None):
16691664
self.n_iter_ = np.reshape(n_iter_, (n_classes, len(folds),
16701665
len(self.Cs_)))
16711666

1672-
self.coefs_paths_ = dict(zip(labels, coefs_paths))
1667+
self.coefs_paths_ = dict(zip(classes, coefs_paths))
16731668
scores = np.reshape(scores, (n_classes, len(folds), -1))
1674 10000 -
self.scores_ = dict(zip(labels, scores))
1669+
self.scores_ = dict(zip(classes, scores))
16751670

16761671
self.C_ = list()
16771672
self.coef_ = np.empty((n_classes, X.shape[1]))
@@ -1682,10 +1677,14 @@ def fit(self, X, y, sample_weight=None):
16821677
scores = multi_scores
16831678
coefs_paths = multi_coefs_paths
16841679

1685-
for index, label in enumerate(iter_labels):
1680+
for index, (cls, encoded_label) in enumerate(
1681+
zip(iter_classes, iter_encoded_labels)):
1682+
16861683
if self.multi_class == 'ovr':
1687-
scores = self.scores_[label]
1688-
coefs_paths = self.coefs_paths_[label]
1684+
# The scores_ / coefs_paths_ dict have unencoded class
1685+
# labels as their keys
1686+
scores = self.scores_[cls]
1687+
coefs_paths = self.coefs_paths_[cls]
16891688

16901689
if self.refit:
16911690
best_index = scores.sum(axis=0).argmax()
@@ -1698,8 +1697,10 @@ def fit(self, X, y, sample_weight=None):
16981697
else:
16991698
coef_init = np.mean(coefs_paths[:, best_index, :], axis=0)
17001699

1700+
# Note that y is label encoded and hence pos_class must be
1701+
# the encoded label / None (for 'multinomial')
17011702
w, _, _ = logistic_regression_path(
1702-
X, y, pos_class=label, Cs=[C_], solver=self.solver,
1703+
X, y, pos_class=encoded_label, Cs=[C_], solver=self.solver,
17031704
fit_intercept=self.fit_intercept, coef=coef_init,
17041705
max_iter=self.max_iter, tol=self.tol,
17051706
penalty=self.penalty, copy=False,

sklearn/linear_model/tests/test_logistic.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from sklearn.model_selection import StratifiedKFold
2828
from sklearn.datasets import load_iris, make_classification
2929
from sklearn.metrics import log_loss
30+
from sklearn.preprocessing import LabelEncoder
3031

3132
X = [[-1, 0], [0, 1], [1, 1]]
3233
X_sp = sp.csr_matrix(X)
@@ -398,6 +399,44 @@ def test_logistic_cv():
398399
assert_array_equal(scores.shape, (1, 3, 1))
399400

400401

402+
def test_multinomial_logistic_regression_string_inputs():
403+
# Test with string labels for LogisticRegression(CV)
404+
n_samples, n_features, n_classes = 50, 5, 3
405+
X_ref, y = make_classification(n_samples=n_samples, n_features=n_features,
406+
n_classes=n_classes, n_informative=3)
407+
y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y)
408+
# For numerical labels, let y values be taken from set (-1, 0, 1)
409+
y = np.array(y) - 1
410+
# Test for string labels
411+
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
412+
lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')
413+
lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial')
414+
lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')
415+
416+
lr.fit(X_ref, y)
417+
lr_cv.fit(X_ref, y)
418+
lr_str.fit(X_ref, y_str)
419+
lr_cv_str.fit(X_ref, y_str)
420+
421+
assert_array_almost_equal(lr.coef_, lr_str.coef_)
422+
assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
423+
assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_)
424+
assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
425+
assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo'])
426+
427+
# The predictions should be in original labels
428+
assert_equal(sorted(np.unique(lr_str.predict(X_ref))),
429+
['bar', 'baz', 'foo'])
430+
assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))),
431+
['bar', 'baz', 'foo'])
432+
433+
# Make sure class weights can be given with string labels
434+
lr_cv_str = LogisticRegression(
435+
solver='lbfgs', class_weight={'bar': 1, 'baz': 2, 'foo': 0},
436+
multi_class='multinomial').fit(X_ref, y_str)
437+
assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])
438+
439+
401440
def test_logistic_cv_sparse():
402441
X, y = make_classification(n_samples=50, n_features=5,
403442
random_state=0)

0 commit comments

Comments
 (0)
0