8000 ENH Encode y and class_weight dict · scikit-learn/scikit-learn@934493c · GitHub
[go: up one dir, main page]

Skip to content

Commit 934493c

Browse files
committed
ENH Encode y and class_weight dict
1 parent 58e5cf2 commit 934493c

File tree

1 file changed

+41
-37
lines changed

1 file changed

+41
-37
lines changed

sklearn/linear_model/logistic.py

Lines changed: 41 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
"""
32
Logistic Regre 10000 ssion
43
"""
@@ -28,7 +27,6 @@
2827
from ..utils.extmath import row_norms
2928
from ..utils.optimize import newton_cg
3029
from ..utils.validation import check_X_y
31-
from ..exceptions import DataConversionWarning
3230
from ..exceptions import NotFittedError
3331
from ..utils.fixes import expit
3432
from ..utils.multiclass import check_classification_targets
@@ -925,9 +923,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
925923
y_test = np.ones(y_test.shape, dtype=np.float64)
926924
y_test[~mask] = -1.
927925

928-
# To deal with object dtypes, we need to convert into an array of floats.
929-
y_test = check_array(y_test, dtype=np.float64, ensure_2d=False)
930-
931926
scores = list()
932927

933928
if isinstance(scoring, six.string_types):
@@ -1561,64 +1556,67 @@ def fit(self, X, y, sample_weight=None):
15611556

15621557
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
15631558
order="C")
1564-
1559+
check_classification_targets(y)
15651560
if self.solver == 'sag':
15661561
max_squared_sum = row_norms(X, squared=True).max()
15671562
else:
15681563
max_squared_sum = None
15691564

1570-
check_classification_targets(y)
1565+
# Encode for string labels
1566+
label_encoder = LabelEncoder().fit(y)
1567+
y = label_encoder.transform(y)
1568+
self.classes_ = label_encoder.classes_
15711569

1572-
if y.ndim == 2 and y.shape[1] == 1:
1573-
warnings.warn(
1574-
"A column-vector y was passed when a 1d array was"
1575-
" expected. Please change the shape of y to "
1576-
"(n_samples, ), for example using ravel().",
1577-
DataConversionWarning)
1578-
y = np.ravel(y)
1570+
enc_labels = label_encoder.transform(label_encoder.classes_)
1571+
cls_labels = self.classes_ # The original class labels
15791572

1580-
check_consistent_length(X, y)
1573+
class_weight = self.class_weight
1574+
if isinstance(class_weight, dict):
1575+
old_keys = list(class_weight.keys())
1576+
new_keys = label_encoder.transform(old_keys)
1577+
# Don't modify the original class_weight dict.
1578+
class_weight = dict()
1579+
for new_key, old_key in zip(new_keys, old_keys):
1580+
class_weight[new_key] = self.class_weight[old_key]
15811581

15821582
# init cross-validation generator
15831583
cv = check_cv(self.cv, y, classifier=True)
15841584
folds = list(cv.split(X, y))
15851585

1586-
self._enc = LabelEncoder()
1587-
self._enc.fit(y)
1588-
1589-
labels = self.classes_ = np.unique(y)
1590-
n_classes = len(labels)
1586+
# Use the label encoded classes
1587+
n_classes = len(enc_labels)
15911588

15921589
if n_classes < 2:
15931590
raise ValueError("This solver needs samples of at least 2 classes"
15941591
" in the data, but the data contains only one"
15951592
" class: %r" % self.classes_[0])
1593+
15961594
if n_classes == 2:
15971595
# OvR in case of binary problems is as good as fitting
15981596
# the higher label
15991597
n_classes = 1
1600-
labels = labels[1:]
1598+
enc_labels = enc_labels[1:]
1599+
cls_labels = cls_labels[1:]
16011600

16021601
# We need this hack to iterate only once over labels, in the case of
16031602
# multi_class = multinomial, without changing the value of the labels.
1604-
iter_labels = labels
16051603
if self.multi_class == 'multinomial':
1606-
iter_labels = [None]
1604+
iter_labels = iter_classes = [None]
1605+
else:
1606+
iter_labels = enc_labels
1607+
iter_classes = cls_labels
16071608

1608-
if self.class_weight and not(isinstance(self.class_weight, dict) or
1609-
self.class_weight in
1610-
['balanced', 'auto']):
1609+
if class_weight and not(isinstance(class_weight, dict) or
1610+
class_weight in ['balanced', 'auto']):
16111611
# 'auto' is deprecated and will be removed in 0.19
16121612
raise ValueError("class_weight provided should be a "
16131613
"dict or 'balanced'")
16141614

16151615
# compute the class weights for the entire dataset y
1616-
if self.class_weight in ("auto", "balanced"):
1617-
classes = np.unique(y)
1618-
class_weight = compute_class_weight(self.class_weight, classes, y)
1616+
if class_weight in ("auto", "balanced"):
1617+
classes = np.arange(len(self.classes_))
1618+
class_weight = compute_class_weight(class_weight, classes, y)
16191619
class_weight = dict(zip(classes, class_weight))
1620-
else:
1621-
class_weight = self.class_weight
16221620

16231621
path_func = delayed(_log_reg_scoring_path)
16241622

@@ -1669,9 +1667,9 @@ def fit(self, X, y, sample_weight=None):
16691667
self.n_iter_ = np.reshape(n_iter_, (n_classes, len(folds),
16701668
len(self.Cs_)))
16711669

1672-
self.coefs_paths_ = dict(zip(labels, coefs_paths))
1670+
self.coefs_paths_ = dict(zip(cls_labels, coefs_paths))
16731671
scores = np.reshape(scores, (n_classes, len(folds), -1))
1674-
self.scores_ = dict(zip(labels, scores))
1672+
self.scores_ = dict(zip(cls_labels, scores))
16751673

16761674
self.C_ = list()
16771675
self.coef_ = np.empty((n_classes, X.shape[1]))
@@ -1682,10 +1680,14 @@ def fit(self, X, y, sample_weight=None):
16821680
scores = multi_scores
16831681
coefs_paths = multi_coefs_paths
16841682

1685-
for index, label in enumerate(iter_labels):
1683+
for index, (cls_lbl, enc_lbl) in enumerate(
1684+
zip(iter_classes, iter_labels)):
1685+
16861686
if self.multi_class == 'ovr':
1687-
scores = self.scores_[label]
1688-
coefs_paths = self.coefs_paths_[label]
1687+
# The scores_ / coefs_paths_ dict have unencoded class
1688+
# labels as their keys
1689+
scores = self.scores_[cls_lbl]
1690+
coefs_paths = self.coefs_paths_[cls_lbl]
16891691

16901692
if self.refit:
16911693
best_index = scores.sum(axis=0).argmax()
@@ -1698,8 +1700,10 @@ def fit(self, X, y, sample_weight=None):
16981700
else:
16991701
coef_init = np.mean(coefs_paths[:, best_index, :], axis=0)
17001702

1703+
# Note that y is label encoded and hence pos_class must be
1704+
# the encoded label / None (for 'multinomial')
17011705
w, _, _ = logistic_regression_path(
1702-
X, y, pos_class=label, Cs=[C_], solver=self.solver,
1706+
X, y, pos_class=enc_lbl, Cs=[C_], solver=self.solver,
17031707
fit_intercept=self.fit_intercept, coef=coef_init,
17041708
max_iter=self.max_iter, tol=self.tol,
17051709
penalty=self.penalty, copy=False,

0 commit comments

Comments
 (0)
0