-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG + 2] FIX LogisticRegressionCV to correctly handle string labels #5874
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7e13f9a
c9777e2
3306f76
9bd4c76
599aa10
41e4e19
f8e3e96
026e97f
427eb41
7f3795e
cc70ff9
4961d97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
|
||
""" | ||
Logistic Regression | ||
""" | ||
|
@@ -28,7 +27,6 @@ | |
from ..utils.extmath import row_norms | ||
from ..utils.optimize import newton_cg | ||
from ..utils.validation import check_X_y | ||
from ..exceptions import DataConversionWarning | ||
from ..exceptions import NotFittedError | ||
from ..utils.fixes import expit | ||
from ..utils.multiclass import check_classification_targets | ||
|
@@ -925,9 +923,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, | |
y_test = np.ones(y_test.shape, dtype=np.float64) | ||
y_test[~mask] = -1. | ||
|
||
# To deal with object dtypes, we need to convert into an array of floats. | ||
y_test = check_array(y_test, dtype=np.float64, ensure_2d=False) | ||
|
||
scores = list() | ||
|
||
if isinstance(scoring, six.string_types): | ||
|
@@ -1561,64 +1556,64 @@ def fit(self, X, y, sample_weight=None): | |
|
||
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, | ||
order="C") | ||
check_classification_targets(y) | ||
|
||
class_weight = self.class_weight | ||
if class_weight and not(isinstance(class_weight, dict) or | ||
class_weight in ['balanced', 'auto']): | ||
# 'auto' is deprecated and will be removed in 0.19 | ||
raise ValueError("class_weight provided should be a " | ||
"dict or 'balanced'") | ||
|
||
# Encode for string labels | ||
label_encoder = LabelEncoder().fit(y) | ||
y = label_encoder.transform(y) | ||
if isinstance(class_weight, dict): | ||
class_weight = dict((label_encoder.transform([cls])[0], v) | ||
for cls, v in class_weight.items()) | ||
|
||
# The original class labels | ||
classes = self.classes_ = label_encoder.classes_ | ||
encoded_labels = label_encoder.transform(label_encoder.classes_) | ||
|
||
if self.solver == 'sag': | ||
max_squared_sum = row_norms(X, squared=True).max() | ||
else: | ||
max_squared_sum = None | ||
|
||
check_classification_targets(y) | ||
|
||
if y.ndim == 2 and y.shape[1] == 1: | ||
warnings.warn( | ||
"A column-vector y was passed when a 1d array was" | ||
" expected. Please change the shape of y to " | ||
"(n_samples, ), for example using ravel().", | ||
DataConversionWarning) | ||
y = np.ravel(y) | ||
|
||
check_consistent_length(X, y) | ||
|
||
# init cross-validation generator | ||
cv = check_cv(self.cv, y, classifier=True) | ||
folds = list(cv.split(X, y)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Surely self._enc must be used somewhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was added here - 67585f6, but not used anywhere... I am unable to understand why though... |
||
self._enc = LabelEncoder() | ||
self._enc.fit(y) | ||
|
||
labels = self.classes_ = np.unique(y) | ||
n_classes = len(labels) | ||
# Use the label encoded classes | ||
n_classes = len(encoded_labels) | ||
|
||
if n_classes < 2: | ||
raise ValueError("This solver needs samples of at least 2 classes" | ||
" in the data, but the data contains only one" | ||
" class: %r" % self.classes_[0]) | ||
" class: %r" % classes[0]) | ||
|
||
if n_classes == 2: | ||
# OvR in case of binary problems is as good as fitting | ||
# the higher label | ||
n_classes = 1 | ||
labels = labels[1:] | ||
encoded_labels = encoded_labels[1:] | ||
classes = classes[1:] | ||
|
||
# We need this hack to iterate only once over labels, in the case of | ||
# multi_class = multinomial, without changing the value of the labels. | ||
iter_labels = labels | ||
if self.multi_class == 'multinomial': | ||
iter_labels = [None] | ||
|
||
if self.class_weight and not(isinstance(self.class_weight, dict) or | ||
self.class_weight in | ||
['balanced', 'auto']): | ||
# 'auto' is deprecated and will be removed in 0.19 | ||
raise ValueError("class_weight provided should be a " | ||
"dict or 'balanced'") | ||
iter_encoded_labels = iter_classes = [None] | ||
else: | ||
iter_encoded_labels = encoded_labels | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think clarity could be improved by moving this |
||
iter_classes = classes | ||
|
||
# compute the class weights for the entire dataset y | ||
if self.class_weight in ("auto", "balanced"): | ||
classes = np.unique(y) | ||
class_weight = compute_class_weight(self.class_weight, classes, y) | ||
class_weight = dict(zip(classes, class_weight)) | ||
else: | ||
class_weight = self.class_weight | ||
if class_weight in ("auto", "balanced"): | ||
class_weight = compute_class_weight(class_weight, | ||
np.arange(len(self.classes_)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jnothman does this seem okay? I could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure this is fine. I don't get why you'd choose |
||
y) | ||
class_weight = dict(enumerate(class_weight)) | ||
|
||
path_func = delayed(_log_reg_scoring_path) | ||
|
||
|
@@ -1638,7 +1633,7 @@ def fit(self, X, y, sample_weight=None): | |
max_squared_sum=max_squared_sum, | ||
sample_weight=sample_weight | ||
) | ||
for label in iter_labels | ||
for label in iter_encoded_labels | ||
for train, test in folds) | ||
|
||
if self.multi_class == 'multinomial': | ||
|
@@ -1669,9 +1664,9 @@ def fit(self, X, y, sample_weight=None): | |
self.n_iter_ = np.reshape(n_iter_, (n_classes, len(folds), | ||
len(self.Cs_))) | ||
|
||
self.coefs_paths_ = dict(zip(labels, coefs_paths)) | ||
self.coefs_paths_ = dict(zip(classes, coefs_paths)) | ||
scores = np.reshape(scores, (n_classes, len(folds), -1)) | ||
self.scores_ = dict(zip(labels, scores)) | ||
self.scores_ = F438 dict(zip(classes, scores)) | ||
|
||
self.C_ = list() | ||
self.coef_ = np.empty((n_classes, X.shape[1])) | ||
|
@@ -1682,10 +1677,14 @@ def fit(self, X, y, sample_weight=None): | |
scores = multi_scores | ||
coefs_paths = multi_coefs_paths | ||
|
||
for index, label in enumerate(iter_labels): | ||
for index, (cls, encoded_label) in enumerate( | ||
zip(iter_classes, iter_encoded_labels)): | ||
|
||
if self.multi_class == 'ovr': | ||
scores = self.scores_[label] | ||
coefs_paths = self.coefs_paths_[label] | ||
# The scores_ / coefs_paths_ dict have unencoded class | ||
# labels as their keys | ||
scores = self.scores_[cls] | ||
coefs_paths = self.coefs_paths_[cls] | ||
|
||
if self.refit: | ||
best_index = scores.sum(axis=0).argmax() | ||
|
@@ -1698,8 +1697,10 @@ def fit(self, X, y, sample_weight=None): | |
else: | ||
coef_init = np.mean(coefs_paths[:, best_index, :], axis=0) | ||
|
||
# Note that y is label encoded and hence pos_class must be | ||
# the encoded label / None (for 'multinomial') | ||
w, _, _ = logistic_regression_path( | ||
X, y, pos_class=label, Cs=[C_], solver=self.solver, | ||
X, y, pos_class=encoded_label, Cs=[C_], solver=self.solver, | ||
fit_intercept=self.fit_intercept, coef=coef_init, | ||
max_iter=self.max_iter, tol=self.tol, | ||
penalty=self.penalty, copy=False, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I also don't understand the significants of
enc_labels
. That'srange(len(classes))
, right?why don't you call
cls_labels
classes
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
enc_labels
(nowencoded_labels
) refers to the encoded labels that are being considered. (by being considered I mean, the ones that are finally used to train the model, as for binary problems the higher label alone is retained hence differs fromrange(len(classes))
).Ref this line
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
classes_labels
was done to match the naming ofencoded_labels
. I'll refactor it back toclasses
if it seems clearer...