diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 08fbfedc79c92..d72e9ad8a40b1 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -97,6 +97,10 @@ Bug fixes attribute in `transform()`. :issue:`7553` by :user:`Ekaterina Krivich `. + - :class:`sklearn.linear_model.LogisticRegressionCV` now correctly handles + string labels. :issue:`5874` by `Raghav RV`_. + + .. _changes_0_18_1: Version 0.18.1 diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 98a7b5a558bc2..e792371383228 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -1,4 +1,3 @@ - """ Logistic Regression """ @@ -28,7 +27,6 @@ from ..utils.extmath import row_norms from ..utils.optimize import newton_cg from ..utils.validation import check_X_y -from ..exceptions import DataConversionWarning from ..exceptions import NotFittedError from ..utils.fixes import expit from ..utils.multiclass import check_classification_targets @@ -925,9 +923,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, y_test = np.ones(y_test.shape, dtype=np.float64) y_test[~mask] = -1. - # To deal with object dtypes, we need to convert into an array of floats. - y_test = check_array(y_test, dtype=np.float64, ensure_2d=False) - scores = list() if isinstance(scoring, six.string_types): @@ -1561,64 +1556,64 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, order="C") + check_classification_targets(y) + + class_weight = self.class_weight + if class_weight and not(isinstance(class_weight, dict) or + class_weight in ['balanced', 'auto']): + # 'auto' is deprecated and will be removed in 0.19 + raise ValueError("class_weight provided should be a " + "dict or 'balanced'") + + # Encode for string labels + label_encoder = LabelEncoder().fit(y) + y = label_encoder.transform(y) + if isinstance(class_weight, dict): + class_weight = dict((label_encoder.transform([cls])[0], v) + for cls, v in class_weight.items()) + + # The original class labels + classes = self.classes_ = label_encoder.classes_ + encoded_labels = label_encoder.transform(label_encoder.classes_) if self.solver == 'sag': max_squared_sum = row_norms(X, squared=True).max() else: max_squared_sum = None - check_classification_targets(y) - - if y.ndim == 2 and y.shape[1] == 1: - warnings.warn( - "A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples, ), for example using ravel().", - DataConversionWarning) - y = np.ravel(y) - - check_consistent_length(X, y) - # init cross-validation generator cv = check_cv(self.cv, y, classifier=True) folds = list(cv.split(X, y)) - self._enc = LabelEncoder() - self._enc.fit(y) - - labels = self.classes_ = np.unique(y) - n_classes = len(labels) + # Use the label encoded classes + n_classes = len(encoded_labels) if n_classes < 2: raise ValueError("This solver needs samples of at least 2 classes" " in the data, but the data contains only one" - " class: %r" % self.classes_[0]) + " class: %r" % classes[0]) + if n_classes == 2: # OvR in case of binary problems is as good as fitting # the higher label n_classes = 1 - labels = labels[1:] + encoded_labels = encoded_labels[1:] + classes = classes[1:] # We need this hack to iterate only once over labels, in the case of # multi_class = multinomial, without changing the value of the labels. - iter_labels = labels if self.multi_class == 'multinomial': - iter_labels = [None] - - if self.class_weight and not(isinstance(self.class_weight, dict) or - self.class_weight in - ['balanced', 'auto']): - # 'auto' is deprecated and will be removed in 0.19 - raise ValueError("class_weight provided should be a " - "dict or 'balanced'") + iter_encoded_labels = iter_classes = [None] + else: + iter_encoded_labels = encoded_labels + iter_classes = classes # compute the class weights for the entire dataset y - if self.class_weight in ("auto", "balanced"): - classes = np.unique(y) - class_weight = compute_class_weight(self.class_weight, classes, y) - class_weight = dict(zip(classes, class_weight)) - else: - class_weight = self.class_weight + if class_weight in ("auto", "balanced"): + class_weight = compute_class_weight(class_weight, + np.arange(len(self.classes_)), + y) + class_weight = dict(enumerate(class_weight)) path_func = delayed(_log_reg_scoring_path) @@ -1638,7 +1633,7 @@ def fit(self, X, y, sample_weight=None): max_squared_sum=max_squared_sum, sample_weight=sample_weight ) - for label in iter_labels + for label in iter_encoded_labels for train, test in folds) if self.multi_class == 'multinomial': @@ -1669,9 +1664,9 @@ def fit(self, X, y, sample_weight=None): self.n_iter_ = np.reshape(n_iter_, (n_classes, len(folds), len(self.Cs_))) - self.coefs_paths_ = dict(zip(labels, coefs_paths)) + self.coefs_paths_ = dict(zip(classes, coefs_paths)) scores = np.reshape(scores, (n_classes, len(folds), -1)) - self.scores_ = dict(zip(labels, scores)) + self.scores_ = dict(zip(classes, scores)) self.C_ = list() self.coef_ = np.empty((n_classes, X.shape[1])) @@ -1682,10 +1677,14 @@ def fit(self, X, y, sample_weight=None): scores = multi_scores coefs_paths = multi_coefs_paths - for index, label in enumerate(iter_labels): + for index, (cls, encoded_label) in enumerate( + zip(iter_classes, iter_encoded_labels)): + if self.multi_class == 'ovr': - scores = self.scores_[label] - coefs_paths = self.coefs_paths_[label] + # The scores_ / coefs_paths_ dict have unencoded class + # labels as their keys + scores = self.scores_[cls] + coefs_paths = self.coefs_paths_[cls] if self.refit: best_index = scores.sum(axis=0).argmax() @@ -1698,8 +1697,10 @@ def fit(self, X, y, sample_weight=None): else: coef_init = np.mean(coefs_paths[:, best_index, :], axis=0) + # Note that y is label encoded and hence pos_class must be + # the encoded label / None (for 'multinomial') w, _, _ = logistic_regression_path( - X, y, pos_class=label, Cs=[C_], solver=self.solver, + X, y, pos_class=encoded_label, Cs=[C_], solver=self.solver, fit_intercept=self.fit_intercept, coef=coef_init, max_iter=self.max_iter, tol=self.tol, penalty=self.penalty, copy=False, diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 8d35bb220c958..a5e9e212c7cf7 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -27,6 +27,7 @@ from sklearn.model_selection import StratifiedKFold from sklearn.datasets import load_iris, make_classification from sklearn.metrics import log_loss +from sklearn.preprocessing import LabelEncoder X = [[-1, 0], [0, 1], [1, 1]] X_sp = sp.csr_matrix(X) @@ -398,6 +399,44 @@ def test_logistic_cv(): assert_array_equal(scores.shape, (1, 3, 1)) +def test_multinomial_logistic_regression_string_inputs(): + # Test with string labels for LogisticRegression(CV) + n_samples, n_features, n_classes = 50, 5, 3 + X_ref, y = make_classification(n_samples=n_samples, n_features=n_features, + n_classes=n_classes, n_informative=3) + y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y) + # For numerical labels, let y values be taken from set (-1, 0, 1) + y = np.array(y) - 1 + # Test for string labels + lr = LogisticRegression(solver='lbfgs', multi_class='multinomial') + lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') + lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial') + lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') + + lr.fit(X_ref, y) + lr_cv.fit(X_ref, y) + lr_str.fit(X_ref, y_str) + lr_cv_str.fit(X_ref, y_str) + + assert_array_almost_equal(lr.coef_, lr_str.coef_) + assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) + assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_) + assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) + assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo']) + + # The predictions should be in original labels + assert_equal(sorted(np.unique(lr_str.predict(X_ref))), + ['bar', 'baz', 'foo']) + assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), + ['bar', 'baz', 'foo']) + + # Make sure class weights can be given with string labels + lr_cv_str = LogisticRegression( + solver='lbfgs', class_weight={'bar': 1, 'baz': 2, 'foo': 0}, + multi_class='multinomial').fit(X_ref, y_str) + assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz']) + + def test_logistic_cv_sparse(): X, y = make_classification(n_samples=50, n_features=5, random_state=0)