From 59f9e40fb8444befa5e0e228da326241787a5cc4 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 22:04:21 -0700
Subject: [PATCH 01/41] remove mean for logisticregression lbfgs

---
 sklearn/linear_model/_logistic.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 7921150e0fa01..9f0738a76020b 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -649,7 +649,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                               intercept_scaling=1., multi_class='auto',
                               random_state=None, check_input=True,
                               max_squared_sum=None, sample_weight=None,
-                              l1_ratio=None):
+                              l1_ratio=None, precondition=False):
     """Compute a Logistic Regression model for a list of regularization
     parameters.
 
@@ -908,6 +908,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             hess = _multinomial_grad_hess
         warm_start_sag = {'coef': w0.T}
     else:
+        # binary logistic regression
         target = y_bin
         if solver == 'lbfgs':
             func = _logistic_loss_and_grad
@@ -919,17 +920,24 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
 
     coefs = list()
     n_iter = np.zeros(len(Cs), dtype=np.int32)
+    X_pre = X
+    if precondition:
+        X_mean = X.mean(axis=0)
+        X_pre = X - X_mean
     for i, C in enumerate(Cs):
         if solver == 'lbfgs':
             iprint = [-1, 50, 1, 100, 101][
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
             opt_res = optimize.minimize(
                 func, w0, method="L-BFGS-B", jac=True,
-                args=(X, target, 1. / C, sample_weight),
+                args=(X_pre, target, 1. / C, sample_weight),
                 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
             )
             n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
             w0, loss = opt_res.x, opt_res.fun
+            if precondition:
+                # adjust intercept for mean subtraction
+                w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean)
         elif solver == 'newton-cg':
             args = (X, target, 1. / C, sample_weight)
             w0, n_iter_i = _newton_cg(hess, func, grad, w0, args=args,
@@ -1428,7 +1436,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,
                  random_state=None, solver='lbfgs', max_iter=100,
                  multi_class='auto', verbose=0, warm_start=False, n_jobs=None,
-                 l1_ratio=None):
+                 l1_ratio=None, precondition=False):
 
         self.penalty = penalty
         self.dual = dual
@@ -1445,6 +1453,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
         self.warm_start = warm_start
         self.n_jobs = n_jobs
         self.l1_ratio = l1_ratio
+        self.precondition = precondition
 
     def fit(self, X, y, sample_weight=None):
         """
@@ -1587,7 +1596,8 @@ def fit(self, X, y, sample_weight=None):
                       class_weight=self.class_weight, check_input=False,
                       random_state=self.random_state, coef=warm_start_coef_,
                       penalty=penalty, max_squared_sum=max_squared_sum,
-                      sample_weight=sample_weight)
+                      sample_weight=sample_weight,
+                      precondition=self.precondition)
             for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
 
         fold_coefs_, _, n_iter_ = zip(*fold_coefs_)

From 697112ae7459e4c1fd747ee8cea1ae5e686d250c Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 22:13:29 -0700
Subject: [PATCH 02/41] add test that preconditioning works for offsets in X

---
 sklearn/linear_model/tests/test_logistic.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 894040c2053bd..6d97f791c0f7e 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1824,3 +1824,22 @@ def test_scores_attribute_layout_elasticnet():
 
             avg_score_lr = cross_val_score(lr, X, y, cv=cv).mean()
             assert avg_scores_lrcv[i, j] == pytest.approx(avg_score_lr)
+
+
+def test_illconditioned_lbfgs():
+    # check that lbfgs converges even with ill-conditioned X
+    X, y = make_classification(n_samples=100, n_features=60, random_state=0)
+    X[:, 1] += 10000
+    lr_pre = LogisticRegression(random_state=0, precondition=True)
+    with pytest.warns(None) as record:
+        lr_pre.fit(X, y)
+    assert len(record) == 0
+    loss_pre = _logistic_loss(np.hstack([lr_pre.coef_.ravel(), lr_pre.intercept_]),
+                              X, 2 * y - 1, 1)
+
+    lr = LogisticRegression(random_state=0, precondition=False)
+    with pytest.warns(ConvergenceWarning):
+        lr.fit(X, y)
+    loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr_pre.intercept_]),
+                            X, 2 * y - 1, 1)
+    assert loss_pre < loss

From 5a1431cfe82ec73b8efa65c7200876715c3526d8 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 22:19:59 -0700
Subject: [PATCH 03/41] add precondition option temporarily to
 log_reg_scoring_path

---
 sklearn/linear_model/_logistic.py | 5 +++--
 sklearn/utils/sparsefuncs.py      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 9f0738a76020b..3bc785b60ef49 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1001,7 +1001,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
                           dual=False, intercept_scaling=1.,
                           multi_class='auto', random_state=None,
                           max_squared_sum=None, sample_weight=None,
-                          l1_ratio=None):
+                          l1_ratio=None, precondition=None):
     """Computes scores across logistic_regression_path
 
     Parameters
@@ -1147,7 +1147,8 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
         multi_class=multi_class, tol=tol, verbose=verbose, dual=dual,
         penalty=penalty, intercept_scaling=intercept_scaling,
         random_state=random_state, check_input=False,
-        max_squared_sum=max_squared_sum, sample_weight=sample_weight)
+        max_squared_sum=max_squared_sum, sample_weight=sample_weight,
+        precondition=precondition)
 
     log_reg = LogisticRegression(solver=solver, multi_class=multi_class)
 
diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
index 383de6d9f23c8..0698e5d963df8 100644
--- a/sklearn/utils/sparsefuncs.py
+++ b/sklearn/utils/sparsefuncs.py
@@ -62,7 +62,7 @@ def inplace_csr_row_scale(X, scale):
 
 
 def mean_variance_axis(X, axis):
-    """Compute mean and variance along an axix on a CSR or CSC matrix
+    """Compute mean and variance along an axis on a CSR or CSC matrix
 
     Parameters
     ----------

From bf2e452f5671d4e88c86a5679487316414bf680a Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 23:10:36 -0700
Subject: [PATCH 04/41] fix gradients, add test

---
 sklearn/linear_model/_logistic.py           | 39 ++++++++++++++++-----
 sklearn/linear_model/tests/test_logistic.py | 26 +++++++++++---
 2 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 3bc785b60ef49..11f8faee6681c 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -20,7 +20,7 @@
 
 from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator
 from ._sag import sag_solver
-from ..preprocessing import LabelEncoder, LabelBinarizer
+from ..preprocessing import LabelEncoder, LabelBinarizer, normalize
 from ..svm._base import _fit_liblinear
 from ..utils import check_array, check_consistent_length, compute_class_weight
 from ..utils import check_random_state
@@ -77,7 +77,7 @@ def _intercept_dot(w, X, y):
     return w, c, yz
 
 
-def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
+def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None):
     """Computes the logistic loss and gradient.
 
     Parameters
@@ -115,12 +115,20 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
         sample_weight = np.ones(n_samples)
 
     # Logistic loss is the negative of the log of the logistic function.
-    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)
+    v = w
+    grad_scale = 1
+    if X_scale is not None:
+        v = w / X_scale
+
+    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(v, v)
 
     z = expit(yz)
     z0 = sample_weight * (z - 1) * y
+    if X_scale is not None:
+        grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * (w / X_scale ** 2)
+    else:
+        grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
 
-    grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
 
     # Case where we fit the intercept.
     if grad.shape[0] > n_features:
@@ -128,7 +136,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
     return out, grad
 
 
-def _logistic_loss(w, X, y, alpha, sample_weight=None):
+def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None):
     """Computes the logistic loss.
 
     Parameters
@@ -149,6 +157,9 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None):
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
+    X_scale : array-like, shape (n_features,) optional
+        When using preconditioning, rescaling of features.
+
     Returns
     -------
     out : float
@@ -160,7 +171,10 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None):
         sample_weight = np.ones(y.shape[0])
 
     # Logistic loss is the negative of the log of the logistic function.
-    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)
+    v = w
+    if X_scale is not None:
+        v = w / X_scale
+    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(v, v)
     return out
 
 
@@ -241,7 +255,7 @@ def Hs(s):
     return grad, Hs
 
 
-def _multinomial_loss(w, X, Y, alpha, sample_weight):
+def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None):
     """Computes multinomial loss and class probabilities.
 
     Parameters
@@ -278,6 +292,8 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight):
     Bishop, C. M. (2006). Pattern recognition and machine learning.
     Springer. (Chapter 4.3.4)
     """
+    if X_scale is not None:
+        raise NotImplementedError
     n_classes = Y.shape[1]
     n_features = X.shape[1]
     fit_intercept = w.size == (n_classes * (n_features + 1))
@@ -297,7 +313,7 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight):
     return loss, p, w
 
 
-def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
+def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None):
     """Computes the multinomial loss, gradient and class probabilities.
 
     Parameters
@@ -335,6 +351,8 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
     Bishop, C. M. (2006). Pattern recognition and machine learning.
     Springer. (Chapter 4.3.4)
     """
+    if X_scale is not None:
+        raise NotImplementedError
     n_classes = Y.shape[1]
     n_features = X.shape[1]
     fit_intercept = (w.size == n_classes * (n_features + 1))
@@ -921,21 +939,24 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     coefs = list()
     n_iter = np.zeros(len(Cs), dtype=np.int32)
     X_pre = X
+    X_scale = None
     if precondition:
         X_mean = X.mean(axis=0)
         X_pre = X - X_mean
+        X_pre, X_scale = normalize(X_pre, axis=0, copy=False, return_norm=True)
     for i, C in enumerate(Cs):
         if solver == 'lbfgs':
             iprint = [-1, 50, 1, 100, 101][
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
             opt_res = optimize.minimize(
                 func, w0, method="L-BFGS-B", jac=True,
-                args=(X_pre, target, 1. / C, sample_weight),
+                args=(X_pre, target, 1. / C, sample_weight, X_scale),
                 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
             )
             n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
             w0, loss = opt_res.x, opt_res.fun
             if precondition:
+                w0[:-1] = w0[:-1] / X_scale
                 # adjust intercept for mean subtraction
                 w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean)
         elif solver == 'newton-cg':
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 6d97f791c0f7e..76e4408c96a7e 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1830,16 +1830,34 @@ def test_illconditioned_lbfgs():
     # check that lbfgs converges even with ill-conditioned X
     X, y = make_classification(n_samples=100, n_features=60, random_state=0)
     X[:, 1] += 10000
+    X[:, 0] *= 10000
     lr_pre = LogisticRegression(random_state=0, precondition=True)
     with pytest.warns(None) as record:
         lr_pre.fit(X, y)
     assert len(record) == 0
-    loss_pre = _logistic_loss(np.hstack([lr_pre.coef_.ravel(), lr_pre.intercept_]),
-                              X, 2 * y - 1, 1)
+    loss_pre = _logistic_loss(
+        np.hstack([lr_pre.coef_.ravel(), lr_pre.intercept_]),
+        X, 2 * y - 1, 1)
 
     lr = LogisticRegression(random_state=0, precondition=False)
     with pytest.warns(ConvergenceWarning):
         lr.fit(X, y)
-    loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr_pre.intercept_]),
-                            X, 2 * y - 1, 1)
+    loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr.intercept_]),
+                          X, 2 * y - 1, 1)
     assert loss_pre < loss
+
+
+def test_logistic_loss_preconditioning():
+    # check _logistic_loss and _logistic_loss_grad with preconditioning
+    X, y = make_classification(n_samples=100, n_features=60, random_state=0)
+    X[:, 1] += 10000
+    lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000)
+    lr.fit(X, y)
+    loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr.intercept_]),
+                          X, 2 * y - 1, 1)
+    X_std = X.std(axis=0)
+    X_pre = X / X_std
+    loss_pre = _logistic_loss(
+        np.hstack([lr.coef_.ravel() * X_std, lr.intercept_]),
+        X_pre, 2 * y - 1, 1, X_scale=X_std)
+    assert_almost_equal(loss, loss_pre)
\ No newline at end of file

From 86f7520c0e31f3e032348eb04ac9b5fdc997cd5a Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 23:18:43 -0700
Subject: [PATCH 05/41] remove unused grad_scale

---
 sklearn/linear_model/_logistic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 11f8faee6681c..704dccc3496d7 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -116,7 +116,6 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None):
 
     # Logistic loss is the negative of the log of the logistic function.
     v = w
-    grad_scale = 1
     if X_scale is not None:
         v = w / X_scale
 

From 01c2c98eb3e1a79553638c01d64a0670bdcf0fc2 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 23:19:45 -0700
Subject: [PATCH 06/41] pep8

---
 sklearn/linear_model/_logistic.py           | 2 +-
 sklearn/linear_model/tests/test_logistic.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 704dccc3496d7..7bc0b452ee396 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -124,7 +124,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None):
     z = expit(yz)
     z0 = sample_weight * (z - 1) * y
     if X_scale is not None:
-        grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * (w / X_scale ** 2)
+        grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * (w / X_scale**2)
     else:
         grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
 
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 76e4408c96a7e..6ce0fc8f6714a 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1860,4 +1860,4 @@ def test_logistic_loss_preconditioning():
     loss_pre = _logistic_loss(
         np.hstack([lr.coef_.ravel() * X_std, lr.intercept_]),
         X_pre, 2 * y - 1, 1, X_scale=X_std)
-    assert_almost_equal(loss, loss_pre)
\ No newline at end of file
+    assert_almost_equal(loss, loss_pre)

From 2bfeba4c3384857411a5ec7372957b4d8d628503 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 23:49:53 -0700
Subject: [PATCH 07/41] fix intercept for multinomial loss

---
 sklearn/linear_model/_logistic.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 7bc0b452ee396..b937dd011c11d 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -291,8 +291,6 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None):
     Bishop, C. M. (2006). Pattern recognition and machine learning.
     Springer. (Chapter 4.3.4)
     """
-    if X_scale is not None:
-        raise NotImplementedError
     n_classes = Y.shape[1]
     n_features = X.shape[1]
     fit_intercept = w.size == (n_classes * (n_features + 1))
@@ -303,7 +301,10 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None):
         w = w[:, :-1]
     else:
         intercept = 0
-    p = safe_sparse_dot(X, w.T)
+    v = w
+    if X_scale is not None:
+        v = w / X_scale
+    p = safe_sparse_dot(X, v.T)
     p += intercept
     p -= logsumexp(p, axis=1)[:, np.newaxis]
     loss = -(sample_weight * Y * p).sum()
@@ -954,7 +955,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             )
             n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
             w0, loss = opt_res.x, opt_res.fun
-            if precondition:
+            if precondition and multi_class != 'multinomial':
+                # adjust weight scale for rescaling
                 w0[:-1] = w0[:-1] / X_scale
                 # adjust intercept for mean subtraction
                 w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean)
@@ -1002,6 +1004,15 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         if multi_class == 'multinomial':
             n_classes = max(2, classes.size)
             multi_w0 = np.reshape(w0, (n_classes, -1))
+            if solver == 'lbfgs' and precondition:
+                if fit_intercept:
+                    multi_w0[:, :-1] = multi_w0[:, :-1] / X_scale
+                    # adjust intercept for preconditioning
+                    multi_w0[:, -1] = (multi_w0[:, -1]
+                                       - np.dot(multi_w0[:, :-1], X_mean))
+                else:
+                    multi_w0 = multi_w0 / X_scale
+
             if n_classes == 2:
                 multi_w0 = multi_w0[1][np.newaxis, :]
             coefs.append(multi_w0.copy())

From e72d27131d61ee9eaa400f0742ac8efa56c77ba3 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sat, 9 Nov 2019 23:54:05 -0700
Subject: [PATCH 08/41] fix loss for multinomial

---
 sklearn/linear_model/_logistic.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index b937dd011c11d..a96db03d3006c 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -301,14 +301,15 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None):
         w = w[:, :-1]
     else:
         intercept = 0
-    v = w
-    if X_scale is not None:
-        v = w / X_scale
-    p = safe_sparse_dot(X, v.T)
+
+    p = safe_sparse_dot(X, w.T)
     p += intercept
     p -= logsumexp(p, axis=1)[:, np.newaxis]
     loss = -(sample_weight * Y * p).sum()
-    loss += 0.5 * alpha * squared_norm(w)
+    v = w
+    if X_scale is not None:
+        v = w / X_scale
+    loss += 0.5 * alpha * squared_norm(v)
     p = np.exp(p, p)
     return loss, p, w
 

From 7d71afb2f83edca6965977e6abe3929794daeac1 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sun, 10 Nov 2019 00:12:34 -0700
Subject: [PATCH 09/41] add multinomial logistic regression preconditioning
 with lbfgs

---
 sklearn/linear_model/_logistic.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index a96db03d3006c..ba7e7662870fc 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -352,18 +352,19 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None):
     Bishop, C. M. (2006). Pattern recognition and machine learning.
     Springer. (Chapter 4.3.4)
     """
-    if X_scale is not None:
-        raise NotImplementedError
     n_classes = Y.shape[1]
     n_features = X.shape[1]
     fit_intercept = (w.size == n_classes * (n_features + 1))
     grad = np.zeros((n_classes, n_features + bool(fit_intercept)),
                     dtype=X.dtype)
-    loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight)
+    loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=X_scale)
     sample_weight = sample_weight[:, np.newaxis]
     diff = sample_weight * (p - Y)
     grad[:, :n_features] = safe_sparse_dot(diff.T, X)
-    grad[:, :n_features] += alpha * w
+    if X_scale is not None:
+        grad[:, :n_features] += alpha * (w / X_scale**2)
+    else:
+        grad[:, :n_features] += alpha * w
     if fit_intercept:
         grad[:, -1] = diff.sum(axis=0)
     return loss, grad.ravel(), p

From f15332118831f7f79b0bd494b2cdba107164eb1c Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sun, 10 Nov 2019 00:19:12 -0700
Subject: [PATCH 10/41] pep8

---
 sklearn/linear_model/_logistic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index ba7e7662870fc..e8ec84cc5f22f 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -128,7 +128,6 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None):
     else:
         grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
 
-
     # Case where we fit the intercept.
     if grad.shape[0] > n_features:
         grad[-1] = z0.sum()
@@ -357,7 +356,8 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None):
     fit_intercept = (w.size == n_classes * (n_features + 1))
     grad = np.zeros((n_classes, n_features + bool(fit_intercept)),
                     dtype=X.dtype)
-    loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=X_scale)
+    loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight,
+                                   X_scale=X_scale)
     sample_weight = sample_weight[:, np.newaxis]
     diff = sample_weight * (p - Y)
     grad[:, :n_features] = safe_sparse_dot(diff.T, X)

From 6da875d20e793edfc98d84b2308a7a4d2d1afbca Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sun, 10 Nov 2019 13:55:32 -0700
Subject: [PATCH 11/41] hack around with sparse stuff, set precondition=True
 everywhere for consistency

---
 sklearn/linear_model/_logistic.py | 40 ++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index e8ec84cc5f22f..8deb846ef650a 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -20,7 +20,7 @@
 
 from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator
 from ._sag import sag_solver
-from ..preprocessing import LabelEncoder, LabelBinarizer, normalize
+from ..preprocessing import LabelEncoder, LabelBinarizer
 from ..svm._base import _fit_liblinear
 from ..utils import check_array, check_consistent_length, compute_class_weight
 from ..utils import check_random_state
@@ -34,6 +34,7 @@
 from ..utils import deprecated
 from ..utils.multiclass import check_classification_targets
 from ..utils.fixes import _joblib_parallel_args
+from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 from ..model_selection import check_cv
 from ..metrics import get_scorer
 
@@ -669,7 +670,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                               intercept_scaling=1., multi_class='auto',
                               random_state=None, check_input=True,
                               max_squared_sum=None, sample_weight=None,
-                              l1_ratio=None, precondition=False):
+                              l1_ratio=None, precondition=True):
     """Compute a Logistic Regression model for a list of regularization
     parameters.
 
@@ -943,9 +944,29 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     X_pre = X
     X_scale = None
     if precondition:
-        X_mean = X.mean(axis=0)
-        X_pre = X - X_mean
-        X_pre, X_scale = normalize(X_pre, axis=0, copy=False, return_norm=True)
+        # FIXME this duplicates come code from _preprocess_data
+        # and should be refactored
+        if sparse.issparse(X):
+            X_mean, X_var = mean_variance_axis(X, axis=0)
+            X_scale = np.sqrt(X_var, X_var)
+            X_scale[X_scale == 0] = 1
+
+            del X_var
+            X_pre = X.toarray()
+            if fit_intercept:
+                X_pre = X_pre - X_mean  # FIXME
+            # can we actually do inplace here?
+            # inplace_column_scale(X_pre, 1 / X_scale)
+            X_pre = X_pre / X_scale
+
+        else:
+            X_mean = X.mean(axis=0)
+            if fit_intercept:
+                X_pre = X - X_mean
+            X_scale = X.std(axis=0)
+            X_scale[X_scale == 0] = 1
+            X_pre = X_pre / X_scale
+
     for i, C in enumerate(Cs):
         if solver == 'lbfgs':
             iprint = [-1, 50, 1, 100, 101][
@@ -959,9 +980,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             w0, loss = opt_res.x, opt_res.fun
             if precondition and multi_class != 'multinomial':
                 # adjust weight scale for rescaling
-                w0[:-1] = w0[:-1] / X_scale
+                w0[:n_features] = w0[:n_features] / X_scale
                 # adjust intercept for mean subtraction
-                w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean)
+                if fit_intercept:
+                    w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean)
         elif solver == 'newton-cg':
             args = (X, target, 1. / C, sample_weight)
             w0, n_iter_i = _newton_cg(hess, func, grad, w0, args=args,
@@ -1034,7 +1056,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
                           dual=False, intercept_scaling=1.,
                           multi_class='auto', random_state=None,
                           max_squared_sum=None, sample_weight=None,
-                          l1_ratio=None, precondition=None):
+                          l1_ratio=None, precondition=True):
     """Computes scores across logistic_regression_path
 
     Parameters
@@ -1470,7 +1492,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,
                  random_state=None, solver='lbfgs', max_iter=100,
                  multi_class='auto', verbose=0, warm_start=False, n_jobs=None,
-                 l1_ratio=None, precondition=False):
+                 l1_ratio=None, precondition=True):
 
         self.penalty = penalty
         self.dual = dual

From 3859d576e52cc9afaad0b35c2495078301dc254f Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sun, 10 Nov 2019 15:01:39 -0700
Subject: [PATCH 12/41] fixing warmstarting

---
 sklearn/linear_model/_logistic.py | 62 ++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 8deb846ef650a..db58fd7fb841d 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -886,6 +886,34 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         w0 = np.zeros((classes.size, n_features + int(fit_intercept)),
                       order='F', dtype=X.dtype)
 
+    # preconditioning
+    X_pre = X
+    X_scale = None
+    if precondition and solver == 'lbfgs':
+        # FIXME this duplicates come code from _preprocess_data
+        # and should be refactored
+        if sparse.issparse(X):
+            X_mean, X_var = mean_variance_axis(X, axis=0)
+            X_scale = np.sqrt(X_var, X_var)
+            X_scale[X_scale == 0] = 1
+
+            del X_var
+            X_pre = X.toarray()
+            if fit_intercept:
+                X_pre = X_pre - X_mean  # FIXME
+            # can we actually do inplace here?
+            # inplace_column_scale(X_pre, 1 / X_scale)
+            X_pre = X_pre / X_scale
+
+        else:
+            X_mean = X.mean(axis=0)
+            if fit_intercept:
+                X_pre = X - X_mean
+            X_scale = X.std(axis=0)
+            X_scale[X_scale == 0] = 1
+            X_pre = X_pre / X_scale
+
+    # warm starting
     if coef is not None:
         # it must work both giving the bias term and not
         if multi_class == 'ovr':
@@ -894,6 +922,11 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                     'Initialization coef is of shape %d, expected shape '
                     '%d or %d' % (coef.size, n_features, w0.size))
             w0[:coef.size] = coef
+            if solver == 'lbfgs' and precondition:
+                if fit_intercept:
+                    w0[-1] += np.inner(w0[:n_features], X_mean)
+                w0[:n_features] *= X_scale
+
         else:
             # For binary problems coef.shape[0] should be 1, otherwise it
             # should be classes.size.
@@ -914,6 +947,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                 w0[1, :coef.shape[1]] = coef
             else:
                 w0[:, :coef.shape[1]] = coef
+            if solver == 'lbfgs' and precondition:
+                if fit_intercept:
+                    w0[:, -1] += np.dot(w0[:, :n_features], X_mean)
+                w0[:, :n_features] *= X_scale
 
     if multi_class == 'multinomial':
         # scipy.optimize.minimize and newton-cg accepts only
@@ -941,31 +978,6 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
 
     coefs = list()
     n_iter = np.zeros(len(Cs), dtype=np.int32)
-    X_pre = X
-    X_scale = None
-    if precondition:
-        # FIXME this duplicates come code from _preprocess_data
-        # and should be refactored
-        if sparse.issparse(X):
-            X_mean, X_var = mean_variance_axis(X, axis=0)
-            X_scale = np.sqrt(X_var, X_var)
-            X_scale[X_scale == 0] = 1
-
-            del X_var
-            X_pre = X.toarray()
-            if fit_intercept:
-                X_pre = X_pre - X_mean  # FIXME
-            # can we actually do inplace here?
-            # inplace_column_scale(X_pre, 1 / X_scale)
-            X_pre = X_pre / X_scale
-
-        else:
-            X_mean = X.mean(axis=0)
-            if fit_intercept:
-                X_pre = X - X_mean
-            X_scale = X.std(axis=0)
-            X_scale[X_scale == 0] = 1
-            X_pre = X_pre / X_scale
 
     for i, C in enumerate(Cs):
         if solver == 'lbfgs':

From 5dd503d9a47762d45a316274c990aba5dd422627 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Sun, 10 Nov 2019 22:07:00 -0700
Subject: [PATCH 13/41] starting on sparse offset support

---
 sklearn/linear_model/_logistic.py | 33 +++++++++++++++++++------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index db58fd7fb841d..6ed3a46483dfc 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -40,7 +40,7 @@
 
 
 # .. some helper functions for logistic_regression_path ..
-def _intercept_dot(w, X, y):
+def _intercept_dot(w, X, y, X_offset=None):
     """Computes y * np.dot(X, w).
 
     It takes into consideration if the intercept should be fit or not.
@@ -74,11 +74,14 @@ def _intercept_dot(w, X, y):
         w = w[:-1]
 
     z = safe_sparse_dot(X, w) + c
+    if X_offset is not None:
+        z += np.dot(X_offset, w)
     yz = y * z
     return w, c, yz
 
 
-def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None):
+def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None,
+                            X_offset=None):
     """Computes the logistic loss and gradient.
 
     Parameters
@@ -110,7 +113,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None):
     n_samples, n_features = X.shape
     grad = np.empty_like(w)
 
-    w, c, yz = _intercept_dot(w, X, y)
+    w, c, yz = _intercept_dot(w, X, y, X_offset)
 
     if sample_weight is None:
         sample_weight = np.ones(n_samples)
@@ -135,7 +138,8 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None):
     return out, grad
 
 
-def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None):
+def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None,
+                   X_offset=None):
     """Computes the logistic loss.
 
     Parameters
@@ -164,7 +168,7 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None):
     out : float
         Logistic loss.
     """
-    w, c, yz = _intercept_dot(w, X, y)
+    w, c, yz = _intercept_dot(w, X, y, X_offset)
 
     if sample_weight is None:
         sample_weight = np.ones(y.shape[0])
@@ -254,7 +258,8 @@ def Hs(s):
     return grad, Hs
 
 
-def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None):
+def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None,
+                      X_offset=None):
     """Computes multinomial loss and class probabilities.
 
     Parameters
@@ -304,6 +309,8 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None):
 
     p = safe_sparse_dot(X, w.T)
     p += intercept
+    if X_offset is not None:
+        p += np.dot(X_offset, w.T)
     p -= logsumexp(p, axis=1)[:, np.newaxis]
     loss = -(sample_weight * Y * p).sum()
     v = w
@@ -314,7 +321,8 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None):
     return loss, p, w
 
 
-def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None):
+def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None,
+                           X_offset=None):
     """Computes the multinomial loss, gradient and class probabilities.
 
     Parameters
@@ -358,7 +366,7 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None):
     grad = np.zeros((n_classes, n_features + bool(fit_intercept)),
                     dtype=X.dtype)
     loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight,
-                                   X_scale=X_scale)
+                                   X_scale=X_scale, X_offset=X_offset)
     sample_weight = sample_weight[:, np.newaxis]
     diff = sample_weight * (p - Y)
     grad[:, :n_features] = safe_sparse_dot(diff.T, X)
@@ -889,6 +897,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     # preconditioning
     X_pre = X
     X_scale = None
+    X_offset = None
     if precondition and solver == 'lbfgs':
         # FIXME this duplicates come code from _preprocess_data
         # and should be refactored
@@ -898,12 +907,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             X_scale[X_scale == 0] = 1
 
             del X_var
-            X_pre = X.toarray()
             if fit_intercept:
-                X_pre = X_pre - X_mean  # FIXME
+                X_offset = -X_mean
             # can we actually do inplace here?
-            # inplace_column_scale(X_pre, 1 / X_scale)
-            X_pre = X_pre / X_scale
+            inplace_column_scale(X_pre, 1 / X_scale)
 
         else:
             X_mean = X.mean(axis=0)
@@ -985,7 +992,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
             opt_res = optimize.minimize(
                 func, w0, method="L-BFGS-B", jac=True,
-                args=(X_pre, target, 1. / C, sample_weight, X_scale),
+                args=(X_pre, target, 1. / C, sample_weight, X_scale, X_offset),
                 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
             )
             n_iter_i = _check_optimize_result(solver, opt_res, max_iter)

From 090e540bc2f312b5938d2dec7897b2b88b4d01d9 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Mon, 11 Nov 2019 10:59:13 -0800
Subject: [PATCH 14/41] Update sklearn/linear_model/_logistic.py

Co-Authored-By: Adrin Jalali <adrin.jalali@gmail.com>
---
 sklearn/linear_model/_logistic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 6ed3a46483dfc..8ba0619f3544b 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -899,7 +899,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     X_scale = None
     X_offset = None
     if precondition and solver == 'lbfgs':
-        # FIXME this duplicates come code from _preprocess_data
+        # FIXME this duplicates some code from _preprocess_data
         # and should be refactored
         if sparse.issparse(X):
             X_mean, X_var = mean_variance_axis(X, axis=0)

From 2e83f7e79417b9d4d12896ab904c25f7c2feafc6 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 11 Nov 2019 17:11:55 -0500
Subject: [PATCH 15/41] don't change sparse matrix inplace

---
 sklearn/linear_model/_logistic.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 8ba0619f3544b..cf1839ea82df0 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -909,8 +909,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             del X_var
             if fit_intercept:
                 X_offset = -X_mean
-            # can we actually do inplace here?
-            inplace_column_scale(X_pre, 1 / X_scale)
+            X_pre = X_pre.multiply(1 / X_scale)
 
         else:
             X_mean = X.mean(axis=0)

From 9656c2f28edbe9465e4070d2f8cca0f3cf931418 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 11 Nov 2019 17:26:24 -0500
Subject: [PATCH 16/41] fix sparse offset

---
 sklearn/linear_model/_logistic.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index cf1839ea82df0..f191703be15e2 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -34,7 +34,7 @@
 from ..utils import deprecated
 from ..utils.multiclass import check_classification_targets
 from ..utils.fixes import _joblib_parallel_args
-from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
+from ..utils.sparsefuncs import mean_variance_axis
 from ..model_selection import check_cv
 from ..metrics import get_scorer
 
@@ -905,10 +905,9 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             X_mean, X_var = mean_variance_axis(X, axis=0)
             X_scale = np.sqrt(X_var, X_var)
             X_scale[X_scale == 0] = 1
-
             del X_var
             if fit_intercept:
-                X_offset = -X_mean
+                X_offset = -X_mean/X_scale
             X_pre = X_pre.multiply(1 / X_scale)
 
         else:

From fcfc54ebeb1867d414bbac4a1659bc6d7e0c0016 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 11 Nov 2019 18:41:43 -0500
Subject: [PATCH 17/41] fix gradient error

---
 sklearn/linear_model/_logistic.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index f191703be15e2..4b18f4d5b4317 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -132,6 +132,9 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None,
     else:
         grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
 
+    if X_offset is not None:
+        grad[:n_features] += X_offset * z0.sum()
+
     # Case where we fit the intercept.
     if grad.shape[0] > n_features:
         grad[-1] = z0.sum()

From 626ad4426a36a40925f616449faafe55963ccce7 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 11 Nov 2019 19:06:57 -0500
Subject: [PATCH 18/41] stricter gradient test, add gradient test for
 multinomial loss

---
 sklearn/linear_model/tests/test_logistic.py | 38 ++++++++++++++++-----
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 6ce0fc8f6714a..26be2789e896a 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -14,7 +14,7 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import cross_val_score
-from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
 from sklearn.utils import compute_class_weight, _IS_32BIT
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_allclose
@@ -38,7 +38,7 @@
     _logistic_regression_path, LogisticRegressionCV,
     _logistic_loss_and_grad, _logistic_grad_hess,
     _multinomial_grad_hess, _logistic_loss,
-    _log_reg_scoring_path)
+    _log_reg_scoring_path, _multinomial_loss_grad)
 
 X = [[-1, 0], [0, 1], [1, 1]]
 X_sp = sp.csr_matrix(X)
@@ -417,13 +417,12 @@ def test_liblinear_dual_random_state():
 
 def test_logistic_loss_and_grad():
     X_ref, y = make_classification(n_samples=20, random_state=0)
-    n_features = X_ref.shape[1]
-
     X_sp = X_ref.copy()
     X_sp[X_sp < .1] = 0
     X_sp = sp.csr_matrix(X_sp)
+    clf = LogisticRegression(random_state=0).fit(X_ref, y)
     for X in (X_ref, X_sp):
-        w = np.zeros(n_features)
+        w = clf.coef_.copy().ravel()
 
         # First check that our derivation of the grad is correct
         loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1.)
@@ -433,12 +432,10 @@ def test_logistic_loss_and_grad():
         assert_array_almost_equal(grad, approx_grad, decimal=2)
 
         # Second check that our intercept implementation is good
-        w = np.zeros(n_features + 1)
+        w = np.hstack([clf.coef_.copy().ravel(), clf.intercept_])
         loss_interp, grad_interp = _logistic_loss_and_grad(
             w, X, y, alpha=1.
         )
-        assert_array_almost_equal(loss, loss_interp)
-
         approx_grad = optimize.approx_fprime(
             w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3
         )
@@ -494,6 +491,31 @@ def test_logistic_grad_hess():
         assert_array_almost_equal(grad_interp, grad_interp_2)
 
 
+def test_multinomial_loss_grad():
+    n_features = 10
+    n_classes = 3
+    X_ref, y = make_classification(n_features=n_features, n_classes=n_classes,
+                                   random_state=0, n_informative=6)
+
+    X_sp = X_ref.copy()
+    X_sp[X_sp < .1] = 0
+    X_sp = sp.csr_matrix(X_sp)
+    sample_weight = np.ones(X_ref.shape[0])
+    Y = label_binarize(y, [0, 1, 2])
+    lr = LogisticRegression(random_state=0).fit(X_ref, y)
+    for X in (X_ref, X_sp):
+
+        w = np.hstack([lr.coef_, lr.intercept_.reshape(-1, 1)])
+        loss, grad, p = _multinomial_loss_grad(
+            w, X, Y, alpha=1., X_scale=None, sample_weight=sample_weight)
+        approx_grad = optimize.approx_fprime(
+            w.ravel(), lambda w: _multinomial_loss_grad(
+                w, X, Y, alpha=1., X_scale=None,
+                sample_weight=sample_weight)[0], 1e-5
+        )
+        assert_array_almost_equal(grad, approx_grad, decimal=3)
+
+
 def test_logistic_cv():
     # test for LogisticRegressionCV object
     n_samples, n_features = 50, 5

From 8cc16333668ce857901cde094d06781f43fdcd12 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Mon, 11 Nov 2019 19:31:18 -0500
Subject: [PATCH 19/41] use sample weights in averages

---
 sklearn/linear_model/_logistic.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 4b18f4d5b4317..f4a31fa760506 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -914,10 +914,12 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             X_pre = X_pre.multiply(1 / X_scale)
 
         else:
-            X_mean = X.mean(axis=0)
+            X_mean = np.average(X, weights=sample_weight, axis=0)
             if fit_intercept:
                 X_pre = X - X_mean
-            X_scale = X.std(axis=0)
+            # weighted version of std
+            X_scale = np.sqrt(np.average((X_pre)**2, weights=sample_weight,
+                                         axis=0))
             X_scale[X_scale == 0] = 1
             X_pre = X_pre / X_scale
 

From a312bc92ac9b2c5e3d980fe5d5f12958c934a755 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 10:31:28 -0500
Subject: [PATCH 20/41] fix doctest

---
 sklearn/feature_selection/_from_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index 674127f06acd7..63baab1f030b2 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -146,7 +146,7 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
     >>> y = [0, 1, 0, 1]
     >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
     >>> selector.estimator_.coef_
-    array([[-0.3252302 ,  0.83462377,  0.49750423]])
+    array([[-0.3252319 ,  0.83462631,  0.49750495]])
     >>> selector.threshold_
     0.55245...
     >>> selector.get_support()

From d8ec9d1c02c94937d27f94fcb9815654d2cabd67 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 11:00:31 -0500
Subject: [PATCH 21/41] increase tolerance in coefficient equality test :-/

---
 sklearn/linear_model/tests/test_logistic.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 26be2789e896a..ac3627c642ad3 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -966,9 +966,10 @@ def test_logistic_regression_multinomial():
         assert clf_w.coef_.shape == (n_classes, n_features)
 
         # Compare solutions between lbfgs and the other solvers
-        assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-2)
-        assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-2)
-        assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-2)
+        assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-1, atol=1e-4)
+        assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-1, atol=1e-4)
+        assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-1,
+                        atol=1e-4)
 
     # Test that the path give almost the same results. However since in this
     # case we take the average of the coefs after fitting across all the

From 954da5afc9d443da1edd8dbc580a5cf0627b46a4 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 14:11:02 -0500
Subject: [PATCH 22/41] store loss value in logistic regression when using
 l-bfgs

---
 sklearn/linear_model/_logistic.py           | 15 +++++++++------
 sklearn/linear_model/tests/test_logistic.py |  6 +++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index f4a31fa760506..e4356873536d4 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -671,7 +671,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         tol=1e-4, verbose=0, solver='lbfgs', coef=None, class_weight=None,
         dual=False, penalty='l2', intercept_scaling=1., multi_class='auto',
         random_state=None, check_input=True, max_squared_sum=None,
-        sample_weight=None, l1_ratio=None)
+        sample_weight=None, l1_ratio=None)[:3]
 
 
 def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
@@ -989,6 +989,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     coefs = list()
     n_iter = np.zeros(len(Cs), dtype=np.int32)
 
+    loss_value = None
     for i, C in enumerate(Cs):
         if solver == 'lbfgs':
             iprint = [-1, 50, 1, 100, 101][
@@ -999,7 +1000,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
             )
             n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
-            w0, loss = opt_res.x, opt_res.fun
+            w0, loss_value = opt_res.x, opt_res.fun
             if precondition and multi_class != 'multinomial':
                 # adjust weight scale for rescaling
                 w0[:n_features] = w0[:n_features] / X_scale
@@ -1067,7 +1068,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
 
         n_iter[i] = n_iter_i
 
-    return np.array(coefs), np.array(Cs), n_iter
+    return np.array(coefs), np.array(Cs), n_iter, loss_value
 
 
 # helper function for LogisticCV
@@ -1217,7 +1218,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
         sample_weight = _check_sample_weight(sample_weight, X)
         sample_weight = sample_weight[train]
 
-    coefs, Cs, n_iter = _logistic_regression_path(
+    coefs, Cs, n_iter, loss_value = _logistic_regression_path(
         X_train, y_train, Cs=Cs, l1_ratio=l1_ratio,
         fit_intercept=fit_intercept, solver=solver, max_iter=max_iter,
         class_weight=class_weight, pos_class=pos_class,
@@ -1678,15 +1679,17 @@ def fit(self, X, y, sample_weight=None):
                       precondition=self.precondition)
             for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
 
-        fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
+        fold_coefs_, _, n_iter_, loss_values_ = zip(*fold_coefs_)
         self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]
 
         if multi_class == 'multinomial':
             self.coef_ = fold_coefs_[0][0]
+            self.loss_values_ = loss_values_[0]
         else:
             self.coef_ = np.asarray(fold_coefs_)
             self.coef_ = self.coef_.reshape(n_classes, n_features +
                                             int(self.fit_intercept))
+            self.loss_values_ = loss_values_[0]
 
         if self.fit_intercept:
             self.intercept_ = self.coef_[:, -1]
@@ -2246,7 +2249,7 @@ def fit(self, X, y, sample_weight=None):
 
                 # Note that y is label encoded and hence pos_class must be
                 # the encoded label / None (for 'multinomial')
-                w, _, _ = _logistic_regression_path(
+                w, _, _, _ = _logistic_regression_path(
                     X, y, pos_class=encoded_label, Cs=[C_], solver=solver,
                     fit_intercept=self.fit_intercept, coef=coef_init,
                     max_iter=self.max_iter, tol=self.tol,
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index ac3627c642ad3..f80dfc74cb2ef 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -358,7 +358,7 @@ def test_consistency_path():
     # can't test with fit_intercept=True since LIBLINEAR
     # penalizes the intercept
     for solver in ['sag', 'saga']:
-        coefs, Cs, _ = f(_logistic_regression_path)(
+        coefs, Cs, _, _ = f(_logistic_regression_path)(
             X, y, Cs=Cs, fit_intercept=False, tol=1e-5, solver=solver,
             max_iter=1000, multi_class='ovr', random_state=0)
         for i, C in enumerate(Cs):
@@ -373,7 +373,7 @@ def test_consistency_path():
     # test for fit_intercept=True
     for solver in ('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'):
         Cs = [1e3]
-        coefs, Cs, _ = f(_logistic_regression_path)(
+        coefs, Cs, _, _ = f(_logistic_regression_path)(
             X, y, Cs=Cs, tol=1e-6, solver=solver,
             intercept_scaling=10000., random_state=0, multi_class='ovr')
         lr = LogisticRegression(C=Cs[0], tol=1e-4,
@@ -1689,7 +1689,7 @@ def test_logistic_regression_path_coefs_multinomial():
                                n_redundant=0, n_clusters_per_class=1,
                                random_state=0, n_features=2)
     Cs = [.00001, 1, 10000]
-    coefs, _, _ = _logistic_regression_path(X, y, penalty='l1', Cs=Cs,
+    coefs, _, _, _ = _logistic_regression_path(X, y, penalty='l1', Cs=Cs,
                                             solver='saga', random_state=0,
                                             multi_class='multinomial')
 

From 9c704aaffbca9755cbf72ade2549252a526ac649 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 14:22:24 -0500
Subject: [PATCH 23/41] fix gradient, add gradient test

---
 sklearn/linear_model/_logistic.py           |  2 ++
 sklearn/linear_model/tests/test_logistic.py | 22 +++++++++++----------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index e4356873536d4..b2ff1838f5cea 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -373,6 +373,8 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None,
     sample_weight = sample_weight[:, np.newaxis]
     diff = sample_weight * (p - Y)
     grad[:, :n_features] = safe_sparse_dot(diff.T, X)
+    if X_offset is not None:
+        grad[:, :n_features] += np.outer(diff.T.sum(axis=1), X_offset)
     if X_scale is not None:
         grad[:, :n_features] += alpha * (w / X_scale**2)
     else:
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index f80dfc74cb2ef..2bda7c7c9c482 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -504,16 +504,18 @@ def test_multinomial_loss_grad():
     Y = label_binarize(y, [0, 1, 2])
     lr = LogisticRegression(random_state=0).fit(X_ref, y)
     for X in (X_ref, X_sp):
-
-        w = np.hstack([lr.coef_, lr.intercept_.reshape(-1, 1)])
-        loss, grad, p = _multinomial_loss_grad(
-            w, X, Y, alpha=1., X_scale=None, sample_weight=sample_weight)
-        approx_grad = optimize.approx_fprime(
-            w.ravel(), lambda w: _multinomial_loss_grad(
-                w, X, Y, alpha=1., X_scale=None,
-                sample_weight=sample_weight)[0], 1e-5
-        )
-        assert_array_almost_equal(grad, approx_grad, decimal=3)
+        for X_offset in (None, X.mean(axis=0)):
+
+            w = np.hstack([lr.coef_, lr.intercept_.reshape(-1, 1)])
+            loss, grad, p = _multinomial_loss_grad(
+                w, X, Y, alpha=1., X_scale=None, sample_weight=sample_weight,
+                X_offset=X_offset)
+            approx_grad = optimize.approx_fprime(
+                w.ravel(), lambda w: _multinomial_loss_grad(
+                    w, X, Y, alpha=1., X_scale=None, X_offset=X_offset,
+                    sample_weight=sample_weight)[0], 1e-5
+            )
+            assert_array_almost_equal(grad, approx_grad, decimal=3)
 
 
 def test_logistic_cv():

From 1d61a941570a33370fbc5ed0b0717a05d40cce90 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 14:42:39 -0500
Subject: [PATCH 24/41] offset gradient test for logistic loss

---
 sklearn/linear_model/_logistic.py           |  2 +-
 sklearn/linear_model/tests/test_logistic.py | 38 ++++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index b2ff1838f5cea..b7869bb407a5a 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -75,7 +75,7 @@ def _intercept_dot(w, X, y, X_offset=None):
 
     z = safe_sparse_dot(X, w) + c
     if X_offset is not None:
-        z += np.dot(X_offset, w)
+        z += np.inner(X_offset, w)
     yz = y * z
     return w, c, yz
 
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 2bda7c7c9c482..20b8e59434cde 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -416,30 +416,34 @@ def test_liblinear_dual_random_state():
 
 
 def test_logistic_loss_and_grad():
-    X_ref, y = make_classification(n_samples=20, random_state=0)
+    X_ref, y = make_classification(n_samples=21, random_state=0)
     X_sp = X_ref.copy()
     X_sp[X_sp < .1] = 0
     X_sp = sp.csr_matrix(X_sp)
     clf = LogisticRegression(random_state=0).fit(X_ref, y)
     for X in (X_ref, X_sp):
-        w = clf.coef_.copy().ravel()
+        for X_offset in (None, np.asarray(X.mean(axis=0)).squeeze()):
+            w = clf.coef_.copy().ravel()
 
-        # First check that our derivation of the grad is correct
-        loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1.)
-        approx_grad = optimize.approx_fprime(
-            w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3
-        )
-        assert_array_almost_equal(grad, approx_grad, decimal=2)
+            # First check that our derivation of the grad is correct
+            loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1.,
+                                                 X_offset=X_offset)
+            approx_grad = optimize.approx_fprime(
+                w, lambda w: _logistic_loss_and_grad(
+                    w, X, y, alpha=1., X_offset=X_offset)[0], 1e-3
+            )
+            assert_array_almost_equal(grad, approx_grad, decimal=2)
 
-        # Second check that our intercept implementation is good
-        w = np.hstack([clf.coef_.copy().ravel(), clf.intercept_])
-        loss_interp, grad_interp = _logistic_loss_and_grad(
-            w, X, y, alpha=1.
-        )
-        approx_grad = optimize.approx_fprime(
-            w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3
-        )
-        assert_array_almost_equal(grad_interp, approx_grad, decimal=2)
+            # Second check that our intercept implementation is good
+            w = np.hstack([clf.coef_.copy().ravel(), clf.intercept_])
+            loss_interp, grad_interp = _logistic_loss_and_grad(
+                w, X, y, alpha=1., X_offset=X_offset
+            )
+            approx_grad = optimize.approx_fprime(
+                w, lambda w: _logistic_loss_and_grad(
+                    w, X, y, alpha=1., X_offset=X_offset)[0], 1e-3
+            )
+            assert_array_almost_equal(grad_interp, approx_grad, decimal=2)
 
 
 def test_logistic_grad_hess():

From a82edc165690ba532d3ee642cd81b86f27a4512e Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 16:29:37 -0500
Subject: [PATCH 25/41] cast matrix to csr for pointwise multiplication?!

---
 sklearn/linear_model/_logistic.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index b7869bb407a5a..4d6b1fe4155ee 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -913,7 +913,9 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             del X_var
             if fit_intercept:
                 X_offset = -X_mean/X_scale
-            X_pre = X_pre.multiply(1 / X_scale)
+            # FIXME old scipy requires conversion to sparse matrix
+            # before calling multiply
+            X_pre = X_pre.multiply(sparse.csr_matrix(1 / X_scale))
 
         else:
             X_mean = np.average(X, weights=sample_weight, axis=0)

From a2947abada2bc8ad1586682022510581f4ef81a3 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 16:43:52 -0500
Subject: [PATCH 26/41] add docstrings and some explanation

---
 sklearn/linear_model/_logistic.py | 57 +++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 4d6b1fe4155ee..0bccec07f5f63 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -56,6 +56,11 @@ def _intercept_dot(w, X, y, X_offset=None):
     y : ndarray, shape (n_samples,)
         Array of labels.
 
+    X_offset : ndarray, shape (n_features,) or None
+        Offset to use for X to avoid subtracting mean from sparse
+        matrices if preconditioning. Should be None in the dense case
+         as the mean was actually subtracted.
+
     Returns
     -------
     w : ndarray, shape (n_features,)
@@ -102,6 +107,15 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None,
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
+    X_scale : ndarray, shape (n_features,) or None
+        Rescaling that was applied to X for preconditioning.
+        Needed to correctly compute penalty term.
+
+    X_offset : ndarray, shape (n_features,) or None
+        Offset to use for X to avoid subtracting mean from sparse
+        matrices if preconditioning. Should be None in the dense case
+         as the mean was actually subtracted.
+
     Returns
     -------
     out : float
@@ -118,11 +132,11 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None,
     if sample_weight is None:
         sample_weight = np.ones(n_samples)
 
-    # Logistic loss is the negative of the log of the logistic function.
     v = w
     if X_scale is not None:
         v = w / X_scale
 
+    # Logistic loss is the negative of the log of the logistic function.
     out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(v, v)
 
     z = expit(yz)
@@ -163,8 +177,14 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None,
         Array of weights that are assigned to individual samples.
         If not provided, then each sample is given unit weight.
 
-    X_scale : array-like, shape (n_features,) optional
-        When using preconditioning, rescaling of features.
+    X_scale : ndarray, shape (n_features,) or None
+        Rescaling that was applied to X for preconditioning.
+        Needed to correctly compute penalty term.
+
+    X_offset : ndarray, shape (n_features,) or None
+        Offset to use for X to avoid subtracting mean from sparse
+        matrices if preconditioning. Should be None in the dense case
+         as the mean was actually subtracted.
 
     Returns
     -------
@@ -283,6 +303,15 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None,
     sample_weight : array-like, shape (n_samples,)
         Array of weights that are assigned to individual samples.
 
+    X_scale : ndarray, shape (n_features,) or None
+        Rescaling that was applied to X for preconditioning.
+        Needed to correctly compute penalty term.
+
+    X_offset : ndarray, shape (n_features,) or None
+        Offset to use for X to avoid subtracting mean from sparse
+        matrices if preconditioning. Should be None in the dense case
+         as the mean was actually subtracted.
+
     Returns
     -------
     loss : float
@@ -346,6 +375,15 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None,
     sample_weight : array-like, shape (n_samples,)
         Array of weights that are assigned to individual samples.
 
+    X_scale : ndarray, shape (n_features,) or None
+        Rescaling that was applied to X for preconditioning.
+        Needed to correctly compute penalty term.
+
+    X_offset : ndarray, shape (n_features,) or None
+        Offset to use for X to avoid subtracting mean from sparse
+        matrices if preconditioning. Should be None in the dense case
+         as the mean was actually subtracted.
+
     Returns
     -------
     loss : float
@@ -899,7 +937,14 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         w0 = np.zeros((classes.size, n_features + int(fit_intercept)),
                       order='F', dtype=X.dtype)
 
-    # preconditioning
+    # preconditioning for lbfgs
+    # Subtract mean, divide by standard deviation but keep scaling and
+    # mean to allow solving the original problem.
+    # The scaling is required in the gradient computation for the penalty
+    # Both scaling and mean are used later used to transform
+    # optimization results back to the original space.
+    # In the sparse case, the mean can not be subtracted and the
+    # correction is carried along as X_offset.
     X_pre = X
     X_scale = None
     X_offset = None
@@ -1575,8 +1620,8 @@ def fit(self, X, y, sample_weight=None):
         if self.penalty == 'elasticnet':
             if (not isinstance(self.l1_ratio, numbers.Number) or
                     self.l1_ratio < 0 or self.l1_ratio > 1):
-                        raise ValueError("l1_ratio must be between 0 and 1;"
-                                         " got (l1_ratio=%r)" % self.l1_ratio)
+                raise ValueError("l1_ratio must be between 0 and 1;"
+                                 " got (l1_ratio=%r)" % self.l1_ratio)
         elif self.l1_ratio is not None:
             warnings.warn("l1_ratio parameter is only used when penalty is "
                           "'elasticnet'. Got "

From 9ad4f93d99f982697d1d4a7eb73620fea5fe5443 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 17:44:58 -0500
Subject: [PATCH 27/41] add helper function for weighted mean and std

---
 sklearn/linear_model/_logistic.py   |  7 +------
 sklearn/utils/extmath.py            | 28 ++++++++++++++++++++++++++++
 sklearn/utils/tests/test_extmath.py | 23 +++++++++++++++++++++++
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 0bccec07f5f63..c7b6a470736bd 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -951,9 +951,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     if precondition and solver == 'lbfgs':
         # FIXME this duplicates some code from _preprocess_data
         # and should be refactored
+        X_mean, X_scale = _weighted_mean_var(X, sample_weight)
         if sparse.issparse(X):
-            X_mean, X_var = mean_variance_axis(X, axis=0)
-            X_scale = np.sqrt(X_var, X_var)
             X_scale[X_scale == 0] = 1
             del X_var
             if fit_intercept:
@@ -963,12 +962,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             X_pre = X_pre.multiply(sparse.csr_matrix(1 / X_scale))
 
         else:
-            X_mean = np.average(X, weights=sample_weight, axis=0)
             if fit_intercept:
                 X_pre = X - X_mean
-            # weighted version of std
-            X_scale = np.sqrt(np.average((X_pre)**2, weights=sample_weight,
-                                         axis=0))
             X_scale[X_scale == 0] = 1
             X_pre = X_pre / X_scale
 
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index b0c28897a8ef1..be33cb9843a15 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -830,3 +830,31 @@ def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
                       'its last element does not correspond to sum',
                       RuntimeWarning)
     return out
+
+
+def _weighted_mean_std(X, sample_weight):
+    """Compute weighted mean and standard deviation for ndarrays and sparse matrices.
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape (n_samples, n_features)
+        input array.
+    sample_weight : ndarray, shape (n_samples,)
+        Weights.
+
+    Returns
+    -------
+    mean : ndarray, shape (n_features,)
+        Weighted mean.
+    std : ndarray, shape (n_features,)
+        Weighted std.
+    """
+    if sparse.issparse(X):
+        normed_weights = sample_weight / sample_weight.sum()
+        sq_sum = safe_sparse_dot(normed_weights, X.multiply(X))
+        mean = safe_sparse_dot(normed_weights, X)
+        var = sq_sum - mean ** 2
+    else:
+        mean = np.average(X, weights=sample_weight, axis=0)
+        var = np.average(X**2, weights=sample_weight, axis=0) - mean ** 2
+    return mean, np.sqrt(var)
\ No newline at end of file
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index fdca303e15d8b..cd92c4edb56b2 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -34,6 +34,7 @@
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.extmath import safe_min
 from sklearn.utils.extmath import safe_sparse_dot
+from sklearn.utils.extmath import _weighted_mean_std
 from sklearn.datasets import make_low_rank_matrix
 
 
@@ -727,3 +728,25 @@ def test_safe_sparse_dot_dense_output(dense_output):
     if dense_output:
         expected = expected.toarray()
     assert_allclose_dense_sparse(actual, expected)
+
+
+def test_weighted_mean_std():
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(100, 10))
+    weights = rng.uniform(size=(100,))
+    mean_dense, std_dense = _weighted_mean_std(X, weights)
+    mean_sparse, std_sparse = _weighted_mean_std(
+        sparse.csr_matrix(X), weights)
+    assert_allclose_dense_sparse(mean_dense, mean_sparse)
+    assert_allclose_dense_sparse(std_dense, std_sparse)
+    # with ones
+    weights = np.ones(100)
+    mean_dense, std_dense = _weighted_mean_std(X, weights)
+    mean_sparse, std_sparse = _weighted_mean_std(
+        sparse.csr_matrix(X), weights)
+    mean_expected = X.mean(axis=0)
+    std_expected = X.std(axis=0)
+    assert_allclose_dense_sparse(mean_dense, mean_expected)
+    assert_allclose_dense_sparse(std_dense, std_expected)
+    assert_allclose_dense_sparse(mean_sparse, mean_expected)
+    assert_allclose_dense_sparse(std_sparse, std_expected)

From 150b5a312c35091b075f70f3c7b9a02aaa9c7a63 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 17:48:13 -0500
Subject: [PATCH 28/41] typos

---
 sklearn/linear_model/_logistic.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index c7b6a470736bd..7979d2971077f 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -26,7 +26,7 @@
 from ..utils import check_random_state
 from ..utils.extmath import (log_logistic, safe_sparse_dot, softmax,
                              squared_norm)
-from ..utils.extmath import row_norms
+from ..utils.extmath import row_norms, _weighted_mean_std
 from ..utils.fixes import logsumexp
 from ..utils.optimize import _newton_cg, _check_optimize_result
 from ..utils.validation import check_X_y
@@ -34,7 +34,6 @@
 from ..utils import deprecated
 from ..utils.multiclass import check_classification_targets
 from ..utils.fixes import _joblib_parallel_args
-from ..utils.sparsefuncs import mean_variance_axis
 from ..model_selection import check_cv
 from ..metrics import get_scorer
 
@@ -951,10 +950,9 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     if precondition and solver == 'lbfgs':
         # FIXME this duplicates some code from _preprocess_data
         # and should be refactored
-        X_mean, X_scale = _weighted_mean_var(X, sample_weight)
+        X_mean, X_scale = _weighted_mean_std(X, sample_weight)
         if sparse.issparse(X):
             X_scale[X_scale == 0] = 1
-            del X_var
             if fit_intercept:
                 X_offset = -X_mean/X_scale
             # FIXME old scipy requires conversion to sparse matrix

From 68dfe1b17e3666e1ea903290f84c6becfe8b85c4 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 17:53:19 -0500
Subject: [PATCH 29/41] shorter docstring to placate pep8

---
 sklearn/utils/extmath.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index be33cb9843a15..159466b423d30 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -833,7 +833,7 @@ def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
 
 
 def _weighted_mean_std(X, sample_weight):
-    """Compute weighted mean and standard deviation for ndarrays and sparse matrices.
+    """Compute weighted mean and standard deviation.
 
     Parameters
     ----------

From 763b0106e1b93dc03911bad573cb14df74f81724 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 12 Nov 2019 17:53:53 -0500
Subject: [PATCH 30/41] more pep8

---
 sklearn/utils/extmath.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index 159466b423d30..c6c44773f8e08 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -857,4 +857,4 @@ def _weighted_mean_std(X, sample_weight):
     else:
         mean = np.average(X, weights=sample_weight, axis=0)
         var = np.average(X**2, weights=sample_weight, axis=0) - mean ** 2
-    return mean, np.sqrt(var)
\ No newline at end of file
+    return mean, np.sqrt(var)

From 71e563aa774b0a834213d291d63ee214975005cb Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 10:45:04 -0500
Subject: [PATCH 31/41] rename loss_value_ to objective_value_

add docstring
---
 sklearn/linear_model/_logistic.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 7979d2971077f..fe59a0ba30140 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1487,6 +1487,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         corresponds to outcome 1 (True) and `-intercept_` corresponds to
         outcome 0 (False).
 
+    objective_value_ : float
+        Objective function value (penalized loss). Lower is better.
+
     n_iter_ : array, shape (n_classes,) or (1, )
         Actual number of iterations for all classes. If binary or multinomial,
         it returns only 1 element. For liblinear solver, only the maximum
@@ -1721,17 +1724,17 @@ def fit(self, X, y, sample_weight=None):
                       precondition=self.precondition)
             for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
 
-        fold_coefs_, _, n_iter_, loss_values_ = zip(*fold_coefs_)
+        fold_coefs_, _, n_iter_, objective_value_ = zip(*fold_coefs_)
         self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]
 
         if multi_class == 'multinomial':
             self.coef_ = fold_coefs_[0][0]
-            self.loss_values_ = loss_values_[0]
+            self.objective_value_ = objective_value_[0]
         else:
             self.coef_ = np.asarray(fold_coefs_)
             self.coef_ = self.coef_.reshape(n_classes, n_features +
                                             int(self.fit_intercept))
-            self.loss_values_ = loss_values_[0]
+            self.objective_value_ = objective_value_[0]
 
         if self.fit_intercept:
             self.intercept_ = self.coef_[:, -1]
@@ -2003,6 +2006,10 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         ``(n_folds, n_cs, n_l1_ratios_, n_features)`` or
         ``(n_folds, n_cs, n_l1_ratios_, n_features + 1)``.
 
+    objective_value_ : float
+        Objective function value (penalized loss). Lower is better.
+        Only present if `refit=True`.
+
     scores_ : dict
         dict with classes as the keys, and the values as the
         grid of scores obtained during cross-validating each fold, after doing
@@ -2291,7 +2298,7 @@ def fit(self, X, y, sample_weight=None):
 
                 # Note that y is label encoded and hence pos_class must be
                 # the encoded label / None (for 'multinomial')
-                w, _, _, _ = _logistic_regression_path(
+                w, _, _, objective_value = _logistic_regression_path(
                     X, y, pos_class=encoded_label, Cs=[C_], solver=solver,
                     fit_intercept=self.fit_intercept, coef=coef_init,
                     max_iter=self.max_iter, tol=self.tol,
@@ -2304,6 +2311,7 @@ def fit(self, X, y, sample_weight=None):
                     sample_weight=sample_weight,
                     l1_ratio=l1_ratio_)
                 w = w[0]
+                self.objective_value_ = objective_value[0]
 
             else:
                 # Take the best scores across every fold and the average of

From ca44e1c35c4301e425cb80652702be47a0f013be Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Wed, 13 Nov 2019 11:01:26 -0500
Subject: [PATCH 32/41] Update sklearn/linear_model/_logistic.py

Co-Authored-By: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/linear_model/_logistic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index fe59a0ba30140..f5a0c43df3f03 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -183,7 +183,7 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None,
     X_offset : ndarray, shape (n_features,) or None
         Offset to use for X to avoid subtracting mean from sparse
         matrices if preconditioning. Should be None in the dense case
-         as the mean was actually subtracted.
+        as the mean was actually subtracted.
 
     Returns
     -------

From 962a190b9dc62751cb9a2394a93d5c636fe9cd80 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Wed, 13 Nov 2019 11:02:11 -0500
Subject: [PATCH 33/41] Update sklearn/linear_model/_logistic.py

Co-Authored-By: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/linear_model/_logistic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index f5a0c43df3f03..9fdf63048af1a 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -309,7 +309,7 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None,
     X_offset : ndarray, shape (n_features,) or None
         Offset to use for X to avoid subtracting mean from sparse
         matrices if preconditioning. Should be None in the dense case
-         as the mean was actually subtracted.
+        as the mean was actually subtracted.
 
     Returns
     -------

From af16e99e270ddd5a3e2d5d58240b75e21c0cf580 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 11:09:35 -0500
Subject: [PATCH 34/41] address some of oliviers comments

---
 sklearn/feature_selection/_from_model.py    | 2 +-
 sklearn/linear_model/tests/test_logistic.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index 63baab1f030b2..fb5b44f4cb155 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -146,7 +146,7 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
     >>> y = [0, 1, 0, 1]
     >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
     >>> selector.estimator_.coef_
-    array([[-0.3252319 ,  0.83462631,  0.49750495]])
+    array([[-0.3252... ,  0.8346...,  0.4950...]])
     >>> selector.threshold_
     0.55245...
     >>> selector.get_support()
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 20b8e59434cde..23c719fe70037 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1877,7 +1877,7 @@ def test_illconditioned_lbfgs():
 
 
 def test_logistic_loss_preconditioning():
-    # check _logistic_loss and _logistic_loss_grad with preconditioning
+    # check that _logistic_loss is invariant wrt whether we precondition or not.
     X, y = make_classification(n_samples=100, n_features=60, random_state=0)
     X[:, 1] += 10000
     lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000)

From 74ed6e98f96e8bdb574033c566ac391c2d713782 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 11:29:25 -0500
Subject: [PATCH 35/41] pep8

---
 sklearn/linear_model/tests/test_logistic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 23c719fe70037..d88449676ddb0 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1877,7 +1877,7 @@ def test_illconditioned_lbfgs():
 
 
 def test_logistic_loss_preconditioning():
-    # check that _logistic_loss is invariant wrt whether we precondition or not.
+    # check that _logistic_loss is invariant wrt whether we precondition.
     X, y = make_classification(n_samples=100, n_features=60, random_state=0)
     X[:, 1] += 10000
     lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000)

From c7c1689f991f9e6594f350f7cbfd8a2566bee8cf Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 11:46:21 -0500
Subject: [PATCH 36/41] improve invariance test for logistic_loss

---
 sklearn/linear_model/tests/test_logistic.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index d88449676ddb0..81c2bf85ed5e9 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1880,13 +1880,16 @@ def test_logistic_loss_preconditioning():
     # check that _logistic_loss is invariant wrt whether we precondition.
     X, y = make_classification(n_samples=100, n_features=60, random_state=0)
     X[:, 1] += 10000
-    lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000)
+    lr = LogisticRegression(random_state=0, precondition=True, max_iter=1000)
     lr.fit(X, y)
     loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr.intercept_]),
                           X, 2 * y - 1, 1)
+    assert_almost_equal(loss, lr.objective_value_)
+    # do full preconditioning
+    X_mean = X.mean(axis=0)
     X_std = X.std(axis=0)
-    X_pre = X / X_std
-    loss_pre = _logistic_loss(
-        np.hstack([lr.coef_.ravel() * X_std, lr.intercept_]),
-        X_pre, 2 * y - 1, 1, X_scale=X_std)
+    X_pre = (X - X_mean) / X_std
+    w_scaled = lr.coef_.ravel() * X_std
+    w_pre = np.hstack([w_scaled, lr.intercept_ + np.inner(lr.coef_, X_mean)])
+    loss_pre = _logistic_loss(w_pre, X_pre, 2 * y - 1, 1, X_scale=X_std)
     assert_almost_equal(loss, loss_pre)

From 1a1bcfd5fa5e200854b58e0713a4861b23cc0db3 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 12:07:51 -0500
Subject: [PATCH 37/41] fix objective value assignment

---
 sklearn/linear_model/_logistic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 9fdf63048af1a..770da2401fa66 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -2311,7 +2311,7 @@ def fit(self, X, y, sample_weight=None):
                     sample_weight=sample_weight,
                     l1_ratio=l1_ratio_)
                 w = w[0]
-                self.objective_value_ = objective_value[0]
+                self.objective_value_ = objective_value
 
             else:
                 # Take the best scores across every fold and the average of

From 82b09493d150adc97c9533beb6438beccca33824 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 13:42:56 -0500
Subject: [PATCH 38/41] why do we suddenly need more dots? hum

---
 sklearn/feature_selection/_from_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index fb5b44f4cb155..98c70ddc64bca 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -146,7 +146,7 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
     >>> y = [0, 1, 0, 1]
     >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
     >>> selector.estimator_.coef_
-    array([[-0.3252... ,  0.8346...,  0.4950...]])
+    array([[-0.32... ,  0.83...,  0.49...]])
     >>> selector.threshold_
     0.55245...
     >>> selector.get_support()

From 01a5aa2d6497c29eafb2a27fc8b7888221e4642e Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 13:48:53 -0500
Subject: [PATCH 39/41] add auto option, docstring

---
 sklearn/linear_model/_logistic.py | 38 +++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 770da2401fa66..22e98e17bf465 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -520,7 +520,6 @@ def _check_solver(solver, penalty, dual):
         raise ValueError(
             "penalty='none' is not supported for the liblinear solver"
         )
-
     return solver
 
 
@@ -847,6 +846,12 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
         combination of L1 and L2.
 
+    precondition : boolean or 'auto', default='auto'
+        Whether to use preconditioning for solving the optimization problem.
+        A diagonal preconditioning based on the data standard deviation is
+        used. If 'auto', preconditioning is used when ``solver='lbfgs'``, which
+        is the only solver that currently supports it.
+
     Returns
     -------
     coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
@@ -875,6 +880,12 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
 
     solver = _check_solver(solver, penalty, dual)
 
+    if precondition == 'auto':
+        precondition = solver == 'lbfgs'
+    if precondition == 'True' and solver != 'lbfgs':
+        raise ValueError("precondition=True only supported with"
+                         " solver='lbfgs'")
+
     # Preprocessing.
     if check_input:
         X = check_array(X, accept_sparse='csr', dtype=np.float64,
@@ -947,7 +958,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     X_pre = X
     X_scale = None
     X_offset = None
-    if precondition and solver == 'lbfgs':
+    if precondition:
         # FIXME this duplicates some code from _preprocess_data
         # and should be refactored
         X_mean, X_scale = _weighted_mean_std(X, sample_weight)
@@ -974,7 +985,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                     'Initialization coef is of shape %d, expected shape '
                     '%d or %d' % (coef.size, n_features, w0.size))
             w0[:coef.size] = coef
-            if solver == 'lbfgs' and precondition:
+            if precondition:
                 if fit_intercept:
                     w0[-1] += np.inner(w0[:n_features], X_mean)
                 w0[:n_features] *= X_scale
@@ -999,7 +1010,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                 w0[1, :coef.shape[1]] = coef
             else:
                 w0[:, :coef.shape[1]] = coef
-            if solver == 'lbfgs' and precondition:
+            if precondition:
                 if fit_intercept:
                     w0[:, -1] += np.dot(w0[:, :n_features], X_mean)
                 w0[:, :n_features] *= X_scale
@@ -1093,7 +1104,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         if multi_class == 'multinomial':
             n_classes = max(2, classes.size)
             multi_w0 = np.reshape(w0, (n_classes, -1))
-            if solver == 'lbfgs' and precondition:
+            if precondition:
                 if fit_intercept:
                     multi_w0[:, :-1] = multi_w0[:, :-1] / X_scale
                     # adjust intercept for preconditioning
@@ -1465,6 +1476,12 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
         combination of L1 and L2.
 
+    precondition : boolean or 'auto', default='auto'
+        Whether to use preconditioning for solving the optimization problem.
+        A diagonal preconditioning based on the data standard deviation is
+        used. If 'auto', preconditioning is used when ``solver='lbfgs'``, which
+        is the only solver that currently supports it.
+
     Attributes
     ----------
 
@@ -1560,7 +1577,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,
                  random_state=None, solver='lbfgs', max_iter=100,
                  multi_class='auto', verbose=0, warm_start=False, n_jobs=None,
-                 l1_ratio=None, precondition=True):
+                 l1_ratio=None, precondition='auto'):
 
         self.penalty = penalty
         self.dual = dual
@@ -1968,6 +1985,12 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination
         of L1 and L2.
 
+    precondition : boolean or 'auto', default='auto'
+        Whether to use preconditioning for solving the optimization problem.
+        A diagonal preconditioning based on the data standard deviation is
+        used. If 'auto', preconditioning is used when ``solver='lbfgs'``, which
+        is the only solver that currently supports it.
+
     Attributes
     ----------
     classes_ : array, shape (n_classes, )
@@ -2060,7 +2083,7 @@ def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False,
                  penalty='l2', scoring=None, solver='lbfgs', tol=1e-4,
                  max_iter=100, class_weight=None, n_jobs=None, verbose=0,
                  refit=True, intercept_scaling=1., multi_class='auto',
-                 random_state=None, l1_ratios=None):
+                 random_state=None, l1_ratios=None, precondition='auto'):
         self.Cs = Cs
         self.fit_intercept = fit_intercept
         self.cv = cv
@@ -2078,6 +2101,7 @@ def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False,
         self.multi_class = multi_class
         self.random_state = random_state
         self.l1_ratios = l1_ratios
+        self.precondition = precondition
 
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.

From 77e5c9951a5fbe8dc57d1590b91b346bddcc8816 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 13:57:57 -0500
Subject: [PATCH 40/41] typo

---
 sklearn/linear_model/_logistic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 22e98e17bf465..daf8b42883d18 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -882,7 +882,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
 
     if precondition == 'auto':
         precondition = solver == 'lbfgs'
-    if precondition == 'True' and solver != 'lbfgs':
+    if precondition and solver != 'lbfgs':
         raise ValueError("precondition=True only supported with"
                          " solver='lbfgs'")
 

From d1109be2a1f40d5478f02d121b6eb712ae2b0edd Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 13 Nov 2019 14:21:38 -0500
Subject: [PATCH 41/41] fix default value to 'auto'

---
 sklearn/linear_model/_logistic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index daf8b42883d18..979927cbbab3c 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -719,7 +719,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                               intercept_scaling=1., multi_class='auto',
                               random_state=None, check_input=True,
                               max_squared_sum=None, sample_weight=None,
-                              l1_ratio=None, precondition=True):
+                              l1_ratio=None, precondition='auto'):
     """Compute a Logistic Regression model for a list of regularization
     parameters.
 
@@ -1132,7 +1132,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
                           dual=False, intercept_scaling=1.,
                           multi_class='auto', random_state=None,
                           max_squared_sum=None, sample_weight=None,
-                          l1_ratio=None, precondition=True):
+                          l1_ratio=None, precondition='auto'):
     """Computes scores across logistic_regression_path
 
     Parameters