From 59f9e40fb8444befa5e0e228da326241787a5cc4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 22:04:21 -0700 Subject: [PATCH 01/41] remove mean for logisticregression lbfgs --- sklearn/linear_model/_logistic.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 7921150e0fa01..9f0738a76020b 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -649,7 +649,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, intercept_scaling=1., multi_class='auto', random_state=None, check_input=True, max_squared_sum=None, sample_weight=None, - l1_ratio=None): + l1_ratio=None, precondition=False): """Compute a Logistic Regression model for a list of regularization parameters. @@ -908,6 +908,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, hess = _multinomial_grad_hess warm_start_sag = {'coef': w0.T} else: + # binary logistic regression target = y_bin if solver == 'lbfgs': func = _logistic_loss_and_grad @@ -919,17 +920,24 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) + X_pre = X + if precondition: + X_mean = X.mean(axis=0) + X_pre = X - X_mean for i, C in enumerate(Cs): if solver == 'lbfgs': iprint = [-1, 50, 1, 100, 101][ np.searchsorted(np.array([0, 1, 2, 3]), verbose)] opt_res = optimize.minimize( func, w0, method="L-BFGS-B", jac=True, - args=(X, target, 1. / C, sample_weight), + args=(X_pre, target, 1. / C, sample_weight), options={"iprint": iprint, "gtol": tol, "maxiter": max_iter} ) n_iter_i = _check_optimize_result(solver, opt_res, max_iter) w0, loss = opt_res.x, opt_res.fun + if precondition: + # adjust intercept for mean subtraction + w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean) elif solver == 'newton-cg': args = (X, target, 1. / C, sample_weight) w0, n_iter_i = _newton_cg(hess, func, grad, w0, args=args, @@ -1428,7 +1436,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, - l1_ratio=None): + l1_ratio=None, precondition=False): self.penalty = penalty self.dual = dual @@ -1445,6 +1453,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0, self.warm_start = warm_start self.n_jobs = n_jobs self.l1_ratio = l1_ratio + self.precondition = precondition def fit(self, X, y, sample_weight=None): """ @@ -1587,7 +1596,8 @@ def fit(self, X, y, sample_weight=None): class_weight=self.class_weight, check_input=False, random_state=self.random_state, coef=warm_start_coef_, penalty=penalty, max_squared_sum=max_squared_sum, - sample_weight=sample_weight) + sample_weight=sample_weight, + precondition=self.precondition) for class_, warm_start_coef_ in zip(classes_, warm_start_coef)) fold_coefs_, _, n_iter_ = zip(*fold_coefs_) From 697112ae7459e4c1fd747ee8cea1ae5e686d250c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 22:13:29 -0700 Subject: [PATCH 02/41] add test that preconditioning works for offsets in X --- sklearn/linear_model/tests/test_logistic.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 894040c2053bd..6d97f791c0f7e 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1824,3 +1824,22 @@ def test_scores_attribute_layout_elasticnet(): avg_score_lr = cross_val_score(lr, X, y, cv=cv).mean() assert avg_scores_lrcv[i, j] == pytest.approx(avg_score_lr) + + +def test_illconditioned_lbfgs(): + # check that lbfgs converges even with ill-conditioned X + X, y = make_classification(n_samples=100, n_features=60, random_state=0) + X[:, 1] += 10000 + lr_pre = LogisticRegression(random_state=0, precondition=True) + with pytest.warns(None) as record: + lr_pre.fit(X, y) + assert len(record) == 0 + loss_pre = _logistic_loss(np.hstack([lr_pre.coef_.ravel(), lr_pre.intercept_]), + X, 2 * y - 1, 1) + + lr = LogisticRegression(random_state=0, precondition=False) + with pytest.warns(ConvergenceWarning): + lr.fit(X, y) + loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr_pre.intercept_]), + X, 2 * y - 1, 1) + assert loss_pre < loss From 5a1431cfe82ec73b8efa65c7200876715c3526d8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 22:19:59 -0700 Subject: [PATCH 03/41] add precondition option temporarily to log_reg_scoring_path --- sklearn/linear_model/_logistic.py | 5 +++-- sklearn/utils/sparsefuncs.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 9f0738a76020b..3bc785b60ef49 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1001,7 +1001,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, dual=False, intercept_scaling=1., multi_class='auto', random_state=None, max_squared_sum=None, sample_weight=None, - l1_ratio=None): + l1_ratio=None, precondition=None): """Computes scores across logistic_regression_path Parameters @@ -1147,7 +1147,8 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, multi_class=multi_class, tol=tol, verbose=verbose, dual=dual, penalty=penalty, intercept_scaling=intercept_scaling, random_state=random_state, check_input=False, - max_squared_sum=max_squared_sum, sample_weight=sample_weight) + max_squared_sum=max_squared_sum, sample_weight=sample_weight, + precondition=precondition) log_reg = LogisticRegression(solver=solver, multi_class=multi_class) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 383de6d9f23c8..0698e5d963df8 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -62,7 +62,7 @@ def inplace_csr_row_scale(X, scale): def mean_variance_axis(X, axis): - """Compute mean and variance along an axix on a CSR or CSC matrix + """Compute mean and variance along an axis on a CSR or CSC matrix Parameters ---------- From bf2e452f5671d4e88c86a5679487316414bf680a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 23:10:36 -0700 Subject: [PATCH 04/41] fix gradients, add test --- sklearn/linear_model/_logistic.py | 39 ++++++++++++++++----- sklearn/linear_model/tests/test_logistic.py | 26 +++++++++++--- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 3bc785b60ef49..11f8faee6681c 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -20,7 +20,7 @@ from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator from ._sag import sag_solver -from ..preprocessing import LabelEncoder, LabelBinarizer +from ..preprocessing import LabelEncoder, LabelBinarizer, normalize from ..svm._base import _fit_liblinear from ..utils import check_array, check_consistent_length, compute_class_weight from ..utils import check_random_state @@ -77,7 +77,7 @@ def _intercept_dot(w, X, y): return w, c, yz -def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): +def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None): """Computes the logistic loss and gradient. Parameters @@ -115,12 +115,20 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): sample_weight = np.ones(n_samples) # Logistic loss is the negative of the log of the logistic function. - out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) + v = w + grad_scale = 1 + if X_scale is not None: + v = w / X_scale + + out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(v, v) z = expit(yz) z0 = sample_weight * (z - 1) * y + if X_scale is not None: + grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * (w / X_scale ** 2) + else: + grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w - grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if grad.shape[0] > n_features: @@ -128,7 +136,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): return out, grad -def _logistic_loss(w, X, y, alpha, sample_weight=None): +def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None): """Computes the logistic loss. Parameters @@ -149,6 +157,9 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None): Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. + X_scale : array-like, shape (n_features,) optional + When using preconditioning, rescaling of features. + Returns ------- out : float @@ -160,7 +171,10 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None): sample_weight = np.ones(y.shape[0]) # Logistic loss is the negative of the log of the logistic function. - out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) + v = w + if X_scale is not None: + v = w / X_scale + out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(v, v) return out @@ -241,7 +255,7 @@ def Hs(s): return grad, Hs -def _multinomial_loss(w, X, Y, alpha, sample_weight): +def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None): """Computes multinomial loss and class probabilities. Parameters @@ -278,6 +292,8 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight): Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ + if X_scale is not None: + raise NotImplementedError n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = w.size == (n_classes * (n_features + 1)) @@ -297,7 +313,7 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight): return loss, p, w -def _multinomial_loss_grad(w, X, Y, alpha, sample_weight): +def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None): """Computes the multinomial loss, gradient and class probabilities. Parameters @@ -335,6 +351,8 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight): Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ + if X_scale is not None: + raise NotImplementedError n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = (w.size == n_classes * (n_features + 1)) @@ -921,21 +939,24 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) X_pre = X + X_scale = None if precondition: X_mean = X.mean(axis=0) X_pre = X - X_mean + X_pre, X_scale = normalize(X_pre, axis=0, copy=False, return_norm=True) for i, C in enumerate(Cs): if solver == 'lbfgs': iprint = [-1, 50, 1, 100, 101][ np.searchsorted(np.array([0, 1, 2, 3]), verbose)] opt_res = optimize.minimize( func, w0, method="L-BFGS-B", jac=True, - args=(X_pre, target, 1. / C, sample_weight), + args=(X_pre, target, 1. / C, sample_weight, X_scale), options={"iprint": iprint, "gtol": tol, "maxiter": max_iter} ) n_iter_i = _check_optimize_result(solver, opt_res, max_iter) w0, loss = opt_res.x, opt_res.fun if precondition: + w0[:-1] = w0[:-1] / X_scale # adjust intercept for mean subtraction w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean) elif solver == 'newton-cg': diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 6d97f791c0f7e..76e4408c96a7e 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1830,16 +1830,34 @@ def test_illconditioned_lbfgs(): # check that lbfgs converges even with ill-conditioned X X, y = make_classification(n_samples=100, n_features=60, random_state=0) X[:, 1] += 10000 + X[:, 0] *= 10000 lr_pre = LogisticRegression(random_state=0, precondition=True) with pytest.warns(None) as record: lr_pre.fit(X, y) assert len(record) == 0 - loss_pre = _logistic_loss(np.hstack([lr_pre.coef_.ravel(), lr_pre.intercept_]), - X, 2 * y - 1, 1) + loss_pre = _logistic_loss( + np.hstack([lr_pre.coef_.ravel(), lr_pre.intercept_]), + X, 2 * y - 1, 1) lr = LogisticRegression(random_state=0, precondition=False) with pytest.warns(ConvergenceWarning): lr.fit(X, y) - loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr_pre.intercept_]), - X, 2 * y - 1, 1) + loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr.intercept_]), + X, 2 * y - 1, 1) assert loss_pre < loss + + +def test_logistic_loss_preconditioning(): + # check _logistic_loss and _logistic_loss_grad with preconditioning + X, y = make_classification(n_samples=100, n_features=60, random_state=0) + X[:, 1] += 10000 + lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000) + lr.fit(X, y) + loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr.intercept_]), + X, 2 * y - 1, 1) + X_std = X.std(axis=0) + X_pre = X / X_std + loss_pre = _logistic_loss( + np.hstack([lr.coef_.ravel() * X_std, lr.intercept_]), + X_pre, 2 * y - 1, 1, X_scale=X_std) + assert_almost_equal(loss, loss_pre) \ No newline at end of file From 86f7520c0e31f3e032348eb04ac9b5fdc997cd5a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 23:18:43 -0700 Subject: [PATCH 05/41] remove unused grad_scale --- sklearn/linear_model/_logistic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 11f8faee6681c..704dccc3496d7 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -116,7 +116,6 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None): # Logistic loss is the negative of the log of the logistic function. v = w - grad_scale = 1 if X_scale is not None: v = w / X_scale From 01c2c98eb3e1a79553638c01d64a0670bdcf0fc2 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 23:19:45 -0700 Subject: [PATCH 06/41] pep8 --- sklearn/linear_model/_logistic.py | 2 +- sklearn/linear_model/tests/test_logistic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 704dccc3496d7..7bc0b452ee396 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -124,7 +124,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None): z = expit(yz) z0 = sample_weight * (z - 1) * y if X_scale is not None: - grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * (w / X_scale ** 2) + grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * (w / X_scale**2) else: grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 76e4408c96a7e..6ce0fc8f6714a 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1860,4 +1860,4 @@ def test_logistic_loss_preconditioning(): loss_pre = _logistic_loss( np.hstack([lr.coef_.ravel() * X_std, lr.intercept_]), X_pre, 2 * y - 1, 1, X_scale=X_std) - assert_almost_equal(loss, loss_pre) \ No newline at end of file + assert_almost_equal(loss, loss_pre) From 2bfeba4c3384857411a5ec7372957b4d8d628503 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 23:49:53 -0700 Subject: [PATCH 07/41] fix intercept for multinomial loss --- sklearn/linear_model/_logistic.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 7bc0b452ee396..b937dd011c11d 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -291,8 +291,6 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None): Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ - if X_scale is not None: - raise NotImplementedError n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = w.size == (n_classes * (n_features + 1)) @@ -303,7 +301,10 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None): w = w[:, :-1] else: intercept = 0 - p = safe_sparse_dot(X, w.T) + v = w + if X_scale is not None: + v = w / X_scale + p = safe_sparse_dot(X, v.T) p += intercept p -= logsumexp(p, axis=1)[:, np.newaxis] loss = -(sample_weight * Y * p).sum() @@ -954,7 +955,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, ) n_iter_i = _check_optimize_result(solver, opt_res, max_iter) w0, loss = opt_res.x, opt_res.fun - if precondition: + if precondition and multi_class != 'multinomial': + # adjust weight scale for rescaling w0[:-1] = w0[:-1] / X_scale # adjust intercept for mean subtraction w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean) @@ -1002,6 +1004,15 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, if multi_class == 'multinomial': n_classes = max(2, classes.size) multi_w0 = np.reshape(w0, (n_classes, -1)) + if solver == 'lbfgs' and precondition: + if fit_intercept: + multi_w0[:, :-1] = multi_w0[:, :-1] / X_scale + # adjust intercept for preconditioning + multi_w0[:, -1] = (multi_w0[:, -1] + - np.dot(multi_w0[:, :-1], X_mean)) + else: + multi_w0 = multi_w0 / X_scale + if n_classes == 2: multi_w0 = multi_w0[1][np.newaxis, :] coefs.append(multi_w0.copy()) From e72d27131d61ee9eaa400f0742ac8efa56c77ba3 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 9 Nov 2019 23:54:05 -0700 Subject: [PATCH 08/41] fix loss for multinomial --- sklearn/linear_model/_logistic.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index b937dd011c11d..a96db03d3006c 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -301,14 +301,15 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None): w = w[:, :-1] else: intercept = 0 - v = w - if X_scale is not None: - v = w / X_scale - p = safe_sparse_dot(X, v.T) + + p = safe_sparse_dot(X, w.T) p += intercept p -= logsumexp(p, axis=1)[:, np.newaxis] loss = -(sample_weight * Y * p).sum() - loss += 0.5 * alpha * squared_norm(w) + v = w + if X_scale is not None: + v = w / X_scale + loss += 0.5 * alpha * squared_norm(v) p = np.exp(p, p) return loss, p, w From 7d71afb2f83edca6965977e6abe3929794daeac1 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 10 Nov 2019 00:12:34 -0700 Subject: [PATCH 09/41] add multinomial logistic regression preconditioning with lbfgs --- sklearn/linear_model/_logistic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index a96db03d3006c..ba7e7662870fc 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -352,18 +352,19 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None): Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ - if X_scale is not None: - raise NotImplementedError n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = (w.size == n_classes * (n_features + 1)) grad = np.zeros((n_classes, n_features + bool(fit_intercept)), dtype=X.dtype) - loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight) + loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=X_scale) sample_weight = sample_weight[:, np.newaxis] diff = sample_weight * (p - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) - grad[:, :n_features] += alpha * w + if X_scale is not None: + grad[:, :n_features] += alpha * (w / X_scale**2) + else: + grad[:, :n_features] += alpha * w if fit_intercept: grad[:, -1] = diff.sum(axis=0) return loss, grad.ravel(), p From f15332118831f7f79b0bd494b2cdba107164eb1c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 10 Nov 2019 00:19:12 -0700 Subject: [PATCH 10/41] pep8 --- sklearn/linear_model/_logistic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index ba7e7662870fc..e8ec84cc5f22f 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -128,7 +128,6 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None): else: grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w - # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() @@ -357,7 +356,8 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None): fit_intercept = (w.size == n_classes * (n_features + 1)) grad = np.zeros((n_classes, n_features + bool(fit_intercept)), dtype=X.dtype) - loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=X_scale) + loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight, + X_scale=X_scale) sample_weight = sample_weight[:, np.newaxis] diff = sample_weight * (p - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) From 6da875d20e793edfc98d84b2308a7a4d2d1afbca Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 10 Nov 2019 13:55:32 -0700 Subject: [PATCH 11/41] hack around with sparse stuff, set precondition=True everywhere for consistency --- sklearn/linear_model/_logistic.py | 40 ++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index e8ec84cc5f22f..8deb846ef650a 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -20,7 +20,7 @@ from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator from ._sag import sag_solver -from ..preprocessing import LabelEncoder, LabelBinarizer, normalize +from ..preprocessing import LabelEncoder, LabelBinarizer from ..svm._base import _fit_liblinear from ..utils import check_array, check_consistent_length, compute_class_weight from ..utils import check_random_state @@ -34,6 +34,7 @@ from ..utils import deprecated from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args +from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale from ..model_selection import check_cv from ..metrics import get_scorer @@ -669,7 +670,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, intercept_scaling=1., multi_class='auto', random_state=None, check_input=True, max_squared_sum=None, sample_weight=None, - l1_ratio=None, precondition=False): + l1_ratio=None, precondition=True): """Compute a Logistic Regression model for a list of regularization parameters. @@ -943,9 +944,29 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, X_pre = X X_scale = None if precondition: - X_mean = X.mean(axis=0) - X_pre = X - X_mean - X_pre, X_scale = normalize(X_pre, axis=0, copy=False, return_norm=True) + # FIXME this duplicates come code from _preprocess_data + # and should be refactored + if sparse.issparse(X): + X_mean, X_var = mean_variance_axis(X, axis=0) + X_scale = np.sqrt(X_var, X_var) + X_scale[X_scale == 0] = 1 + + del X_var + X_pre = X.toarray() + if fit_intercept: + X_pre = X_pre - X_mean # FIXME + # can we actually do inplace here? + # inplace_column_scale(X_pre, 1 / X_scale) + X_pre = X_pre / X_scale + + else: + X_mean = X.mean(axis=0) + if fit_intercept: + X_pre = X - X_mean + X_scale = X.std(axis=0) + X_scale[X_scale == 0] = 1 + X_pre = X_pre / X_scale + for i, C in enumerate(Cs): if solver == 'lbfgs': iprint = [-1, 50, 1, 100, 101][ @@ -959,9 +980,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, w0, loss = opt_res.x, opt_res.fun if precondition and multi_class != 'multinomial': # adjust weight scale for rescaling - w0[:-1] = w0[:-1] / X_scale + w0[:n_features] = w0[:n_features] / X_scale # adjust intercept for mean subtraction - w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean) + if fit_intercept: + w0[-1] = w0[-1] - np.inner(w0[:-1], X_mean) elif solver == 'newton-cg': args = (X, target, 1. / C, sample_weight) w0, n_iter_i = _newton_cg(hess, func, grad, w0, args=args, @@ -1034,7 +1056,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, dual=False, intercept_scaling=1., multi_class='auto', random_state=None, max_squared_sum=None, sample_weight=None, - l1_ratio=None, precondition=None): + l1_ratio=None, precondition=True): """Computes scores across logistic_regression_path Parameters @@ -1470,7 +1492,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, - l1_ratio=None, precondition=False): + l1_ratio=None, precondition=True): self.penalty = penalty self.dual = dual From 3859d576e52cc9afaad0b35c2495078301dc254f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 10 Nov 2019 15:01:39 -0700 Subject: [PATCH 12/41] fixing warmstarting --- sklearn/linear_model/_logistic.py | 62 ++++++++++++++++++------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 8deb846ef650a..db58fd7fb841d 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -886,6 +886,34 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, w0 = np.zeros((classes.size, n_features + int(fit_intercept)), order='F', dtype=X.dtype) + # preconditioning + X_pre = X + X_scale = None + if precondition and solver == 'lbfgs': + # FIXME this duplicates come code from _preprocess_data + # and should be refactored + if sparse.issparse(X): + X_mean, X_var = mean_variance_axis(X, axis=0) + X_scale = np.sqrt(X_var, X_var) + X_scale[X_scale == 0] = 1 + + del X_var + X_pre = X.toarray() + if fit_intercept: + X_pre = X_pre - X_mean # FIXME + # can we actually do inplace here? + # inplace_column_scale(X_pre, 1 / X_scale) + X_pre = X_pre / X_scale + + else: + X_mean = X.mean(axis=0) + if fit_intercept: + X_pre = X - X_mean + X_scale = X.std(axis=0) + X_scale[X_scale == 0] = 1 + X_pre = X_pre / X_scale + + # warm starting if coef is not None: # it must work both giving the bias term and not if multi_class == 'ovr': @@ -894,6 +922,11 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, 'Initialization coef is of shape %d, expected shape ' '%d or %d' % (coef.size, n_features, w0.size)) w0[:coef.size] = coef + if solver == 'lbfgs' and precondition: + if fit_intercept: + w0[-1] += np.inner(w0[:n_features], X_mean) + w0[:n_features] *= X_scale + else: # For binary problems coef.shape[0] should be 1, otherwise it # should be classes.size. @@ -914,6 +947,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, w0[1, :coef.shape[1]] = coef else: w0[:, :coef.shape[1]] = coef + if solver == 'lbfgs' and precondition: + if fit_intercept: + w0[:, -1] += np.dot(w0[:, :n_features], X_mean) + w0[:, :n_features] *= X_scale if multi_class == 'multinomial': # scipy.optimize.minimize and newton-cg accepts only @@ -941,31 +978,6 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) - X_pre = X - X_scale = None - if precondition: - # FIXME this duplicates come code from _preprocess_data - # and should be refactored - if sparse.issparse(X): - X_mean, X_var = mean_variance_axis(X, axis=0) - X_scale = np.sqrt(X_var, X_var) - X_scale[X_scale == 0] = 1 - - del X_var - X_pre = X.toarray() - if fit_intercept: - X_pre = X_pre - X_mean # FIXME - # can we actually do inplace here? - # inplace_column_scale(X_pre, 1 / X_scale) - X_pre = X_pre / X_scale - - else: - X_mean = X.mean(axis=0) - if fit_intercept: - X_pre = X - X_mean - X_scale = X.std(axis=0) - X_scale[X_scale == 0] = 1 - X_pre = X_pre / X_scale for i, C in enumerate(Cs): if solver == 'lbfgs': From 5dd503d9a47762d45a316274c990aba5dd422627 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 10 Nov 2019 22:07:00 -0700 Subject: [PATCH 13/41] starting on sparse offset support --- sklearn/linear_model/_logistic.py | 33 +++++++++++++++++++------------ 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index db58fd7fb841d..6ed3a46483dfc 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -40,7 +40,7 @@ # .. some helper functions for logistic_regression_path .. -def _intercept_dot(w, X, y): +def _intercept_dot(w, X, y, X_offset=None): """Computes y * np.dot(X, w). It takes into consideration if the intercept should be fit or not. @@ -74,11 +74,14 @@ def _intercept_dot(w, X, y): w = w[:-1] z = safe_sparse_dot(X, w) + c + if X_offset is not None: + z += np.dot(X_offset, w) yz = y * z return w, c, yz -def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None): +def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None, + X_offset=None): """Computes the logistic loss and gradient. Parameters @@ -110,7 +113,7 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None): n_samples, n_features = X.shape grad = np.empty_like(w) - w, c, yz = _intercept_dot(w, X, y) + w, c, yz = _intercept_dot(w, X, y, X_offset) if sample_weight is None: sample_weight = np.ones(n_samples) @@ -135,7 +138,8 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None): return out, grad -def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None): +def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None, + X_offset=None): """Computes the logistic loss. Parameters @@ -164,7 +168,7 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None): out : float Logistic loss. """ - w, c, yz = _intercept_dot(w, X, y) + w, c, yz = _intercept_dot(w, X, y, X_offset) if sample_weight is None: sample_weight = np.ones(y.shape[0]) @@ -254,7 +258,8 @@ def Hs(s): return grad, Hs -def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None): +def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None, + X_offset=None): """Computes multinomial loss and class probabilities. Parameters @@ -304,6 +309,8 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None): p = safe_sparse_dot(X, w.T) p += intercept + if X_offset is not None: + p += np.dot(X_offset, w.T) p -= logsumexp(p, axis=1)[:, np.newaxis] loss = -(sample_weight * Y * p).sum() v = w @@ -314,7 +321,8 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None): return loss, p, w -def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None): +def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None, + X_offset=None): """Computes the multinomial loss, gradient and class probabilities. Parameters @@ -358,7 +366,7 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None): grad = np.zeros((n_classes, n_features + bool(fit_intercept)), dtype=X.dtype) loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight, - X_scale=X_scale) + X_scale=X_scale, X_offset=X_offset) sample_weight = sample_weight[:, np.newaxis] diff = sample_weight * (p - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) @@ -889,6 +897,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, # preconditioning X_pre = X X_scale = None + X_offset = None if precondition and solver == 'lbfgs': # FIXME this duplicates come code from _preprocess_data # and should be refactored @@ -898,12 +907,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, X_scale[X_scale == 0] = 1 del X_var - X_pre = X.toarray() if fit_intercept: - X_pre = X_pre - X_mean # FIXME + X_offset = -X_mean # can we actually do inplace here? - # inplace_column_scale(X_pre, 1 / X_scale) - X_pre = X_pre / X_scale + inplace_column_scale(X_pre, 1 / X_scale) else: X_mean = X.mean(axis=0) @@ -985,7 +992,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, np.searchsorted(np.array([0, 1, 2, 3]), verbose)] opt_res = optimize.minimize( func, w0, method="L-BFGS-B", jac=True, - args=(X_pre, target, 1. / C, sample_weight, X_scale), + args=(X_pre, target, 1. / C, sample_weight, X_scale, X_offset), options={"iprint": iprint, "gtol": tol, "maxiter": max_iter} ) n_iter_i = _check_optimize_result(solver, opt_res, max_iter) From 090e540bc2f312b5938d2dec7897b2b88b4d01d9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 11 Nov 2019 10:59:13 -0800 Subject: [PATCH 14/41] Update sklearn/linear_model/_logistic.py Co-Authored-By: Adrin Jalali --- sklearn/linear_model/_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 6ed3a46483dfc..8ba0619f3544b 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -899,7 +899,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, X_scale = None X_offset = None if precondition and solver == 'lbfgs': - # FIXME this duplicates come code from _preprocess_data + # FIXME this duplicates some code from _preprocess_data # and should be refactored if sparse.issparse(X): X_mean, X_var = mean_variance_axis(X, axis=0) From 2e83f7e79417b9d4d12896ab904c25f7c2feafc6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 11 Nov 2019 17:11:55 -0500 Subject: [PATCH 15/41] don't change sparse matrix inplace --- sklearn/linear_model/_logistic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 8ba0619f3544b..cf1839ea82df0 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -909,8 +909,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, del X_var if fit_intercept: X_offset = -X_mean - # can we actually do inplace here? - inplace_column_scale(X_pre, 1 / X_scale) + X_pre = X_pre.multiply(1 / X_scale) else: X_mean = X.mean(axis=0) From 9656c2f28edbe9465e4070d2f8cca0f3cf931418 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 11 Nov 2019 17:26:24 -0500 Subject: [PATCH 16/41] fix sparse offset --- sklearn/linear_model/_logistic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index cf1839ea82df0..f191703be15e2 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -34,7 +34,7 @@ from ..utils import deprecated from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args -from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale +from ..utils.sparsefuncs import mean_variance_axis from ..model_selection import check_cv from ..metrics import get_scorer @@ -905,10 +905,9 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, X_mean, X_var = mean_variance_axis(X, axis=0) X_scale = np.sqrt(X_var, X_var) X_scale[X_scale == 0] = 1 - del X_var if fit_intercept: - X_offset = -X_mean + X_offset = -X_mean/X_scale X_pre = X_pre.multiply(1 / X_scale) else: From fcfc54ebeb1867d414bbac4a1659bc6d7e0c0016 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 11 Nov 2019 18:41:43 -0500 Subject: [PATCH 17/41] fix gradient error --- sklearn/linear_model/_logistic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index f191703be15e2..4b18f4d5b4317 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -132,6 +132,9 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None, else: grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w + if X_offset is not None: + grad[:n_features] += X_offset * z0.sum() + # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() From 626ad4426a36a40925f616449faafe55963ccce7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 11 Nov 2019 19:06:57 -0500 Subject: [PATCH 18/41] stricter gradient test, add gradient test for multinomial loss --- sklearn/linear_model/tests/test_logistic.py | 38 ++++++++++++++++----- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 6ce0fc8f6714a..26be2789e896a 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -14,7 +14,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score -from sklearn.preprocessing import LabelEncoder, StandardScaler +from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize from sklearn.utils import compute_class_weight, _IS_32BIT from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_allclose @@ -38,7 +38,7 @@ _logistic_regression_path, LogisticRegressionCV, _logistic_loss_and_grad, _logistic_grad_hess, _multinomial_grad_hess, _logistic_loss, - _log_reg_scoring_path) + _log_reg_scoring_path, _multinomial_loss_grad) X = [[-1, 0], [0, 1], [1, 1]] X_sp = sp.csr_matrix(X) @@ -417,13 +417,12 @@ def test_liblinear_dual_random_state(): def test_logistic_loss_and_grad(): X_ref, y = make_classification(n_samples=20, random_state=0) - n_features = X_ref.shape[1] - X_sp = X_ref.copy() X_sp[X_sp < .1] = 0 X_sp = sp.csr_matrix(X_sp) + clf = LogisticRegression(random_state=0).fit(X_ref, y) for X in (X_ref, X_sp): - w = np.zeros(n_features) + w = clf.coef_.copy().ravel() # First check that our derivation of the grad is correct loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1.) @@ -433,12 +432,10 @@ def test_logistic_loss_and_grad(): assert_array_almost_equal(grad, approx_grad, decimal=2) # Second check that our intercept implementation is good - w = np.zeros(n_features + 1) + w = np.hstack([clf.coef_.copy().ravel(), clf.intercept_]) loss_interp, grad_interp = _logistic_loss_and_grad( w, X, y, alpha=1. ) - assert_array_almost_equal(loss, loss_interp) - approx_grad = optimize.approx_fprime( w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3 ) @@ -494,6 +491,31 @@ def test_logistic_grad_hess(): assert_array_almost_equal(grad_interp, grad_interp_2) +def test_multinomial_loss_grad(): + n_features = 10 + n_classes = 3 + X_ref, y = make_classification(n_features=n_features, n_classes=n_classes, + random_state=0, n_informative=6) + + X_sp = X_ref.copy() + X_sp[X_sp < .1] = 0 + X_sp = sp.csr_matrix(X_sp) + sample_weight = np.ones(X_ref.shape[0]) + Y = label_binarize(y, [0, 1, 2]) + lr = LogisticRegression(random_state=0).fit(X_ref, y) + for X in (X_ref, X_sp): + + w = np.hstack([lr.coef_, lr.intercept_.reshape(-1, 1)]) + loss, grad, p = _multinomial_loss_grad( + w, X, Y, alpha=1., X_scale=None, sample_weight=sample_weight) + approx_grad = optimize.approx_fprime( + w.ravel(), lambda w: _multinomial_loss_grad( + w, X, Y, alpha=1., X_scale=None, + sample_weight=sample_weight)[0], 1e-5 + ) + assert_array_almost_equal(grad, approx_grad, decimal=3) + + def test_logistic_cv(): # test for LogisticRegressionCV object n_samples, n_features = 50, 5 From 8cc16333668ce857901cde094d06781f43fdcd12 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 11 Nov 2019 19:31:18 -0500 Subject: [PATCH 19/41] use sample weights in averages --- sklearn/linear_model/_logistic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 4b18f4d5b4317..f4a31fa760506 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -914,10 +914,12 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, X_pre = X_pre.multiply(1 / X_scale) else: - X_mean = X.mean(axis=0) + X_mean = np.average(X, weights=sample_weight, axis=0) if fit_intercept: X_pre = X - X_mean - X_scale = X.std(axis=0) + # weighted version of std + X_scale = np.sqrt(np.average((X_pre)**2, weights=sample_weight, + axis=0)) X_scale[X_scale == 0] = 1 X_pre = X_pre / X_scale From a312bc92ac9b2c5e3d980fe5d5f12958c934a755 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 10:31:28 -0500 Subject: [PATCH 20/41] fix doctest --- sklearn/feature_selection/_from_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 674127f06acd7..63baab1f030b2 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -146,7 +146,7 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator): >>> y = [0, 1, 0, 1] >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) >>> selector.estimator_.coef_ - array([[-0.3252302 , 0.83462377, 0.49750423]]) + array([[-0.3252319 , 0.83462631, 0.49750495]]) >>> selector.threshold_ 0.55245... >>> selector.get_support() From d8ec9d1c02c94937d27f94fcb9815654d2cabd67 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 11:00:31 -0500 Subject: [PATCH 21/41] increase tolerance in coefficient equality test :-/ --- sklearn/linear_model/tests/test_logistic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 26be2789e896a..ac3627c642ad3 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -966,9 +966,10 @@ def test_logistic_regression_multinomial(): assert clf_w.coef_.shape == (n_classes, n_features) # Compare solutions between lbfgs and the other solvers - assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-2) - assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-2) - assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-2) + assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-1, atol=1e-4) + assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-1, atol=1e-4) + assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-1, + atol=1e-4) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the From 954da5afc9d443da1edd8dbc580a5cf0627b46a4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 14:11:02 -0500 Subject: [PATCH 22/41] store loss value in logistic regression when using l-bfgs --- sklearn/linear_model/_logistic.py | 15 +++++++++------ sklearn/linear_model/tests/test_logistic.py | 6 +++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index f4a31fa760506..e4356873536d4 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -671,7 +671,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, tol=1e-4, verbose=0, solver='lbfgs', coef=None, class_weight=None, dual=False, penalty='l2', intercept_scaling=1., multi_class='auto', random_state=None, check_input=True, max_squared_sum=None, - sample_weight=None, l1_ratio=None) + sample_weight=None, l1_ratio=None)[:3] def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, @@ -989,6 +989,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) + loss_value = None for i, C in enumerate(Cs): if solver == 'lbfgs': iprint = [-1, 50, 1, 100, 101][ @@ -999,7 +1000,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, options={"iprint": iprint, "gtol": tol, "maxiter": max_iter} ) n_iter_i = _check_optimize_result(solver, opt_res, max_iter) - w0, loss = opt_res.x, opt_res.fun + w0, loss_value = opt_res.x, opt_res.fun if precondition and multi_class != 'multinomial': # adjust weight scale for rescaling w0[:n_features] = w0[:n_features] / X_scale @@ -1067,7 +1068,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, n_iter[i] = n_iter_i - return np.array(coefs), np.array(Cs), n_iter + return np.array(coefs), np.array(Cs), n_iter, loss_value # helper function for LogisticCV @@ -1217,7 +1218,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, sample_weight = _check_sample_weight(sample_weight, X) sample_weight = sample_weight[train] - coefs, Cs, n_iter = _logistic_regression_path( + coefs, Cs, n_iter, loss_value = _logistic_regression_path( X_train, y_train, Cs=Cs, l1_ratio=l1_ratio, fit_intercept=fit_intercept, solver=solver, max_iter=max_iter, class_weight=class_weight, pos_class=pos_class, @@ -1678,15 +1679,17 @@ def fit(self, X, y, sample_weight=None): precondition=self.precondition) for class_, warm_start_coef_ in zip(classes_, warm_start_coef)) - fold_coefs_, _, n_iter_ = zip(*fold_coefs_) + fold_coefs_, _, n_iter_, loss_values_ = zip(*fold_coefs_) self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] if multi_class == 'multinomial': self.coef_ = fold_coefs_[0][0] + self.loss_values_ = loss_values_[0] else: self.coef_ = np.asarray(fold_coefs_) self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept)) + self.loss_values_ = loss_values_[0] if self.fit_intercept: self.intercept_ = self.coef_[:, -1] @@ -2246,7 +2249,7 @@ def fit(self, X, y, sample_weight=None): # Note that y is label encoded and hence pos_class must be # the encoded label / None (for 'multinomial') - w, _, _ = _logistic_regression_path( + w, _, _, _ = _logistic_regression_path( X, y, pos_class=encoded_label, Cs=[C_], solver=solver, fit_intercept=self.fit_intercept, coef=coef_init, max_iter=self.max_iter, tol=self.tol, diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index ac3627c642ad3..f80dfc74cb2ef 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -358,7 +358,7 @@ def test_consistency_path(): # can't test with fit_intercept=True since LIBLINEAR # penalizes the intercept for solver in ['sag', 'saga']: - coefs, Cs, _ = f(_logistic_regression_path)( + coefs, Cs, _, _ = f(_logistic_regression_path)( X, y, Cs=Cs, fit_intercept=False, tol=1e-5, solver=solver, max_iter=1000, multi_class='ovr', random_state=0) for i, C in enumerate(Cs): @@ -373,7 +373,7 @@ def test_consistency_path(): # test for fit_intercept=True for solver in ('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'): Cs = [1e3] - coefs, Cs, _ = f(_logistic_regression_path)( + coefs, Cs, _, _ = f(_logistic_regression_path)( X, y, Cs=Cs, tol=1e-6, solver=solver, intercept_scaling=10000., random_state=0, multi_class='ovr') lr = LogisticRegression(C=Cs[0], tol=1e-4, @@ -1689,7 +1689,7 @@ def test_logistic_regression_path_coefs_multinomial(): n_redundant=0, n_clusters_per_class=1, random_state=0, n_features=2) Cs = [.00001, 1, 10000] - coefs, _, _ = _logistic_regression_path(X, y, penalty='l1', Cs=Cs, + coefs, _, _, _ = _logistic_regression_path(X, y, penalty='l1', Cs=Cs, solver='saga', random_state=0, multi_class='multinomial') From 9c704aaffbca9755cbf72ade2549252a526ac649 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 14:22:24 -0500 Subject: [PATCH 23/41] fix gradient, add gradient test --- sklearn/linear_model/_logistic.py | 2 ++ sklearn/linear_model/tests/test_logistic.py | 22 +++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index e4356873536d4..b2ff1838f5cea 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -373,6 +373,8 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None, sample_weight = sample_weight[:, np.newaxis] diff = sample_weight * (p - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) + if X_offset is not None: + grad[:, :n_features] += np.outer(diff.T.sum(axis=1), X_offset) if X_scale is not None: grad[:, :n_features] += alpha * (w / X_scale**2) else: diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index f80dfc74cb2ef..2bda7c7c9c482 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -504,16 +504,18 @@ def test_multinomial_loss_grad(): Y = label_binarize(y, [0, 1, 2]) lr = LogisticRegression(random_state=0).fit(X_ref, y) for X in (X_ref, X_sp): - - w = np.hstack([lr.coef_, lr.intercept_.reshape(-1, 1)]) - loss, grad, p = _multinomial_loss_grad( - w, X, Y, alpha=1., X_scale=None, sample_weight=sample_weight) - approx_grad = optimize.approx_fprime( - w.ravel(), lambda w: _multinomial_loss_grad( - w, X, Y, alpha=1., X_scale=None, - sample_weight=sample_weight)[0], 1e-5 - ) - assert_array_almost_equal(grad, approx_grad, decimal=3) + for X_offset in (None, X.mean(axis=0)): + + w = np.hstack([lr.coef_, lr.intercept_.reshape(-1, 1)]) + loss, grad, p = _multinomial_loss_grad( + w, X, Y, alpha=1., X_scale=None, sample_weight=sample_weight, + X_offset=X_offset) + approx_grad = optimize.approx_fprime( + w.ravel(), lambda w: _multinomial_loss_grad( + w, X, Y, alpha=1., X_scale=None, X_offset=X_offset, + sample_weight=sample_weight)[0], 1e-5 + ) + assert_array_almost_equal(grad, approx_grad, decimal=3) def test_logistic_cv(): From 1d61a941570a33370fbc5ed0b0717a05d40cce90 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 14:42:39 -0500 Subject: [PATCH 24/41] offset gradient test for logistic loss --- sklearn/linear_model/_logistic.py | 2 +- sklearn/linear_model/tests/test_logistic.py | 38 ++++++++++++--------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index b2ff1838f5cea..b7869bb407a5a 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -75,7 +75,7 @@ def _intercept_dot(w, X, y, X_offset=None): z = safe_sparse_dot(X, w) + c if X_offset is not None: - z += np.dot(X_offset, w) + z += np.inner(X_offset, w) yz = y * z return w, c, yz diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 2bda7c7c9c482..20b8e59434cde 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -416,30 +416,34 @@ def test_liblinear_dual_random_state(): def test_logistic_loss_and_grad(): - X_ref, y = make_classification(n_samples=20, random_state=0) + X_ref, y = make_classification(n_samples=21, random_state=0) X_sp = X_ref.copy() X_sp[X_sp < .1] = 0 X_sp = sp.csr_matrix(X_sp) clf = LogisticRegression(random_state=0).fit(X_ref, y) for X in (X_ref, X_sp): - w = clf.coef_.copy().ravel() + for X_offset in (None, np.asarray(X.mean(axis=0)).squeeze()): + w = clf.coef_.copy().ravel() - # First check that our derivation of the grad is correct - loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1.) - approx_grad = optimize.approx_fprime( - w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3 - ) - assert_array_almost_equal(grad, approx_grad, decimal=2) + # First check that our derivation of the grad is correct + loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1., + X_offset=X_offset) + approx_grad = optimize.approx_fprime( + w, lambda w: _logistic_loss_and_grad( + w, X, y, alpha=1., X_offset=X_offset)[0], 1e-3 + ) + assert_array_almost_equal(grad, approx_grad, decimal=2) - # Second check that our intercept implementation is good - w = np.hstack([clf.coef_.copy().ravel(), clf.intercept_]) - loss_interp, grad_interp = _logistic_loss_and_grad( - w, X, y, alpha=1. - ) - approx_grad = optimize.approx_fprime( - w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3 - ) - assert_array_almost_equal(grad_interp, approx_grad, decimal=2) + # Second check that our intercept implementation is good + w = np.hstack([clf.coef_.copy().ravel(), clf.intercept_]) + loss_interp, grad_interp = _logistic_loss_and_grad( + w, X, y, alpha=1., X_offset=X_offset + ) + approx_grad = optimize.approx_fprime( + w, lambda w: _logistic_loss_and_grad( + w, X, y, alpha=1., X_offset=X_offset)[0], 1e-3 + ) + assert_array_almost_equal(grad_interp, approx_grad, decimal=2) def test_logistic_grad_hess(): From a82edc165690ba532d3ee642cd81b86f27a4512e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 16:29:37 -0500 Subject: [PATCH 25/41] cast matrix to csr for pointwise multiplication?! --- sklearn/linear_model/_logistic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index b7869bb407a5a..4d6b1fe4155ee 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -913,7 +913,9 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, del X_var if fit_intercept: X_offset = -X_mean/X_scale - X_pre = X_pre.multiply(1 / X_scale) + # FIXME old scipy requires conversion to sparse matrix + # before calling multiply + X_pre = X_pre.multiply(sparse.csr_matrix(1 / X_scale)) else: X_mean = np.average(X, weights=sample_weight, axis=0) From a2947abada2bc8ad1586682022510581f4ef81a3 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 16:43:52 -0500 Subject: [PATCH 26/41] add docstrings and some explanation --- sklearn/linear_model/_logistic.py | 57 +++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 4d6b1fe4155ee..0bccec07f5f63 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -56,6 +56,11 @@ def _intercept_dot(w, X, y, X_offset=None): y : ndarray, shape (n_samples,) Array of labels. + X_offset : ndarray, shape (n_features,) or None + Offset to use for X to avoid subtracting mean from sparse + matrices if preconditioning. Should be None in the dense case + as the mean was actually subtracted. + Returns ------- w : ndarray, shape (n_features,) @@ -102,6 +107,15 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None, Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. + X_scale : ndarray, shape (n_features,) or None + Rescaling that was applied to X for preconditioning. + Needed to correctly compute penalty term. + + X_offset : ndarray, shape (n_features,) or None + Offset to use for X to avoid subtracting mean from sparse + matrices if preconditioning. Should be None in the dense case + as the mean was actually subtracted. + Returns ------- out : float @@ -118,11 +132,11 @@ def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, X_scale=None, if sample_weight is None: sample_weight = np.ones(n_samples) - # Logistic loss is the negative of the log of the logistic function. v = w if X_scale is not None: v = w / X_scale + # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(v, v) z = expit(yz) @@ -163,8 +177,14 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None, Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. - X_scale : array-like, shape (n_features,) optional - When using preconditioning, rescaling of features. + X_scale : ndarray, shape (n_features,) or None + Rescaling that was applied to X for preconditioning. + Needed to correctly compute penalty term. + + X_offset : ndarray, shape (n_features,) or None + Offset to use for X to avoid subtracting mean from sparse + matrices if preconditioning. Should be None in the dense case + as the mean was actually subtracted. Returns ------- @@ -283,6 +303,15 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None, sample_weight : array-like, shape (n_samples,) Array of weights that are assigned to individual samples. + X_scale : ndarray, shape (n_features,) or None + Rescaling that was applied to X for preconditioning. + Needed to correctly compute penalty term. + + X_offset : ndarray, shape (n_features,) or None + Offset to use for X to avoid subtracting mean from sparse + matrices if preconditioning. Should be None in the dense case + as the mean was actually subtracted. + Returns ------- loss : float @@ -346,6 +375,15 @@ def _multinomial_loss_grad(w, X, Y, alpha, sample_weight, X_scale=None, sample_weight : array-like, shape (n_samples,) Array of weights that are assigned to individual samples. + X_scale : ndarray, shape (n_features,) or None + Rescaling that was applied to X for preconditioning. + Needed to correctly compute penalty term. + + X_offset : ndarray, shape (n_features,) or None + Offset to use for X to avoid subtracting mean from sparse + matrices if preconditioning. Should be None in the dense case + as the mean was actually subtracted. + Returns ------- loss : float @@ -899,7 +937,14 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, w0 = np.zeros((classes.size, n_features + int(fit_intercept)), order='F', dtype=X.dtype) - # preconditioning + # preconditioning for lbfgs + # Subtract mean, divide by standard deviation but keep scaling and + # mean to allow solving the original problem. + # The scaling is required in the gradient computation for the penalty + # Both scaling and mean are used later used to transform + # optimization results back to the original space. + # In the sparse case, the mean can not be subtracted and the + # correction is carried along as X_offset. X_pre = X X_scale = None X_offset = None @@ -1575,8 +1620,8 @@ def fit(self, X, y, sample_weight=None): if self.penalty == 'elasticnet': if (not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1): - raise ValueError("l1_ratio must be between 0 and 1;" - " got (l1_ratio=%r)" % self.l1_ratio) + raise ValueError("l1_ratio must be between 0 and 1;" + " got (l1_ratio=%r)" % self.l1_ratio) elif self.l1_ratio is not None: warnings.warn("l1_ratio parameter is only used when penalty is " "'elasticnet'. Got " From 9ad4f93d99f982697d1d4a7eb73620fea5fe5443 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 17:44:58 -0500 Subject: [PATCH 27/41] add helper function for weighted mean and std --- sklearn/linear_model/_logistic.py | 7 +------ sklearn/utils/extmath.py | 28 ++++++++++++++++++++++++++++ sklearn/utils/tests/test_extmath.py | 23 +++++++++++++++++++++++ 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 0bccec07f5f63..c7b6a470736bd 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -951,9 +951,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, if precondition and solver == 'lbfgs': # FIXME this duplicates some code from _preprocess_data # and should be refactored + X_mean, X_scale = _weighted_mean_var(X, sample_weight) if sparse.issparse(X): - X_mean, X_var = mean_variance_axis(X, axis=0) - X_scale = np.sqrt(X_var, X_var) X_scale[X_scale == 0] = 1 del X_var if fit_intercept: @@ -963,12 +962,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, X_pre = X_pre.multiply(sparse.csr_matrix(1 / X_scale)) else: - X_mean = np.average(X, weights=sample_weight, axis=0) if fit_intercept: X_pre = X - X_mean - # weighted version of std - X_scale = np.sqrt(np.average((X_pre)**2, weights=sample_weight, - axis=0)) X_scale[X_scale == 0] = 1 X_pre = X_pre / X_scale diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index b0c28897a8ef1..be33cb9843a15 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -830,3 +830,31 @@ def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): 'its last element does not correspond to sum', RuntimeWarning) return out + + +def _weighted_mean_std(X, sample_weight): + """Compute weighted mean and standard deviation for ndarrays and sparse matrices. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + input array. + sample_weight : ndarray, shape (n_samples,) + Weights. + + Returns + ------- + mean : ndarray, shape (n_features,) + Weighted mean. + std : ndarray, shape (n_features,) + Weighted std. + """ + if sparse.issparse(X): + normed_weights = sample_weight / sample_weight.sum() + sq_sum = safe_sparse_dot(normed_weights, X.multiply(X)) + mean = safe_sparse_dot(normed_weights, X) + var = sq_sum - mean ** 2 + else: + mean = np.average(X, weights=sample_weight, axis=0) + var = np.average(X**2, weights=sample_weight, axis=0) - mean ** 2 + return mean, np.sqrt(var) \ No newline at end of file diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index fdca303e15d8b..cd92c4edb56b2 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -34,6 +34,7 @@ from sklearn.utils.extmath import stable_cumsum from sklearn.utils.extmath import safe_min from sklearn.utils.extmath import safe_sparse_dot +from sklearn.utils.extmath import _weighted_mean_std from sklearn.datasets import make_low_rank_matrix @@ -727,3 +728,25 @@ def test_safe_sparse_dot_dense_output(dense_output): if dense_output: expected = expected.toarray() assert_allclose_dense_sparse(actual, expected) + + +def test_weighted_mean_std(): + rng = np.random.RandomState(0) + X = rng.normal(size=(100, 10)) + weights = rng.uniform(size=(100,)) + mean_dense, std_dense = _weighted_mean_std(X, weights) + mean_sparse, std_sparse = _weighted_mean_std( + sparse.csr_matrix(X), weights) + assert_allclose_dense_sparse(mean_dense, mean_sparse) + assert_allclose_dense_sparse(std_dense, std_sparse) + # with ones + weights = np.ones(100) + mean_dense, std_dense = _weighted_mean_std(X, weights) + mean_sparse, std_sparse = _weighted_mean_std( + sparse.csr_matrix(X), weights) + mean_expected = X.mean(axis=0) + std_expected = X.std(axis=0) + assert_allclose_dense_sparse(mean_dense, mean_expected) + assert_allclose_dense_sparse(std_dense, std_expected) + assert_allclose_dense_sparse(mean_sparse, mean_expected) + assert_allclose_dense_sparse(std_sparse, std_expected) From 150b5a312c35091b075f70f3c7b9a02aaa9c7a63 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 17:48:13 -0500 Subject: [PATCH 28/41] typos --- sklearn/linear_model/_logistic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index c7b6a470736bd..7979d2971077f 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -26,7 +26,7 @@ from ..utils import check_random_state from ..utils.extmath import (log_logistic, safe_sparse_dot, softmax, squared_norm) -from ..utils.extmath import row_norms +from ..utils.extmath import row_norms, _weighted_mean_std from ..utils.fixes import logsumexp from ..utils.optimize import _newton_cg, _check_optimize_result from ..utils.validation import check_X_y @@ -34,7 +34,6 @@ from ..utils import deprecated from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args -from ..utils.sparsefuncs import mean_variance_axis from ..model_selection import check_cv from ..metrics import get_scorer @@ -951,10 +950,9 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, if precondition and solver == 'lbfgs': # FIXME this duplicates some code from _preprocess_data # and should be refactored - X_mean, X_scale = _weighted_mean_var(X, sample_weight) + X_mean, X_scale = _weighted_mean_std(X, sample_weight) if sparse.issparse(X): X_scale[X_scale == 0] = 1 - del X_var if fit_intercept: X_offset = -X_mean/X_scale # FIXME old scipy requires conversion to sparse matrix From 68dfe1b17e3666e1ea903290f84c6becfe8b85c4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 17:53:19 -0500 Subject: [PATCH 29/41] shorter docstring to placate pep8 --- sklearn/utils/extmath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index be33cb9843a15..159466b423d30 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -833,7 +833,7 @@ def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): def _weighted_mean_std(X, sample_weight): - """Compute weighted mean and standard deviation for ndarrays and sparse matrices. + """Compute weighted mean and standard deviation. Parameters ---------- From 763b0106e1b93dc03911bad573cb14df74f81724 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 12 Nov 2019 17:53:53 -0500 Subject: [PATCH 30/41] more pep8 --- sklearn/utils/extmath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 159466b423d30..c6c44773f8e08 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -857,4 +857,4 @@ def _weighted_mean_std(X, sample_weight): else: mean = np.average(X, weights=sample_weight, axis=0) var = np.average(X**2, weights=sample_weight, axis=0) - mean ** 2 - return mean, np.sqrt(var) \ No newline at end of file + return mean, np.sqrt(var) From 71e563aa774b0a834213d291d63ee214975005cb Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 10:45:04 -0500 Subject: [PATCH 31/41] rename loss_value_ to objective_value_ add docstring --- sklearn/linear_model/_logistic.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 7979d2971077f..fe59a0ba30140 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1487,6 +1487,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, corresponds to outcome 1 (True) and `-intercept_` corresponds to outcome 0 (False). + objective_value_ : float + Objective function value (penalized loss). Lower is better. + n_iter_ : array, shape (n_classes,) or (1, ) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum @@ -1721,17 +1724,17 @@ def fit(self, X, y, sample_weight=None): precondition=self.precondition) for class_, warm_start_coef_ in zip(classes_, warm_start_coef)) - fold_coefs_, _, n_iter_, loss_values_ = zip(*fold_coefs_) + fold_coefs_, _, n_iter_, objective_value_ = zip(*fold_coefs_) self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] if multi_class == 'multinomial': self.coef_ = fold_coefs_[0][0] - self.loss_values_ = loss_values_[0] + self.objective_value_ = objective_value_[0] else: self.coef_ = np.asarray(fold_coefs_) self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept)) - self.loss_values_ = loss_values_[0] + self.objective_value_ = objective_value_[0] if self.fit_intercept: self.intercept_ = self.coef_[:, -1] @@ -2003,6 +2006,10 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, ``(n_folds, n_cs, n_l1_ratios_, n_features)`` or ``(n_folds, n_cs, n_l1_ratios_, n_features + 1)``. + objective_value_ : float + Objective function value (penalized loss). Lower is better. + Only present if `refit=True`. + scores_ : dict dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold, after doing @@ -2291,7 +2298,7 @@ def fit(self, X, y, sample_weight=None): # Note that y is label encoded and hence pos_class must be # the encoded label / None (for 'multinomial') - w, _, _, _ = _logistic_regression_path( + w, _, _, objective_value = _logistic_regression_path( X, y, pos_class=encoded_label, Cs=[C_], solver=solver, fit_intercept=self.fit_intercept, coef=coef_init, max_iter=self.max_iter, tol=self.tol, @@ -2304,6 +2311,7 @@ def fit(self, X, y, sample_weight=None): sample_weight=sample_weight, l1_ratio=l1_ratio_) w = w[0] + self.objective_value_ = objective_value[0] else: # Take the best scores across every fold and the average of From ca44e1c35c4301e425cb80652702be47a0f013be Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 11:01:26 -0500 Subject: [PATCH 32/41] Update sklearn/linear_model/_logistic.py Co-Authored-By: Olivier Grisel --- sklearn/linear_model/_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index fe59a0ba30140..f5a0c43df3f03 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -183,7 +183,7 @@ def _logistic_loss(w, X, y, alpha, sample_weight=None, X_scale=None, X_offset : ndarray, shape (n_features,) or None Offset to use for X to avoid subtracting mean from sparse matrices if preconditioning. Should be None in the dense case - as the mean was actually subtracted. + as the mean was actually subtracted. Returns ------- From 962a190b9dc62751cb9a2394a93d5c636fe9cd80 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 11:02:11 -0500 Subject: [PATCH 33/41] Update sklearn/linear_model/_logistic.py Co-Authored-By: Olivier Grisel --- sklearn/linear_model/_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index f5a0c43df3f03..9fdf63048af1a 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -309,7 +309,7 @@ def _multinomial_loss(w, X, Y, alpha, sample_weight, X_scale=None, X_offset : ndarray, shape (n_features,) or None Offset to use for X to avoid subtracting mean from sparse matrices if preconditioning. Should be None in the dense case - as the mean was actually subtracted. + as the mean was actually subtracted. Returns ------- From af16e99e270ddd5a3e2d5d58240b75e21c0cf580 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 11:09:35 -0500 Subject: [PATCH 34/41] address some of oliviers comments --- sklearn/feature_selection/_from_model.py | 2 +- sklearn/linear_model/tests/test_logistic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 63baab1f030b2..fb5b44f4cb155 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -146,7 +146,7 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator): >>> y = [0, 1, 0, 1] >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) >>> selector.estimator_.coef_ - array([[-0.3252319 , 0.83462631, 0.49750495]]) + array([[-0.3252... , 0.8346..., 0.4950...]]) >>> selector.threshold_ 0.55245... >>> selector.get_support() diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 20b8e59434cde..23c719fe70037 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1877,7 +1877,7 @@ def test_illconditioned_lbfgs(): def test_logistic_loss_preconditioning(): - # check _logistic_loss and _logistic_loss_grad with preconditioning + # check that _logistic_loss is invariant wrt whether we precondition or not. X, y = make_classification(n_samples=100, n_features=60, random_state=0) X[:, 1] += 10000 lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000) From 74ed6e98f96e8bdb574033c566ac391c2d713782 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 11:29:25 -0500 Subject: [PATCH 35/41] pep8 --- sklearn/linear_model/tests/test_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 23c719fe70037..d88449676ddb0 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1877,7 +1877,7 @@ def test_illconditioned_lbfgs(): def test_logistic_loss_preconditioning(): - # check that _logistic_loss is invariant wrt whether we precondition or not. + # check that _logistic_loss is invariant wrt whether we precondition. X, y = make_classification(n_samples=100, n_features=60, random_state=0) X[:, 1] += 10000 lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000) From c7c1689f991f9e6594f350f7cbfd8a2566bee8cf Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 11:46:21 -0500 Subject: [PATCH 36/41] improve invariance test for logistic_loss --- sklearn/linear_model/tests/test_logistic.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index d88449676ddb0..81c2bf85ed5e9 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1880,13 +1880,16 @@ def test_logistic_loss_preconditioning(): # check that _logistic_loss is invariant wrt whether we precondition. X, y = make_classification(n_samples=100, n_features=60, random_state=0) X[:, 1] += 10000 - lr = LogisticRegression(random_state=0, precondition=False, max_iter=1000) + lr = LogisticRegression(random_state=0, precondition=True, max_iter=1000) lr.fit(X, y) loss = _logistic_loss(np.hstack([lr.coef_.ravel(), lr.intercept_]), X, 2 * y - 1, 1) + assert_almost_equal(loss, lr.objective_value_) + # do full preconditioning + X_mean = X.mean(axis=0) X_std = X.std(axis=0) - X_pre = X / X_std - loss_pre = _logistic_loss( - np.hstack([lr.coef_.ravel() * X_std, lr.intercept_]), - X_pre, 2 * y - 1, 1, X_scale=X_std) + X_pre = (X - X_mean) / X_std + w_scaled = lr.coef_.ravel() * X_std + w_pre = np.hstack([w_scaled, lr.intercept_ + np.inner(lr.coef_, X_mean)]) + loss_pre = _logistic_loss(w_pre, X_pre, 2 * y - 1, 1, X_scale=X_std) assert_almost_equal(loss, loss_pre) From 1a1bcfd5fa5e200854b58e0713a4861b23cc0db3 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 12:07:51 -0500 Subject: [PATCH 37/41] fix objective value assignment --- sklearn/linear_model/_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 9fdf63048af1a..770da2401fa66 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -2311,7 +2311,7 @@ def fit(self, X, y, sample_weight=None): sample_weight=sample_weight, l1_ratio=l1_ratio_) w = w[0] - self.objective_value_ = objective_value[0] + self.objective_value_ = objective_value else: # Take the best scores across every fold and the average of From 82b09493d150adc97c9533beb6438beccca33824 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 13:42:56 -0500 Subject: [PATCH 38/41] why do we suddenly need more dots? hum --- sklearn/feature_selection/_from_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index fb5b44f4cb155..98c70ddc64bca 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -146,7 +146,7 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator): >>> y = [0, 1, 0, 1] >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) >>> selector.estimator_.coef_ - array([[-0.3252... , 0.8346..., 0.4950...]]) + array([[-0.32... , 0.83..., 0.49...]]) >>> selector.threshold_ 0.55245... >>> selector.get_support() From 01a5aa2d6497c29eafb2a27fc8b7888221e4642e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 13:48:53 -0500 Subject: [PATCH 39/41] add auto option, docstring --- sklearn/linear_model/_logistic.py | 38 +++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 770da2401fa66..22e98e17bf465 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -520,7 +520,6 @@ def _check_solver(solver, penalty, dual): raise ValueError( "penalty='none' is not supported for the liblinear solver" ) - return solver @@ -847,6 +846,12 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. + precondition : boolean or 'auto', default='auto' + Whether to use preconditioning for solving the optimization problem. + A diagonal preconditioning based on the data standard deviation is + used. If 'auto', preconditioning is used when ``solver='lbfgs'``, which + is the only solver that currently supports it. + Returns ------- coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1) @@ -875,6 +880,12 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, solver = _check_solver(solver, penalty, dual) + if precondition == 'auto': + precondition = solver == 'lbfgs' + if precondition == 'True' and solver != 'lbfgs': + raise ValueError("precondition=True only supported with" + " solver='lbfgs'") + # Preprocessing. if check_input: X = check_array(X, accept_sparse='csr', dtype=np.float64, @@ -947,7 +958,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, X_pre = X X_scale = None X_offset = None - if precondition and solver == 'lbfgs': + if precondition: # FIXME this duplicates some code from _preprocess_data # and should be refactored X_mean, X_scale = _weighted_mean_std(X, sample_weight) @@ -974,7 +985,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, 'Initialization coef is of shape %d, expected shape ' '%d or %d' % (coef.size, n_features, w0.size)) w0[:coef.size] = coef - if solver == 'lbfgs' and precondition: + if precondition: if fit_intercept: w0[-1] += np.inner(w0[:n_features], X_mean) w0[:n_features] *= X_scale @@ -999,7 +1010,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, w0[1, :coef.shape[1]] = coef else: w0[:, :coef.shape[1]] = coef - if solver == 'lbfgs' and precondition: + if precondition: if fit_intercept: w0[:, -1] += np.dot(w0[:, :n_features], X_mean) w0[:, :n_features] *= X_scale @@ -1093,7 +1104,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, if multi_class == 'multinomial': n_classes = max(2, classes.size) multi_w0 = np.reshape(w0, (n_classes, -1)) - if solver == 'lbfgs' and precondition: + if precondition: if fit_intercept: multi_w0[:, :-1] = multi_w0[:, :-1] / X_scale # adjust intercept for preconditioning @@ -1465,6 +1476,12 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. + precondition : boolean or 'auto', default='auto' + Whether to use preconditioning for solving the optimization problem. + A diagonal preconditioning based on the data standard deviation is + used. If 'auto', preconditioning is used when ``solver='lbfgs'``, which + is the only solver that currently supports it. + Attributes ---------- @@ -1560,7 +1577,7 @@ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, - l1_ratio=None, precondition=True): + l1_ratio=None, precondition='auto'): self.penalty = penalty self.dual = dual @@ -1968,6 +1985,12 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. + precondition : boolean or 'auto', default='auto' + Whether to use preconditioning for solving the optimization problem. + A diagonal preconditioning based on the data standard deviation is + used. If 'auto', preconditioning is used when ``solver='lbfgs'``, which + is the only solver that currently supports it. + Attributes ---------- classes_ : array, shape (n_classes, ) @@ -2060,7 +2083,7 @@ def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=1e-4, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1., multi_class='auto', - random_state=None, l1_ratios=None): + random_state=None, l1_ratios=None, precondition='auto'): self.Cs = Cs self.fit_intercept = fit_intercept self.cv = cv @@ -2078,6 +2101,7 @@ def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False, self.multi_class = multi_class self.random_state = random_state self.l1_ratios = l1_ratios + self.precondition = precondition def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. From 77e5c9951a5fbe8dc57d1590b91b346bddcc8816 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 13:57:57 -0500 Subject: [PATCH 40/41] typo --- sklearn/linear_model/_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 22e98e17bf465..daf8b42883d18 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -882,7 +882,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, if precondition == 'auto': precondition = solver == 'lbfgs' - if precondition == 'True' and solver != 'lbfgs': + if precondition and solver != 'lbfgs': raise ValueError("precondition=True only supported with" " solver='lbfgs'") From d1109be2a1f40d5478f02d121b6eb712ae2b0edd Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 13 Nov 2019 14:21:38 -0500 Subject: [PATCH 41/41] fix default value to 'auto' --- sklearn/linear_model/_logistic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index daf8b42883d18..979927cbbab3c 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -719,7 +719,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, intercept_scaling=1., multi_class='auto', random_state=None, check_input=True, max_squared_sum=None, sample_weight=None, - l1_ratio=None, precondition=True): + l1_ratio=None, precondition='auto'): """Compute a Logistic Regression model for a list of regularization parameters. @@ -1132,7 +1132,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, dual=False, intercept_scaling=1., multi_class='auto', random_state=None, max_squared_sum=None, sample_weight=None, - l1_ratio=None, precondition=True): + l1_ratio=None, precondition='auto'): """Computes scores across logistic_regression_path Parameters