8000 refactor sag_logistic and sag_ridge in sag_solver · scikit-learn/scikit-learn@4a3d751 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4a3d751

Browse files
committed
refactor sag_logistic and sag_ridge in sag_solver
1 parent 4417be7 commit 4a3d751

File tree

8 files changed

+440
-539
lines changed

8 files changed

+440
-539
lines changed

doc/modules/linear_model.rst

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -692,13 +692,12 @@ Setting `multi_class` to "multinomial" with the "lbfgs" or "newton-cg" solver
692692
in :class:`LogisticRegression` learns a true multinomial logistic
693693
regression model, which means that its probability estimates should
694694
be better calibrated than the default "one-vs-rest" setting.
695-
"lbfgs" and "newton-cg" solvers cannot optimize L1-penalized models, though,
696-
so the "multinomial" setting does not learn sparse models.
695+
"lbfgs", "newton-cg" and "sag" solvers cannot optimize L1-penalized models, though, so the "multinomial" setting does not learn sparse models.
697696

698697
The solver "sag" uses a Stochastic Average Gradient descent [3]_. It does not
699698
handle "multinomial" case, and is limited to L2-penalized models, yet it is
700-
faster than other solvers for large datasets, when both the number of samples
701-
and the number of features are large.
699+
often faster than other solvers for large datasets, when both the number of
700+
samples and the number of features are large.
702701

703702
In a nutshell, one may choose the solver with the following rules:
704703

sklearn/decomposition/sparse_pca.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ def transform(self, X, ridge_alpha=None):
159159

160160
X = check_array(X)
161161
ridge_alpha = self.ridge_alpha if ridge_alpha is None else ridge_alpha
162-
U, _ = ridge_regression(self.components_.T, X.T, ridge_alpha,
163-
solver='cholesky')
162+
U = ridge_regression(self.components_.T, X.T, ridge_alpha,
163+
solver='cholesky')
164164
s = np.sqrt((U ** 2).sum(axis=0))
165165
s[s == 0] = 1
166166
U /= s

sklearn/linear_model/logistic.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from scipy import optimize, sparse
1717

1818
from .base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator
19-
from .sag import sag_logistic
19+
from .sag import sag_solver
2020
from .sag_fast import get_max_squared_sum
2121
from ..feature_selection.from_model import _LearntSelectorMixin
2222
from ..preprocessing import LabelEncoder, LabelBinarizer
@@ -694,9 +694,10 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
694694
w0 = coef_.ravel()
695695

696696
elif solver == 'sag':
697-
warm_start_sag, n_iter_i = sag_logistic(
698-
X, target, sample_weight, 1. / C, max_iter, tol, verbose,
699-
random_state, False, max_squared_sum, warm_start_sag)
697+
w0, n_iter_i, warm_start_sag = sag_solver(
698+
X, target, sample_weight, 'log', 1. / C, max_iter, tol,
699+
verbose, random_state, False, max_squared_sum,
700+
warm_start_sag)
700701
w0 = warm_start_sag['coef']
701702
else:
702703
raise ValueError("solver must be one of {'liblinear', 'lbfgs', "
@@ -965,6 +966,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
965966
multinomial loss; 'sag' and 'liblinear' are limited to
966967
one-versus-rest schemes.
967968
- 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty.
969+
Note that 'sag' fast convergence is only guaranteed on features with
970+
approximately the same scale. You can preprocess the data with a
971+
scaler from sklearn.preprocessing.
968972
969973
tol : float, optional
970974
Tolerance for stopping criteria.

sklearn/linear_model/ridge.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from scipy.sparse import linalg as sp_linalg
1919

2020
from .base import LinearClassifierMixin, LinearModel, _rescale_data
21-
from .sag import sag_ridge
21+
from .sag import sag_solver
2222
from .sag_fast import get_max_squared_sum
2323
from ..base import RegressorMixin
2424
from ..utils.extmath import safe_sparse_dot
@@ -193,7 +193,8 @@ def _solve_svd(X, y, alpha):
193193

194194

195195
def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
196-
max_iter=None, tol=1e-3, verbose=0, random_state=None):
196+
max_iter=None, tol=1e-3, verbose=0, random_state=None,
197+
return_n_iter=False):
197198
"""Solve the ridge equation by the method of normal equations.
198199
199200
Read more in the :ref:`User Guide <ridge_regression>`.
@@ -244,8 +245,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
244245
in old scipy versions. It also uses an iterative procedure.
245246
246247
- 'sag' uses a Stochastic Average Gradient descent. It also uses an
247-
iterative procedure, and is faster than other solvers when both
248-
n_samples and n_features are large.
248+
iterative procedure, and is often faster than other solvers when
249+
both n_samples and n_features are large. Note that 'sag' fast
250+
convergence is only guaranteed on features with approximately the
251+
same scale. You can preprocess the data with a scaler from
252+
sklearn.preprocessing.
249253
250254
All last four solvers support both dense and sparse data.
251255
@@ -260,11 +264,19 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
260264
The seed of the pseudo random number generator to use when
261265
shuffling the data. Used in 'sag' solver.
262266
267+
return_n_iter : boolean, default False
268+
If True, the method also returns `n_iter`, the actual number of
269+
iteration performed by the solver.
270+
263271
Returns
264272
-------
265273
coef : array, shape = [n_features] or [n_targets, n_features]
266274
Weight vector(s).
267275
276+
n_iter : int, optional
277+
The actual number of iteration performed by the solver.
278+
Only returned if `return_n_iter` is True.
279+
268280
Notes
269281
-----
270282
This function won't compute the intercept.
@@ -364,9 +376,10 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
364376
coef = np.empty((y.shape[1], n_features))
365377
n_iter = np.empty(y.shape[1], dtype=np.int32)
366378
for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
367-
coef_, n_iter_ = sag_ridge(
368-
X, target.ravel(), sample_weight, alpha_i, max_iter, tol,
369-
verbose, random_state, False, max_squared_sum)
379+
coef_, n_iter_, _ = sag_solver(
380+
X, target.ravel(), sample_weight, 'squared', alpha_i,
381+
max_iter, tol, verbose, random_state, False, max_squared_sum,
382+
dict())
370383
coef[i] = coef_
371384
n_iter[i] = n_iter_
372385

@@ -382,7 +395,10 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
382395
# When y was passed as a 1d-array, we flatten the coefficients.
383396
coef = coef.ravel()
384397

385-
return coef, n_iter
398+
if return_n_iter:
399+
return coef, n_iter
400+
else:
401+
return coef
386402

387403

388404
class _BaseRidge(six.with_metaclass(ABCMeta, LinearModel)):
@@ -415,7 +431,7 @@ def fit(self, X, y, sample_weight=None):
415431
self.coef_, self.n_iter_ = ridge_regression(
416432
X, y, alpha=self.alpha, sample_weight=sample_weight,
417433
max_iter=self.max_iter, tol=self.tol, solver=self.solver,
418-
random_state=self.random_state)
434+
random_state=self.random_state, return_n_iter=True)
419435

420436
self._set_intercept(X_mean, y_mean, X_std)
421437
return self
@@ -479,8 +495,11 @@ class Ridge(_BaseRidge, RegressorMixin):
479495
in old scipy versions. It also uses an iterative procedure.
480496
481497
- 'sag' uses a Stochastic Average Gradient descent. It also uses an
482-
iterative procedure, and is faster than other solvers when both
483-
n_samples and n_features are large.
498+
iterative procedure, and is often faster than other solvers when
499+
both n_samples and n_features are large. Note that 'sag' fast
500+
convergence is only guaranteed on features with approximately the
501+
same scale. You can preprocess the data with a scaler from
502+
sklearn.preprocessing.
484503
485504
All last four solvers support both dense and sparse data.
486505
@@ -624,15 +643,13 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
624643
coef_ : array, shape (n_features,) or (n_classes, n_features)
625644
Weight vector(s).
626645
627-
<<<<<<< HEAD
628646
intercept_ : float | array, shape = (n_targets,)
629647
Independent term in decision function. Set to 0.0 if
630648
``fit_intercept = False``.
631-
=======
649+
632650
n_iter_ : array or None, shape (n_targets,)
633651
Actual number of iterations for each target. Available only for
634652
sag and lsqr solvers. Other solvers will return None.
635-
>>>>>>> ENH add n_iter in ridge
636653
637654
See also
638655
--------

0 commit comments

Comments
 (0)
0