scikit-learn
diff --git a/‎doc/modules/linear_model.rst
Lines changed: 3 additions & 4 deletions b/‎doc/modules/linear_model.rst
Lines changed: 3 additions & 4 deletions
diff --git a/‎sklearn/decomposition/sparse_pca.py
Lines changed: 2 additions & 2 deletions b/‎sklearn/decomposition/sparse_pca.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎sklearn/linear_model/logistic.py
Lines changed: 8 additions & 4 deletions b/‎sklearn/linear_model/logistic.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎sklearn/linear_model/ridge.py
Lines changed: 31 additions & 14 deletions b/‎sklearn/linear_model/ridge.py
Lines changed: 31 additions & 14 deletions
@@ -692,13 +692,12 @@ Setting `multi_class` to "multinomial" with the "lbfgs" or "newton-cg" solver
 in :class:`LogisticRegression` learns a true multinomial logistic
 regression model, which means that its probability estimates should
 be better calibrated than the default "one-vs-rest" setting.
-"lbfgs" and "newton-cg" solvers cannot optimize L1-penalized models, though,
-so the "multinomial" setting does not learn sparse models.
+"lbfgs", "newton-cg" and "sag" solvers cannot optimize L1-penalized models, though, so the "multinomial" setting does not learn sparse models.
 
 The solver "sag" uses a Stochastic Average Gradient descent [3]_. It does not
 handle "multinomial" case, and is limited to L2-penalized models, yet it is
-faster than other solvers for large datasets, when both the number of samples
-and the number of features are large.
+often faster than other solvers for large datasets, when both the number of
+samples and the number of features are large.
 
 In a nutshell, one may choose the solver with the following rules:
 
 
@@ -159,8 +159,8 @@ def transform(self, X, ridge_alpha=None):
 
         X = check_array(X)
         ridge_alpha = self.ridge_alpha if ridge_alpha is None else ridge_alpha
-        U, _ = ridge_regression(self.components_.T, X.T, ridge_alpha,
-                                solver='cholesky')
+        U = ridge_regression(self.components_.T, X.T, ridge_alpha,
+                             solver='cholesky')
         s = np.sqrt((U ** 2).sum(axis=0))
         U /= s
 
@@ -16,7 +16,7 @@
 from scipy import optimize, sparse
 
 from .base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator
-from .sag import sag_logistic
+from .sag import sag_solver
 from .sag_fast import get_max_squared_sum
 from ..feature_selection.from_model import _LearntSelectorMixin
 from ..preprocessing import LabelEncoder, LabelBinarizer
@@ -694,9 +694,10 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                 w0 = coef_.ravel()
 
         elif solver == 'sag':
-            warm_start_sag, n_iter_i = sag_logistic(
-                X, target, sample_weight, 1. / C, max_iter, tol, verbose,
-                random_state, False, max_squared_sum, warm_start_sag)
+            w0, n_iter_i, warm_start_sag = sag_solver(
+                X, target, sample_weight, 'log', 1. / C, max_iter, tol,
+                verbose, random_state, False, max_squared_sum,
+                warm_start_sag)
             w0 = warm_start_sag['coef']
         else:
             raise ValueError("solver must be one of {'liblinear', 'lbfgs', "
@@ -965,6 +966,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
             multinomial loss; 'sag' and 'liblinear' are limited to
             one-versus-rest schemes.
         - 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty.
+        Note that 'sag' fast convergence is only guaranteed on features with
+        approximately the same scale. You can preprocess the data with a
+        scaler from sklearn.preprocessing.
 
     tol : float, optional
         Tolerance for stopping criteria.
 
@@ -18,7 +18,7 @@
 from scipy.sparse import linalg as sp_linalg
 
 from .base import LinearClassifierMixin, LinearModel, _rescale_data
-from .sag import sag_ridge
+from .sag import sag_solver
 from .sag_fast import get_max_squared_sum
 from ..base import RegressorMixin
 from ..utils.extmath import safe_sparse_dot
@@ -193,7 +193,8 @@ def _solve_svd(X, y, alpha):
 
 
 def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
-                     max_iter=None, tol=1e-3, verbose=0, random_state=None):
+                     max_iter=None, tol=1e-3, verbose=0, random_state=None,
+                     return_n_iter=False):
     """Solve the ridge equation by the method of normal equations.
 
     Read more in the :ref:`User Guide <ridge_regression>`.
@@ -244,8 +245,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
           in old scipy versions. It also uses an iterative procedure.
 
         - 'sag' uses a Stochastic Average Gradient descent. It also uses an
-          iterative procedure, and is faster than other solvers when both
-          n_samples and n_features are large.
+          iterative procedure, and is often faster than other solvers when
+          both n_samples and n_features are large. Note that 'sag' fast
+          convergence is only guaranteed on features with approximately the
+          same scale. You can preprocess the data with a scaler from
+          sklearn.preprocessing.
 
         All last four solvers support both dense and sparse data.
 
@@ -260,11 +264,19 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         The seed of the pseudo random number generator to use when
         shuffling the data. Used in 'sag' solver.
 
+    return_n_iter : boolean, default False
+        If True, the method also returns `n_iter`, the actual number of
+        iteration performed by the solver.
+
     Returns
     -------
     coef : array, shape = [n_features] or [n_targets, n_features]
         Weight vector(s).
 
+    n_iter : int, optional
+        The actual number of iteration performed by the solver.
+        Only returned if `return_n_iter` is True.
+
     Notes
     -----
     This function won't compute the intercept.
@@ -364,9 +376,10 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         coef = np.empty((y.shape[1], n_features))
         n_iter = np.empty(y.shape[1], dtype=np.int32)
         for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
-            coef_, n_iter_ = sag_ridge(
-                X, target.ravel(), sample_weight, alpha_i, max_iter, tol,
-                verbose, random_state, False, max_squared_sum)
+            coef_, n_iter_, _ = sag_solver(
+                X, target.ravel(), sample_weight, 'squared', alpha_i,
+                max_iter, tol, verbose, random_state, False, max_squared_sum,
+                dict())
             coef[i] = coef_
             n_iter[i] = n_iter_
 
@@ -382,7 +395,10 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         # When y was passed as a 1d-array, we flatten the coefficients.
         coef = coef.ravel()
 
-    return coef, n_iter
+    if return_n_iter:
+        return coef, n_iter
+    else:
+        return coef
 
 
 class _BaseRidge(six.with_metaclass(ABCMeta, LinearModel)):
@@ -415,7 +431,7 @@ def fit(self, X, y, sample_weight=None):
         self.coef_, self.n_iter_ = ridge_regression(
             X, y, alpha=self.alpha, sample_weight=sample_weight,
             max_iter=self.max_iter, tol=self.tol, solver=self.solver,
-            random_state=self.random_state)
+            random_state=self.random_state, return_n_iter=True)
 
         self._set_intercept(X_mean, y_mean, X_std)
         return self
@@ -479,8 +495,11 @@ class Ridge(_BaseRidge, RegressorMixin):
           in old scipy versions. It also uses an iterative procedure.
 
         - 'sag' uses a Stochastic Average Gradient descent. It also uses an
-          iterative procedure, and is faster than other solvers when both
-          n_samples and n_features are large.
+          iterative procedure, and is often faster than other solvers when
+          both n_samples and n_features are large. Note that 'sag' fast
+          convergence is only guaranteed on features with approximately the
+          same scale. You can preprocess the data with a scaler from
+          sklearn.preprocessing.
 
         All last four solvers support both dense and sparse data.
 
@@ -624,15 +643,13 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     coef_ : array, shape (n_features,) or (n_classes, n_features)
         Weight vector(s).
 
-<<<<<<< HEAD
     intercept_ : float | array, shape = (n_targets,)
         Independent term in decision function. Set to 0.0 if
         ``fit_intercept = False``.
-=======
+
     n_iter_ : array or None, shape (n_targets,)
         Actual number of iterations for each target. Available only for
         sag and lsqr solvers. Other solvers will return None.
->>>>>>> ENH add n_iter in ridge
 
     See also
     --------