scikit-learn
diff --git a/‎sklearn/linear_model/ridge.py
Lines changed: 32 additions & 13 deletions b/‎sklearn/linear_model/ridge.py
Lines changed: 32 additions & 13 deletions
diff --git a/‎sklearn/linear_model/sag.py
Lines changed: 94 additions & 27 deletions b/‎sklearn/linear_model/sag.py
Lines changed: 94 additions & 27 deletions
@@ -18,6 +18,7 @@
 from scipy.sparse import linalg as sp_linalg
 
 from .base import LinearClassifierMixin, LinearModel
+from .sag import sag_ridge
 from ..base import RegressorMixin
 from ..utils.extmath import safe_sparse_dot
 from ..utils import check_X_y
@@ -220,8 +221,9 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         The default value is determined by scipy.sparse.linalg.
 
     sample_weight : float or numpy array of shape [n_samples]
-        Individual weights for each sample. If sample_weight is set, then
-        the solver will automatically be set to 'cholesky'
+        Individual weights for each sample. If sample_weight is set, and
+        if the solver is not in {'cholesky', 'sag'}, then the solver will
+        automatically be set to 'cholesky'.
 
     solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'}
         Solver to use in the computational routines:
@@ -245,7 +247,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
           scipy.sparse.linalg.lsqr. It is the fatest but may not be available
           in old scipy versions. It also uses an iterative procedure.
 
-        All three solvers support both dense and sparse data.
+        - 'sag' uses a Stochastic Average Gradient descent. It also uses an
+          iterative procedure, and is faster than other solvers when both
+          n_samples and n_features are large.
+
+        All last four solvers support both dense and sparse data.
 
     tol : float
         Precision of the solution.
@@ -285,7 +291,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
     if solver == 'auto':
         # cholesky if it's a dense array and cg in
         # any other case
-        if not sparse.issparse(X) or has_sw:
+        if not sparse.issparse(X) or (has_sw and solver != 'sag'):
             solver = 'cholesky'
         else:
             solver = 'sparse_cg'
@@ -299,8 +305,9 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         if np.atleast_1d(sample_weight).ndim > 1:
             raise ValueError("Sample weights must be 1D array or scalar")
 
-        # Sample weight can be implemented via a simple rescaling.
-        X, y = _rescale_data(X, y, sample_weight)
+        if solver == 'cholesky':
+            # Sample weight can be implemented via a simple rescaling.
+            X, y = _rescale_data(X, y, sample_weight)
 
     # There should be either 1 or n_targets penalties
     alpha = np.asarray(alpha).ravel()
@@ -312,13 +319,13 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
     if alpha.size == 1 and n_targets > 1:
         alpha = np.repeat(alpha, n_targets)
 
-    if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr'):
+    if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag'):
         raise ValueError('Solver %s not understood' % solver)
 
     if solver == 'sparse_cg':
         coef = _solve_sparse_cg(X, y, alpha, max_iter, tol, verbose)
 
-    elif solver == "lsqr":
+    elif solver == 'lsqr':
         coef = _solve_lsqr(X, y, alpha, max_iter, tol)
 
     elif solver == 'cholesky':
@@ -339,6 +346,12 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
                 # use SVD solver if matrix is singular
                 solver = 'svd'
 
+    elif solver == 'sag':
+        coef = [sag_ridge(X, target.ravel(), alpha_i, sample_weight,
+                          max_iter, tol, verbose)
+                for alpha_i, target in zip(alpha, y.T)]
+        coef = np.asarray(coef)
+
     if solver == 'svd':
         if sparse.issparse(X):
             raise TypeError('SVD solver does not support sparse'
@@ -414,9 +427,10 @@ class Ridge(_BaseRidge, RegressorMixin):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    max_iter : int, optional
+    max_iter : int, optional for 'sparse_cg' and 'lsqr' solvers
         Maximum number of iterations for conjugate gradient solver.
-        The default value is determined by scipy.sparse.linalg.
+        For 'sparse_cg' and 'lsqr' solvers, the default value is determined
+        by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
@@ -442,7 +456,11 @@ class Ridge(_BaseRidge, RegressorMixin):
           scipy.sparse.linalg.lsqr. It is the fatest but may not be available
           in old scipy versions. It also uses an iterative procedure.
 
-        All three solvers support both dense and sparse data.
+        - 'sag' uses a Stochastic Average Gradient descent. It also uses an
+          iterative procedure, and is faster than other solvers when both
+          n_samples and n_features are large.
+
+        All last four solvers support both dense and sparse data.
 
     tol : float
         Precision of the solution.
@@ -527,15 +545,16 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
 
-    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'}
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag'}
         Solver to use in the computational
         routines. 'svd' will use a Singular value decomposition to obtain
         the solution, 'cholesky' will use the standard
         scipy.linalg.solve function, 'sparse_cg' will use the
         conjugate gradient solver as found in
         scipy.sparse.linalg.cg while 'auto' will chose the most
         appropriate depending on the matrix X. 'lsqr' uses
-        a direct regularized least-squares routine provided by scipy.
+        a direct regularized least-squares routine provided by scipy,
+        and 'sag' uses a Stochastic Average Gradient descent.
 
     tol : float
         Precision of the solution.
 
@@ -11,7 +11,7 @@
 from ..utils.seq_dataset import ArrayDataset, CSRDataset
 from ..externals import six
 from ..externals.joblib import Parallel, delayed
-from .sag_fast import Log, SquaredLoss
+from .sag_fast import LogLoss, SquaredLoss
 from .sag_fast import sag_sparse, get_auto_eta
 
 MAX_INT = np.iinfo(np.int32).max
@@ -21,15 +21,94 @@
 SPARSE_INTERCEPT_DECAY = 0.01
 
 
+def make_dataset(X, y, sample_weight, random_state):
+        # check which type of Sequential Dataset is needed
+    if sp.issparse(X):
+        dataset = CSRDataset(X.data, X.indptr, X.indices,
+                             y, sample_weight,
+                             seed=random_state.randint(MAX_INT))
+        intercept_decay = SPARSE_INTERCEPT_DECAY
+    else:
+        dataset = ArrayDataset(X, y, sample_weight,
+                               seed=random_state.randint(MAX_INT))
+        intercept_decay = 1.0
+
+    return dataset, intercept_decay
+
+
+def sag_ridge(X, y, alpha=1e-4, sample_weight=None, max_iter=1000, tol=0.001,
+              verbose=0):
+    """SAG solver for Ridge regression"""
+
+    # TODO
+    if max_iter is None:
+        warnings.warn("sag solver requires 'max_iter' to be not None. "
+                      "max_iter is set to 1000", RuntimeWarning)
+        max_iter = 1000
+
+    n_samples, n_features = X.shape[0], X.shape[1]
+    alpha = float(alpha) / n_samples
+    fit_intercept = False
+
+    # initialization
+    if sample_weight is None:
+        sample_weight = np.ones(n_samples, dtype=np.float64, order='C')
+    coef_init = np.zeros(n_features, dtype=np.float64, order='C')
+    intercept_init = 0.0
+    weight_pos = 1
+    weight_neg = 1
+
+    # TODO: *_init (with a boolean warm-start) as parameters ?
     intercept_sum_gradient_init = 0.0
+    sum_gradient_init = np.zeros(n_features, dtype=np.float64, order='C')
+    gradient_memory_init = np.zeros(n_samples, dtype=np.float64, order='C')
+    seen_init = np.zeros(n_samples, dtype=np.int32, order='C')
+    num_seen_init = 0
+
+    # TODO: add a random_state in parameters ?
+    random_state = check_random_state(42)
+
+    dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state)
+
+    # set the eta0 at 1 / 4L where L is the max sum of
+    # squares for over all samples
+    step_size = get_auto_eta(dataset, alpha, n_samples, SquaredLoss(),
+                             fit_intercept)
+    if step_size * alpha == 1:
+        raise ZeroDivisionError("Current sag implementation does not handle "
+                                "the case step_size * alpha == 1")
+    print alpha
+    intercept_, num_seen, max_iter_reached, intercept_sum_gradient = \
+        sag_sparse(dataset, coef_init.ravel(),
+                   intercept_init, n_samples,
+                   n_features, tol,
+                   max_iter,
+                   SquaredLoss(),
+                   step_size, alpha,
+                   sum_gradient_init.ravel(),
+                   gradient_memory_init.ravel(),
+                   seen_init.ravel(),
+                   num_seen_init,
+                   weight_pos, weight_neg,
+                   fit_intercept,
+                   intercept_sum_gradient_init,
+                   intercept_decay,
+                   verbose)
+
+    if max_iter_reached:
+        warnings.warn("The max_iter was reached which means "
+                      "the coef_ did not converge", ConvergenceWarning)
+
+    return coef_init
+
+
                  max_iter=1000, tol=0.001, verbose=0, random_state=None):
     """SAG solver for LogisticRegression"""
 
     n_samples, n_features = X.shape[0], X.shape[1]
+    alpha = float(alpha) / n_samples
 
-    alpha = alpha / n_samples
-
-    # initialize all parameters if there is no init
     if sample_weight is None:
         sample_weight = np.ones(n_samples, dtype=np.float64, order='C')
 
@@ -47,32 +126,28 @@ def sag_logistic(X, y, coef_init, alpha=1e-4, sample_weight=None,
     gradient_memory_init = np.zeros(n_samples, dtype=np.float64, order='C')
     seen_init = np.zeros(n_samples, dtype=np.int32, order='C')
     num_seen_init = 0
+
     weight_pos = 1
     weight_neg = 1
 
     random_state = check_random_state(random_state)
 
-    # check which type of Sequential Dataset is needed
-    if sp.issparse(X):
-        dataset = CSRDataset(X.data, X.indptr, X.indices,
-                             y, sample_weight,
-                             seed=random_state.randint(MAX_INT))
-        intercept_decay = SPARSE_INTERCEPT_DECAY
-    else:
-        dataset = ArrayDataset(X, y, sample_weight,
-                               seed=random_state.randint(MAX_INT))
-        intercept_decay = 1.0
+    dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state)
 
     # set the eta0 at 1 / 4L where L is the max sum of
     # squares for over all samples
-    step_size = get_auto_eta(dataset, alpha, n_samples, Log(), fit_intercept)
+    step_size = get_auto_eta(dataset, alpha, n_samples, LogLoss(),
+                             fit_intercept)
+    if step_size * alpha == 1.:
+        raise ZeroDivisionError("Current sag implementation does not handle "
+                                "the case step_size * alpha == 1")
 
     intercept_, num_seen, max_iter_reached, intercept_sum_gradient = \
         sag_sparse(dataset, coef_init.ravel(),
                    intercept_init, n_samples
10712
,
                    n_features, tol,
                    max_iter,
-                   Log(),
+                   LogLoss(),
                    step_size, alpha,
                    sum_gradient_init.ravel(),
                    gradient_memory_init.ravel(),
@@ -163,16 +238,8 @@ def _fit(self, X, y, coef_init=None, intercept_init=None,
 
         random_state = check_random_state(self.random_state)
 
-        # check which type of Sequential Dataset is needed
-        if sp.issparse(X):
-            dataset = CSRDataset(X.data, X.indptr, X.indices,
-                                 y, sample_weight,
-                                 seed=random_state.randint(MAX_INT))
-            intercept_decay = SPARSE_INTERCEPT_DECAY
-        else:
-            dataset = ArrayDataset(X, y, sample_weight,
-                                   seed=random_state.randint(MAX_INT))
-            intercept_decay = 1.0
+        dataset, intercept_decay = make_dataset(X, y, sample_weight,
+                                                random_state)
 
         # set the eta0 if needed, 'auto' is 1 / 4L where L is the max sum of
         # squares for over all samples
@@ -317,7 +384,7 @@ def __init__(self, alpha=0.0001,
                  eta0='auto', class_weight=None, warm_start=False):
         self.n_jobs = n_jobs
         self.class_weight = class_weight
-        self.loss_function = Log()
+        self.loss_function = LogLoss()
         super(SAGClassifier, self).__init__(alpha=alpha,
                                             fit_intercept=fit_intercept,
                                             max_iter=max_iter,