From 1b182c6a2e12e90db70085ebb1d5a199078146b2 Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Thu, 10 Oct 2019 16:25:49 -0400
Subject: [PATCH 01/19] Adding jitter to LassoLars fit

---
 doc/modules/linear_model.rst        |  2 +-
 sklearn/linear_model/least_angle.py | 26 +++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d663cc0ce3dca..194801f6c5b50 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -480,7 +480,7 @@ function of the norm of its coefficients.
    >>> reg.fit([[0, 0], [1, 1]], [0, 1])
    LassoLars(alpha=0.1)
    >>> reg.coef_
-   array([0.717157..., 0.        ])
+   array([0.717..., 0.        ])
 
 .. topic:: Examples:
 
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index d6b9609efb383..296f329825293 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -809,6 +809,11 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
+    jitter : float, default=0.0001
+        Uniform noise parameter, added to the y values, to satisfy \
+        the model's assumption of one-at-a-time computations \
+        (Efron et al. 2004).
+
     Attributes
     ----------
     alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
@@ -855,7 +860,8 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
 
     def __init__(self, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
-                 eps=np.finfo(np.float).eps, copy_X=True, fit_path=True):
+                 eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
+                 jitter=10e-5):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -864,6 +870,7 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True,
         self.eps = eps
         self.copy_X = copy_X
         self.fit_path = fit_path
+        self.jitter = jitter
 
     @staticmethod
     def _get_gram(precompute, X, y):
@@ -963,6 +970,13 @@ def fit(self, X, y, Xy=None):
         else:
             max_iter = self.max_iter
 
+        noise = np.random.uniform(high=self.jitter, size=len(y))
+
+        if y.ndim == 2:
+            noise = noise.reshape(-1,1)
+
+        y = y + noise
+
         self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path,
                   Xy=Xy)
 
@@ -1030,6 +1044,11 @@ class LassoLars(Lars):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
+    jitter : float, default=0.0001
+        Uniform noise parameter, added to the y values, to satisfy \
+        the model's assumption of one-at-a-time computations \
+        (Efron et al. 2004).
+
     positive : bool, default=False
         Restrict coefficients to be >= 0. Be aware that you might want to
         remove fit_intercept which is set True by default.
@@ -1074,7 +1093,7 @@ class LassoLars(Lars):
     >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])
     LassoLars(alpha=0.01)
     >>> print(reg.coef_)
-    [ 0.         -0.963257...]
+    [ 0.         -0.9632...]
 
     See also
     --------
@@ -1092,7 +1111,7 @@ class LassoLars(Lars):
     def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 positive=False):
+                 positive=False, jitter=10e-5):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
@@ -1103,6 +1122,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
         self.copy_X = copy_X
         self.eps = eps
         self.fit_path = fit_path
+        self.jitter = jitter
 
 
 ###############################################################################

From 7db0301213fe5ba38f01114b9d285a1d94d77cbe Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Thu, 10 Oct 2019 21:09:53 -0400
Subject: [PATCH 02/19] CircleCI fail

---
 sklearn/linear_model/least_angle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 296f329825293..36ebf271fb04c 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -973,7 +973,7 @@ def fit(self, X, y, Xy=None):
         noise = np.random.uniform(high=self.jitter, size=len(y))
 
         if y.ndim == 2:
-            noise = noise.reshape(-1,1)
+            noise = noise.reshape(-1, 1)
 
         y = y + noise
 

From d075950ec41960c256cdf979d75299e07837f0fc Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Mon, 14 Oct 2019 20:24:58 -0400
Subject: [PATCH 03/19] MR comments

---
 sklearn/linear_model/least_angle.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 36ebf271fb04c..e98817d98adc6 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -24,6 +24,7 @@
 from ..exceptions import ConvergenceWarning
 
 SOLVE_TRIANGULAR_ARGS = {'check_finite': False}
+JITTER = 10e-4
 
 
 def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
@@ -809,9 +810,9 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
-    jitter : float, default=0.0001
-        Uniform noise parameter, added to the y values, to satisfy \
-        the model's assumption of one-at-a-time computations \
+    jitter : float, default=0.001
+        Uniform noise parameter, added to the y values, to satisfy
+        the model's assumption of one-at-a-time computations
         (Efron et al. 2004).
 
     Attributes
@@ -861,7 +862,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     def __init__(self, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 jitter=10e-5):
+                 jitter=JITTER):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -970,7 +971,7 @@ def fit(self, X, y, Xy=None):
         else:
             max_iter = self.max_iter
 
-        noise = np.random.uniform(high=self.jitter, size=len(y))
+        noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y))
 
         if y.ndim == 2:
             noise = noise.reshape(-1, 1)
@@ -1044,7 +1045,7 @@ class LassoLars(Lars):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
-    jitter : float, default=0.0001
+    jitter : float, default=0.001
         Uniform noise parameter, added to the y values, to satisfy \
         the model's assumption of one-at-a-time computations \
         (Efron et al. 2004).
@@ -1111,7 +1112,7 @@ class LassoLars(Lars):
     def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 positive=False, jitter=10e-5):
+                 positive=False, jitter=JITTER):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter

From 7b43af873e94865b1abfab9c2111ee1d45f3cfe1 Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 25 Oct 2019 10:51:44 -0400
Subject: [PATCH 04/19] Jitter becomes default, added test based on issue
 description

---
 sklearn/linear_model/least_angle.py           | 17 ++++++-------
 .../linear_model/tests/test_least_angle.py    | 25 +++++++++++++++++++
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index e98817d98adc6..384c45df0ffbd 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -24,7 +24,7 @@
 from ..exceptions import ConvergenceWarning
 
 SOLVE_TRIANGULAR_ARGS = {'check_finite': False}
-JITTER = 10e-4
+DEFAULT_JITTER = None
 
 
 def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
@@ -862,7 +862,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     def __init__(self, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 jitter=JITTER):
+                 jitter=DEFAULT_JITTER):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -971,12 +971,11 @@ def fit(self, X, y, Xy=None):
         else:
             max_iter = self.max_iter
 
-        noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y))
-
-        if y.ndim == 2:
-            noise = noise.reshape(-1, 1)
-
-        y = y + noise
+        if self.jitter:
+            noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y))
+            if y.ndim == 2:
+                noise = noise.reshape(-1, 1)
+            y = y + noise
 
         self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path,
                   Xy=Xy)
@@ -1112,7 +1111,7 @@ class LassoLars(Lars):
     def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 positive=False, jitter=JITTER):
+                 positive=False, jitter=DEFAULT_JITTER):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index f913f87c2fdbe..c0138d5b93dac 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -730,3 +730,28 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     y = X[:, 2]
     lasso_lars.fit(X, y, copy_X=copy_X)
     assert copy_X == np.array_equal(X, X_copy)
+
+def test_lars_with_jitter():
+    """
+    Test that user input of a small amount of jitter, using example provided in issue #2746
+
+    """
+
+    A = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
+    b = np.array([-2.5, -2.5])
+    expected_output = np.array([0, 2.5, 0, 2.5, 0])
+    alpha = 0.001
+    fit_intercept = False
+
+    lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept)
+    lars_with_jiggle = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept,
+                                              jitter=10e-5)
+
+    lars.fit(A, b)
+    lars_with_jiggle.fit(A, b)
+
+    w_nojiggle = lars.coef_
+    w_jiggle = lars_with_jiggle.coef_
+
+    assert np.array_equal(w_jiggle, w_nojiggle) == False
+    assert_array_almost_equal(w_jiggle, expected_output, decimal=2)

From b6c11fa7d91aa5c3addbc5a05a753419338b29c8 Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 25 Oct 2019 12:48:55 -0400
Subject: [PATCH 05/19] flake8 fixes

---
 sklearn/linear_model/sag_fast.pyx             | 1357 +++++++++++++++++
 .../linear_model/tests/test_least_angle.py    |    9 +-
 sklearn/utils/seq_dataset.pxd                 |  116 ++
 sklearn/utils/seq_dataset.pyx                 |  653 ++++++++
 4 files changed, 2132 insertions(+), 3 deletions(-)
 create mode 100644 sklearn/linear_model/sag_fast.pyx
 create mode 100644 sklearn/utils/seq_dataset.pxd
 create mode 100644 sklearn/utils/seq_dataset.pyx

diff --git a/sklearn/linear_model/sag_fast.pyx b/sklearn/linear_model/sag_fast.pyx
new file mode 100644
index 0000000000000..6d48b65bda560
--- /dev/null
+++ b/sklearn/linear_model/sag_fast.pyx
@@ -0,0 +1,1357 @@
+
+#------------------------------------------------------------------------------
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+#
+# Authors: Danny Sullivan <dbsullivan23@gmail.com>
+#          Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
+#          Arthur Mensch <arthur.mensch@m4x.org
+#
+# License: BSD 3 clause
+
+"""
+SAG and SAGA implementation
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+
+cimport numpy as np
+import numpy as np
+from libc.math cimport fabs, exp, log
+from libc.time cimport time, time_t
+
+from .sgd_fast cimport LossFunction
+from .sgd_fast cimport Log, SquaredLoss
+
+from ..utils.seq_dataset cimport SequentialDataset32, SequentialDataset64
+
+from libc.stdio cimport printf
+
+
+
+cdef extern from "sgd_fast_helpers.h":
+    bint skl_isfinite64(double) nogil
+
+
+cdef extern from "sgd_fast_helpers.h":
+    bint skl_isfinite32(float) nogil
+
+
+cdef inline double fmax64(double x, double y) nogil:
+    if x > y:
+        return x
+    return y
+
+cdef inline float fmax32(float x, float y) nogil:
+    if x > y:
+        return x
+    return y
+
+cdef double _logsumexp64(double* arr, int n_classes) nogil:
+    """Computes the sum of arr assuming arr is in the log domain.
+
+    Returns log(sum(exp(arr))) while minimizing the possibility of
+    over/underflow.
+    """
+    # Use the max to normalize, as with the log this is what accumulates
+    # the less errors
+    cdef double vmax = arr[0]
+    cdef double out = 0.0
+    cdef int i
+
+    for i in range(1, n_classes):
+        if vmax < arr[i]:
+            vmax = arr[i]
+
+    for i in range(n_classes):
+        out += exp(arr[i] - vmax)
+
+    return log(out) + vmax
+
+cdef float _logsumexp32(float* arr, int n_classes) nogil:
+    """Computes the sum of arr assuming arr is in the log domain.
+
+    Returns log(sum(exp(arr))) while minimizing the possibility of
+    over/underflow.
+    """
+    # Use the max to normalize, as with the log this is what accumulates
+    # the less errors
+    cdef float vmax = arr[0]
+    cdef float out = 0.0
+    cdef int i
+
+    for i in range(1, n_classes):
+        if vmax < arr[i]:
+            vmax = arr[i]
+
+    for i in range(n_classes):
+        out += exp(arr[i] - vmax)
+
+    return log(out) + vmax
+
+cdef class MultinomialLogLoss64:
+    cdef double _loss(self, double* prediction, double y, int n_classes,
+                      double sample_weight) nogil:
+        r"""Multinomial Logistic regression loss.
+
+        The multinomial logistic loss for one sample is:
+        loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
+             = sw (logsumexp(prediction) - prediction[y])
+
+        where:
+            prediction = dot(x_sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[double] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : double, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded).
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : double
+            Weight of current sample.
+
+        Returns
+        -------
+        loss : double
+            Multinomial loss for current sample.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes)
+        cdef double loss
+
+        # y is the indice of the correct class of current sample.
+        loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
+        return loss
+
+    cdef void _dloss(self, double* prediction, double y, int n_classes,
+                     double sample_weight, double* gradient_ptr) nogil:
+        r"""Multinomial Logistic regression gradient of the loss.
+
+        The gradient of the multinomial logistic loss with respect to a class c,
+        and for one sample is:
+        grad_c = - sw * (p[c] - \delta_{y,c})
+
+        where:
+            p[c] = exp(logsumexp(prediction) - prediction[c])
+            prediction = dot(sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Note that to obtain the true gradient, this value has to be multiplied
+        by the sample vector x.
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[double] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : double, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded)
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : double
+            Weight of current sample.
+
+        gradient_ptr : pointer to a np.ndarray[double] of shape (n_classes,)
+            Gradient vector to be filled.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes)
+        cdef int class_ind
+
+        for class_ind in range(n_classes):
+            gradient_ptr[class_ind] = exp(prediction[class_ind] -
+                                          logsumexp_prediction)
+
+            # y is the indice of the correct class of current sample.
+            if class_ind == y:
+                gradient_ptr[class_ind] -= 1.0
+
+            gradient_ptr[class_ind] *= sample_weight
+
+    def __reduce__(self):
+        return MultinomialLogLoss64, ()
+
+cdef class MultinomialLogLoss32:
+    cdef float _loss(self, float* prediction, float y, int n_classes,
+                      float sample_weight) nogil:
+        r"""Multinomial Logistic regression loss.
+
+        The multinomial logistic loss for one sample is:
+        loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
+             = sw (logsumexp(prediction) - prediction[y])
+
+        where:
+            prediction = dot(x_sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[float] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : float, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded).
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : float
+            Weight of current sample.
+
+        Returns
+        -------
+        loss : float
+            Multinomial loss for current sample.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes)
+        cdef float loss
+
+        # y is the indice of the correct class of current sample.
+        loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
+        return loss
+
+    cdef void _dloss(self, float* prediction, float y, int n_classes,
+                     float sample_weight, float* gradient_ptr) nogil:
+        r"""Multinomial Logistic regression gradient of the loss.
+
+        The gradient of the multinomial logistic loss with respect to a class c,
+        and for one sample is:
+        grad_c = - sw * (p[c] - \delta_{y,c})
+
+        where:
+            p[c] = exp(logsumexp(prediction) - prediction[c])
+            prediction = dot(sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Note that to obtain the true gradient, this value has to be multiplied
+        by the sample vector x.
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[float] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : float, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded)
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : float
+            Weight of current sample.
+
+        gradient_ptr : pointer to a np.ndarray[float] of shape (n_classes,)
+            Gradient vector to be filled.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes)
+        cdef int class_ind
+
+        for class_ind in range(n_classes):
+            gradient_ptr[class_ind] = exp(prediction[class_ind] -
+                                          logsumexp_prediction)
+
+            # y is the indice of the correct class of current sample.
+            if class_ind == y:
+                gradient_ptr[class_ind] -= 1.0
+
+            gradient_ptr[class_ind] *= sample_weight
+
+    def __reduce__(self):
+        return MultinomialLogLoss32, ()
+
+cdef inline double _soft_thresholding64(double x, double shrinkage) nogil:
+    return fmax64(x - shrinkage, 0) - fmax64(- x - shrinkage, 0)
+
+cdef inline float _soft_thresholding32(float x, float shrinkage) nogil:
+    return fmax32(x - shrinkage, 0) - fmax32(- x - shrinkage, 0)
+
+def sag64(SequentialDataset64 dataset,
+        np.ndarray[double, ndim=2, mode='c'] weights_array,
+        np.ndarray[double, ndim=1, mode='c'] intercept_array,
+        int n_samples,
+        int n_features,
+        int n_classes,
+        double tol,
+        int max_iter,
+        str loss_function,
+        double step_size,
+        double alpha,
+        double beta,
+        np.ndarray[double, ndim=2, mode='c'] sum_gradient_init,
+        np.ndarray[double, ndim=2, mode='c'] gradient_memory_init,
+        np.ndarray[bint, ndim=1, mode='c'] seen_init,
+        int num_seen,
+        bint fit_intercept,
+        np.ndarray[double, ndim=1, mode='c'] intercept_sum_gradient_init,
+        double intercept_decay,
+        bint saga,
+        bint verbose):
+    """Stochastic Average Gradient (SAG) and SAGA solvers.
+
+    Used in Ridge and LogisticRegression.
+
+    Reference
+    ---------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+    (section 4.3)
+
+    Defazio, A., Bach, F., Lacoste-Julien, S. (2014),
+    SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives
+    https://arxiv.org/abs/1407.0202
+
+    """
+    # the data pointer for x, the current sample
+    cdef double *x_data_ptr = NULL
+    # the index pointer for the column of the data
+    cdef int *x_ind_ptr = NULL
+    # the number of non-zero features for current sample
+    cdef int xnnz = -1
+    # the label value for current sample
+    # the label value for curent sample
+    cdef double y
+    # the sample weight
+    cdef double sample_weight
+
+    # helper variable for indexes
+    cdef int f_idx, s_idx, feature_ind, class_ind, j
+    # the number of pass through all samples
+    cdef int n_iter = 0
+    # helper to track iterations through samples
+    cdef int sample_itr
+    # the index (row number) of the current sample
+    cdef int sample_ind
+
+    # the maximum change in weights, used to compute stopping criteria
+    cdef double max_change
+    # a holder variable for the max weight, used to compute stopping criteria
+    cdef double max_weight
+
+    # the start time of the fit
+    cdef time_t start_time
+    # the end time of the fit
+    cdef time_t end_time
+
+    # precomputation since the step size does not change in this implementation
+    cdef double wscale_update = 1.0 - step_size * alpha
+
+    # vector of booleans indicating whether this sample has been seen
+    cdef bint* seen = <bint*> seen_init.data
+
+    # helper for cumulative sum
+    cdef double cum_sum
+
+    # the pointer to the coef_ or weights
+    cdef double* weights = <double * >weights_array.data
+    # the pointer to the intercept_array
+    cdef double* intercept = <double * >intercept_array.data
+
+    # the pointer to the intercept_sum_gradient
+    cdef double* intercept_sum_gradient = \
+        <double * >intercept_sum_gradient_init.data
+
+    # the sum of gradients for each feature
+    cdef double* sum_gradient = <double*> sum_gradient_init.data
+    # the previously seen gradient for each sample
+    cdef double* gradient_memory = <double*> gradient_memory_init.data
+
+    # the cumulative sums needed for JIT params
+    cdef np.ndarray[double, ndim=1] cumulative_sums_array = \
+        np.empty(n_samples, dtype=np.float64, order="c")
+    cdef double* cumulative_sums = <double*> cumulative_sums_array.data
+
+    # the index for the last time this feature was updated
+    cdef np.ndarray[int, ndim=1] feature_hist_array = \
+        np.zeros(n_features, dtype=np.int32, order="c")
+    cdef int* feature_hist = <int*> feature_hist_array.data
+
+    # the previous weights to use to compute stopping criteria
+    cdef np.ndarray[double, ndim=2] previous_weights_array = \
+        np.zeros((n_features, n_classes), dtype=np.float64, order="c")
+    cdef double* previous_weights = <double*> previous_weights_array.data
+
+    cdef np.ndarray[double, ndim=1] prediction_array = \
+        np.zeros(n_classes, dtype=np.float64, order="c")
+    cdef double* prediction = <double*> prediction_array.data
+
+    cdef np.ndarray[double, ndim=1] gradient_array = \
+        np.zeros(n_classes, dtype=np.float64, order="c")
+    cdef double* gradient = <double*> gradient_array.data
+
+    # Intermediate variable that need declaration since cython cannot infer when templating
+    cdef double val
+
+    # Bias correction term in saga
+    cdef double gradient_correction
+
+    # the scalar used for multiplying z
+    cdef double wscale = 1.0
+
+    # return value (-1 if an error occurred, 0 otherwise)
+    cdef int status = 0
+
+    # the cumulative sums for each iteration for the sparse implementation
+    cumulative_sums[0] = 0.0
+
+    # the multipliative scale needed for JIT params
+    cdef np.ndarray[double, ndim=1] cumulative_sums_prox_array
+    cdef double* cumulative_sums_prox
+
+    cdef bint prox = beta > 0 and saga
+
+    # Loss function to optimize
+    cdef LossFunction loss
+    # Wether the loss function is multinomial
+    cdef bint multinomial = False
+    # Multinomial loss function
+    cdef MultinomialLogLoss64 multiloss
+
+    if loss_function == "multinomial":
+        multinomial = True
+        multiloss = MultinomialLogLoss64()
+    elif loss_function == "log":
+        loss = Log()
+    elif loss_function == "squared":
+        loss = SquaredLoss()
+    else:
+        raise ValueError("Invalid loss parameter: got %s instead of "
+                         "one of ('log', 'squared', 'multinomial')"
+                         % loss_function)
+
+    if prox:
+        cumulative_sums_prox_array = np.empty(n_samples,
+                                              dtype=np.float64, order="c")
+        cumulative_sums_prox = <double*> cumulative_sums_prox_array.data
+    else:
+        cumulative_sums_prox = NULL
+
+    with nogil:
+        start_time = time(NULL)
+        for n_iter in range(max_iter):
+            for sample_itr in range(n_samples):
+                # extract a random sample
+                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz,
+                                              &y, &sample_weight)
+
+                # cached index for gradient_memory
+                s_idx = sample_ind * n_classes
+
+                # update the number of samples seen and the seen array
+                if seen[sample_ind] == 0:
+                    num_seen += 1
+                    seen[sample_ind] = 1
+
+                # make the weight updates
+                if sample_itr > 0:
+                   status = lagged_update64(weights, wscale, xnnz,
+                                                  n_samples, n_classes,
+                                                  sample_itr,
+                                                  cumulative_sums,
+                                                  cumulative_sums_prox,
+                                                  feature_hist,
+                                                  prox,
+                                                  sum_gradient,
+                                                  x_ind_ptr,
+                                                  False,
+                                                  n_iter)
+                   if status == -1:
+                       break
+
+                # find the current prediction
+                predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
+                                       intercept, prediction, n_classes)
+
+                # compute the gradient for this sample, given the prediction
+                if multinomial:
+                    multiloss._dloss(prediction, y, n_classes, sample_weight,
+                                     gradient)
+                else:
+                    gradient[0] = loss._dloss(prediction[0], y) * sample_weight
+
+                # L2 regularization by simply rescaling the weights
+                wscale *= wscale_update
+
+                # make the updates to the sum of gradients
+                for j in range(xnnz):
+                    feature_ind = x_ind_ptr[j]
+                    val = x_data_ptr[j]
+                    f_idx = feature_ind * n_classes
+                    for class_ind in range(n_classes):
+                        gradient_correction = \
+                            val * (gradient[class_ind] -
+                                   gradient_memory[s_idx + class_ind])
+                        if saga:
+                            weights[f_idx + class_ind] -= \
+                                (gradient_correction * step_size
+                                 * (1 - 1. / num_seen) / wscale)
+                        sum_gradient[f_idx + class_ind] += gradient_correction
+
+                # fit the intercept
+                if fit_intercept:
+                    for class_ind in range(n_classes):
+                        gradient_correction = (gradient[class_ind] -
+                                               gradient_memory[s_idx + class_ind])
+                        intercept_sum_gradient[class_ind] += gradient_correction
+                        gradient_correction *= step_size * (1. - 1. / num_seen)
+                        if saga:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay) + gradient_correction
+                        else:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay)
+
+                        # check to see that the intercept is not inf or NaN
+                        if not skl_isfinite64(intercept[class_ind]):
+                            status = -1
+                            break
+                    # Break from the n_samples outer loop if an error happened
+                    # in the fit_intercept n_classes inner loop
+                    if status == -1:
+                        break
+
+                # update the gradient memory for this sample
+                for class_ind in range(n_classes):
+                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
+
+                if sample_itr == 0:
+                    cumulative_sums[0] = step_size / (wscale * num_seen)
+                    if prox:
+                        cumulative_sums_prox[0] = step_size * beta / wscale
+                else:
+                    cumulative_sums[sample_itr] = \
+                        (cumulative_sums[sample_itr - 1] +
+                         step_size / (wscale * num_seen))
+                    if prox:
+                        cumulative_sums_prox[sample_itr] = \
+                        (cumulative_sums_prox[sample_itr - 1] +
+                             step_size * beta / wscale)
+                # If wscale gets too small, we need to reset the scale.
+                if wscale < 1e-9:
+                    if verbose:
+                        with gil:
+                            print("rescaling...")
+                    status = scale_weights64(
+                        weights, &wscale, n_features, n_samples, n_classes,
+                        sample_itr, cumulative_sums,
+                        cumulative_sums_prox,
+                        feature_hist,
+                        prox, sum_gradient, n_iter)
+                    if status == -1:
+                        break
+
+            # Break from the n_iter outer loop if an error happened in the
+            # n_samples inner loop
+            if status == -1:
+                break
+
+            # we scale the weights every n_samples iterations and reset the
+            # just-in-time update system for numerical stability.
+            status = scale_weights64(weights, &wscale, n_features,
+                                           n_samples,
+                                           n_classes, n_samples - 1,
+                                           cumulative_sums,
+                                           cumulative_sums_prox,
+                                           feature_hist,
+                                           prox, sum_gradient, n_iter)
+
+            if status == -1:
+                break
+            # check if the stopping criteria is reached
+            max_change = 0.0
+            max_weight = 0.0
+            for idx in range(n_features * n_classes):
+                max_weight = fmax64(max_weight, fabs(weights[idx]))
+                max_change = fmax64(max_change,
+                                  fabs(weights[idx] -
+                                       previous_weights[idx]))
+                previous_weights[idx] = weights[idx]
+            if ((max_weight != 0 and max_change / max_weight <= tol)
+                or max_weight == 0 and max_change == 0):
+                if verbose:
+                    end_time = time(NULL)
+                    with gil:
+                        print("convergence after %d epochs took %d seconds" %
+                              (n_iter + 1, end_time - start_time))
+                break
+            elif verbose:
+                printf('Epoch %d, change: %.8f\n', n_iter + 1,
+                                                  max_change / max_weight)
+    n_iter += 1
+    # We do the error treatment here based on error code in status to avoid
+    # re-acquiring the GIL within the cython code, which slows the computation
+    # when the sag/saga solver is used concurrently in multiple Python threads.
+    if status == -1:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % n_iter)
+
+    if verbose and n_iter >= max_iter:
+        end_time = time(NULL)
+        print(("max_iter reached after %d seconds") %
+              (end_time - start_time))
+
+    return num_seen, n_iter
+
+def sag32(SequentialDataset32 dataset,
+        np.ndarray[float, ndim=2, mode='c'] weights_array,
+        np.ndarray[float, ndim=1, mode='c'] intercept_array,
+        int n_samples,
+        int n_features,
+        int n_classes,
+        double tol,
+        int max_iter,
+        str loss_function,
+        double step_size,
+        double alpha,
+        double beta,
+        np.ndarray[float, ndim=2, mode='c'] sum_gradient_init,
+        np.ndarray[float, ndim=2, mode='c'] gradient_memory_init,
+        np.ndarray[bint, ndim=1, mode='c'] seen_init,
+        int num_seen,
+        bint fit_intercept,
+        np.ndarray[float, ndim=1, mode='c'] intercept_sum_gradient_init,
+        double intercept_decay,
+        bint saga,
+        bint verbose):
+    """Stochastic Average Gradient (SAG) and SAGA solvers.
+
+    Used in Ridge and LogisticRegression.
+
+    Reference
+    ---------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+    (section 4.3)
+
+    Defazio, A., Bach, F., Lacoste-Julien, S. (2014),
+    SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives
+    https://arxiv.org/abs/1407.0202
+
+    """
+    # the data pointer for x, the current sample
+    cdef float *x_data_ptr = NULL
+    # the index pointer for the column of the data
+    cdef int *x_ind_ptr = NULL
+    # the number of non-zero features for current sample
+    cdef int xnnz = -1
+    # the label value for current sample
+    # the label value for curent sample
+    cdef float y
+    # the sample weight
+    cdef float sample_weight
+
+    # helper variable for indexes
+    cdef int f_idx, s_idx, feature_ind, class_ind, j
+    # the number of pass through all samples
+    cdef int n_iter = 0
+    # helper to track iterations through samples
+    cdef int sample_itr
+    # the index (row number) of the current sample
+    cdef int sample_ind
+
+    # the maximum change in weights, used to compute stopping criteria
+    cdef float max_change
+    # a holder variable for the max weight, used to compute stopping criteria
+    cdef float max_weight
+
+    # the start time of the fit
+    cdef time_t start_time
+    # the end time of the fit
+    cdef time_t end_time
+
+    # precomputation since the step size does not change in this implementation
+    cdef float wscale_update = 1.0 - step_size * alpha
+
+    # vector of booleans indicating whether this sample has been seen
+    cdef bint* seen = <bint*> seen_init.data
+
+    # helper for cumulative sum
+    cdef float cum_sum
+
+    # the pointer to the coef_ or weights
+    cdef float* weights = <float * >weights_array.data
+    # the pointer to the intercept_array
+    cdef float* intercept = <float * >intercept_array.data
+
+    # the pointer to the intercept_sum_gradient
+    cdef float* intercept_sum_gradient = \
+        <float * >intercept_sum_gradient_init.data
+
+    # the sum of gradients for each feature
+    cdef float* sum_gradient = <float*> sum_gradient_init.data
+    # the previously seen gradient for each sample
+    cdef float* gradient_memory = <float*> gradient_memory_init.data
+
+    # the cumulative sums needed for JIT params
+    cdef np.ndarray[float, ndim=1] cumulative_sums_array = \
+        np.empty(n_samples, dtype=np.float32, order="c")
+    cdef float* cumulative_sums = <float*> cumulative_sums_array.data
+
+    # the index for the last time this feature was updated
+    cdef np.ndarray[int, ndim=1] feature_hist_array = \
+        np.zeros(n_features, dtype=np.int32, order="c")
+    cdef int* feature_hist = <int*> feature_hist_array.data
+
+    # the previous weights to use to compute stopping criteria
+    cdef np.ndarray[float, ndim=2] previous_weights_array = \
+        np.zeros((n_features, n_classes), dtype=np.float32, order="c")
+    cdef float* previous_weights = <float*> previous_weights_array.data
+
+    cdef np.ndarray[float, ndim=1] prediction_array = \
+        np.zeros(n_classes, dtype=np.float32, order="c")
+    cdef float* prediction = <float*> prediction_array.data
+
+    cdef np.ndarray[float, ndim=1] gradient_array = \
+        np.zeros(n_classes, dtype=np.float32, order="c")
+    cdef float* gradient = <float*> gradient_array.data
+
+    # Intermediate variable that need declaration since cython cannot infer when templating
+    cdef float val
+
+    # Bias correction term in saga
+    cdef float gradient_correction
+
+    # the scalar used for multiplying z
+    cdef float wscale = 1.0
+
+    # return value (-1 if an error occurred, 0 otherwise)
+    cdef int status = 0
+
+    # the cumulative sums for each iteration for the sparse implementation
+    cumulative_sums[0] = 0.0
+
+    # the multipliative scale needed for JIT params
+    cdef np.ndarray[float, ndim=1] cumulative_sums_prox_array
+    cdef float* cumulative_sums_prox
+
+    cdef bint prox = beta > 0 and saga
+
+    # Loss function to optimize
+    cdef LossFunction loss
+    # Wether the loss function is multinomial
+    cdef bint multinomial = False
+    # Multinomial loss function
+    cdef MultinomialLogLoss32 multiloss
+
+    if loss_function == "multinomial":
+        multinomial = True
+        multiloss = MultinomialLogLoss32()
+    elif loss_function == "log":
+        loss = Log()
+    elif loss_function == "squared":
+        loss = SquaredLoss()
+    else:
+        raise ValueError("Invalid loss parameter: got %s instead of "
+                         "one of ('log', 'squared', 'multinomial')"
+                         % loss_function)
+
+    if prox:
+        cumulative_sums_prox_array = np.empty(n_samples,
+                                              dtype=np.float32, order="c")
+        cumulative_sums_prox = <float*> cumulative_sums_prox_array.data
+    else:
+        cumulative_sums_prox = NULL
+
+    with nogil:
+        start_time = time(NULL)
+        for n_iter in range(max_iter):
+            for sample_itr in range(n_samples):
+                # extract a random sample
+                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz,
+                                              &y, &sample_weight)
+
+                # cached index for gradient_memory
+                s_idx = sample_ind * n_classes
+
+                # update the number of samples seen and the seen array
+                if seen[sample_ind] == 0:
+                    num_seen += 1
+                    seen[sample_ind] = 1
+
+                # make the weight updates
+                if sample_itr > 0:
+                   status = lagged_update32(weights, wscale, xnnz,
+                                                  n_samples, n_classes,
+                                                  sample_itr,
+                                                  cumulative_sums,
+                                                  cumulative_sums_prox,
+                                                  feature_hist,
+                                                  prox,
+                                                  sum_gradient,
+                                                  x_ind_ptr,
+                                                  False,
+                                                  n_iter)
+                   if status == -1:
+                       break
+
+                # find the current prediction
+                predict_sample32(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
+                                       intercept, prediction, n_classes)
+
+                # compute the gradient for this sample, given the prediction
+                if multinomial:
+                    multiloss._dloss(prediction, y, n_classes, sample_weight,
+                                     gradient)
+                else:
+                    gradient[0] = loss._dloss(prediction[0], y) * sample_weight
+
+                # L2 regularization by simply rescaling the weights
+                wscale *= wscale_update
+
+                # make the updates to the sum of gradients
+                for j in range(xnnz):
+                    feature_ind = x_ind_ptr[j]
+                    val = x_data_ptr[j]
+                    f_idx = feature_ind * n_classes
+                    for class_ind in range(n_classes):
+                        gradient_correction = \
+                            val * (gradient[class_ind] -
+                                   gradient_memory[s_idx + class_ind])
+                        if saga:
+                            weights[f_idx + class_ind] -= \
+                                (gradient_correction * step_size
+                                 * (1 - 1. / num_seen) / wscale)
+                        sum_gradient[f_idx + class_ind] += gradient_correction
+
+                # fit the intercept
+                if fit_intercept:
+                    for class_ind in range(n_classes):
+                        gradient_correction = (gradient[class_ind] -
+                                               gradient_memory[s_idx + class_ind])
+                        intercept_sum_gradient[class_ind] += gradient_correction
+                        gradient_correction *= step_size * (1. - 1. / num_seen)
+                        if saga:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay) + gradient_correction
+                        else:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay)
+
+                        # check to see that the intercept is not inf or NaN
+                        if not skl_isfinite32(intercept[class_ind]):
+                            status = -1
+                            break
+                    # Break from the n_samples outer loop if an error happened
+                    # in the fit_intercept n_classes inner loop
+                    if status == -1:
+                        break
+
+                # update the gradient memory for this sample
+                for class_ind in range(n_classes):
+                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
+
+                if sample_itr == 0:
+                    cumulative_sums[0] = step_size / (wscale * num_seen)
+                    if prox:
+                        cumulative_sums_prox[0] = step_size * beta / wscale
+                else:
+                    cumulative_sums[sample_itr] = \
+                        (cumulative_sums[sample_itr - 1] +
+                         step_size / (wscale * num_seen))
+                    if prox:
+                        cumulative_sums_prox[sample_itr] = \
+                        (cumulative_sums_prox[sample_itr - 1] +
+                             step_size * beta / wscale)
+                # If wscale gets too small, we need to reset the scale.
+                if wscale < 1e-9:
+                    if verbose:
+                        with gil:
+                            print("rescaling...")
+                    status = scale_weights32(
+                        weights, &wscale, n_features, n_samples, n_classes,
+                        sample_itr, cumulative_sums,
+                        cumulative_sums_prox,
+                        feature_hist,
+                        prox, sum_gradient, n_iter)
+                    if status == -1:
+                        break
+
+            # Break from the n_iter outer loop if an error happened in the
+            # n_samples inner loop
+            if status == -1:
+                break
+
+            # we scale the weights every n_samples iterations and reset the
+            # just-in-time update system for numerical stability.
+            status = scale_weights32(weights, &wscale, n_features,
+                                           n_samples,
+                                           n_classes, n_samples - 1,
+                                           cumulative_sums,
+                                           cumulative_sums_prox,
+                                           feature_hist,
+                                           prox, sum_gradient, n_iter)
+
+            if status == -1:
+                break
+            # check if the stopping criteria is reached
+            max_change = 0.0
+            max_weight = 0.0
+            for idx in range(n_features * n_classes):
+                max_weight = fmax32(max_weight, fabs(weights[idx]))
+                max_change = fmax32(max_change,
+                                  fabs(weights[idx] -
+                                       previous_weights[idx]))
+                previous_weights[idx] = weights[idx]
+            if ((max_weight != 0 and max_change / max_weight <= tol)
+                or max_weight == 0 and max_change == 0):
+                if verbose:
+                    end_time = time(NULL)
+                    with gil:
+                        print("convergence after %d epochs took %d seconds" %
+                              (n_iter + 1, end_time - start_time))
+                break
+            elif verbose:
+                printf('Epoch %d, change: %.8f\n', n_iter + 1,
+                                                  max_change / max_weight)
+    n_iter += 1
+    # We do the error treatment here based on error code in status to avoid
+    # re-acquiring the GIL within the cython code, which slows the computation
+    # when the sag/saga solver is used concurrently in multiple Python threads.
+    if status == -1:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % n_iter)
+
+    if verbose and n_iter >= max_iter:
+        end_time = time(NULL)
+        print(("max_iter reached after %d seconds") %
+              (end_time - start_time))
+
+    return num_seen, n_iter
+
+cdef int scale_weights64(double* weights, double* wscale,
+                               int n_features,
+                               int n_samples, int n_classes, int sample_itr,
+                               double* cumulative_sums,
+                               double* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               double* sum_gradient,
+                               int n_iter) nogil:
+    """Scale the weights with wscale for numerical stability.
+
+    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
+    can become very small, so we reset it every n_samples iterations to 1.0 for
+    numerical stability. To be able to scale, we first need to update every
+    coefficients and reset the just-in-time update system.
+    This also limits the size of `cumulative_sums`.
+    """
+
+    cdef int status
+    status = lagged_update64(weights, wscale[0], n_features,
+                                   n_samples, n_classes, sample_itr + 1,
+                                   cumulative_sums,
+                                   cumulative_sums_prox,
+                                   feature_hist,
+                                   prox,
+                                   sum_gradient,
+                                   NULL,
+                                   True,
+                                   n_iter)
+    # if lagged update succeeded, reset wscale to 1.0
+    if status == 0:
+        wscale[0] = 1.0
+    return status
+
+cdef int scale_weights32(float* weights, float* wscale,
+                               int n_features,
+                               int n_samples, int n_classes, int sample_itr,
+                               float* cumulative_sums,
+                               float* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               float* sum_gradient,
+                               int n_iter) nogil:
+    """Scale the weights with wscale for numerical stability.
+
+    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
+    can become very small, so we reset it every n_samples iterations to 1.0 for
+    numerical stability. To be able to scale, we first need to update every
+    coefficients and reset the just-in-time update system.
+    This also limits the size of `cumulative_sums`.
+    """
+
+    cdef int status
+    status = lagged_update32(weights, wscale[0], n_features,
+                                   n_samples, n_classes, sample_itr + 1,
+                                   cumulative_sums,
+                                   cumulative_sums_prox,
+                                   feature_hist,
+                                   prox,
+                                   sum_gradient,
+                                   NULL,
+                                   True,
+                                   n_iter)
+    # if lagged update succeeded, reset wscale to 1.0
+    if status == 0:
+        wscale[0] = 1.0
+    return status
+
+cdef int lagged_update64(double* weights, double wscale, int xnnz,
+                               int n_samples, int n_classes, int sample_itr,
+                               double* cumulative_sums,
+                               double* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               double* sum_gradient,
+                               int* x_ind_ptr,
+                               bint reset,
+                               int n_iter) nogil:
+    """Hard perform the JIT updates for non-zero features of present sample.
+    The updates that awaits are kept in memory using cumulative_sums,
+    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
+    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
+    1 (this is done at the end of each epoch).
+    """
+    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
+    cdef double cum_sum, grad_step, prox_step, cum_sum_prox
+    for feature_ind in range(xnnz):
+        if not reset:
+            feature_ind = x_ind_ptr[feature_ind]
+        f_idx = feature_ind * n_classes
+
+        cum_sum = cumulative_sums[sample_itr - 1]
+        if prox:
+            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
+        if feature_hist[feature_ind] != 0:
+            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
+            if prox:
+                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
+        if not prox:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                weights[idx] -= cum_sum * sum_gradient[idx]
+                if reset:
+                    weights[idx] *= wscale
+                    if not skl_isfinite64(weights[idx]):
+                        # returning here does not require the gil as the return
+                        # type is a C integer
+                        return -1
+        else:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
+                    # In this case, we can perform all the gradient steps and
+                    # all the proximal steps in this order, which is more
+                    # efficient than unrolling all the lagged updates.
+                    # Idea taken from scikit-learn-contrib/lightning.
+                    weights[idx] -= cum_sum * sum_gradient[idx]
+                    weights[idx] = _soft_thresholding64(weights[idx],
+                                                      cum_sum_prox)
+                else:
+                    last_update_ind = feature_hist[feature_ind]
+                    if last_update_ind == -1:
+                        last_update_ind = sample_itr - 1
+                    for lagged_ind in range(sample_itr - 1,
+                                   last_update_ind - 1, -1):
+                        if lagged_ind > 0:
+                            grad_step = (cumulative_sums[lagged_ind]
+                               - cumulative_sums[lagged_ind - 1])
+                            prox_step = (cumulative_sums_prox[lagged_ind]
+                               - cumulative_sums_prox[lagged_ind - 1])
+                        else:
+                            grad_step = cumulative_sums[lagged_ind]
+                            prox_step = cumulative_sums_prox[lagged_ind]
+                        weights[idx] -= sum_gradient[idx] * grad_step
+                        weights[idx] = _soft_thresholding64(weights[idx],
+                                                          prox_step)
+
+                if reset:
+                    weights[idx] *= wscale
+                    # check to see that the weight is not inf or NaN
+                    if not skl_isfinite64(weights[idx]):
+                        return -1
+        if reset:
+            feature_hist[feature_ind] = sample_itr % n_samples
+        else:
+            feature_hist[feature_ind] = sample_itr
+
+    if reset:
+        cumulative_sums[sample_itr - 1] = 0.0
+        if prox:
+            cumulative_sums_prox[sample_itr - 1] = 0.0
+
+    return 0
+
+cdef int lagged_update32(float* weights, float wscale, int xnnz,
+                               int n_samples, int n_classes, int sample_itr,
+                               float* cumulative_sums,
+                               float* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               float* sum_gradient,
+                               int* x_ind_ptr,
+                               bint reset,
+                               int n_iter) nogil:
+    """Hard perform the JIT updates for non-zero features of present sample.
+    The updates that awaits are kept in memory using cumulative_sums,
+    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
+    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
+    1 (this is done at the end of each epoch).
+    """
+    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
+    cdef float cum_sum, grad_step, prox_step, cum_sum_prox
+    for feature_ind in range(xnnz):
+        if not reset:
+            feature_ind = x_ind_ptr[feature_ind]
+        f_idx = feature_ind * n_classes
+
+        cum_sum = cumulative_sums[sample_itr - 1]
+        if prox:
+            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
+        if feature_hist[feature_ind] != 0:
+            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
+            if prox:
+                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
+        if not prox:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                weights[idx] -= cum_sum * sum_gradient[idx]
+                if reset:
+                    weights[idx] *= wscale
+                    if not skl_isfinite32(weights[idx]):
+                        # returning here does not require the gil as the return
+                        # type is a C integer
+                        return -1
+        else:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
+                    # In this case, we can perform all the gradient steps and
+                    # all the proximal steps in this order, which is more
+                    # efficient than unrolling all the lagged updates.
+                    # Idea taken from scikit-learn-contrib/lightning.
+                    weights[idx] -= cum_sum * sum_gradient[idx]
+                    weights[idx] = _soft_thresholding32(weights[idx],
+                                                      cum_sum_prox)
+                else:
+                    last_update_ind = feature_hist[feature_ind]
+                    if last_update_ind == -1:
+                        last_update_ind = sample_itr - 1
+                    for lagged_ind in range(sample_itr - 1,
+                                   last_update_ind - 1, -1):
+                        if lagged_ind > 0:
+                            grad_step = (cumulative_sums[lagged_ind]
+                               - cumulative_sums[lagged_ind - 1])
+                            prox_step = (cumulative_sums_prox[lagged_ind]
+                               - cumulative_sums_prox[lagged_ind - 1])
+                        else:
+                            grad_step = cumulative_sums[lagged_ind]
+                            prox_step = cumulative_sums_prox[lagged_ind]
+                        weights[idx] -= sum_gradient[idx] * grad_step
+                        weights[idx] = _soft_thresholding32(weights[idx],
+                                                          prox_step)
+
+                if reset:
+                    weights[idx] *= wscale
+                    # check to see that the weight is not inf or NaN
+                    if not skl_isfinite32(weights[idx]):
+                        return -1
+        if reset:
+            feature_hist[feature_ind] = sample_itr % n_samples
+        else:
+            feature_hist[feature_ind] = sample_itr
+
+    if reset:
+        cumulative_sums[sample_itr - 1] = 0.0
+        if prox:
+            cumulative_sums_prox[sample_itr - 1] = 0.0
+
+    return 0
+
+cdef void predict_sample64(double* x_data_ptr, int* x_ind_ptr, int xnnz,
+                                 double* w_data_ptr, double wscale,
+                                 double* intercept, double* prediction,
+                                 int n_classes) nogil:
+    """Compute the prediction given sparse sample x and dense weight w.
+
+    Parameters
+    ----------
+    x_data_ptr : pointer
+        Pointer to the data of the sample x
+
+    x_ind_ptr : pointer
+        Pointer to the indices of the sample  x
+
+    xnnz : int
+        Number of non-zero element in the sample  x
+
+    w_data_ptr : pointer
+        Pointer to the data of the weights w
+
+    wscale : double
+        Scale of the weights w
+
+    intercept : pointer
+        Pointer to the intercept
+
+    prediction : pointer
+        Pointer to store the resulting prediction
+
+    n_classes : int
+        Number of classes in multinomial case. Equals 1 in binary case.
+
+    """
+    cdef int feature_ind, class_ind, j
+    cdef double innerprod
+
+    for class_ind in range(n_classes):
+        innerprod = 0.0
+        # Compute the dot product only on non-zero elements of x
+        for j in range(xnnz):
+            feature_ind = x_ind_ptr[j]
+            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
+                          x_data_ptr[j])
+
+        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
+
+
+cdef void predict_sample32(float* x_data_ptr, int* x_ind_ptr, int xnnz,
+                                 float* w_data_ptr, float wscale,
+                                 float* intercept, float* prediction,
+                                 int n_classes) nogil:
+    """Compute the prediction given sparse sample x and dense weight w.
+
+    Parameters
+    ----------
+    x_data_ptr : pointer
+        Pointer to the data of the sample x
+
+    x_ind_ptr : pointer
+        Pointer to the indices of the sample  x
+
+    xnnz : int
+        Number of non-zero element in the sample  x
+
+    w_data_ptr : pointer
+        Pointer to the data of the weights w
+
+    wscale : float
+        Scale of the weights w
+
+    intercept : pointer
+        Pointer to the intercept
+
+    prediction : pointer
+        Pointer to store the resulting prediction
+
+    n_classes : int
+        Number of classes in multinomial case. Equals 1 in binary case.
+
+    """
+    cdef int feature_ind, class_ind, j
+    cdef float innerprod
+
+    for class_ind in range(n_classes):
+        innerprod = 0.0
+        # Compute the dot product only on non-zero elements of x
+        for j in range(xnnz):
+            feature_ind = x_ind_ptr[j]
+            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
+                          x_data_ptr[j])
+
+        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
+
+
+
+def _multinomial_grad_loss_all_samples(
+        SequentialDataset64 dataset,
+        np.ndarray[double, ndim=2, mode='c'] weights_array,
+        np.ndarray[double, ndim=1, mode='c'] intercept_array,
+        int n_samples, int n_features, int n_classes):
+    """Compute multinomial gradient and loss across all samples.
+
+    Used for testing purpose only.
+    """
+    cdef double* weights = <double * >weights_array.data
+    cdef double* intercept = <double * >intercept_array.data
+
+    cdef double *x_data_ptr = NULL
+    cdef int *x_ind_ptr = NULL
+    cdef int xnnz = -1
+    cdef double y
+    cdef double sample_weight
+
+    cdef double wscale = 1.0
+    cdef int i, j, class_ind, feature_ind
+    cdef double val
+    cdef double sum_loss = 0.0
+
+    cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64()
+
+    cdef np.ndarray[double, ndim=2] sum_gradient_array = \
+        np.zeros((n_features, n_classes), dtype=np.double, order="c")
+    cdef double* sum_gradient = <double*> sum_gradient_array.data
+
+    cdef np.ndarray[double, ndim=1] prediction_array = \
+        np.zeros(n_classes, dtype=np.double, order="c")
+    cdef double* prediction = <double*> prediction_array.data
+
+    cdef np.ndarray[double, ndim=1] gradient_array = \
+        np.zeros(n_classes, dtype=np.double, order="c")
+    cdef double* gradient = <double*> gradient_array.data
+
+    with nogil:
+        for i in range(n_samples):
+            # get next sample on the dataset
+            dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
+                         &y, &sample_weight)
+
+            # prediction of the multinomial classifier for the sample
+            predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
+                           intercept, prediction, n_classes)
+
+            # compute the gradient for this sample, given the prediction
+            multiloss._dloss(prediction, y, n_classes, sample_weight, gradient)
+
+            # compute the loss for this sample, given the prediction
+            sum_loss += multiloss._loss(prediction, y, n_classes, sample_weight)
+
+            # update the sum of the gradient
+            for j in range(xnnz):
+                feature_ind = x_ind_ptr[j]
+                val = x_data_ptr[j]
+                for class_ind in range(n_classes):
+                    sum_gradient[feature_ind * n_classes + class_ind] += \
+                        gradient[class_ind] * val
+
+    return sum_loss, sum_gradient_array
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 160a334c7f833..d3fb5a31fbe8d 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -731,9 +731,11 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     lasso_lars.fit(X, y, copy_X=copy_X)
     assert copy_X == np.array_equal(X, X_copy)
 
+
 def test_lars_with_jitter():
     """
-    Test that user input of a small amount of jitter, using example provided in issue #2746
+    Test that user input of a small amount of jitter,
+    using example provided in issue #2746
 
     """
 
@@ -744,7 +746,8 @@ def test_lars_with_jitter():
     fit_intercept = False
 
     lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept)
-    lars_with_jiggle = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept,
+    lars_with_jiggle = linear_model.LassoLars(alpha=alpha,
+                                              fit_intercept=fit_intercept,
                                               jitter=10e-5)
 
     lars.fit(A, b)
@@ -753,5 +756,5 @@ def test_lars_with_jitter():
     w_nojiggle = lars.coef_
     w_jiggle = lars_with_jiggle.coef_
 
-    assert np.array_equal(w_jiggle, w_nojiggle) == False
+    assert not np.array_equal(w_jiggle, w_nojiggle)
     assert_array_almost_equal(w_jiggle, expected_output, decimal=2)
diff --git a/sklearn/utils/seq_dataset.pxd b/sklearn/utils/seq_dataset.pxd
new file mode 100644
index 0000000000000..67ce3b68b4474
--- /dev/null
+++ b/sklearn/utils/seq_dataset.pxd
@@ -0,0 +1,116 @@
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
+"""
+
+cimport numpy as np
+
+# SequentialDataset and its two concrete subclasses are (optionally randomized)
+# iterators over the rows of a matrix X and corresponding target values y.
+
+
+cdef class SequentialDataset64:
+    cdef int current_index
+    cdef np.ndarray index
+    cdef int *index_data_ptr
+    cdef Py_ssize_t n_samples
+    cdef np.uint32_t seed
+
+    cdef void shuffle(self, np.uint32_t seed) nogil
+    cdef int _get_next_index(self) nogil
+    cdef int _get_random_index(self) nogil
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil
+    cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, double *y, double *sample_weight) nogil
+    cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, double *y, double *sample_weight) nogil
+
+
+cdef class ArrayDataset64(SequentialDataset64):
+    cdef np.ndarray X
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef Py_ssize_t n_features
+    cdef np.npy_intp X_stride
+    cdef double *X_data_ptr
+    cdef double *Y_data_ptr
+    cdef np.ndarray feature_indices
+    cdef int *feature_indices_ptr
+    cdef double *sample_weight_data
+
+
+cdef class CSRDataset64(SequentialDataset64):
+    cdef np.ndarray X_data
+    cdef np.ndarray X_indptr
+    cdef np.ndarray X_indices
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef double *X_data_ptr
+    cdef int *X_indptr_ptr
+    cdef int *X_indices_ptr
+    cdef double *Y_data_ptr
+    cdef double *sample_weight_data
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
+"""
+
+cimport numpy as np
+
+# SequentialDataset and its two concrete subclasses are (optionally randomized)
+# iterators over the rows of a matrix X and corresponding target values y.
+
+
+cdef class SequentialDataset32:
+    cdef int current_index
+    cdef np.ndarray index
+    cdef int *index_data_ptr
+    cdef Py_ssize_t n_samples
+    cdef np.uint32_t seed
+
+    cdef void shuffle(self, np.uint32_t seed) nogil
+    cdef int _get_next_index(self) nogil
+    cdef int _get_random_index(self) nogil
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil
+    cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, float *y, float *sample_weight) nogil
+    cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, float *y, float *sample_weight) nogil
+
+
+cdef class ArrayDataset32(SequentialDataset32):
+    cdef np.ndarray X
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef Py_ssize_t n_features
+    cdef np.npy_intp X_stride
+    cdef float *X_data_ptr
+    cdef float *Y_data_ptr
+    cdef np.ndarray feature_indices
+    cdef int *feature_indices_ptr
+    cdef float *sample_weight_data
+
+
+cdef class CSRDataset32(SequentialDataset32):
+    cdef np.ndarray X_data
+    cdef np.ndarray X_indptr
+    cdef np.ndarray X_indices
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef float *X_data_ptr
+    cdef int *X_indptr_ptr
+    cdef int *X_indices_ptr
+    cdef float *Y_data_ptr
+    cdef float *sample_weight_data
diff --git a/sklearn/utils/seq_dataset.pyx b/sklearn/utils/seq_dataset.pyx
new file mode 100644
index 0000000000000..6fa274771defe
--- /dev/null
+++ b/sklearn/utils/seq_dataset.pyx
@@ -0,0 +1,653 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+
+cimport cython
+from libc.limits cimport INT_MAX
+cimport numpy as np
+import numpy as np
+
+np.import_array()
+
+from ._random cimport our_rand_r
+
+cdef class SequentialDataset64:
+    """Base class for datasets with sequential data access.
+
+    SequentialDataset is used to iterate over the rows of a matrix X and
+    corresponding target values y, i.e. to iterate over samples.
+    There are two methods to get the next sample:
+        - next : Iterate sequentially (optionally randomized)
+        - random : Iterate randomly (with replacement)
+
+    Attributes
+    ----------
+    index : np.ndarray
+        Index array for fast shuffling.
+
+    index_data_ptr : int
+        Pointer to the index array.
+
+    current_index : int
+        Index of current sample in ``index``.
+        The index of current sample in the data is given by
+        index_data_ptr[current_index].
+
+    n_samples : Py_ssize_t
+        Number of samples in the dataset.
+
+    seed : np.uint32_t
+        Seed used for random sampling.
+
+    """
+
+    cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, double *y, double *sample_weight) nogil:
+        """Get the next example ``x`` from the dataset.
+
+        This method gets the next sample looping sequentially over all samples.
+        The order can be shuffled with the method ``shuffle``.
+        Shuffling once before iterating over all samples corresponds to a
+        random draw without replacement. It is used for instance in SGD solver.
+
+        Parameters
+        ----------
+        x_data_ptr : double**
+            A pointer to the double array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : double*
+            The target value of the next example.
+
+        sample_weight : double*
+            The weight of the next example.
+        """
+        cdef int current_index = self._get_next_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+
+    cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, double *y, double *sample_weight) nogil:
+        """Get a random example ``x`` from the dataset.
+
+        This method gets next sample chosen randomly over a uniform
+        distribution. It corresponds to a random draw with replacement.
+        It is used for instance in SAG solver.
+
+        Parameters
+        ----------
+        x_data_ptr : double**
+            A pointer to the double array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : double*
+            The target value of the next example.
+
+        sample_weight : double*
+            The weight of the next example.
+
+        Returns
+        -------
+        current_index : int
+            Index of current sample.
+        """
+        cdef int current_index = self._get_random_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+        return current_index
+
+    cdef void shuffle(self, np.uint32_t seed) nogil:
+        """Permutes the ordering of examples."""
+        # Fisher-Yates shuffle
+        cdef int *ind = self.index_data_ptr
+        cdef int n = self.n_samples
+        cdef unsigned i, j
+        for i in range(n - 1):
+            j = i + our_rand_r(&seed) % (n - i)
+            ind[i], ind[j] = ind[j], ind[i]
+
+    cdef int _get_next_index(self) nogil:
+        cdef int current_index = self.current_index
+        if current_index >= (self.n_samples - 1):
+            current_index = -1
+
+        current_index += 1
+        self.current_index = current_index
+        return self.current_index
+
+    cdef int _get_random_index(self) nogil:
+        cdef int n = self.n_samples
+        cdef int current_index = our_rand_r(&self.seed) % n
+        self.current_index = current_index
+        return current_index
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil:
+        pass
+
+    def _shuffle_py(self, np.uint32_t seed):
+        """python function used for easy testing"""
+        self.shuffle(seed)
+
+    def _next_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_next_index()
+        return self._sample_py(current_index)
+
+    def _random_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_random_index()
+        return self._sample_py(current_index)
+
+    def _sample_py(self, int current_index):
+        """python function used for easy testing"""
+        cdef double* x_data_ptr
+        cdef int* x_indices_ptr
+        cdef int nnz, j
+        cdef double y, sample_weight
+
+        # call _sample in cython
+        self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
+                     current_index)
+
+        # transform the pointed data in numpy CSR array
+        cdef np.ndarray[double, ndim=1] x_data = np.empty(nnz,
+                                                              dtype=np.float64)
+        cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32)
+        cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz],
+                                                           dtype=np.int32)
+
+        for j in range(nnz):
+            x_data[j] = x_data_ptr[j]
+            x_indices[j] = x_indices_ptr[j]
+
+        cdef int sample_idx = self.index_data_ptr[current_index]
+
+        return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx
+
+
+cdef class ArrayDataset64(SequentialDataset64):
+    """Dataset backed by a two-dimensional numpy array.
+
+    The dtype of the numpy array is expected to be ``np.float64`` (double)
+    and C-style memory layout.
+    """
+
+    def __cinit__(self, np.ndarray[double, ndim=2, mode='c'] X,
+                  np.ndarray[double, ndim=1, mode='c'] Y,
+                  np.ndarray[double, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """A ``SequentialDataset`` backed by a two-dimensional numpy array.
+
+        Parameters
+        ----------
+        X : ndarray, dtype=double, ndim=2, mode='c'
+            The sample array, of shape(n_samples, n_features)
+
+        Y : ndarray, dtype=double, ndim=1, mode='c'
+            The target array, of shape(n_samples, )
+
+        sample_weights : ndarray, dtype=double, ndim=1, mode='c'
+            The weight of each sample, of shape(n_samples,)
+        """
+        if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
+            raise ValueError("More than %d samples or features not supported;"
+                             " got (%d, %d)."
+                             % (INT_MAX, X.shape[0], X.shape[1]))
+
+        # keep a reference to the data to prevent garbage collection
+        self.X = X
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = X.shape[0]
+        self.n_features = X.shape[1]
+
+        cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \
+            np.arange(0, self.n_features, dtype=np.intc)
+        self.feature_indices = feature_indices
+        self.feature_indices_ptr = <int *> feature_indices.data
+
+        self.current_index = -1
+        self.X_stride = X.strides[0] / X.itemsize
+        self.X_data_ptr = <double *>X.data
+        self.Y_data_ptr = <double *>Y.data
+        self.sample_weight_data = <double *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] index = \
+            np.arange(0, self.n_samples, dtype=np.intc)
+        self.index = index
+        self.index_data_ptr = <int *>index.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = sample_idx * self.X_stride
+
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.feature_indices_ptr
+        nnz[0] = self.n_features
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+cdef class CSRDataset64(SequentialDataset64):
+    """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
+
+    def __cinit__(self, np.ndarray[double, ndim=1, mode='c'] X_data,
+                  np.ndarray[int, ndim=1, mode='c'] X_indptr,
+                  np.ndarray[int, ndim=1, mode='c'] X_indices,
+                  np.ndarray[double, ndim=1, mode='c'] Y,
+                  np.ndarray[double, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """Dataset backed by a scipy sparse CSR matrix.
+
+        The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
+        The corresponding feature values are given by
+        x_data_ptr[0:nnz].
+
+        Parameters
+        ----------
+        X_data : ndarray, dtype=double, ndim=1, mode='c'
+            The data array of the CSR features matrix.
+
+        X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The index pointer array of the CSR features matrix.
+
+        X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The column indices array of the CSR features matrix.
+
+        Y : ndarray, dtype=double, ndim=1, mode='c'
+            The target values.
+
+        sample_weights : ndarray, dtype=double, ndim=1, mode='c'
+            The weight of each sample.
+        """
+        # keep a reference to the data to prevent garbage collection
+        self.X_data = X_data
+        self.X_indptr = X_indptr
+        self.X_indices = X_indices
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = Y.shape[0]
+        self.current_index = -1
+        self.X_data_ptr = <double *>X_data.data
+        self.X_indptr_ptr = <int *>X_indptr.data
+        self.X_indices_ptr = <int *>X_indices.data
+
+        self.Y_data_ptr = <double *>Y.data
+        self.sample_weight_data = <double *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples,
+                                                               dtype=np.intc)
+        self.index = idx
+        self.index_data_ptr = <int *>idx.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = self.X_indptr_ptr[sample_idx]
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.X_indices_ptr + offset
+        nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+
+cimport cython
+from libc.limits cimport INT_MAX
+cimport numpy as np
+import numpy as np
+
+np.import_array()
+
+from ._random cimport our_rand_r
+
+cdef class SequentialDataset32:
+    """Base class for datasets with sequential data access.
+
+    SequentialDataset is used to iterate over the rows of a matrix X and
+    corresponding target values y, i.e. to iterate over samples.
+    There are two methods to get the next sample:
+        - next : Iterate sequentially (optionally randomized)
+        - random : Iterate randomly (with replacement)
+
+    Attributes
+    ----------
+    index : np.ndarray
+        Index array for fast shuffling.
+
+    index_data_ptr : int
+        Pointer to the index array.
+
+    current_index : int
+        Index of current sample in ``index``.
+        The index of current sample in the data is given by
+        index_data_ptr[current_index].
+
+    n_samples : Py_ssize_t
+        Number of samples in the dataset.
+
+    seed : np.uint32_t
+        Seed used for random sampling.
+
+    """
+
+    cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, float *y, float *sample_weight) nogil:
+        """Get the next example ``x`` from the dataset.
+
+        This method gets the next sample looping sequentially over all samples.
+        The order can be shuffled with the method ``shuffle``.
+        Shuffling once before iterating over all samples corresponds to a
+        random draw without replacement. It is used for instance in SGD solver.
+
+        Parameters
+        ----------
+        x_data_ptr : float**
+            A pointer to the float array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : float*
+            The target value of the next example.
+
+        sample_weight : float*
+            The weight of the next example.
+        """
+        cdef int current_index = self._get_next_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+
+    cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, float *y, float *sample_weight) nogil:
+        """Get a random example ``x`` from the dataset.
+
+        This method gets next sample chosen randomly over a uniform
+        distribution. It corresponds to a random draw with replacement.
+        It is used for instance in SAG solver.
+
+        Parameters
+        ----------
+        x_data_ptr : float**
+            A pointer to the float array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : float*
+            The target value of the next example.
+
+        sample_weight : float*
+            The weight of the next example.
+
+        Returns
+        -------
+        current_index : int
+            Index of current sample.
+        """
+        cdef int current_index = self._get_random_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+        return current_index
+
+    cdef void shuffle(self, np.uint32_t seed) nogil:
+        """Permutes the ordering of examples."""
+        # Fisher-Yates shuffle
+        cdef int *ind = self.index_data_ptr
+        cdef int n = self.n_samples
+        cdef unsigned i, j
+        for i in range(n - 1):
+            j = i + our_rand_r(&seed) % (n - i)
+            ind[i], ind[j] = ind[j], ind[i]
+
+    cdef int _get_next_index(self) nogil:
+        cdef int current_index = self.current_index
+        if current_index >= (self.n_samples - 1):
+            current_index = -1
+
+        current_index += 1
+        self.current_index = current_index
+        return self.current_index
+
+    cdef int _get_random_index(self) nogil:
+        cdef int n = self.n_samples
+        cdef int current_index = our_rand_r(&self.seed) % n
+        self.current_index = current_index
+        return current_index
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil:
+        pass
+
+    def _shuffle_py(self, np.uint32_t seed):
+        """python function used for easy testing"""
+        self.shuffle(seed)
+
+    def _next_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_next_index()
+        return self._sample_py(current_index)
+
+    def _random_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_random_index()
+        return self._sample_py(current_index)
+
+    def _sample_py(self, int current_index):
+        """python function used for easy testing"""
+        cdef float* x_data_ptr
+        cdef int* x_indices_ptr
+        cdef int nnz, j
+        cdef float y, sample_weight
+
+        # call _sample in cython
+        self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
+                     current_index)
+
+        # transform the pointed data in numpy CSR array
+        cdef np.ndarray[float, ndim=1] x_data = np.empty(nnz,
+                                                              dtype=np.float32)
+        cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32)
+        cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz],
+                                                           dtype=np.int32)
+
+        for j in range(nnz):
+            x_data[j] = x_data_ptr[j]
+            x_indices[j] = x_indices_ptr[j]
+
+        cdef int sample_idx = self.index_data_ptr[current_index]
+
+        return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx
+
+
+cdef class ArrayDataset32(SequentialDataset32):
+    """Dataset backed by a two-dimensional numpy array.
+
+    The dtype of the numpy array is expected to be ``np.float32`` (float)
+    and C-style memory layout.
+    """
+
+    def __cinit__(self, np.ndarray[float, ndim=2, mode='c'] X,
+                  np.ndarray[float, ndim=1, mode='c'] Y,
+                  np.ndarray[float, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """A ``SequentialDataset`` backed by a two-dimensional numpy array.
+
+        Parameters
+        ----------
+        X : ndarray, dtype=float, ndim=2, mode='c'
+            The sample array, of shape(n_samples, n_features)
+
+        Y : ndarray, dtype=float, ndim=1, mode='c'
+            The target array, of shape(n_samples, )
+
+        sample_weights : ndarray, dtype=float, ndim=1, mode='c'
+            The weight of each sample, of shape(n_samples,)
+        """
+        if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
+            raise ValueError("More than %d samples or features not supported;"
+                             " got (%d, %d)."
+                             % (INT_MAX, X.shape[0], X.shape[1]))
+
+        # keep a reference to the data to prevent garbage collection
+        self.X = X
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = X.shape[0]
+        self.n_features = X.shape[1]
+
+        cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \
+            np.arange(0, self.n_features, dtype=np.intc)
+        self.feature_indices = feature_indices
+        self.feature_indices_ptr = <int *> feature_indices.data
+
+        self.current_index = -1
+        self.X_stride = X.strides[0] / X.itemsize
+        self.X_data_ptr = <float *>X.data
+        self.Y_data_ptr = <float *>Y.data
+        self.sample_weight_data = <float *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] index = \
+            np.arange(0, self.n_samples, dtype=np.intc)
+        self.index = index
+        self.index_data_ptr = <int *>index.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = sample_idx * self.X_stride
+
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.feature_indices_ptr
+        nnz[0] = self.n_features
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+cdef class CSRDataset32(SequentialDataset32):
+    """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
+
+    def __cinit__(self, np.ndarray[float, ndim=1, mode='c'] X_data,
+                  np.ndarray[int, ndim=1, mode='c'] X_indptr,
+                  np.ndarray[int, ndim=1, mode='c'] X_indices,
+                  np.ndarray[float, ndim=1, mode='c'] Y,
+                  np.ndarray[float, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """Dataset backed by a scipy sparse CSR matrix.
+
+        The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
+        The corresponding feature values are given by
+        x_data_ptr[0:nnz].
+
+        Parameters
+        ----------
+        X_data : ndarray, dtype=float, ndim=1, mode='c'
+            The data array of the CSR features matrix.
+
+        X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The index pointer array of the CSR features matrix.
+
+        X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The column indices array of the CSR features matrix.
+
+        Y : ndarray, dtype=float, ndim=1, mode='c'
+            The target values.
+
+        sample_weights : ndarray, dtype=float, ndim=1, mode='c'
+            The weight of each sample.
+        """
+        # keep a reference to the data to prevent garbage collection
+        self.X_data = X_data
+        self.X_indptr = X_indptr
+        self.X_indices = X_indices
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = Y.shape[0]
+        self.current_index = -1
+        self.X_data_ptr = <float *>X_data.data
+        self.X_indptr_ptr = <int *>X_indptr.data
+        self.X_indices_ptr = <int *>X_indices.data
+
+        self.Y_data_ptr = <float *>Y.data
+        self.sample_weight_data = <float *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples,
+                                                               dtype=np.intc)
+        self.index = idx
+        self.index_data_ptr = <int *>idx.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = self.X_indptr_ptr[sample_idx]
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.X_indices_ptr + offset
+        nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+

From 9c77fbea1b021898a9c888587468ff22443cb87b Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 25 Oct 2019 14:07:30 -0400
Subject: [PATCH 06/19] Removing unexpected cython files

---
 sklearn/linear_model/_least_angle.py |    7 +-
 sklearn/linear_model/sag_fast.pyx    | 1357 --------------------------
 sklearn/utils/seq_dataset.pxd        |  116 ---
 sklearn/utils/seq_dataset.pyx        |  653 -------------
 4 files changed, 4 insertions(+), 2129 deletions(-)
 delete mode 100644 sklearn/linear_model/sag_fast.pyx
 delete mode 100644 sklearn/utils/seq_dataset.pxd
 delete mode 100644 sklearn/utils/seq_dataset.pyx

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 9012268978a57..ed17b8b7f30c4 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -810,7 +810,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
-    jitter : float, default=0.001
+    jitter : float, default=None
         Uniform noise parameter, added to the y values, to satisfy
         the model's assumption of one-at-a-time computations
         (Efron et al. 2004).
@@ -972,7 +972,8 @@ def fit(self, X, y, Xy=None):
             max_iter = self.max_iter
 
         if self.jitter:
-            noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y))
+            noise = np.random.RandomState(0).uniform(high=self.jitter,
+                                                     size=len(y))
             if y.ndim == 2:
                 noise = noise.reshape(-1, 1)
             y = y + noise
@@ -1044,7 +1045,7 @@ class LassoLars(Lars):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
-    jitter : float, default=0.001
+    jitter : float, default=None
         Uniform noise parameter, added to the y values, to satisfy \
         the model's assumption of one-at-a-time computations \
         (Efron et al. 2004).
diff --git a/sklearn/linear_model/sag_fast.pyx b/sklearn/linear_model/sag_fast.pyx
deleted file mode 100644
index 6d48b65bda560..0000000000000
--- a/sklearn/linear_model/sag_fast.pyx
+++ /dev/null
@@ -1,1357 +0,0 @@
-
-#------------------------------------------------------------------------------
-
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-#
-# Authors: Danny Sullivan <dbsullivan23@gmail.com>
-#          Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
-#          Arthur Mensch <arthur.mensch@m4x.org
-#
-# License: BSD 3 clause
-
-"""
-SAG and SAGA implementation
-WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
-"""
-
-cimport numpy as np
-import numpy as np
-from libc.math cimport fabs, exp, log
-from libc.time cimport time, time_t
-
-from .sgd_fast cimport LossFunction
-from .sgd_fast cimport Log, SquaredLoss
-
-from ..utils.seq_dataset cimport SequentialDataset32, SequentialDataset64
-
-from libc.stdio cimport printf
-
-
-
-cdef extern from "sgd_fast_helpers.h":
-    bint skl_isfinite64(double) nogil
-
-
-cdef extern from "sgd_fast_helpers.h":
-    bint skl_isfinite32(float) nogil
-
-
-cdef inline double fmax64(double x, double y) nogil:
-    if x > y:
-        return x
-    return y
-
-cdef inline float fmax32(float x, float y) nogil:
-    if x > y:
-        return x
-    return y
-
-cdef double _logsumexp64(double* arr, int n_classes) nogil:
-    """Computes the sum of arr assuming arr is in the log domain.
-
-    Returns log(sum(exp(arr))) while minimizing the possibility of
-    over/underflow.
-    """
-    # Use the max to normalize, as with the log this is what accumulates
-    # the less errors
-    cdef double vmax = arr[0]
-    cdef double out = 0.0
-    cdef int i
-
-    for i in range(1, n_classes):
-        if vmax < arr[i]:
-            vmax = arr[i]
-
-    for i in range(n_classes):
-        out += exp(arr[i] - vmax)
-
-    return log(out) + vmax
-
-cdef float _logsumexp32(float* arr, int n_classes) nogil:
-    """Computes the sum of arr assuming arr is in the log domain.
-
-    Returns log(sum(exp(arr))) while minimizing the possibility of
-    over/underflow.
-    """
-    # Use the max to normalize, as with the log this is what accumulates
-    # the less errors
-    cdef float vmax = arr[0]
-    cdef float out = 0.0
-    cdef int i
-
-    for i in range(1, n_classes):
-        if vmax < arr[i]:
-            vmax = arr[i]
-
-    for i in range(n_classes):
-        out += exp(arr[i] - vmax)
-
-    return log(out) + vmax
-
-cdef class MultinomialLogLoss64:
-    cdef double _loss(self, double* prediction, double y, int n_classes,
-                      double sample_weight) nogil:
-        r"""Multinomial Logistic regression loss.
-
-        The multinomial logistic loss for one sample is:
-        loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
-             = sw (logsumexp(prediction) - prediction[y])
-
-        where:
-            prediction = dot(x_sample, weights) + intercept
-            \delta_{y,c} = 1 if (y == c) else 0
-            sw = sample_weight
-
-        Parameters
-        ----------
-        prediction : pointer to a np.ndarray[double] of shape (n_classes,)
-            Prediction of the multinomial classifier, for current sample.
-
-        y : double, between 0 and n_classes - 1
-            Indice of the correct class for current sample (i.e. label encoded).
-
-        n_classes : integer
-            Total number of classes.
-
-        sample_weight : double
-            Weight of current sample.
-
-        Returns
-        -------
-        loss : double
-            Multinomial loss for current sample.
-
-        Reference
-        ---------
-        Bishop, C. M. (2006). Pattern recognition and machine learning.
-        Springer. (Chapter 4.3.4)
-        """
-        cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes)
-        cdef double loss
-
-        # y is the indice of the correct class of current sample.
-        loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
-        return loss
-
-    cdef void _dloss(self, double* prediction, double y, int n_classes,
-                     double sample_weight, double* gradient_ptr) nogil:
-        r"""Multinomial Logistic regression gradient of the loss.
-
-        The gradient of the multinomial logistic loss with respect to a class c,
-        and for one sample is:
-        grad_c = - sw * (p[c] - \delta_{y,c})
-
-        where:
-            p[c] = exp(logsumexp(prediction) - prediction[c])
-            prediction = dot(sample, weights) + intercept
-            \delta_{y,c} = 1 if (y == c) else 0
-            sw = sample_weight
-
-        Note that to obtain the true gradient, this value has to be multiplied
-        by the sample vector x.
-
-        Parameters
-        ----------
-        prediction : pointer to a np.ndarray[double] of shape (n_classes,)
-            Prediction of the multinomial classifier, for current sample.
-
-        y : double, between 0 and n_classes - 1
-            Indice of the correct class for current sample (i.e. label encoded)
-
-        n_classes : integer
-            Total number of classes.
-
-        sample_weight : double
-            Weight of current sample.
-
-        gradient_ptr : pointer to a np.ndarray[double] of shape (n_classes,)
-            Gradient vector to be filled.
-
-        Reference
-        ---------
-        Bishop, C. M. (2006). Pattern recognition and machine learning.
-        Springer. (Chapter 4.3.4)
-        """
-        cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes)
-        cdef int class_ind
-
-        for class_ind in range(n_classes):
-            gradient_ptr[class_ind] = exp(prediction[class_ind] -
-                                          logsumexp_prediction)
-
-            # y is the indice of the correct class of current sample.
-            if class_ind == y:
-                gradient_ptr[class_ind] -= 1.0
-
-            gradient_ptr[class_ind] *= sample_weight
-
-    def __reduce__(self):
-        return MultinomialLogLoss64, ()
-
-cdef class MultinomialLogLoss32:
-    cdef float _loss(self, float* prediction, float y, int n_classes,
-                      float sample_weight) nogil:
-        r"""Multinomial Logistic regression loss.
-
-        The multinomial logistic loss for one sample is:
-        loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
-             = sw (logsumexp(prediction) - prediction[y])
-
-        where:
-            prediction = dot(x_sample, weights) + intercept
-            \delta_{y,c} = 1 if (y == c) else 0
-            sw = sample_weight
-
-        Parameters
-        ----------
-        prediction : pointer to a np.ndarray[float] of shape (n_classes,)
-            Prediction of the multinomial classifier, for current sample.
-
-        y : float, between 0 and n_classes - 1
-            Indice of the correct class for current sample (i.e. label encoded).
-
-        n_classes : integer
-            Total number of classes.
-
-        sample_weight : float
-            Weight of current sample.
-
-        Returns
-        -------
-        loss : float
-            Multinomial loss for current sample.
-
-        Reference
-        ---------
-        Bishop, C. M. (2006). Pattern recognition and machine learning.
-        Springer. (Chapter 4.3.4)
-        """
-        cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes)
-        cdef float loss
-
-        # y is the indice of the correct class of current sample.
-        loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
-        return loss
-
-    cdef void _dloss(self, float* prediction, float y, int n_classes,
-                     float sample_weight, float* gradient_ptr) nogil:
-        r"""Multinomial Logistic regression gradient of the loss.
-
-        The gradient of the multinomial logistic loss with respect to a class c,
-        and for one sample is:
-        grad_c = - sw * (p[c] - \delta_{y,c})
-
-        where:
-            p[c] = exp(logsumexp(prediction) - prediction[c])
-            prediction = dot(sample, weights) + intercept
-            \delta_{y,c} = 1 if (y == c) else 0
-            sw = sample_weight
-
-        Note that to obtain the true gradient, this value has to be multiplied
-        by the sample vector x.
-
-        Parameters
-        ----------
-        prediction : pointer to a np.ndarray[float] of shape (n_classes,)
-            Prediction of the multinomial classifier, for current sample.
-
-        y : float, between 0 and n_classes - 1
-            Indice of the correct class for current sample (i.e. label encoded)
-
-        n_classes : integer
-            Total number of classes.
-
-        sample_weight : float
-            Weight of current sample.
-
-        gradient_ptr : pointer to a np.ndarray[float] of shape (n_classes,)
-            Gradient vector to be filled.
-
-        Reference
-        ---------
-        Bishop, C. M. (2006). Pattern recognition and machine learning.
-        Springer. (Chapter 4.3.4)
-        """
-        cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes)
-        cdef int class_ind
-
-        for class_ind in range(n_classes):
-            gradient_ptr[class_ind] = exp(prediction[class_ind] -
-                                          logsumexp_prediction)
-
-            # y is the indice of the correct class of current sample.
-            if class_ind == y:
-                gradient_ptr[class_ind] -= 1.0
-
-            gradient_ptr[class_ind] *= sample_weight
-
-    def __reduce__(self):
-        return MultinomialLogLoss32, ()
-
-cdef inline double _soft_thresholding64(double x, double shrinkage) nogil:
-    return fmax64(x - shrinkage, 0) - fmax64(- x - shrinkage, 0)
-
-cdef inline float _soft_thresholding32(float x, float shrinkage) nogil:
-    return fmax32(x - shrinkage, 0) - fmax32(- x - shrinkage, 0)
-
-def sag64(SequentialDataset64 dataset,
-        np.ndarray[double, ndim=2, mode='c'] weights_array,
-        np.ndarray[double, ndim=1, mode='c'] intercept_array,
-        int n_samples,
-        int n_features,
-        int n_classes,
-        double tol,
-        int max_iter,
-        str loss_function,
-        double step_size,
-        double alpha,
-        double beta,
-        np.ndarray[double, ndim=2, mode='c'] sum_gradient_init,
-        np.ndarray[double, ndim=2, mode='c'] gradient_memory_init,
-        np.ndarray[bint, ndim=1, mode='c'] seen_init,
-        int num_seen,
-        bint fit_intercept,
-        np.ndarray[double, ndim=1, mode='c'] intercept_sum_gradient_init,
-        double intercept_decay,
-        bint saga,
-        bint verbose):
-    """Stochastic Average Gradient (SAG) and SAGA solvers.
-
-    Used in Ridge and LogisticRegression.
-
-    Reference
-    ---------
-    Schmidt, M., Roux, N. L., & Bach, F. (2013).
-    Minimizing finite sums with the stochastic average gradient
-    https://hal.inria.fr/hal-00860051/document
-    (section 4.3)
-
-    Defazio, A., Bach, F., Lacoste-Julien, S. (2014),
-    SAGA: A Fast Incremental Gradient Method With Support
-    for Non-Strongly Convex Composite Objectives
-    https://arxiv.org/abs/1407.0202
-
-    """
-    # the data pointer for x, the current sample
-    cdef double *x_data_ptr = NULL
-    # the index pointer for the column of the data
-    cdef int *x_ind_ptr = NULL
-    # the number of non-zero features for current sample
-    cdef int xnnz = -1
-    # the label value for current sample
-    # the label value for curent sample
-    cdef double y
-    # the sample weight
-    cdef double sample_weight
-
-    # helper variable for indexes
-    cdef int f_idx, s_idx, feature_ind, class_ind, j
-    # the number of pass through all samples
-    cdef int n_iter = 0
-    # helper to track iterations through samples
-    cdef int sample_itr
-    # the index (row number) of the current sample
-    cdef int sample_ind
-
-    # the maximum change in weights, used to compute stopping criteria
-    cdef double max_change
-    # a holder variable for the max weight, used to compute stopping criteria
-    cdef double max_weight
-
-    # the start time of the fit
-    cdef time_t start_time
-    # the end time of the fit
-    cdef time_t end_time
-
-    # precomputation since the step size does not change in this implementation
-    cdef double wscale_update = 1.0 - step_size * alpha
-
-    # vector of booleans indicating whether this sample has been seen
-    cdef bint* seen = <bint*> seen_init.data
-
-    # helper for cumulative sum
-    cdef double cum_sum
-
-    # the pointer to the coef_ or weights
-    cdef double* weights = <double * >weights_array.data
-    # the pointer to the intercept_array
-    cdef double* intercept = <double * >intercept_array.data
-
-    # the pointer to the intercept_sum_gradient
-    cdef double* intercept_sum_gradient = \
-        <double * >intercept_sum_gradient_init.data
-
-    # the sum of gradients for each feature
-    cdef double* sum_gradient = <double*> sum_gradient_init.data
-    # the previously seen gradient for each sample
-    cdef double* gradient_memory = <double*> gradient_memory_init.data
-
-    # the cumulative sums needed for JIT params
-    cdef np.ndarray[double, ndim=1] cumulative_sums_array = \
-        np.empty(n_samples, dtype=np.float64, order="c")
-    cdef double* cumulative_sums = <double*> cumulative_sums_array.data
-
-    # the index for the last time this feature was updated
-    cdef np.ndarray[int, ndim=1] feature_hist_array = \
-        np.zeros(n_features, dtype=np.int32, order="c")
-    cdef int* feature_hist = <int*> feature_hist_array.data
-
-    # the previous weights to use to compute stopping criteria
-    cdef np.ndarray[double, ndim=2] previous_weights_array = \
-        np.zeros((n_features, n_classes), dtype=np.float64, order="c")
-    cdef double* previous_weights = <double*> previous_weights_array.data
-
-    cdef np.ndarray[double, ndim=1] prediction_array = \
-        np.zeros(n_classes, dtype=np.float64, order="c")
-    cdef double* prediction = <double*> prediction_array.data
-
-    cdef np.ndarray[double, ndim=1] gradient_array = \
-        np.zeros(n_classes, dtype=np.float64, order="c")
-    cdef double* gradient = <double*> gradient_array.data
-
-    # Intermediate variable that need declaration since cython cannot infer when templating
-    cdef double val
-
-    # Bias correction term in saga
-    cdef double gradient_correction
-
-    # the scalar used for multiplying z
-    cdef double wscale = 1.0
-
-    # return value (-1 if an error occurred, 0 otherwise)
-    cdef int status = 0
-
-    # the cumulative sums for each iteration for the sparse implementation
-    cumulative_sums[0] = 0.0
-
-    # the multipliative scale needed for JIT params
-    cdef np.ndarray[double, ndim=1] cumulative_sums_prox_array
-    cdef double* cumulative_sums_prox
-
-    cdef bint prox = beta > 0 and saga
-
-    # Loss function to optimize
-    cdef LossFunction loss
-    # Wether the loss function is multinomial
-    cdef bint multinomial = False
-    # Multinomial loss function
-    cdef MultinomialLogLoss64 multiloss
-
-    if loss_function == "multinomial":
-        multinomial = True
-        multiloss = MultinomialLogLoss64()
-    elif loss_function == "log":
-        loss = Log()
-    elif loss_function == "squared":
-        loss = SquaredLoss()
-    else:
-        raise ValueError("Invalid loss parameter: got %s instead of "
-                         "one of ('log', 'squared', 'multinomial')"
-                         % loss_function)
-
-    if prox:
-        cumulative_sums_prox_array = np.empty(n_samples,
-                                              dtype=np.float64, order="c")
-        cumulative_sums_prox = <double*> cumulative_sums_prox_array.data
-    else:
-        cumulative_sums_prox = NULL
-
-    with nogil:
-        start_time = time(NULL)
-        for n_iter in range(max_iter):
-            for sample_itr in range(n_samples):
-                # extract a random sample
-                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz,
-                                              &y, &sample_weight)
-
-                # cached index for gradient_memory
-                s_idx = sample_ind * n_classes
-
-                # update the number of samples seen and the seen array
-                if seen[sample_ind] == 0:
-                    num_seen += 1
-                    seen[sample_ind] = 1
-
-                # make the weight updates
-                if sample_itr > 0:
-                   status = lagged_update64(weights, wscale, xnnz,
-                                                  n_samples, n_classes,
-                                                  sample_itr,
-                                                  cumulative_sums,
-                                                  cumulative_sums_prox,
-                                                  feature_hist,
-                                                  prox,
-                                                  sum_gradient,
-                                                  x_ind_ptr,
-                                                  False,
-                                                  n_iter)
-                   if status == -1:
-                       break
-
-                # find the current prediction
-                predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
-                                       intercept, prediction, n_classes)
-
-                # compute the gradient for this sample, given the prediction
-                if multinomial:
-                    multiloss._dloss(prediction, y, n_classes, sample_weight,
-                                     gradient)
-                else:
-                    gradient[0] = loss._dloss(prediction[0], y) * sample_weight
-
-                # L2 regularization by simply rescaling the weights
-                wscale *= wscale_update
-
-                # make the updates to the sum of gradients
-                for j in range(xnnz):
-                    feature_ind = x_ind_ptr[j]
-                    val = x_data_ptr[j]
-                    f_idx = feature_ind * n_classes
-                    for class_ind in range(n_classes):
-                        gradient_correction = \
-                            val * (gradient[class_ind] -
-                                   gradient_memory[s_idx + class_ind])
-                        if saga:
-                            weights[f_idx + class_ind] -= \
-                                (gradient_correction * step_size
-                                 * (1 - 1. / num_seen) / wscale)
-                        sum_gradient[f_idx + class_ind] += gradient_correction
-
-                # fit the intercept
-                if fit_intercept:
-                    for class_ind in range(n_classes):
-                        gradient_correction = (gradient[class_ind] -
-                                               gradient_memory[s_idx + class_ind])
-                        intercept_sum_gradient[class_ind] += gradient_correction
-                        gradient_correction *= step_size * (1. - 1. / num_seen)
-                        if saga:
-                            intercept[class_ind] -= \
-                                (step_size * intercept_sum_gradient[class_ind] /
-                                 num_seen * intercept_decay) + gradient_correction
-                        else:
-                            intercept[class_ind] -= \
-                                (step_size * intercept_sum_gradient[class_ind] /
-                                 num_seen * intercept_decay)
-
-                        # check to see that the intercept is not inf or NaN
-                        if not skl_isfinite64(intercept[class_ind]):
-                            status = -1
-                            break
-                    # Break from the n_samples outer loop if an error happened
-                    # in the fit_intercept n_classes inner loop
-                    if status == -1:
-                        break
-
-                # update the gradient memory for this sample
-                for class_ind in range(n_classes):
-                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
-
-                if sample_itr == 0:
-                    cumulative_sums[0] = step_size / (wscale * num_seen)
-                    if prox:
-                        cumulative_sums_prox[0] = step_size * beta / wscale
-                else:
-                    cumulative_sums[sample_itr] = \
-                        (cumulative_sums[sample_itr - 1] +
-                         step_size / (wscale * num_seen))
-                    if prox:
-                        cumulative_sums_prox[sample_itr] = \
-                        (cumulative_sums_prox[sample_itr - 1] +
-                             step_size * beta / wscale)
-                # If wscale gets too small, we need to reset the scale.
-                if wscale < 1e-9:
-                    if verbose:
-                        with gil:
-                            print("rescaling...")
-                    status = scale_weights64(
-                        weights, &wscale, n_features, n_samples, n_classes,
-                        sample_itr, cumulative_sums,
-                        cumulative_sums_prox,
-                        feature_hist,
-                        prox, sum_gradient, n_iter)
-                    if status == -1:
-                        break
-
-            # Break from the n_iter outer loop if an error happened in the
-            # n_samples inner loop
-            if status == -1:
-                break
-
-            # we scale the weights every n_samples iterations and reset the
-            # just-in-time update system for numerical stability.
-            status = scale_weights64(weights, &wscale, n_features,
-                                           n_samples,
-                                           n_classes, n_samples - 1,
-                                           cumulative_sums,
-                                           cumulative_sums_prox,
-                                           feature_hist,
-                                           prox, sum_gradient, n_iter)
-
-            if status == -1:
-                break
-            # check if the stopping criteria is reached
-            max_change = 0.0
-            max_weight = 0.0
-            for idx in range(n_features * n_classes):
-                max_weight = fmax64(max_weight, fabs(weights[idx]))
-                max_change = fmax64(max_change,
-                                  fabs(weights[idx] -
-                                       previous_weights[idx]))
-                previous_weights[idx] = weights[idx]
-            if ((max_weight != 0 and max_change / max_weight <= tol)
-                or max_weight == 0 and max_change == 0):
-                if verbose:
-                    end_time = time(NULL)
-                    with gil:
-                        print("convergence after %d epochs took %d seconds" %
-                              (n_iter + 1, end_time - start_time))
-                break
-            elif verbose:
-                printf('Epoch %d, change: %.8f\n', n_iter + 1,
-                                                  max_change / max_weight)
-    n_iter += 1
-    # We do the error treatment here based on error code in status to avoid
-    # re-acquiring the GIL within the cython code, which slows the computation
-    # when the sag/saga solver is used concurrently in multiple Python threads.
-    if status == -1:
-        raise ValueError(("Floating-point under-/overflow occurred at epoch"
-                          " #%d. Scaling input data with StandardScaler or"
-                          " MinMaxScaler might help.") % n_iter)
-
-    if verbose and n_iter >= max_iter:
-        end_time = time(NULL)
-        print(("max_iter reached after %d seconds") %
-              (end_time - start_time))
-
-    return num_seen, n_iter
-
-def sag32(SequentialDataset32 dataset,
-        np.ndarray[float, ndim=2, mode='c'] weights_array,
-        np.ndarray[float, ndim=1, mode='c'] intercept_array,
-        int n_samples,
-        int n_features,
-        int n_classes,
-        double tol,
-        int max_iter,
-        str loss_function,
-        double step_size,
-        double alpha,
-        double beta,
-        np.ndarray[float, ndim=2, mode='c'] sum_gradient_init,
-        np.ndarray[float, ndim=2, mode='c'] gradient_memory_init,
-        np.ndarray[bint, ndim=1, mode='c'] seen_init,
-        int num_seen,
-        bint fit_intercept,
-        np.ndarray[float, ndim=1, mode='c'] intercept_sum_gradient_init,
-        double intercept_decay,
-        bint saga,
-        bint verbose):
-    """Stochastic Average Gradient (SAG) and SAGA solvers.
-
-    Used in Ridge and LogisticRegression.
-
-    Reference
-    ---------
-    Schmidt, M., Roux, N. L., & Bach, F. (2013).
-    Minimizing finite sums with the stochastic average gradient
-    https://hal.inria.fr/hal-00860051/document
-    (section 4.3)
-
-    Defazio, A., Bach, F., Lacoste-Julien, S. (2014),
-    SAGA: A Fast Incremental Gradient Method With Support
-    for Non-Strongly Convex Composite Objectives
-    https://arxiv.org/abs/1407.0202
-
-    """
-    # the data pointer for x, the current sample
-    cdef float *x_data_ptr = NULL
-    # the index pointer for the column of the data
-    cdef int *x_ind_ptr = NULL
-    # the number of non-zero features for current sample
-    cdef int xnnz = -1
-    # the label value for current sample
-    # the label value for curent sample
-    cdef float y
-    # the sample weight
-    cdef float sample_weight
-
-    # helper variable for indexes
-    cdef int f_idx, s_idx, feature_ind, class_ind, j
-    # the number of pass through all samples
-    cdef int n_iter = 0
-    # helper to track iterations through samples
-    cdef int sample_itr
-    # the index (row number) of the current sample
-    cdef int sample_ind
-
-    # the maximum change in weights, used to compute stopping criteria
-    cdef float max_change
-    # a holder variable for the max weight, used to compute stopping criteria
-    cdef float max_weight
-
-    # the start time of the fit
-    cdef time_t start_time
-    # the end time of the fit
-    cdef time_t end_time
-
-    # precomputation since the step size does not change in this implementation
-    cdef float wscale_update = 1.0 - step_size * alpha
-
-    # vector of booleans indicating whether this sample has been seen
-    cdef bint* seen = <bint*> seen_init.data
-
-    # helper for cumulative sum
-    cdef float cum_sum
-
-    # the pointer to the coef_ or weights
-    cdef float* weights = <float * >weights_array.data
-    # the pointer to the intercept_array
-    cdef float* intercept = <float * >intercept_array.data
-
-    # the pointer to the intercept_sum_gradient
-    cdef float* intercept_sum_gradient = \
-        <float * >intercept_sum_gradient_init.data
-
-    # the sum of gradients for each feature
-    cdef float* sum_gradient = <float*> sum_gradient_init.data
-    # the previously seen gradient for each sample
-    cdef float* gradient_memory = <float*> gradient_memory_init.data
-
-    # the cumulative sums needed for JIT params
-    cdef np.ndarray[float, ndim=1] cumulative_sums_array = \
-        np.empty(n_samples, dtype=np.float32, order="c")
-    cdef float* cumulative_sums = <float*> cumulative_sums_array.data
-
-    # the index for the last time this feature was updated
-    cdef np.ndarray[int, ndim=1] feature_hist_array = \
-        np.zeros(n_features, dtype=np.int32, order="c")
-    cdef int* feature_hist = <int*> feature_hist_array.data
-
-    # the previous weights to use to compute stopping criteria
-    cdef np.ndarray[float, ndim=2] previous_weights_array = \
-        np.zeros((n_features, n_classes), dtype=np.float32, order="c")
-    cdef float* previous_weights = <float*> previous_weights_array.data
-
-    cdef np.ndarray[float, ndim=1] prediction_array = \
-        np.zeros(n_classes, dtype=np.float32, order="c")
-    cdef float* prediction = <float*> prediction_array.data
-
-    cdef np.ndarray[float, ndim=1] gradient_array = \
-        np.zeros(n_classes, dtype=np.float32, order="c")
-    cdef float* gradient = <float*> gradient_array.data
-
-    # Intermediate variable that need declaration since cython cannot infer when templating
-    cdef float val
-
-    # Bias correction term in saga
-    cdef float gradient_correction
-
-    # the scalar used for multiplying z
-    cdef float wscale = 1.0
-
-    # return value (-1 if an error occurred, 0 otherwise)
-    cdef int status = 0
-
-    # the cumulative sums for each iteration for the sparse implementation
-    cumulative_sums[0] = 0.0
-
-    # the multipliative scale needed for JIT params
-    cdef np.ndarray[float, ndim=1] cumulative_sums_prox_array
-    cdef float* cumulative_sums_prox
-
-    cdef bint prox = beta > 0 and saga
-
-    # Loss function to optimize
-    cdef LossFunction loss
-    # Wether the loss function is multinomial
-    cdef bint multinomial = False
-    # Multinomial loss function
-    cdef MultinomialLogLoss32 multiloss
-
-    if loss_function == "multinomial":
-        multinomial = True
-        multiloss = MultinomialLogLoss32()
-    elif loss_function == "log":
-        loss = Log()
-    elif loss_function == "squared":
-        loss = SquaredLoss()
-    else:
-        raise ValueError("Invalid loss parameter: got %s instead of "
-                         "one of ('log', 'squared', 'multinomial')"
-                         % loss_function)
-
-    if prox:
-        cumulative_sums_prox_array = np.empty(n_samples,
-                                              dtype=np.float32, order="c")
-        cumulative_sums_prox = <float*> cumulative_sums_prox_array.data
-    else:
-        cumulative_sums_prox = NULL
-
-    with nogil:
-        start_time = time(NULL)
-        for n_iter in range(max_iter):
-            for sample_itr in range(n_samples):
-                # extract a random sample
-                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz,
-                                              &y, &sample_weight)
-
-                # cached index for gradient_memory
-                s_idx = sample_ind * n_classes
-
-                # update the number of samples seen and the seen array
-                if seen[sample_ind] == 0:
-                    num_seen += 1
-                    seen[sample_ind] = 1
-
-                # make the weight updates
-                if sample_itr > 0:
-                   status = lagged_update32(weights, wscale, xnnz,
-                                                  n_samples, n_classes,
-                                                  sample_itr,
-                                                  cumulative_sums,
-                                                  cumulative_sums_prox,
-                                                  feature_hist,
-                                                  prox,
-                                                  sum_gradient,
-                                                  x_ind_ptr,
-                                                  False,
-                                                  n_iter)
-                   if status == -1:
-                       break
-
-                # find the current prediction
-                predict_sample32(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
-                                       intercept, prediction, n_classes)
-
-                # compute the gradient for this sample, given the prediction
-                if multinomial:
-                    multiloss._dloss(prediction, y, n_classes, sample_weight,
-                                     gradient)
-                else:
-                    gradient[0] = loss._dloss(prediction[0], y) * sample_weight
-
-                # L2 regularization by simply rescaling the weights
-                wscale *= wscale_update
-
-                # make the updates to the sum of gradients
-                for j in range(xnnz):
-                    feature_ind = x_ind_ptr[j]
-                    val = x_data_ptr[j]
-                    f_idx = feature_ind * n_classes
-                    for class_ind in range(n_classes):
-                        gradient_correction = \
-                            val * (gradient[class_ind] -
-                                   gradient_memory[s_idx + class_ind])
-                        if saga:
-                            weights[f_idx + class_ind] -= \
-                                (gradient_correction * step_size
-                                 * (1 - 1. / num_seen) / wscale)
-                        sum_gradient[f_idx + class_ind] += gradient_correction
-
-                # fit the intercept
-                if fit_intercept:
-                    for class_ind in range(n_classes):
-                        gradient_correction = (gradient[class_ind] -
-                                               gradient_memory[s_idx + class_ind])
-                        intercept_sum_gradient[class_ind] += gradient_correction
-                        gradient_correction *= step_size * (1. - 1. / num_seen)
-                        if saga:
-                            intercept[class_ind] -= \
-                                (step_size * intercept_sum_gradient[class_ind] /
-                                 num_seen * intercept_decay) + gradient_correction
-                        else:
-                            intercept[class_ind] -= \
-                                (step_size * intercept_sum_gradient[class_ind] /
-                                 num_seen * intercept_decay)
-
-                        # check to see that the intercept is not inf or NaN
-                        if not skl_isfinite32(intercept[class_ind]):
-                            status = -1
-                            break
-                    # Break from the n_samples outer loop if an error happened
-                    # in the fit_intercept n_classes inner loop
-                    if status == -1:
-                        break
-
-                # update the gradient memory for this sample
-                for class_ind in range(n_classes):
-                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
-
-                if sample_itr == 0:
-                    cumulative_sums[0] = step_size / (wscale * num_seen)
-                    if prox:
-                        cumulative_sums_prox[0] = step_size * beta / wscale
-                else:
-                    cumulative_sums[sample_itr] = \
-                        (cumulative_sums[sample_itr - 1] +
-                         step_size / (wscale * num_seen))
-                    if prox:
-                        cumulative_sums_prox[sample_itr] = \
-                        (cumulative_sums_prox[sample_itr - 1] +
-                             step_size * beta / wscale)
-                # If wscale gets too small, we need to reset the scale.
-                if wscale < 1e-9:
-                    if verbose:
-                        with gil:
-                            print("rescaling...")
-                    status = scale_weights32(
-                        weights, &wscale, n_features, n_samples, n_classes,
-                        sample_itr, cumulative_sums,
-                        cumulative_sums_prox,
-                        feature_hist,
-                        prox, sum_gradient, n_iter)
-                    if status == -1:
-                        break
-
-            # Break from the n_iter outer loop if an error happened in the
-            # n_samples inner loop
-            if status == -1:
-                break
-
-            # we scale the weights every n_samples iterations and reset the
-            # just-in-time update system for numerical stability.
-            status = scale_weights32(weights, &wscale, n_features,
-                                           n_samples,
-                                           n_classes, n_samples - 1,
-                                           cumulative_sums,
-                                           cumulative_sums_prox,
-                                           feature_hist,
-                                           prox, sum_gradient, n_iter)
-
-            if status == -1:
-                break
-            # check if the stopping criteria is reached
-            max_change = 0.0
-            max_weight = 0.0
-            for idx in range(n_features * n_classes):
-                max_weight = fmax32(max_weight, fabs(weights[idx]))
-                max_change = fmax32(max_change,
-                                  fabs(weights[idx] -
-                                       previous_weights[idx]))
-                previous_weights[idx] = weights[idx]
-            if ((max_weight != 0 and max_change / max_weight <= tol)
-                or max_weight == 0 and max_change == 0):
-                if verbose:
-                    end_time = time(NULL)
-                    with gil:
-                        print("convergence after %d epochs took %d seconds" %
-                              (n_iter + 1, end_time - start_time))
-                break
-            elif verbose:
-                printf('Epoch %d, change: %.8f\n', n_iter + 1,
-                                                  max_change / max_weight)
-    n_iter += 1
-    # We do the error treatment here based on error code in status to avoid
-    # re-acquiring the GIL within the cython code, which slows the computation
-    # when the sag/saga solver is used concurrently in multiple Python threads.
-    if status == -1:
-        raise ValueError(("Floating-point under-/overflow occurred at epoch"
-                          " #%d. Scaling input data with StandardScaler or"
-                          " MinMaxScaler might help.") % n_iter)
-
-    if verbose and n_iter >= max_iter:
-        end_time = time(NULL)
-        print(("max_iter reached after %d seconds") %
-              (end_time - start_time))
-
-    return num_seen, n_iter
-
-cdef int scale_weights64(double* weights, double* wscale,
-                               int n_features,
-                               int n_samples, int n_classes, int sample_itr,
-                               double* cumulative_sums,
-                               double* cumulative_sums_prox,
-                               int* feature_hist,
-                               bint prox,
-                               double* sum_gradient,
-                               int n_iter) nogil:
-    """Scale the weights with wscale for numerical stability.
-
-    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
-    can become very small, so we reset it every n_samples iterations to 1.0 for
-    numerical stability. To be able to scale, we first need to update every
-    coefficients and reset the just-in-time update system.
-    This also limits the size of `cumulative_sums`.
-    """
-
-    cdef int status
-    status = lagged_update64(weights, wscale[0], n_features,
-                                   n_samples, n_classes, sample_itr + 1,
-                                   cumulative_sums,
-                                   cumulative_sums_prox,
-                                   feature_hist,
-                                   prox,
-                                   sum_gradient,
-                                   NULL,
-                                   True,
-                                   n_iter)
-    # if lagged update succeeded, reset wscale to 1.0
-    if status == 0:
-        wscale[0] = 1.0
-    return status
-
-cdef int scale_weights32(float* weights, float* wscale,
-                               int n_features,
-                               int n_samples, int n_classes, int sample_itr,
-                               float* cumulative_sums,
-                               float* cumulative_sums_prox,
-                               int* feature_hist,
-                               bint prox,
-                               float* sum_gradient,
-                               int n_iter) nogil:
-    """Scale the weights with wscale for numerical stability.
-
-    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
-    can become very small, so we reset it every n_samples iterations to 1.0 for
-    numerical stability. To be able to scale, we first need to update every
-    coefficients and reset the just-in-time update system.
-    This also limits the size of `cumulative_sums`.
-    """
-
-    cdef int status
-    status = lagged_update32(weights, wscale[0], n_features,
-                                   n_samples, n_classes, sample_itr + 1,
-                                   cumulative_sums,
-                                   cumulative_sums_prox,
-                                   feature_hist,
-                                   prox,
-                                   sum_gradient,
-                                   NULL,
-                                   True,
-                                   n_iter)
-    # if lagged update succeeded, reset wscale to 1.0
-    if status == 0:
-        wscale[0] = 1.0
-    return status
-
-cdef int lagged_update64(double* weights, double wscale, int xnnz,
-                               int n_samples, int n_classes, int sample_itr,
-                               double* cumulative_sums,
-                               double* cumulative_sums_prox,
-                               int* feature_hist,
-                               bint prox,
-                               double* sum_gradient,
-                               int* x_ind_ptr,
-                               bint reset,
-                               int n_iter) nogil:
-    """Hard perform the JIT updates for non-zero features of present sample.
-    The updates that awaits are kept in memory using cumulative_sums,
-    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
-    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
-    1 (this is done at the end of each epoch).
-    """
-    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
-    cdef double cum_sum, grad_step, prox_step, cum_sum_prox
-    for feature_ind in range(xnnz):
-        if not reset:
-            feature_ind = x_ind_ptr[feature_ind]
-        f_idx = feature_ind * n_classes
-
-        cum_sum = cumulative_sums[sample_itr - 1]
-        if prox:
-            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
-        if feature_hist[feature_ind] != 0:
-            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
-            if prox:
-                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
-        if not prox:
-            for class_ind in range(n_classes):
-                idx = f_idx + class_ind
-                weights[idx] -= cum_sum * sum_gradient[idx]
-                if reset:
-                    weights[idx] *= wscale
-                    if not skl_isfinite64(weights[idx]):
-                        # returning here does not require the gil as the return
-                        # type is a C integer
-                        return -1
-        else:
-            for class_ind in range(n_classes):
-                idx = f_idx + class_ind
-                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
-                    # In this case, we can perform all the gradient steps and
-                    # all the proximal steps in this order, which is more
-                    # efficient than unrolling all the lagged updates.
-                    # Idea taken from scikit-learn-contrib/lightning.
-                    weights[idx] -= cum_sum * sum_gradient[idx]
-                    weights[idx] = _soft_thresholding64(weights[idx],
-                                                      cum_sum_prox)
-                else:
-                    last_update_ind = feature_hist[feature_ind]
-                    if last_update_ind == -1:
-                        last_update_ind = sample_itr - 1
-                    for lagged_ind in range(sample_itr - 1,
-                                   last_update_ind - 1, -1):
-                        if lagged_ind > 0:
-                            grad_step = (cumulative_sums[lagged_ind]
-                               - cumulative_sums[lagged_ind - 1])
-                            prox_step = (cumulative_sums_prox[lagged_ind]
-                               - cumulative_sums_prox[lagged_ind - 1])
-                        else:
-                            grad_step = cumulative_sums[lagged_ind]
-                            prox_step = cumulative_sums_prox[lagged_ind]
-                        weights[idx] -= sum_gradient[idx] * grad_step
-                        weights[idx] = _soft_thresholding64(weights[idx],
-                                                          prox_step)
-
-                if reset:
-                    weights[idx] *= wscale
-                    # check to see that the weight is not inf or NaN
-                    if not skl_isfinite64(weights[idx]):
-                        return -1
-        if reset:
-            feature_hist[feature_ind] = sample_itr % n_samples
-        else:
-            feature_hist[feature_ind] = sample_itr
-
-    if reset:
-        cumulative_sums[sample_itr - 1] = 0.0
-        if prox:
-            cumulative_sums_prox[sample_itr - 1] = 0.0
-
-    return 0
-
-cdef int lagged_update32(float* weights, float wscale, int xnnz,
-                               int n_samples, int n_classes, int sample_itr,
-                               float* cumulative_sums,
-                               float* cumulative_sums_prox,
-                               int* feature_hist,
-                               bint prox,
-                               float* sum_gradient,
-                               int* x_ind_ptr,
-                               bint reset,
-                               int n_iter) nogil:
-    """Hard perform the JIT updates for non-zero features of present sample.
-    The updates that awaits are kept in memory using cumulative_sums,
-    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
-    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
-    1 (this is done at the end of each epoch).
-    """
-    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
-    cdef float cum_sum, grad_step, prox_step, cum_sum_prox
-    for feature_ind in range(xnnz):
-        if not reset:
-            feature_ind = x_ind_ptr[feature_ind]
-        f_idx = feature_ind * n_classes
-
-        cum_sum = cumulative_sums[sample_itr - 1]
-        if prox:
-            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
-        if feature_hist[feature_ind] != 0:
-            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
-            if prox:
-                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
-        if not prox:
-            for class_ind in range(n_classes):
-                idx = f_idx + class_ind
-                weights[idx] -= cum_sum * sum_gradient[idx]
-                if reset:
-                    weights[idx] *= wscale
-                    if not skl_isfinite32(weights[idx]):
-                        # returning here does not require the gil as the return
-                        # type is a C integer
-                        return -1
-        else:
-            for class_ind in range(n_classes):
-                idx = f_idx + class_ind
-                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
-                    # In this case, we can perform all the gradient steps and
-                    # all the proximal steps in this order, which is more
-                    # efficient than unrolling all the lagged updates.
-                    # Idea taken from scikit-learn-contrib/lightning.
-                    weights[idx] -= cum_sum * sum_gradient[idx]
-                    weights[idx] = _soft_thresholding32(weights[idx],
-                                                      cum_sum_prox)
-                else:
-                    last_update_ind = feature_hist[feature_ind]
-                    if last_update_ind == -1:
-                        last_update_ind = sample_itr - 1
-                    for lagged_ind in range(sample_itr - 1,
-                                   last_update_ind - 1, -1):
-                        if lagged_ind > 0:
-                            grad_step = (cumulative_sums[lagged_ind]
-                               - cumulative_sums[lagged_ind - 1])
-                            prox_step = (cumulative_sums_prox[lagged_ind]
-                               - cumulative_sums_prox[lagged_ind - 1])
-                        else:
-                            grad_step = cumulative_sums[lagged_ind]
-                            prox_step = cumulative_sums_prox[lagged_ind]
-                        weights[idx] -= sum_gradient[idx] * grad_step
-                        weights[idx] = _soft_thresholding32(weights[idx],
-                                                          prox_step)
-
-                if reset:
-                    weights[idx] *= wscale
-                    # check to see that the weight is not inf or NaN
-                    if not skl_isfinite32(weights[idx]):
-                        return -1
-        if reset:
-            feature_hist[feature_ind] = sample_itr % n_samples
-        else:
-            feature_hist[feature_ind] = sample_itr
-
-    if reset:
-        cumulative_sums[sample_itr - 1] = 0.0
-        if prox:
-            cumulative_sums_prox[sample_itr - 1] = 0.0
-
-    return 0
-
-cdef void predict_sample64(double* x_data_ptr, int* x_ind_ptr, int xnnz,
-                                 double* w_data_ptr, double wscale,
-                                 double* intercept, double* prediction,
-                                 int n_classes) nogil:
-    """Compute the prediction given sparse sample x and dense weight w.
-
-    Parameters
-    ----------
-    x_data_ptr : pointer
-        Pointer to the data of the sample x
-
-    x_ind_ptr : pointer
-        Pointer to the indices of the sample  x
-
-    xnnz : int
-        Number of non-zero element in the sample  x
-
-    w_data_ptr : pointer
-        Pointer to the data of the weights w
-
-    wscale : double
-        Scale of the weights w
-
-    intercept : pointer
-        Pointer to the intercept
-
-    prediction : pointer
-        Pointer to store the resulting prediction
-
-    n_classes : int
-        Number of classes in multinomial case. Equals 1 in binary case.
-
-    """
-    cdef int feature_ind, class_ind, j
-    cdef double innerprod
-
-    for class_ind in range(n_classes):
-        innerprod = 0.0
-        # Compute the dot product only on non-zero elements of x
-        for j in range(xnnz):
-            feature_ind = x_ind_ptr[j]
-            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
-                          x_data_ptr[j])
-
-        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
-
-
-cdef void predict_sample32(float* x_data_ptr, int* x_ind_ptr, int xnnz,
-                                 float* w_data_ptr, float wscale,
-                                 float* intercept, float* prediction,
-                                 int n_classes) nogil:
-    """Compute the prediction given sparse sample x and dense weight w.
-
-    Parameters
-    ----------
-    x_data_ptr : pointer
-        Pointer to the data of the sample x
-
-    x_ind_ptr : pointer
-        Pointer to the indices of the sample  x
-
-    xnnz : int
-        Number of non-zero element in the sample  x
-
-    w_data_ptr : pointer
-        Pointer to the data of the weights w
-
-    wscale : float
-        Scale of the weights w
-
-    intercept : pointer
-        Pointer to the intercept
-
-    prediction : pointer
-        Pointer to store the resulting prediction
-
-    n_classes : int
-        Number of classes in multinomial case. Equals 1 in binary case.
-
-    """
-    cdef int feature_ind, class_ind, j
-    cdef float innerprod
-
-    for class_ind in range(n_classes):
-        innerprod = 0.0
-        # Compute the dot product only on non-zero elements of x
-        for j in range(xnnz):
-            feature_ind = x_ind_ptr[j]
-            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
-                          x_data_ptr[j])
-
-        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
-
-
-
-def _multinomial_grad_loss_all_samples(
-        SequentialDataset64 dataset,
-        np.ndarray[double, ndim=2, mode='c'] weights_array,
-        np.ndarray[double, ndim=1, mode='c'] intercept_array,
-        int n_samples, int n_features, int n_classes):
-    """Compute multinomial gradient and loss across all samples.
-
-    Used for testing purpose only.
-    """
-    cdef double* weights = <double * >weights_array.data
-    cdef double* intercept = <double * >intercept_array.data
-
-    cdef double *x_data_ptr = NULL
-    cdef int *x_ind_ptr = NULL
-    cdef int xnnz = -1
-    cdef double y
-    cdef double sample_weight
-
-    cdef double wscale = 1.0
-    cdef int i, j, class_ind, feature_ind
-    cdef double val
-    cdef double sum_loss = 0.0
-
-    cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64()
-
-    cdef np.ndarray[double, ndim=2] sum_gradient_array = \
-        np.zeros((n_features, n_classes), dtype=np.double, order="c")
-    cdef double* sum_gradient = <double*> sum_gradient_array.data
-
-    cdef np.ndarray[double, ndim=1] prediction_array = \
-        np.zeros(n_classes, dtype=np.double, order="c")
-    cdef double* prediction = <double*> prediction_array.data
-
-    cdef np.ndarray[double, ndim=1] gradient_array = \
-        np.zeros(n_classes, dtype=np.double, order="c")
-    cdef double* gradient = <double*> gradient_array.data
-
-    with nogil:
-        for i in range(n_samples):
-            # get next sample on the dataset
-            dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
-                         &y, &sample_weight)
-
-            # prediction of the multinomial classifier for the sample
-            predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
-                           intercept, prediction, n_classes)
-
-            # compute the gradient for this sample, given the prediction
-            multiloss._dloss(prediction, y, n_classes, sample_weight, gradient)
-
-            # compute the loss for this sample, given the prediction
-            sum_loss += multiloss._loss(prediction, y, n_classes, sample_weight)
-
-            # update the sum of the gradient
-            for j in range(xnnz):
-                feature_ind = x_ind_ptr[j]
-                val = x_data_ptr[j]
-                for class_ind in range(n_classes):
-                    sum_gradient[feature_ind * n_classes + class_ind] += \
-                        gradient[class_ind] * val
-
-    return sum_loss, sum_gradient_array
diff --git a/sklearn/utils/seq_dataset.pxd b/sklearn/utils/seq_dataset.pxd
deleted file mode 100644
index 67ce3b68b4474..0000000000000
--- a/sklearn/utils/seq_dataset.pxd
+++ /dev/null
@@ -1,116 +0,0 @@
-
-#------------------------------------------------------------------------------
-
-"""
-Dataset abstractions for sequential data access.
-WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
-"""
-
-cimport numpy as np
-
-# SequentialDataset and its two concrete subclasses are (optionally randomized)
-# iterators over the rows of a matrix X and corresponding target values y.
-
-
-cdef class SequentialDataset64:
-    cdef int current_index
-    cdef np.ndarray index
-    cdef int *index_data_ptr
-    cdef Py_ssize_t n_samples
-    cdef np.uint32_t seed
-
-    cdef void shuffle(self, np.uint32_t seed) nogil
-    cdef int _get_next_index(self) nogil
-    cdef int _get_random_index(self) nogil
-
-    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, double *y, double *sample_weight,
-                      int current_index) nogil
-    cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
-                   int *nnz, double *y, double *sample_weight) nogil
-    cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
-                    int *nnz, double *y, double *sample_weight) nogil
-
-
-cdef class ArrayDataset64(SequentialDataset64):
-    cdef np.ndarray X
-    cdef np.ndarray Y
-    cdef np.ndarray sample_weights
-    cdef Py_ssize_t n_features
-    cdef np.npy_intp X_stride
-    cdef double *X_data_ptr
-    cdef double *Y_data_ptr
-    cdef np.ndarray feature_indices
-    cdef int *feature_indices_ptr
-    cdef double *sample_weight_data
-
-
-cdef class CSRDataset64(SequentialDataset64):
-    cdef np.ndarray X_data
-    cdef np.ndarray X_indptr
-    cdef np.ndarray X_indices
-    cdef np.ndarray Y
-    cdef np.ndarray sample_weights
-    cdef double *X_data_ptr
-    cdef int *X_indptr_ptr
-    cdef int *X_indices_ptr
-    cdef double *Y_data_ptr
-    cdef double *sample_weight_data
-
-#------------------------------------------------------------------------------
-
-"""
-Dataset abstractions for sequential data access.
-WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
-"""
-
-cimport numpy as np
-
-# SequentialDataset and its two concrete subclasses are (optionally randomized)
-# iterators over the rows of a matrix X and corresponding target values y.
-
-
-cdef class SequentialDataset32:
-    cdef int current_index
-    cdef np.ndarray index
-    cdef int *index_data_ptr
-    cdef Py_ssize_t n_samples
-    cdef np.uint32_t seed
-
-    cdef void shuffle(self, np.uint32_t seed) nogil
-    cdef int _get_next_index(self) nogil
-    cdef int _get_random_index(self) nogil
-
-    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, float *y, float *sample_weight,
-                      int current_index) nogil
-    cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
-                   int *nnz, float *y, float *sample_weight) nogil
-    cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
-                    int *nnz, float *y, float *sample_weight) nogil
-
-
-cdef class ArrayDataset32(SequentialDataset32):
-    cdef np.ndarray X
-    cdef np.ndarray Y
-    cdef np.ndarray sample_weights
-    cdef Py_ssize_t n_features
-    cdef np.npy_intp X_stride
-    cdef float *X_data_ptr
-    cdef float *Y_data_ptr
-    cdef np.ndarray feature_indices
-    cdef int *feature_indices_ptr
-    cdef float *sample_weight_data
-
-
-cdef class CSRDataset32(SequentialDataset32):
-    cdef np.ndarray X_data
-    cdef np.ndarray X_indptr
-    cdef np.ndarray X_indices
-    cdef np.ndarray Y
-    cdef np.ndarray sample_weights
-    cdef float *X_data_ptr
-    cdef int *X_indptr_ptr
-    cdef int *X_indices_ptr
-    cdef float *Y_data_ptr
-    cdef float *sample_weight_data
diff --git a/sklearn/utils/seq_dataset.pyx b/sklearn/utils/seq_dataset.pyx
deleted file mode 100644
index 6fa274771defe..0000000000000
--- a/sklearn/utils/seq_dataset.pyx
+++ /dev/null
@@ -1,653 +0,0 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-
-#------------------------------------------------------------------------------
-
-"""
-Dataset abstractions for sequential data access.
-WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
-"""
-
-cimport cython
-from libc.limits cimport INT_MAX
-cimport numpy as np
-import numpy as np
-
-np.import_array()
-
-from ._random cimport our_rand_r
-
-cdef class SequentialDataset64:
-    """Base class for datasets with sequential data access.
-
-    SequentialDataset is used to iterate over the rows of a matrix X and
-    corresponding target values y, i.e. to iterate over samples.
-    There are two methods to get the next sample:
-        - next : Iterate sequentially (optionally randomized)
-        - random : Iterate randomly (with replacement)
-
-    Attributes
-    ----------
-    index : np.ndarray
-        Index array for fast shuffling.
-
-    index_data_ptr : int
-        Pointer to the index array.
-
-    current_index : int
-        Index of current sample in ``index``.
-        The index of current sample in the data is given by
-        index_data_ptr[current_index].
-
-    n_samples : Py_ssize_t
-        Number of samples in the dataset.
-
-    seed : np.uint32_t
-        Seed used for random sampling.
-
-    """
-
-    cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
-                   int *nnz, double *y, double *sample_weight) nogil:
-        """Get the next example ``x`` from the dataset.
-
-        This method gets the next sample looping sequentially over all samples.
-        The order can be shuffled with the method ``shuffle``.
-        Shuffling once before iterating over all samples corresponds to a
-        random draw without replacement. It is used for instance in SGD solver.
-
-        Parameters
-        ----------
-        x_data_ptr : double**
-            A pointer to the double array which holds the feature
-            values of the next example.
-
-        x_ind_ptr : np.intc**
-            A pointer to the int array which holds the feature
-            indices of the next example.
-
-        nnz : int*
-            A pointer to an int holding the number of non-zero
-            values of the next example.
-
-        y : double*
-            The target value of the next example.
-
-        sample_weight : double*
-            The weight of the next example.
-        """
-        cdef int current_index = self._get_next_index()
-        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
-                     current_index)
-
-    cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
-                    int *nnz, double *y, double *sample_weight) nogil:
-        """Get a random example ``x`` from the dataset.
-
-        This method gets next sample chosen randomly over a uniform
-        distribution. It corresponds to a random draw with replacement.
-        It is used for instance in SAG solver.
-
-        Parameters
-        ----------
-        x_data_ptr : double**
-            A pointer to the double array which holds the feature
-            values of the next example.
-
-        x_ind_ptr : np.intc**
-            A pointer to the int array which holds the feature
-            indices of the next example.
-
-        nnz : int*
-            A pointer to an int holding the number of non-zero
-            values of the next example.
-
-        y : double*
-            The target value of the next example.
-
-        sample_weight : double*
-            The weight of the next example.
-
-        Returns
-        -------
-        current_index : int
-            Index of current sample.
-        """
-        cdef int current_index = self._get_random_index()
-        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
-                     current_index)
-        return current_index
-
-    cdef void shuffle(self, np.uint32_t seed) nogil:
-        """Permutes the ordering of examples."""
-        # Fisher-Yates shuffle
-        cdef int *ind = self.index_data_ptr
-        cdef int n = self.n_samples
-        cdef unsigned i, j
-        for i in range(n - 1):
-            j = i + our_rand_r(&seed) % (n - i)
-            ind[i], ind[j] = ind[j], ind[i]
-
-    cdef int _get_next_index(self) nogil:
-        cdef int current_index = self.current_index
-        if current_index >= (self.n_samples - 1):
-            current_index = -1
-
-        current_index += 1
-        self.current_index = current_index
-        return self.current_index
-
-    cdef int _get_random_index(self) nogil:
-        cdef int n = self.n_samples
-        cdef int current_index = our_rand_r(&self.seed) % n
-        self.current_index = current_index
-        return current_index
-
-    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, double *y, double *sample_weight,
-                      int current_index) nogil:
-        pass
-
-    def _shuffle_py(self, np.uint32_t seed):
-        """python function used for easy testing"""
-        self.shuffle(seed)
-
-    def _next_py(self):
-        """python function used for easy testing"""
-        cdef int current_index = self._get_next_index()
-        return self._sample_py(current_index)
-
-    def _random_py(self):
-        """python function used for easy testing"""
-        cdef int current_index = self._get_random_index()
-        return self._sample_py(current_index)
-
-    def _sample_py(self, int current_index):
-        """python function used for easy testing"""
-        cdef double* x_data_ptr
-        cdef int* x_indices_ptr
-        cdef int nnz, j
-        cdef double y, sample_weight
-
-        # call _sample in cython
-        self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
-                     current_index)
-
-        # transform the pointed data in numpy CSR array
-        cdef np.ndarray[double, ndim=1] x_data = np.empty(nnz,
-                                                              dtype=np.float64)
-        cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32)
-        cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz],
-                                                           dtype=np.int32)
-
-        for j in range(nnz):
-            x_data[j] = x_data_ptr[j]
-            x_indices[j] = x_indices_ptr[j]
-
-        cdef int sample_idx = self.index_data_ptr[current_index]
-
-        return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx
-
-
-cdef class ArrayDataset64(SequentialDataset64):
-    """Dataset backed by a two-dimensional numpy array.
-
-    The dtype of the numpy array is expected to be ``np.float64`` (double)
-    and C-style memory layout.
-    """
-
-    def __cinit__(self, np.ndarray[double, ndim=2, mode='c'] X,
-                  np.ndarray[double, ndim=1, mode='c'] Y,
-                  np.ndarray[double, ndim=1, mode='c'] sample_weights,
-                  np.uint32_t seed=1):
-        """A ``SequentialDataset`` backed by a two-dimensional numpy array.
-
-        Parameters
-        ----------
-        X : ndarray, dtype=double, ndim=2, mode='c'
-            The sample array, of shape(n_samples, n_features)
-
-        Y : ndarray, dtype=double, ndim=1, mode='c'
-            The target array, of shape(n_samples, )
-
-        sample_weights : ndarray, dtype=double, ndim=1, mode='c'
-            The weight of each sample, of shape(n_samples,)
-        """
-        if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
-            raise ValueError("More than %d samples or features not supported;"
-                             " got (%d, %d)."
-                             % (INT_MAX, X.shape[0], X.shape[1]))
-
-        # keep a reference to the data to prevent garbage collection
-        self.X = X
-        self.Y = Y
-        self.sample_weights = sample_weights
-
-        self.n_samples = X.shape[0]
-        self.n_features = X.shape[1]
-
-        cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \
-            np.arange(0, self.n_features, dtype=np.intc)
-        self.feature_indices = feature_indices
-        self.feature_indices_ptr = <int *> feature_indices.data
-
-        self.current_index = -1
-        self.X_stride = X.strides[0] / X.itemsize
-        self.X_data_ptr = <double *>X.data
-        self.Y_data_ptr = <double *>Y.data
-        self.sample_weight_data = <double *>sample_weights.data
-
-        # Use index array for fast shuffling
-        cdef np.ndarray[int, ndim=1, mode='c'] index = \
-            np.arange(0, self.n_samples, dtype=np.intc)
-        self.index = index
-        self.index_data_ptr = <int *>index.data
-        # seed should not be 0 for our_rand_r
-        self.seed = max(seed, 1)
-
-    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, double *y, double *sample_weight,
-                      int current_index) nogil:
-        cdef long long sample_idx = self.index_data_ptr[current_index]
-        cdef long long offset = sample_idx * self.X_stride
-
-        y[0] = self.Y_data_ptr[sample_idx]
-        x_data_ptr[0] = self.X_data_ptr + offset
-        x_ind_ptr[0] = self.feature_indices_ptr
-        nnz[0] = self.n_features
-        sample_weight[0] = self.sample_weight_data[sample_idx]
-
-
-cdef class CSRDataset64(SequentialDataset64):
-    """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
-
-    def __cinit__(self, np.ndarray[double, ndim=1, mode='c'] X_data,
-                  np.ndarray[int, ndim=1, mode='c'] X_indptr,
-                  np.ndarray[int, ndim=1, mode='c'] X_indices,
-                  np.ndarray[double, ndim=1, mode='c'] Y,
-                  np.ndarray[double, ndim=1, mode='c'] sample_weights,
-                  np.uint32_t seed=1):
-        """Dataset backed by a scipy sparse CSR matrix.
-
-        The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
-        The corresponding feature values are given by
-        x_data_ptr[0:nnz].
-
-        Parameters
-        ----------
-        X_data : ndarray, dtype=double, ndim=1, mode='c'
-            The data array of the CSR features matrix.
-
-        X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
-            The index pointer array of the CSR features matrix.
-
-        X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
-            The column indices array of the CSR features matrix.
-
-        Y : ndarray, dtype=double, ndim=1, mode='c'
-            The target values.
-
-        sample_weights : ndarray, dtype=double, ndim=1, mode='c'
-            The weight of each sample.
-        """
-        # keep a reference to the data to prevent garbage collection
-        self.X_data = X_data
-        self.X_indptr = X_indptr
-        self.X_indices = X_indices
-        self.Y = Y
-        self.sample_weights = sample_weights
-
-        self.n_samples = Y.shape[0]
-        self.current_index = -1
-        self.X_data_ptr = <double *>X_data.data
-        self.X_indptr_ptr = <int *>X_indptr.data
-        self.X_indices_ptr = <int *>X_indices.data
-
-        self.Y_data_ptr = <double *>Y.data
-        self.sample_weight_data = <double *>sample_weights.data
-
-        # Use index array for fast shuffling
-        cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples,
-                                                               dtype=np.intc)
-        self.index = idx
-        self.index_data_ptr = <int *>idx.data
-        # seed should not be 0 for our_rand_r
-        self.seed = max(seed, 1)
-
-    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, double *y, double *sample_weight,
-                      int current_index) nogil:
-        cdef long long sample_idx = self.index_data_ptr[current_index]
-        cdef long long offset = self.X_indptr_ptr[sample_idx]
-        y[0] = self.Y_data_ptr[sample_idx]
-        x_data_ptr[0] = self.X_data_ptr + offset
-        x_ind_ptr[0] = self.X_indices_ptr + offset
-        nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
-        sample_weight[0] = self.sample_weight_data[sample_idx]
-
-
-#------------------------------------------------------------------------------
-
-"""
-Dataset abstractions for sequential data access.
-WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
-"""
-
-cimport cython
-from libc.limits cimport INT_MAX
-cimport numpy as np
-import numpy as np
-
-np.import_array()
-
-from ._random cimport our_rand_r
-
-cdef class SequentialDataset32:
-    """Base class for datasets with sequential data access.
-
-    SequentialDataset is used to iterate over the rows of a matrix X and
-    corresponding target values y, i.e. to iterate over samples.
-    There are two methods to get the next sample:
-        - next : Iterate sequentially (optionally randomized)
-        - random : Iterate randomly (with replacement)
-
-    Attributes
-    ----------
-    index : np.ndarray
-        Index array for fast shuffling.
-
-    index_data_ptr : int
-        Pointer to the index array.
-
-    current_index : int
-        Index of current sample in ``index``.
-        The index of current sample in the data is given by
-        index_data_ptr[current_index].
-
-    n_samples : Py_ssize_t
-        Number of samples in the dataset.
-
-    seed : np.uint32_t
-        Seed used for random sampling.
-
-    """
-
-    cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
-                   int *nnz, float *y, float *sample_weight) nogil:
-        """Get the next example ``x`` from the dataset.
-
-        This method gets the next sample looping sequentially over all samples.
-        The order can be shuffled with the method ``shuffle``.
-        Shuffling once before iterating over all samples corresponds to a
-        random draw without replacement. It is used for instance in SGD solver.
-
-        Parameters
-        ----------
-        x_data_ptr : float**
-            A pointer to the float array which holds the feature
-            values of the next example.
-
-        x_ind_ptr : np.intc**
-            A pointer to the int array which holds the feature
-            indices of the next example.
-
-        nnz : int*
-            A pointer to an int holding the number of non-zero
-            values of the next example.
-
-        y : float*
-            The target value of the next example.
-
-        sample_weight : float*
-            The weight of the next example.
-        """
-        cdef int current_index = self._get_next_index()
-        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
-                     current_index)
-
-    cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
-                    int *nnz, float *y, float *sample_weight) nogil:
-        """Get a random example ``x`` from the dataset.
-
-        This method gets next sample chosen randomly over a uniform
-        distribution. It corresponds to a random draw with replacement.
-        It is used for instance in SAG solver.
-
-        Parameters
-        ----------
-        x_data_ptr : float**
-            A pointer to the float array which holds the feature
-            values of the next example.
-
-        x_ind_ptr : np.intc**
-            A pointer to the int array which holds the feature
-            indices of the next example.
-
-        nnz : int*
-            A pointer to an int holding the number of non-zero
-            values of the next example.
-
-        y : float*
-            The target value of the next example.
-
-        sample_weight : float*
-            The weight of the next example.
-
-        Returns
-        -------
-        current_index : int
-            Index of current sample.
-        """
-        cdef int current_index = self._get_random_index()
-        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
-                     current_index)
-        return current_index
-
-    cdef void shuffle(self, np.uint32_t seed) nogil:
-        """Permutes the ordering of examples."""
-        # Fisher-Yates shuffle
-        cdef int *ind = self.index_data_ptr
-        cdef int n = self.n_samples
-        cdef unsigned i, j
-        for i in range(n - 1):
-            j = i + our_rand_r(&seed) % (n - i)
-            ind[i], ind[j] = ind[j], ind[i]
-
-    cdef int _get_next_index(self) nogil:
-        cdef int current_index = self.current_index
-        if current_index >= (self.n_samples - 1):
-            current_index = -1
-
-        current_index += 1
-        self.current_index = current_index
-        return self.current_index
-
-    cdef int _get_random_index(self) nogil:
-        cdef int n = self.n_samples
-        cdef int current_index = our_rand_r(&self.seed) % n
-        self.current_index = current_index
-        return current_index
-
-    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, float *y, float *sample_weight,
-                      int current_index) nogil:
-        pass
-
-    def _shuffle_py(self, np.uint32_t seed):
-        """python function used for easy testing"""
-        self.shuffle(seed)
-
-    def _next_py(self):
-        """python function used for easy testing"""
-        cdef int current_index = self._get_next_index()
-        return self._sample_py(current_index)
-
-    def _random_py(self):
-        """python function used for easy testing"""
-        cdef int current_index = self._get_random_index()
-        return self._sample_py(current_index)
-
-    def _sample_py(self, int current_index):
-        """python function used for easy testing"""
-        cdef float* x_data_ptr
-        cdef int* x_indices_ptr
-        cdef int nnz, j
-        cdef float y, sample_weight
-
-        # call _sample in cython
-        self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
-                     current_index)
-
-        # transform the pointed data in numpy CSR array
-        cdef np.ndarray[float, ndim=1] x_data = np.empty(nnz,
-                                                              dtype=np.float32)
-        cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32)
-        cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz],
-                                                           dtype=np.int32)
-
-        for j in range(nnz):
-            x_data[j] = x_data_ptr[j]
-            x_indices[j] = x_indices_ptr[j]
-
-        cdef int sample_idx = self.index_data_ptr[current_index]
-
-        return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx
-
-
-cdef class ArrayDataset32(SequentialDataset32):
-    """Dataset backed by a two-dimensional numpy array.
-
-    The dtype of the numpy array is expected to be ``np.float32`` (float)
-    and C-style memory layout.
-    """
-
-    def __cinit__(self, np.ndarray[float, ndim=2, mode='c'] X,
-                  np.ndarray[float, ndim=1, mode='c'] Y,
-                  np.ndarray[float, ndim=1, mode='c'] sample_weights,
-                  np.uint32_t seed=1):
-        """A ``SequentialDataset`` backed by a two-dimensional numpy array.
-
-        Parameters
-        ----------
-        X : ndarray, dtype=float, ndim=2, mode='c'
-            The sample array, of shape(n_samples, n_features)
-
-        Y : ndarray, dtype=float, ndim=1, mode='c'
-            The target array, of shape(n_samples, )
-
-        sample_weights : ndarray, dtype=float, ndim=1, mode='c'
-            The weight of each sample, of shape(n_samples,)
-        """
-        if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
-            raise ValueError("More than %d samples or features not supported;"
-                             " got (%d, %d)."
-                             % (INT_MAX, X.shape[0], X.shape[1]))
-
-        # keep a reference to the data to prevent garbage collection
-        self.X = X
-        self.Y = Y
-        self.sample_weights = sample_weights
-
-        self.n_samples = X.shape[0]
-        self.n_features = X.shape[1]
-
-        cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \
-            np.arange(0, self.n_features, dtype=np.intc)
-        self.feature_indices = feature_indices
-        self.feature_indices_ptr = <int *> feature_indices.data
-
-        self.current_index = -1
-        self.X_stride = X.strides[0] / X.itemsize
-        self.X_data_ptr = <float *>X.data
-        self.Y_data_ptr = <float *>Y.data
-        self.sample_weight_data = <float *>sample_weights.data
-
-        # Use index array for fast shuffling
-        cdef np.ndarray[int, ndim=1, mode='c'] index = \
-            np.arange(0, self.n_samples, dtype=np.intc)
-        self.index = index
-        self.index_data_ptr = <int *>index.data
-        # seed should not be 0 for our_rand_r
-        self.seed = max(seed, 1)
-
-    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, float *y, float *sample_weight,
-                      int current_index) nogil:
-        cdef long long sample_idx = self.index_data_ptr[current_index]
-        cdef long long offset = sample_idx * self.X_stride
-
-        y[0] = self.Y_data_ptr[sample_idx]
-        x_data_ptr[0] = self.X_data_ptr + offset
-        x_ind_ptr[0] = self.feature_indices_ptr
-        nnz[0] = self.n_features
-        sample_weight[0] = self.sample_weight_data[sample_idx]
-
-
-cdef class CSRDataset32(SequentialDataset32):
-    """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
-
-    def __cinit__(self, np.ndarray[float, ndim=1, mode='c'] X_data,
-                  np.ndarray[int, ndim=1, mode='c'] X_indptr,
-                  np.ndarray[int, ndim=1, mode='c'] X_indices,
-                  np.ndarray[float, ndim=1, mode='c'] Y,
-                  np.ndarray[float, ndim=1, mode='c'] sample_weights,
-                  np.uint32_t seed=1):
-        """Dataset backed by a scipy sparse CSR matrix.
-
-        The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
-        The corresponding feature values are given by
-        x_data_ptr[0:nnz].
-
-        Parameters
-        ----------
-        X_data : ndarray, dtype=float, ndim=1, mode='c'
-            The data array of the CSR features matrix.
-
-        X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
-            The index pointer array of the CSR features matrix.
-
-        X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
-            The column indices array of the CSR features matrix.
-
-        Y : ndarray, dtype=float, ndim=1, mode='c'
-            The target values.
-
-        sample_weights : ndarray, dtype=float, ndim=1, mode='c'
-            The weight of each sample.
-        """
-        # keep a reference to the data to prevent garbage collection
-        self.X_data = X_data
-        self.X_indptr = X_indptr
-        self.X_indices = X_indices
-        self.Y = Y
-        self.sample_weights = sample_weights
-
-        self.n_samples = Y.shape[0]
-        self.current_index = -1
-        self.X_data_ptr = <float *>X_data.data
-        self.X_indptr_ptr = <int *>X_indptr.data
-        self.X_indices_ptr = <int *>X_indices.data
-
-        self.Y_data_ptr = <float *>Y.data
-        self.sample_weight_data = <float *>sample_weights.data
-
-        # Use index array for fast shuffling
-        cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples,
-                                                               dtype=np.intc)
-        self.index = idx
-        self.index_data_ptr = <int *>idx.data
-        # seed should not be 0 for our_rand_r
-        self.seed = max(seed, 1)
-
-    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
-                      int *nnz, float *y, float *sample_weight,
-                      int current_index) nogil:
-        cdef long long sample_idx = self.index_data_ptr[current_index]
-        cdef long long offset = self.X_indptr_ptr[sample_idx]
-        y[0] = self.Y_data_ptr[sample_idx]
-        x_data_ptr[0] = self.X_data_ptr + offset
-        x_ind_ptr[0] = self.X_indices_ptr + offset
-        nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
-        sample_weight[0] = self.sample_weight_data[sample_idx]
-

From 038b7d1fc3aa3d209bd5fe8efc2fad42e8d574a8 Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 25 Oct 2019 15:27:21 -0400
Subject: [PATCH 07/19] Better coverage

---
 .../linear_model/tests/test_least_angle.py    | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index d3fb5a31fbe8d..ec151637b758b 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -732,29 +732,33 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     assert copy_X == np.array_equal(X, X_copy)
 
 
-def test_lars_with_jitter():
+@pytest.mark.parametrize('y_list, expected_y', [
+    ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]),
+    ([[-2.5, -2.5], [-2.5, -2.5]],
+     [[0, 2.5, 0, 2.5, 0], [0, 2.5, 0, 2.5, 0]])])
+def test_lars_with_jitter(y_list, expected_y):
     """
     Test that user input of a small amount of jitter,
     using example provided in issue #2746
 
     """
 
-    A = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
-    b = np.array([-2.5, -2.5])
-    expected_output = np.array([0, 2.5, 0, 2.5, 0])
+    X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
+    y = np.array(y_list)
+    expected_output = np.array(expected_y)
     alpha = 0.001
     fit_intercept = False
 
     lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept)
-    lars_with_jiggle = linear_model.LassoLars(alpha=alpha,
+    lars_with_jitter = linear_model.LassoLars(alpha=alpha,
                                               fit_intercept=fit_intercept,
                                               jitter=10e-5)
 
-    lars.fit(A, b)
-    lars_with_jiggle.fit(A, b)
+    lars.fit(X, y)
+    lars_with_jitter.fit(X, y)
 
-    w_nojiggle = lars.coef_
-    w_jiggle = lars_with_jiggle.coef_
+    w_nojitter = lars.coef_
+    w_jitter = lars_with_jitter.coef_
 
-    assert not np.array_equal(w_jiggle, w_nojiggle)
-    assert_array_almost_equal(w_jiggle, expected_output, decimal=2)
+    assert not np.array_equal(w_jitter, w_nojitter)
+    assert_array_almost_equal(w_jitter, expected_output, decimal=2)

From f1cefce833e88b73bd20cfadf623c4e95d76f46f Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Sat, 2 Nov 2019 18:42:37 -0400
Subject: [PATCH 08/19] PR comments

---
 sklearn/linear_model/_least_angle.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index ed17b8b7f30c4..9a4b2f4746eb1 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -24,7 +24,6 @@
 from ..exceptions import ConvergenceWarning
 
 SOLVE_TRIANGULAR_ARGS = {'check_finite': False}
-DEFAULT_JITTER = None
 
 
 def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
@@ -862,7 +861,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     def __init__(self, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 jitter=DEFAULT_JITTER):
+                 jitter=None):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -1046,8 +1045,8 @@ class LassoLars(Lars):
         with a small alpha.
 
     jitter : float, default=None
-        Uniform noise parameter, added to the y values, to satisfy \
-        the model's assumption of one-at-a-time computations \
+        Uniform noise parameter, added to the y values, to satisfy
+        the model's assumption of one-at-a-time computations
         (Efron et al. 2004).
 
     positive : bool, default=False
@@ -1094,7 +1093,7 @@ class LassoLars(Lars):
     >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])
     LassoLars(alpha=0.01)
     >>> print(reg.coef_)
-    [ 0.         -0.9632...]
+    [ 0.         -0.963257...]
 
     See also
     --------
@@ -1112,7 +1111,7 @@ class LassoLars(Lars):
     def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 positive=False, jitter=DEFAULT_JITTER):
+                 positive=False, jitter=None):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter

From d82ce51a570aff77484eed7a2bd971a6c1aca629 Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 13 Dec 2019 11:04:09 -0500
Subject: [PATCH 09/19] PR comments

---
 doc/modules/linear_model.rst         |  2 +-
 sklearn/linear_model/_least_angle.py | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 330e6a1008c5c..c0799528ecd07 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -511,7 +511,7 @@ function of the norm of its coefficients.
    >>> reg.fit([[0, 0], [1, 1]], [0, 1])
    LassoLars(alpha=0.1)
    >>> reg.coef_
-   array([0.717..., 0.        ])
+   array([0.0.717157..., 0.        ])
 
 .. topic:: Examples:
 
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 3bc0f42a93b4c..ffe05de762664 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -19,7 +19,7 @@
 
 from ._base import LinearModel
 from ..base import RegressorMixin, MultiOutputMixin
-from ..utils import arrayfuncs, as_float_array, check_X_y
+from ..utils import arrayfuncs, as_float_array, check_X_y, check_random_state
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
 
@@ -814,6 +814,11 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         the model's assumption of one-at-a-time computations
         (Efron et al. 2004).
 
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
     Attributes
     ----------
     alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
@@ -861,7 +866,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     def __init__(self, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 jitter=None):
+                 jitter=None, random_state=None):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -871,6 +876,7 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True,
         self.copy_X = copy_X
         self.fit_path = fit_path
         self.jitter = jitter
+        self.random_state = random_state
 
     @staticmethod
     def _get_gram(precompute, X, y):
@@ -971,7 +977,9 @@ def fit(self, X, y, Xy=None):
             max_iter = self.max_iter
 
         if self.jitter:
-            noise = np.random.RandomState(0).uniform(high=self.jitter,
+            generator = check_random_state(self.random_state)
+
+            noise = generator.uniform(high=self.jitter,
                                                      size=len(y))
             if y.ndim == 2:
                 noise = noise.reshape(-1, 1)
@@ -1740,7 +1748,8 @@ class LassoLarsIC(LassoLars):
     """
     def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
-                 eps=np.finfo(np.float).eps, copy_X=True, positive=False):
+                 eps=np.finfo(np.float).eps, copy_X=True, positive=False,
+                 random_state=None):
         self.criterion = criterion
         self.fit_intercept = fit_intercept
         self.positive = positive
@@ -1751,6 +1760,7 @@ def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
         self.precompute = precompute
         self.eps = eps
         self.fit_path = True
+        self.random_state = random_state
 
     def _more_tags(self):
         return {'multioutput': False}

From ec5156c4d2602aea88a741e278288e2f2ea1252a Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 13 Dec 2019 11:29:13 -0500
Subject: [PATCH 10/19] PR comments

---
 doc/modules/linear_model.rst         | 2 +-
 sklearn/linear_model/_least_angle.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index c0799528ecd07..19205385f311b 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -511,7 +511,7 @@ function of the norm of its coefficients.
    >>> reg.fit([[0, 0], [1, 1]], [0, 1])
    LassoLars(alpha=0.1)
    >>> reg.coef_
-   array([0.0.717157..., 0.        ])
+   array([0.717157..., 0.        ])
 
 .. topic:: Examples:
 
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index ffe05de762664..0b33de6f6619c 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -979,8 +979,7 @@ def fit(self, X, y, Xy=None):
         if self.jitter:
             generator = check_random_state(self.random_state)
 
-            noise = generator.uniform(high=self.jitter,
-                                                     size=len(y))
+            noise = generator.uniform(high=self.jitter, size=len(y))
             if y.ndim == 2:
                 noise = noise.reshape(-1, 1)
             y = y + noise

From b817abf1bdc516745ccdafcf8ca29b9e2237ddb5 Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 13 Dec 2019 12:36:01 -0500
Subject: [PATCH 11/19] PR comments

---
 sklearn/linear_model/_least_angle.py          | 20 +++++++++++--------
 .../linear_model/tests/test_least_angle.py    |  5 +++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 0b33de6f6619c..757ed1cd06372 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -980,8 +980,6 @@ def fit(self, X, y, Xy=None):
             generator = check_random_state(self.random_state)
 
             noise = generator.uniform(high=self.jitter, size=len(y))
-            if y.ndim == 2:
-                noise = noise.reshape(-1, 1)
             y = y + noise
 
         self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path,
@@ -1051,11 +1049,6 @@ class LassoLars(Lars):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
-    jitter : float, default=None
-        Uniform noise parameter, added to the y values, to satisfy
-        the model's assumption of one-at-a-time computations
-        (Efron et al. 2004).
-
     positive : bool, default=False
         Restrict coefficients to be >= 0. Be aware that you might want to
         remove fit_intercept which is set True by default.
@@ -1066,6 +1059,16 @@ class LassoLars(Lars):
         algorithm are typically in congruence with the solution of the
         coordinate descent Lasso estimator.
 
+    jitter : float, default=None
+        Uniform noise parameter, added to the y values, to satisfy
+        the model's assumption of one-at-a-time computations
+        (Efron et al. 2004).
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
     Attributes
     ----------
     alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
@@ -1118,7 +1121,7 @@ class LassoLars(Lars):
     def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 positive=False, jitter=None):
+                 positive=False, jitter=None, random_state=None):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
@@ -1130,6 +1133,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
         self.eps = eps
         self.fit_path = fit_path
         self.jitter = jitter
+        self.random_state = random_state
 
 
 ###############################################################################
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 979f5edf335bc..8967f017480f4 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -735,7 +735,7 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
 @pytest.mark.parametrize('y_list, expected_y', [
     ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]),
     ([[-2.5, -2.5], [-2.5, -2.5]],
-     [[0, 2.5, 0, 2.5, 0], [0, 2.5, 0, 2.5, 0]])])
+     [[0, 5, 0, 2.5, 0], [0, 5, 0, 2.5, 0]])])
 def test_lars_with_jitter(y_list, expected_y):
     """
     Test that user input of a small amount of jitter,
@@ -752,7 +752,8 @@ def test_lars_with_jitter(y_list, expected_y):
     lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept)
     lars_with_jitter = linear_model.LassoLars(alpha=alpha,
                                               fit_intercept=fit_intercept,
-                                              jitter=10e-5)
+                                              jitter=10e-8,
+                                              random_state=0)
 
     lars.fit(X, y)
     lars_with_jitter.fit(X, y)

From 6c3a24f3748e29d3838052544d804aef1484df90 Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 13 Dec 2019 15:10:20 -0500
Subject: [PATCH 12/19] PR comments

---
 sklearn/linear_model/_least_angle.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 757ed1cd06372..7aa9a3e61fc2f 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -811,8 +811,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
 
     jitter : float, default=None
         Uniform noise parameter, added to the y values, to satisfy
-        the model's assumption of one-at-a-time computations
-        (Efron et al. 2004).
+        the model's assumption of one-at-a-time computations.
 
     random_state : int, RandomState instance or None (default)
         Determines random number generation for dataset creation. Pass an int
@@ -976,10 +975,10 @@ def fit(self, X, y, Xy=None):
         else:
             max_iter = self.max_iter
 
-        if self.jitter:
-            generator = check_random_state(self.random_state)
+        if self.jitter is not None:
+            rng = check_random_state(self.random_state)
 
-            noise = generator.uniform(high=self.jitter, size=len(y))
+            noise = rng.uniform(high=self.jitter, size=len(y))
             y = y + noise
 
         self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path,
@@ -1061,8 +1060,7 @@ class LassoLars(Lars):
 
     jitter : float, default=None
         Uniform noise parameter, added to the y values, to satisfy
-        the model's assumption of one-at-a-time computations
-        (Efron et al. 2004).
+        the model's assumption of one-at-a-time computations.
 
     random_state : int, RandomState instance or None (default)
         Determines random number generation for dataset creation. Pass an int

From f6e7aa59dced51438cd0c180b54dfe7c9406096b Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 17 Jan 2020 15:44:17 -0500
Subject: [PATCH 13/19] Linting

---
 sklearn/linear_model/tests/test_least_angle.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 08348a8c56032..a848df6149586 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -770,4 +770,3 @@ def test_X_none_gram_not_none():
     with pytest.raises(ValueError,
                        match="X cannot be None if Gram is not None"):
         lars_path(X=None, y=[1], Gram='not None')
-

From 0a1922567f5c9e0cd9fa5e8d627c38c04949ce00 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 15 Apr 2020 09:04:14 -0400
Subject: [PATCH 14/19] Apply suggestions from code review

---
 sklearn/linear_model/_least_angle.py           | 18 ++++++++++--------
 sklearn/linear_model/tests/test_least_angle.py | 11 ++++-------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 5c75d3cd45ef8..a61ffcadc77c5 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -800,13 +800,14 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         with a small alpha.
 
     jitter : float, default=None
-        Uniform noise parameter, added to the y values, to satisfy
-        the model's assumption of one-at-a-time computations.
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
 
     random_state : int, RandomState instance or None (default)
-        Determines random number generation for dataset creation. Pass an int
+        Determines random number generation for jittering. Pass an int
         for reproducible output across multiple function calls.
-        See :term:`Glossary <random_state>`.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
 
     Attributes
     ----------
@@ -1049,13 +1050,14 @@ class LassoLars(Lars):
         coordinate descent Lasso estimator.
 
     jitter : float, default=None
-        Uniform noise parameter, added to the y values, to satisfy
-        the model's assumption of one-at-a-time computations.
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
 
     random_state : int, RandomState instance or None (default)
-        Determines random number generation for dataset creation. Pass an int
+        Determines random number generation for jittering. Pass an int
         for reproducible output across multiple function calls.
-        See :term:`Glossary <random_state>`.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
 
     Attributes
     ----------
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index a848df6149586..481b44740e3ac 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -733,16 +733,13 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     assert copy_X == np.array_equal(X, X_copy)
 
 
-@pytest.mark.parametrize('y_list, expected_y', [
+@pytest.mark.parametrize('y, expected_coef', [
     ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]),
     ([[-2.5, -2.5], [-2.5, -2.5]],
      [[0, 5, 0, 2.5, 0], [0, 5, 0, 2.5, 0]])])
 def test_lars_with_jitter(y_list, expected_y):
-    """
-    Test that user input of a small amount of jitter,
-    using example provided in issue #2746
-
-    """
+    # Test that a small amount of jitter helps stability,
+    # using example provided in issue #2746
 
     X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
     y = np.array(y_list)
@@ -763,7 +760,7 @@ def test_lars_with_jitter(y_list, expected_y):
     w_jitter = lars_with_jitter.coef_
 
     assert not np.array_equal(w_jitter, w_nojitter)
-    assert_array_almost_equal(w_jitter, expected_output, decimal=2)
+    np.testing.assert_allclose(w_jitter, expected_coeff)```
 
 
 def test_X_none_gram_not_none():

From 58e807bd88e1893cc04c1def332b29fe7fedd451 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 15 Apr 2020 09:22:47 -0400
Subject: [PATCH 15/19] addressed comments

---
 .../linear_model/tests/test_least_angle.py    | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 1fe12d1cf9293..fdcbb96c56cf9 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -733,16 +733,16 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     assert copy_X == np.array_equal(X, X_copy)
 
 
-@pytest.mark.parametrize('y, expected_coef', [
-    ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]),
-    ([[-2.5, -2.5], [-2.5, -2.5]],
-     [[0, 5, 0, 2.5, 0], [0, 5, 0, 2.5, 0]])])
-def test_lars_with_jitter(y, expected_coef):
+def test_lars_with_jitter():
     # Test that a small amount of jitter helps stability,
     # using example provided in issue #2746
 
-    X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
+    X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0],
+                  [0.0, -1.0, 0.0, 0.0, 0.0]])
+    y = [-2.5, -2.5]
+    expected_coef = [0, 2.5, 0, 2.5, 0]
     alpha = 0.001
+    # set to False since target is constant and we check the value of coef
     fit_intercept = False
 
     lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept)
@@ -754,11 +754,9 @@ def test_lars_with_jitter(y, expected_coef):
     lars.fit(X, y)
     lars_with_jitter.fit(X, y)
 
-    w_nojitter = lars.coef_
-    w_jitter = lars_with_jitter.coef_
-
-    assert not np.array_equal(w_jitter, w_nojitter)
-    np.testing.assert_allclose(w_jitter, expected_coef, rtol=1e-3)
+    assert np.mean((lars.coef_ - lars_with_jitter.coef_)**2) > .1
+    np.testing.assert_allclose(lars_with_jitter.coef_, expected_coef,
+                               rtol=1e-3)
 
 
 def test_X_none_gram_not_none():

From 547d4167296e8c746aebee64553c78211c056bcd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 15 Apr 2020 09:24:50 -0400
Subject: [PATCH 16/19] added whatnew entry

---
 doc/whats_new/v0.23.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 4c489c1887815..e5e0999e20557 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -298,6 +298,10 @@ Changelog
   of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
   :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Enhancement| :class:`linear_model.LassoLars` now support a `jitter`
+  parameter that adds random noise to the target. This might help with
+  stability in some edge cases. :pr:`15179` by :user:`angelaambroz`.
+
 :mod:`sklearn.metrics`
 ......................
 

From e9839f59e303b559e34ddb98148ea2500c3c7dfc Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 15 Apr 2020 09:32:34 -0400
Subject: [PATCH 17/19] test both estimators

---
 .../linear_model/tests/test_least_angle.py    | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index fdcbb96c56cf9..e198dfb15e323 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -6,6 +6,7 @@
 import pytest
 from scipy import linalg
 
+from sklearn.base import clone
 from sklearn.model_selection import train_test_split
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
@@ -17,6 +18,7 @@
 from sklearn import linear_model, datasets
 from sklearn.linear_model._least_angle import _lars_path_residues
 from sklearn.linear_model import LassoLarsIC, lars_path
+from sklearn.linear_model import Lars, LassoLars
 
 # TODO: use another dataset that has multiple drops
 diabetes = datasets.load_diabetes()
@@ -733,7 +735,8 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     assert copy_X == np.array_equal(X, X_copy)
 
 
-def test_lars_with_jitter():
+@pytest.mark.parametrize('est', (LassoLars(alpha=1e-3), Lars()))
+def test_lars_with_jitter(est):
     # Test that a small amount of jitter helps stability,
     # using example provided in issue #2746
 
@@ -741,22 +744,17 @@ def test_lars_with_jitter():
                   [0.0, -1.0, 0.0, 0.0, 0.0]])
     y = [-2.5, -2.5]
     expected_coef = [0, 2.5, 0, 2.5, 0]
-    alpha = 0.001
-    # set to False since target is constant and we check the value of coef
-    fit_intercept = False
 
-    lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept)
-    lars_with_jitter = linear_model.LassoLars(alpha=alpha,
-                                              fit_intercept=fit_intercept,
-                                              jitter=10e-8,
-                                              random_state=0)
+    # set to fit_intercept to False since target is constant and we want check
+    # the value of coef. coef would be all zeros otherwise.
+    est.set_params(fit_intercept=False)
+    est_jitter = clone(est).set_params(jitter=10e-8, random_state=0)
 
-    lars.fit(X, y)
-    lars_with_jitter.fit(X, y)
+    est.fit(X, y)
+    est_jitter.fit(X, y)
 
-    assert np.mean((lars.coef_ - lars_with_jitter.coef_)**2) > .1
-    np.testing.assert_allclose(lars_with_jitter.coef_, expected_coef,
-                               rtol=1e-3)
+    assert np.mean((est.coef_ - est_jitter.coef_)**2) > .1
+    np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3)
 
 
 def test_X_none_gram_not_none():

From 09e3cc3b03e679528f92cd90021e15c5e80984de Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 15 Apr 2020 09:33:39 -0400
Subject: [PATCH 18/19] update whatsnew

---
 doc/whats_new/v0.23.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index e5e0999e20557..c4fa3818aa1bc 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -298,9 +298,10 @@ Changelog
   of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
   :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
 
-- |Enhancement| :class:`linear_model.LassoLars` now support a `jitter`
-  parameter that adds random noise to the target. This might help with
-  stability in some edge cases. :pr:`15179` by :user:`angelaambroz`.
+- |Enhancement| :class:`linear_model.LassoLars` and
+  :class:`linear_model.Lars` now support a `jitter` parameter that adds
+  random noise to the target. This might help with stability in some edge
+  cases. :pr:`15179` by :user:`angelaambroz`.
 
 :mod:`sklearn.metrics`
 ......................

From 24e7b81a04f6eb012e4cb2aa467711d5989bbece Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 17 Apr 2020 10:00:09 -0400
Subject: [PATCH 19/19] removed random_state for lassolarsIC

---
 sklearn/linear_model/_least_angle.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 8e940153ed145..bc71d7a1fccbd 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -1743,8 +1743,7 @@ class LassoLarsIC(LassoLars):
     """
     def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
-                 eps=np.finfo(np.float).eps, copy_X=True, positive=False,
-                 random_state=None):
+                 eps=np.finfo(np.float).eps, copy_X=True, positive=False):
         self.criterion = criterion
         self.fit_intercept = fit_intercept
         self.positive = positive
@@ -1755,7 +1754,6 @@ def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
         self.precompute = precompute
         self.eps = eps
         self.fit_path = True
-        self.random_state = random_state
 
     def _more_tags(self):
         return {'multioutput': False}