From 55ff4698a3f2229a8e81965913c5c653d7fa0391 Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Tue, 26 Feb 2019 18:22:41 +0100
Subject: [PATCH 1/5] use pytest and not unitest in test_sgd.py

---
 sklearn/linear_model/tests/test_sgd.py | 2222 +++++++++++++-----------
 1 file changed, 1164 insertions(+), 1058 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 629933a3bc8e9..236d3a9bea426 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1,6 +1,5 @@
 from distutils.version import LooseVersion
 import pickle
-import unittest
 import pytest
 
 import numpy as np
@@ -21,7 +20,6 @@
 
 from sklearn import linear_model, datasets, metrics
 from sklearn.base import clone, is_classifier
-from sklearn.linear_model import SGDClassifier, SGDRegressor
 from sklearn.preprocessing import LabelEncoder, scale, MinMaxScaler
 from sklearn.preprocessing import StandardScaler
 from sklearn.exceptions import ConvergenceWarning
@@ -38,8 +36,17 @@
     "ignore:max_iter and tol parameters have been")
 
 
-class SparseSGDClassifier(SGDClassifier):
+def _update_kwargs(kwargs):
+    if "random_state" not in kwargs:
+        kwargs["random_state"] = 42
 
+    if "tol" not in kwargs:
+        kwargs["tol"] = None
+    if "max_iter" not in kwargs:
+        kwargs["max_iter"] = 5
+
+
+class _SparseSGDClassifier(linear_model.SGDClassifier):
     def fit(self, X, y, *args, **kw):
         X = sp.csr_matrix(X)
         return super().fit(X, y, *args, **kw)
@@ -57,19 +64,39 @@ def predict_proba(self, X):
         return super().predict_proba(X)
 
 
-class SparseSGDRegressor(SGDRegressor):
-
+class _SparseSGDRegressor(linear_model.SGDRegressor):
     def fit(self, X, y, *args, **kw):
         X = sp.csr_matrix(X)
-        return SGDRegressor.fit(self, X, y, *args, **kw)
+        return linear_model.SGDRegressor.fit(self, X, y, *args, **kw)
 
     def partial_fit(self, X, y, *args, **kw):
         X = sp.csr_matrix(X)
-        return SGDRegressor.partial_fit(self, X, y, *args, **kw)
+        return linear_model.SGDRegressor.partial_fit(self, X, y, *args, **kw)
 
     def decision_function(self, X, *args, **kw):
         X = sp.csr_matrix(X)
-        return SGDRegressor.decision_function(self, X, *args, **kw)
+        return linear_model.SGDRegressor.decision_function(self, X, *args,
+                                                           **kw)
+
+
+def SGDClassifier(**kwargs):
+    _update_kwargs(kwargs)
+    return linear_model.SGDClassifier(**kwargs)
+
+
+def SGDRegressor(**kwargs):
+    _update_kwargs(kwargs)
+    return linear_model.SGDRegressor(**kwargs)
+
+
+def SparseSGDClassifier(**kwargs):
+    _update_kwargs(kwargs)
+    return _SparseSGDClassifier(**kwargs)
+
+
+def SparseSGDRegressor(**kwargs):
+    _update_kwargs(kwargs)
+    return _SparseSGDRegressor(**kwargs)
 
 
 # Test Data
@@ -113,1102 +140,1181 @@ def decision_function(self, X, *args, **kw):
 ###############################################################################
 # Tests common to classification and regression
 
-class CommonTest:
-
-    def factory(self, **kwargs):
-        if "random_state" not in kwargs:
-            kwargs["random_state"] = 42
+# a simple implementation of ASGD to use for testing
+# uses squared loss to find the gradient
+def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
+    if weight_init is None:
+        weights = np.zeros(X.shape[1])
+    else:
+        weights = weight_init
+
+    average_weights = np.zeros(X.shape[1])
+    intercept = intercept_init
+    average_intercept = 0.0
+    decay = 1.0
 
-        if "tol" not in kwargs:
-            kwargs["tol"] = None
-        if "max_iter" not in kwargs:
-            kwargs["max_iter"] = 5
+    # sparse data has a fixed decay of .01
+    if klass in (SparseSGDClassifier, SparseSGDRegressor):
+        decay = .01
 
-        return self.factory_class(**kwargs)
+    for i, entry in enumerate(X):
+        p = np.dot(entry, weights)
+        p += intercept
+        gradient = p - y[i]
+        weights *= 1.0 - (eta * alpha)
+        weights += -(eta * gradient * entry)
+        intercept += -(eta * gradient) * decay
+
+        average_weights *= i
+        average_weights += weights
+        average_weights /= i + 1.0
+
+        average_intercept *= i
+        average_intercept += intercept
+        average_intercept /= i + 1.0
+
+    return average_weights, average_intercept
+
+
+def _test_warm_start(klass, X, Y, lr):
+    # Test that explicit warm restart...
+    clf = klass(alpha=0.01, eta0=0.01, shuffle=False,
+                learning_rate=lr)
+    clf.fit(X, Y)
+
+    clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False,
+                 learning_rate=lr)
+    clf2.fit(X, Y,
+             coef_init=clf.coef_.copy(),
+             intercept_init=clf.intercept_.copy())
+
+    # ... and implicit warm restart are equivalent.
+    clf3 = klass(alpha=0.01, eta0=0.01, shuffle=False,
+                 warm_start=True, learning_rate=lr)
+    clf3.fit(X, Y)
+
+    assert_equal(clf3.t_, clf.t_)
+    assert_array_almost_equal(clf3.coef_, clf.coef_)
+
+    clf3.set_params(alpha=0.001)
+    clf3.fit(X, Y)
+
+    assert_equal(clf3.t_, clf2.t_)
+    assert_array_almost_equal(clf3.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
+def test_warm_start(klass, lr):
+    _test_warm_start(klass, X, Y, lr)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_input_format(klass):
+    # Input format tests.
+    clf = klass(alpha=0.01, shuffle=False)
+    clf.fit(X, Y)
+    Y_ = np.array(Y)[:, np.newaxis]
+
+    Y_ = np.c_[Y_, Y_]
+    assert_raises(ValueError, clf.fit, X, Y_)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_clone(klass):
+    # Test whether clone works ok.
+    clf = klass(alpha=0.01, penalty='l1')
+    clf = clone(clf)
+    clf.set_params(penalty='l2')
+    clf.fit(X, Y)
+
+    clf2 = klass(alpha=0.01, penalty='l2')
+    clf2.fit(X, Y)
+
+    assert_array_equal(clf.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_plain_has_no_average_attr(klass):
+    clf = klass(average=True, eta0=.01)
+    clf.fit(X, Y)
+
+    assert hasattr(clf, 'average_coef_')
+    assert hasattr(clf, 'average_intercept_')
+    assert hasattr(clf, 'standard_intercept_')
+    assert hasattr(clf, 'standard_coef_')
+
+    clf = klass()
+    clf.fit(X, Y)
+
+    assert not hasattr(clf, 'average_coef_')
+    assert not hasattr(clf, 'average_intercept_')
+    assert not hasattr(clf, 'standard_intercept_')
+    assert not hasattr(clf, 'standard_coef_')
 
-    # a simple implementation of ASGD to use for testing
-    # uses squared loss to find the gradient
-    def asgd(self, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
-        if weight_init is None:
-            weights = np.zeros(X.shape[1])
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_late_onset_averaging_not_reached(klass):
+    clf1 = klass(average=600)
+    clf2 = klass()
+    for _ in range(100):
+        if is_classifier(clf1):
+            clf1.partial_fit(X, Y, classes=np.unique(Y))
+            clf2.partial_fit(X, Y, classes=np.unique(Y))
         else:
-            weights = weight_init
-
-        average_weights = np.zeros(X.shape[1])
-        intercept = intercept_init
-        average_intercept = 0.0
-        decay = 1.0
-
-        # sparse data has a fixed decay of .01
-        if (isinstance(self, SparseSGDClassifierTestCase) or
-                isinstance(self, SparseSGDRegressorTestCase)):
-            decay = .01
-
-        for i, entry in enumerate(X):
-            p = np.dot(entry, weights)
-            p += intercept
-            gradient = p - y[i]
-            weights *= 1.0 - (eta * alpha)
-            weights += -(eta * gradient * entry)
-            intercept += -(eta * gradient) * decay
-
-            average_weights *= i
-            average_weights += weights
-            average_weights /= i + 1.0
-
-            average_intercept *= i
-            average_intercept += intercept
-            average_intercept /= i + 1.0
-
-        return average_weights, average_intercept
-
-    def _test_warm_start(self, X, Y, lr):
-        # Test that explicit warm restart...
-        clf = self.factory(alpha=0.01, eta0=0.01, shuffle=False,
-                           learning_rate=lr)
-        clf.fit(X, Y)
-
-        clf2 = self.factory(alpha=0.001, eta0=0.01, shuffle=False,
-                            learning_rate=lr)
-        clf2.fit(X, Y,
-                 coef_init=clf.coef_.copy(),
-                 intercept_init=clf.intercept_.copy())
-
-        # ... and implicit warm restart are equivalent.
-        clf3 = self.factory(alpha=0.01, eta0=0.01, shuffle=False,
-                            warm_start=True, learning_rate=lr)
-        clf3.fit(X, Y)
-
-        assert_equal(clf3.t_, clf.t_)
-        assert_array_almost_equal(clf3.coef_, clf.coef_)
-
-        clf3.set_params(alpha=0.001)
-        clf3.fit(X, Y)
-
-        assert_equal(clf3.t_, clf2.t_)
-        assert_array_almost_equal(clf3.coef_, clf2.coef_)
-
-    def test_warm_start_constant(self):
-        self._test_warm_start(X, Y, "constant")
-
-    def test_warm_start_invscaling(self):
-        self._test_warm_start(X, Y, "invscaling")
-
-    def test_warm_start_optimal(self):
-        self._test_warm_start(X, Y, "optimal")
-
-    def test_warm_start_adaptive(self):
-        self._test_warm_start(X, Y, "adaptive")
-
-    def test_input_format(self):
-        # Input format tests.
-        clf = self.factory(alpha=0.01, shuffle=False)
-        clf.fit(X, Y)
-        Y_ = np.array(Y)[:, np.newaxis]
-
-        Y_ = np.c_[Y_, Y_]
-        assert_raises(ValueError, clf.fit, X, Y_)
-
-    def test_clone(self):
-        # Test whether clone works ok.
-        clf = self.factory(alpha=0.01, penalty='l1')
-        clf = clone(clf)
-        clf.set_params(penalty='l2')
-        clf.fit(X, Y)
-
-        clf2 = self.factory(alpha=0.01, penalty='l2')
-        clf2.fit(X, Y)
-
-        assert_array_equal(clf.coef_, clf2.coef_)
-
-    def test_plain_has_no_average_attr(self):
-        clf = self.factory(average=True, eta0=.01)
-        clf.fit(X, Y)
-
-        assert hasattr(clf, 'average_coef_')
-        assert hasattr(clf, 'average_intercept_')
-        assert hasattr(clf, 'standard_intercept_')
-        assert hasattr(clf, 'standard_coef_')
-
-        clf = self.factory()
-        clf.fit(X, Y)
-
-        assert not hasattr(clf, 'average_coef_')
-        assert not hasattr(clf, 'average_intercept_')
-        assert not hasattr(clf, 'standard_intercept_')
-        assert not hasattr(clf, 'standard_coef_')
-
-    def test_late_onset_averaging_not_reached(self):
-        clf1 = self.factory(average=600)
-        clf2 = self.factory()
-        for _ in range(100):
-            if isinstance(clf1, SGDClassifier):
-                clf1.partial_fit(X, Y, classes=np.unique(Y))
-                clf2.partial_fit(X, Y, classes=np.unique(Y))
-            else:
-                clf1.partial_fit(X, Y)
-                clf2.partial_fit(X, Y)
-
-        assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
-        assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
-
-    def test_late_onset_averaging_reached(self):
-        eta0 = .001
-        alpha = .0001
-        Y_encode = np.array(Y)
-        Y_encode[Y_encode == 1] = -1.0
-        Y_encode[Y_encode == 2] = 1.0
-
-        clf1 = self.factory(average=7, learning_rate="constant",
-                            loss='squared_loss', eta0=eta0,
-                            alpha=alpha, max_iter=2, shuffle=False)
-        clf2 = self.factory(average=0, learning_rate="constant",
-                            loss='squared_loss', eta0=eta0,
-                            alpha=alpha, max_iter=1, shuffle=False)
-
-        clf1.fit(X, Y_encode)
-        clf2.fit(X, Y_encode)
-
-        average_weights, average_intercept = \
-            self.asgd(X, Y_encode, eta0, alpha,
-                      weight_init=clf2.coef_.ravel(),
-                      intercept_init=clf2.intercept_)
-
-        assert_array_almost_equal(clf1.coef_.ravel(),
-                                  average_weights.ravel(),
-                                  decimal=16)
-        assert_almost_equal(clf1.intercept_, average_intercept, decimal=16)
-
-    def test_sgd_bad_alpha_for_optimal_learning_rate(self):
-        # Check whether expected ValueError on bad alpha, i.e. 0
-        # since alpha is used to compute the optimal learning rate
-        assert_raises(ValueError, self.factory,
-                      alpha=0, learning_rate="optimal")
-
-    def test_early_stopping(self):
-        X = iris.data[iris.target > 0]
-        Y = iris.target[iris.target > 0]
-        for early_stopping in [True, False]:
-            max_iter = 1000
-            clf = self.factory(early_stopping=early_stopping, tol=1e-3,
-                               max_iter=max_iter).fit(X, Y)
-            assert clf.n_iter_ < max_iter
-
-    def test_adaptive_longer_than_constant(self):
-        clf1 = self.factory(learning_rate="adaptive", eta0=0.01, tol=1e-3,
-                            max_iter=100)
-        clf1.fit(iris.data, iris.target)
-        clf2 = self.factory(learning_rate="constant", eta0=0.01, tol=1e-3,
-                            max_iter=100)
-        clf2.fit(iris.data, iris.target)
-        assert clf1.n_iter_ > clf2.n_iter_
-
-    def test_validation_set_not_used_for_training(self):
-        X, Y = iris.data, iris.target
-        validation_fraction = 0.4
-        seed = 42
-        shuffle = False
-        max_iter = 10
-        clf1 = self.factory(early_stopping=True,
-                            random_state=np.random.RandomState(seed),
-                            validation_fraction=validation_fraction,
-                            learning_rate='constant', eta0=0.01,
-                            tol=None, max_iter=max_iter, shuffle=shuffle)
-        clf1.fit(X, Y)
-        assert clf1.n_iter_ == max_iter
-
-        clf2 = self.factory(early_stopping=False,
-                            random_state=np.random.RandomState(seed),
-                            learning_rate='constant', eta0=0.01,
-                            tol=None, max_iter=max_iter, shuffle=shuffle)
-
-        if is_classifier(clf2):
-            cv = StratifiedShuffleSplit(test_size=validation_fraction,
-                                        random_state=seed)
-        else:
-            cv = ShuffleSplit(test_size=validation_fraction,
-                              random_state=seed)
-        idx_train, idx_val = next(cv.split(X, Y))
-        idx_train = np.sort(idx_train)  # remove shuffling
-        clf2.fit(X[idx_train], Y[idx_train])
-        assert clf2.n_iter_ == max_iter
-
-        assert_array_equal(clf1.coef_, clf2.coef_)
-
-    def test_n_iter_no_change(self):
-        X, Y = iris.data, iris.target
-        # test that n_iter_ increases monotonically with n_iter_no_change
-        for early_stopping in [True, False]:
-            n_iter_list = [self.factory(early_stopping=early_stopping,
-                                        n_iter_no_change=n_iter_no_change,
-                                        tol=1e-4, max_iter=1000
-                                        ).fit(X, Y).n_iter_
-                           for n_iter_no_change in [2, 3, 10]]
-            assert_array_equal(n_iter_list, sorted(n_iter_list))
-
-    def test_not_enough_sample_for_early_stopping(self):
-        # test an error is raised if the training or validation set is empty
-        clf = self.factory(early_stopping=True, validation_fraction=0.99)
-        with pytest.raises(ValueError):
-            clf.fit(X3, Y3)
+            clf1.partial_fit(X, Y)
+            clf2.partial_fit(X, Y)
+
+    assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
+    assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_late_onset_averaging_reached(klass):
+    eta0 = .001
+    alpha = .0001
+    Y_encode = np.array(Y)
+    Y_encode[Y_encode == 1] = -1.0
+    Y_encode[Y_encode == 2] = 1.0
+
+    clf1 = klass(average=7, learning_rate="constant",
+                 loss='squared_loss', eta0=eta0,
+                 alpha=alpha, max_iter=2, shuffle=False)
+    clf2 = klass(average=0, learning_rate="constant",
+                 loss='squared_loss', eta0=eta0,
+                 alpha=alpha, max_iter=1, shuffle=False)
+
+    clf1.fit(X, Y_encode)
+    clf2.fit(X, Y_encode)
+
+    average_weights, average_intercept = \
+        asgd(klass, X, Y_encode, eta0, alpha,
+             weight_init=clf2.coef_.ravel(),
+             intercept_init=clf2.intercept_)
+
+    assert_array_almost_equal(clf1.coef_.ravel(),
+                              average_weights.ravel(),
+                              decimal=16)
+    assert_almost_equal(clf1.intercept_, average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_sgd_bad_alpha_for_optimal_learning_rate(klass):
+    # Check whether expected ValueError on bad alpha, i.e. 0
+    # since alpha is used to compute the optimal learning rate
+    assert_raises(ValueError, klass,
+                  alpha=0, learning_rate="optimal")
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_early_stopping(klass):
+    X = iris.data[iris.target > 0]
+    Y = iris.target[iris.target > 0]
+    for early_stopping in [True, False]:
+        max_iter = 1000
+        clf = klass(early_stopping=early_stopping, tol=1e-3,
+                    max_iter=max_iter).fit(X, Y)
+        assert clf.n_iter_ < max_iter
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_adaptive_longer_than_constant(klass):
+    clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3,
+                 max_iter=100)
+    clf1.fit(iris.data, iris.target)
+    clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3,
+                 max_iter=100)
+    clf2.fit(iris.data, iris.target)
+    assert clf1.n_iter_ > clf2.n_iter_
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_validation_set_not_used_for_training(klass):
+    X, Y = iris.data, iris.target
+    validation_fraction = 0.4
+    seed = 42
+    shuffle = False
+    max_iter = 10
+    clf1 = klass(early_stopping=True,
+                 random_state=np.random.RandomState(seed),
+                 validation_fraction=validation_fraction,
+                 learning_rate='constant', eta0=0.01,
+                 tol=None, max_iter=max_iter, shuffle=shuffle)
+    clf1.fit(X, Y)
+    assert clf1.n_iter_ == max_iter
+
+    clf2 = klass(early_stopping=False,
+                 random_state=np.random.RandomState(seed),
+                 learning_rate='constant', eta0=0.01,
+                 tol=None, max_iter=max_iter, shuffle=shuffle)
+
+    if is_classifier(clf2):
+        cv = StratifiedShuffleSplit(test_size=validation_fraction,
+                                    random_state=seed)
+    else:
+        cv = ShuffleSplit(test_size=validation_fraction,
+                          random_state=seed)
+    idx_train, idx_val = next(cv.split(X, Y))
+    idx_train = np.sort(idx_train)  # remove shuffling
+    clf2.fit(X[idx_train], Y[idx_train])
+    assert clf2.n_iter_ == max_iter
+
+    assert_array_equal(clf1.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_n_iter_no_change(klass):
+    X, Y = iris.data, iris.target
+    # test that n_iter_ increases monotonically with n_iter_no_change
+    for early_stopping in [True, False]:
+        n_iter_list = [klass(early_stopping=early_stopping,
+                             n_iter_no_change=n_iter_no_change,
+                             tol=1e-4, max_iter=1000
+                             ).fit(X, Y).n_iter_
+                       for n_iter_no_change in [2, 3, 10]]
+        assert_array_equal(n_iter_list, sorted(n_iter_list))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_not_enough_sample_for_early_stopping(klass):
+    # test an error is raised if the training or validation set is empty
+    clf = klass(early_stopping=True, validation_fraction=0.99)
+    with pytest.raises(ValueError):
+        clf.fit(X3, Y3)
 
 
 ###############################################################################
 # Classification Test Case
 
-class DenseSGDClassifierTestCase(unittest.TestCase, CommonTest):
-    """Test suite for the dense representation variant of SGD"""
-    factory_class = SGDClassifier
-
-    def test_sgd(self):
-        # Check that SGD gives any results :-)
-
-        for loss in ("hinge", "squared_hinge", "log", "modified_huber"):
-            clf = self.factory(penalty='l2', alpha=0.01, fit_intercept=True,
-                               loss=loss, max_iter=10, shuffle=True)
-            clf.fit(X, Y)
-            # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7)
-            assert_array_equal(clf.predict(T), true_result)
-
-    def test_sgd_bad_l1_ratio(self):
-        # Check whether expected ValueError on bad l1_ratio
-        assert_raises(ValueError, self.factory, l1_ratio=1.1)
-
-    def test_sgd_bad_learning_rate_schedule(self):
-        # Check whether expected ValueError on bad learning_rate
-        assert_raises(ValueError, self.factory, learning_rate="<unknown>")
-
-    def test_sgd_bad_eta0(self):
-        # Check whether expected ValueError on bad eta0
-        assert_raises(ValueError, self.factory, eta0=0,
-                      learning_rate="constant")
-
-    def test_sgd_bad_alpha(self):
-        # Check whether expected ValueError on bad alpha
-        assert_raises(ValueError, self.factory, alpha=-.1)
-
-    def test_sgd_bad_penalty(self):
-        # Check whether expected ValueError on bad penalty
-        assert_raises(ValueError, self.factory, penalty='foobar',
-                      l1_ratio=0.85)
-
-    def test_sgd_bad_loss(self):
-        # Check whether expected ValueError on bad loss
-        assert_raises(ValueError, self.factory, loss="foobar")
-
-    def test_sgd_max_iter_param(self):
-        # Test parameter validity check
-        assert_raises(ValueError, self.factory, max_iter=-10000)
-
-    def test_sgd_shuffle_param(self):
-        # Test parameter validity check
-        assert_raises(ValueError, self.factory, shuffle="false")
-
-    def test_sgd_early_stopping_param(self):
-        # Test parameter validity check
-        assert_raises(ValueError, self.factory, early_stopping="false")
-
-    def test_sgd_validation_fraction(self):
-        # Test parameter validity check
-        assert_raises(ValueError, self.factory, validation_fraction=-.1)
-
-    def test_sgd_n_iter_no_change(self):
-        # Test parameter validity check
-        assert_raises(ValueError, self.factory, n_iter_no_change=0)
-
-    def test_argument_coef(self):
-        # Checks coef_init not allowed as model argument (only fit)
-        # Provided coef_ does not match dataset
-        assert_raises(TypeError, self.factory, coef_init=np.zeros((3,)))
-
-    def test_provide_coef(self):
-        # Checks coef_init shape for the warm starts
-        # Provided coef_ does not match dataset.
-        assert_raises(ValueError, self.factory().fit,
-                      X, Y, coef_init=np.zeros((3,)))
-
-    def test_set_intercept(self):
-        # Checks intercept_ shape for the warm starts
-        # Provided intercept_ does not match dataset.
-        assert_raises(ValueError, self.factory().fit,
-                      X, Y, intercept_init=np.zeros((3,)))
-
-    def test_sgd_early_stopping_with_partial_fit(self):
-        # Test parameter validity check
-        assert_raises(ValueError,
-                      self.factory(early_stopping=True).partial_fit, X, Y)
-
-    def test_set_intercept_binary(self):
-        # Checks intercept_ shape for the warm starts in binary case
-        self.factory().fit(X5, Y5, intercept_init=0)
-
-    def test_average_binary_computed_correctly(self):
-        # Checks the SGDClassifier correctly computes the average weights
-        eta = .1
-        alpha = 2.
-        n_samples = 20
-        n_features = 10
-        rng = np.random.RandomState(0)
-        X = rng.normal(size=(n_samples, n_features))
-        w = rng.normal(size=n_features)
-
-        clf = self.factory(loss='squared_loss',
-                           learning_rate='constant',
-                           eta0=eta, alpha=alpha,
-                           fit_intercept=True,
-                           max_iter=1, average=True, shuffle=False)
-
-        # simple linear function without noise
-        y = np.dot(X, w)
-        y = np.sign(y)
-
-        clf.fit(X, y)
-
-        average_weights, average_intercept = self.asgd(X, y, eta, alpha)
-        average_weights = average_weights.reshape(1, -1)
-        assert_array_almost_equal(clf.coef_,
-                                  average_weights,
-                                  decimal=14)
-        assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
-
-    def test_set_intercept_to_intercept(self):
-        # Checks intercept_ shape consistency for the warm starts
-        # Inconsistent intercept_ shape.
-        clf = self.factory().fit(X5, Y5)
-        self.factory().fit(X5, Y5, intercept_init=clf.intercept_)
-        clf = self.factory().fit(X, Y)
-        self.factory().fit(X, Y, intercept_init=clf.intercept_)
-
-    def test_sgd_at_least_two_labels(self):
-        # Target must have at least two labels
-        clf = self.factory(alpha=0.01, max_iter=20)
-        assert_raises(ValueError, clf.fit, X2, np.ones(9))
-
-    def test_partial_fit_weight_class_balanced(self):
-        # partial_fit with class_weight='balanced' not supported"""
-        regex = (r"class_weight 'balanced' is not supported for "
-                 r"partial_fit\. In order to use 'balanced' weights, "
-                 r"use compute_class_weight\('balanced', classes, y\). "
-                 r"In place of y you can us a large enough sample "
-                 r"of the full training set target to properly "
-                 r"estimate the class frequency distributions\. "
-                 r"Pass the resulting weights as the class_weight "
-                 r"parameter\.")
-        assert_raises_regexp(ValueError,
-                             regex,
-                             self.factory(class_weight='balanced').partial_fit,
-                             X, Y, classes=np.unique(Y))
-
-    def test_sgd_multiclass(self):
-        # Multi-class test case
-        clf = self.factory(alpha=0.01, max_iter=20).fit(X2, Y2)
-        assert_equal(clf.coef_.shape, (3, 2))
-        assert_equal(clf.intercept_.shape, (3,))
-        assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3))
-        pred = clf.predict(T2)
-        assert_array_equal(pred, true_result2)
-
-    def test_sgd_multiclass_average(self):
-        eta = .001
-        alpha = .01
-        # Multi-class average test case
-        clf = self.factory(loss='squared_loss',
-                           learning_rate='constant',
-                           eta0=eta, alpha=alpha,
-                           fit_intercept=True,
-                           max_iter=1, average=True, shuffle=False)
-
-        np_Y2 = np.array(Y2)
-        clf.fit(X2, np_Y2)
-        classes = np.unique(np_Y2)
-
-        for i, cl in enumerate(classes):
-            y_i = np.ones(np_Y2.shape[0])
-            y_i[np_Y2 != cl] = -1
-            average_coef, average_intercept = self.asgd(X2, y_i, eta, alpha)
-            assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
-            assert_almost_equal(average_intercept,
-                                clf.intercept_[i],
-                                decimal=16)
-
-    def test_sgd_multiclass_with_init_coef(self):
-        # Multi-class test case
-        clf = self.factory(alpha=0.01, max_iter=20)
-        clf.fit(X2, Y2, coef_init=np.zeros((3, 2)),
-                intercept_init=np.zeros(3))
-        assert_equal(clf.coef_.shape, (3, 2))
-        assert clf.intercept_.shape, (3,)
-        pred = clf.predict(T2)
-        assert_array_equal(pred, true_result2)
-
-    def test_sgd_multiclass_njobs(self):
-        # Multi-class test case with multi-core support
-        clf = self.factory(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
-        assert_equal(clf.coef_.shape, (3, 2))
-        assert_equal(clf.intercept_.shape, (3,))
-        assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3))
-        pred = clf.predict(T2)
-        assert_array_equal(pred, true_result2)
-
-    def test_set_coef_multiclass(self):
-        # Checks coef_init and intercept_init shape for multi-class
-        # problems
-        # Provided coef_ does not match dataset
-        clf = self.factory()
-        assert_raises(ValueError, clf.fit, X2, Y2, coef_init=np.zeros((2, 2)))
-
-        # Provided coef_ does match dataset
-        clf = self.factory().fit(X2, Y2, coef_init=np.zeros((3, 2)))
-
-        # Provided intercept_ does not match dataset
-        clf = self.factory()
-        assert_raises(ValueError, clf.fit, X2, Y2,
-                      intercept_init=np.zeros((1,)))
-
-        # Provided intercept_ does match dataset.
-        clf = self.factory().fit(X2, Y2, intercept_init=np.zeros((3,)))
-
-    def test_sgd_predict_proba_method_access(self):
-        # Checks that SGDClassifier predict_proba and predict_log_proba methods
-        # can either be accessed or raise an appropriate error message
-        # otherwise. See
-        # https://github.com/scikit-learn/scikit-learn/issues/10938 for more
-        # details.
-        for loss in SGDClassifier.loss_functions:
-            clf = SGDClassifier(loss=loss)
-            if loss in ('log', 'modified_huber'):
-                assert hasattr(clf, 'predict_proba')
-                assert hasattr(clf, 'predict_log_proba')
-            else:
-                message = ("probability estimates are not "
-                           "available for loss={!r}".format(loss))
-                assert not hasattr(clf, 'predict_proba')
-                assert not hasattr(clf, 'predict_log_proba')
-                with pytest.raises(AttributeError,
-                                   match=message):
-                    clf.predict_proba
-                with pytest.raises(AttributeError,
-                                   match=message):
-                    clf.predict_log_proba
-
-    def test_sgd_proba(self):
-        # Check SGD.predict_proba
-
-        # Hinge loss does not allow for conditional prob estimate.
-        # We cannot use the factory here, because it defines predict_proba
-        # anyway.
-        clf = SGDClassifier(loss="hinge", alpha=0.01,
-                            max_iter=10, tol=None).fit(X, Y)
-        assert not hasattr(clf, "predict_proba")
-        assert not hasattr(clf, "predict_log_proba")
-
-        # log and modified_huber losses can output probability estimates
-        # binary case
-        for loss in ["log", "modified_huber"]:
-            clf = self.factory(loss=loss, alpha=0.01, max_iter=10)
-            clf.fit(X, Y)
-            p = clf.predict_proba([[3, 2]])
-            assert p[0, 1] > 0.5
-            p = clf.predict_proba([[-1, -1]])
-            assert p[0, 1] < 0.5
-
-            p = clf.predict_log_proba([[3, 2]])
-            assert p[0, 1] > p[0, 0]
-            p = clf.predict_log_proba([[-1, -1]])
-            assert p[0, 1] < p[0, 0]
-
-        # log loss multiclass probability estimates
-        clf = self.factory(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
-
-        d = clf.decision_function([[.1, -.1], [.3, .2]])
-        p = clf.predict_proba([[.1, -.1], [.3, .2]])
-        assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
-        assert_almost_equal(p[0].sum(), 1)
-        assert np.all(p[0] >= 0)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_clf(klass):
+    # Check that SGD gives any results :-)
 
-        p = clf.predict_proba([[-1, -1]])
-        d = clf.decision_function([[-1, -1]])
-        assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))
-
-        l = clf.predict_log_proba([[3, 2]])
-        p = clf.predict_proba([[3, 2]])
-        assert_array_almost_equal(np.log(p), l)
-
-        l = clf.predict_log_proba([[-1, -1]])
-        p = clf.predict_proba([[-1, -1]])
-        assert_array_almost_equal(np.log(p), l)
-
-        # Modified Huber multiclass probability estimates; requires a separate
-        # test because the hard zero/one probabilities may destroy the
-        # ordering present in decision_function output.
-        clf = self.factory(loss="modified_huber", alpha=0.01, max_iter=10)
-        clf.fit(X2, Y2)
-        d = clf.decision_function([[3, 2]])
-        p = clf.predict_proba([[3, 2]])
-        if not isinstance(self, SparseSGDClassifierTestCase):
-            assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1))
-        else:   # XXX the sparse test gets a different X2 (?)
-            assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1))
-
-        # the following sample produces decision_function values < -1,
-        # which would cause naive normalization to fail (see comment
-        # in SGDClassifier.predict_proba)
-        x = X.mean(axis=0)
-        d = clf.decision_function([x])
-        if np.all(d < -1):  # XXX not true in sparse test case (why?)
-            p = clf.predict_proba([x])
-            assert_array_almost_equal(p[0], [1 / 3.] * 3)
-
-    def test_sgd_l1(self):
-        # Test L1 regularization
-        n = len(X4)
-        rng = np.random.RandomState(13)
-        idx = np.arange(n)
-        rng.shuffle(idx)
-
-        X = X4[idx, :]
-        Y = Y4[idx]
-
-        clf = self.factory(penalty='l1', alpha=.2, fit_intercept=False,
-                           max_iter=2000, tol=None, shuffle=False)
-        clf.fit(X, Y)
-        assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,)))
-        pred = clf.predict(X)
-        assert_array_equal(pred, Y)
-
-        # test sparsify with dense inputs
-        clf.sparsify()
-        assert sp.issparse(clf.coef_)
-        pred = clf.predict(X)
-        assert_array_equal(pred, Y)
-
-        # pickle and unpickle with sparse coef_
-        clf = pickle.loads(pickle.dumps(clf))
-        assert sp.issparse(clf.coef_)
-        pred = clf.predict(X)
-        assert_array_equal(pred, Y)
-
-    def test_class_weights(self):
-        # Test class weights.
-        X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
-                      [1.0, 1.0], [1.0, 0.0]])
-        y = [1, 1, 1, -1, -1]
-
-        clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False,
-                           class_weight=None)
-        clf.fit(X, y)
-        assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
-
-        # we give a small weights to class 1
-        clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False,
-                           class_weight={1: 0.001})
-        clf.fit(X, y)
-
-        # now the hyperplane should rotate clock-wise and
-        # the prediction on this point should shift
-        assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
-
-    def test_equal_class_weight(self):
-        # Test if equal class weights approx. equals no class weights.
-        X = [[1, 0], [1, 0], [0, 1], [0, 1]]
-        y = [0, 0, 1, 1]
-        clf = self.factory(alpha=0.1, max_iter=1000, class_weight=None)
-        clf.fit(X, y)
-
-        X = [[1, 0], [0, 1]]
-        y = [0, 1]
-        clf_weighted = self.factory(alpha=0.1, max_iter=1000,
-                                    class_weight={0: 0.5, 1: 0.5})
-        clf_weighted.fit(X, y)
-
-        # should be similar up to some epsilon due to learning rate schedule
-        assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
-
-    def test_wrong_class_weight_label(self):
-        # ValueError due to not existing class label.
-        clf = self.factory(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
-        assert_raises(ValueError, clf.fit, X, Y)
-
-    def test_wrong_class_weight_format(self):
-        # ValueError due to wrong class_weight argument type.
-        clf = self.factory(alpha=0.1, max_iter=1000, class_weight=[0.5])
-        assert_raises(ValueError, clf.fit, X, Y)
-
-    def test_weights_multiplied(self):
-        # Tests that class_weight and sample_weight are multiplicative
-        class_weights = {1: .6, 2: .3}
-        rng = np.random.RandomState(0)
-        sample_weights = rng.random_sample(Y4.shape[0])
-        multiplied_together = np.copy(sample_weights)
-        multiplied_together[Y4 == 1] *= class_weights[1]
-        multiplied_together[Y4 == 2] *= class_weights[2]
-
-        clf1 = self.factory(alpha=0.1, max_iter=20, class_weight=class_weights)
-        clf2 = self.factory(alpha=0.1, max_iter=20)
-
-        clf1.fit(X4, Y4, sample_weight=sample_weights)
-        clf2.fit(X4, Y4, sample_weight=multiplied_together)
-
-        assert_almost_equal(clf1.coef_, clf2.coef_)
-
-    def test_balanced_weight(self):
-        # Test class weights for imbalanced data"""
-        # compute reference metrics on iris dataset that is quite balanced by
-        # default
-        X, y = iris.data, iris.target
-        X = scale(X)
-        idx = np.arange(X.shape[0])
-        rng = np.random.RandomState(6)
-        rng.shuffle(idx)
-        X = X[idx]
-        y = y[idx]
-        clf = self.factory(alpha=0.0001, max_iter=1000,
-                           class_weight=None, shuffle=False).fit(X, y)
-        f1 = metrics.f1_score(y, clf.predict(X), average='weighted')
-        assert_almost_equal(f1, 0.96, decimal=1)
-
-        # make the same prediction using balanced class_weight
-        clf_balanced = self.factory(alpha=0.0001, max_iter=1000,
-                                    class_weight="balanced",
-                                    shuffle=False).fit(X, y)
-        f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted')
-        assert_almost_equal(f1, 0.96, decimal=1)
-
-        # Make sure that in the balanced case it does not change anything
-        # to use "balanced"
-        assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6)
-
-        # build an very very imbalanced dataset out of iris data
-        X_0 = X[y == 0, :]
-        y_0 = y[y == 0]
-
-        X_imbalanced = np.vstack([X] + [X_0] * 10)
-        y_imbalanced = np.concatenate([y] + [y_0] * 10)
-
-        # fit a model on the imbalanced data without class weight info
-        clf = self.factory(max_iter=1000, class_weight=None, shuffle=False)
-        clf.fit(X_imbalanced, y_imbalanced)
-        y_pred = clf.predict(X)
-        assert_less(metrics.f1_score(y, y_pred, average='weighted'), 0.96)
-
-        # fit a model with balanced class_weight enabled
-        clf = self.factory(max_iter=1000, class_weight="balanced",
-                           shuffle=False)
-        clf.fit(X_imbalanced, y_imbalanced)
-        y_pred = clf.predict(X)
-        assert_greater(metrics.f1_score(y, y_pred, average='weighted'), 0.96)
-
-    def test_sample_weights(self):
-        # Test weights on individual samples
-        X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
-                      [1.0, 1.0], [1.0, 0.0]])
-        y = [1, 1, 1, -1, -1]
-
-        clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False)
-        clf.fit(X, y)
-        assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
-
-        # we give a small weights to class 1
-        clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2)
-
-        # now the hyperplane should rotate clock-wise and
-        # the prediction on this point should shift
-        assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
-
-    def test_wrong_sample_weights(self):
-        # Test if ValueError is raised if sample_weight has wrong shape
-        clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False)
-        # provided sample_weight too long
-        assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7))
-
-    def test_partial_fit_exception(self):
-        clf = self.factory(alpha=0.01)
-        # classes was not specified
-        assert_raises(ValueError, clf.partial_fit, X3, Y3)
-
-    def test_partial_fit_binary(self):
-        third = X.shape[0] // 3
-        clf = self.factory(alpha=0.01)
-        classes = np.unique(Y)
-
-        clf.partial_fit(X[:third], Y[:third], classes=classes)
-        assert_equal(clf.coef_.shape, (1, X.shape[1]))
-        assert_equal(clf.intercept_.shape, (1,))
-        assert_equal(clf.decision_function([[0, 0]]).shape, (1, ))
-        id1 = id(clf.coef_.data)
-
-        clf.partial_fit(X[third:], Y[third:])
-        id2 = id(clf.coef_.data)
-        # check that coef_ haven't been re-allocated
-        assert id1, id2
-
-        y_pred = clf.predict(T)
-        assert_array_equal(y_pred, true_result)
-
-    def test_partial_fit_multiclass(self):
-        third = X2.shape[0] // 3
-        clf = self.factory(alpha=0.01)
-        classes = np.unique(Y2)
-
-        clf.partial_fit(X2[:third], Y2[:third], classes=classes)
-        assert_equal(clf.coef_.shape, (3, X2.shape[1]))
-        assert_equal(clf.intercept_.shape, (3,))
-        assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3))
-        id1 = id(clf.coef_.data)
-
-        clf.partial_fit(X2[third:], Y2[third:])
-        id2 = id(clf.coef_.data)
-        # check that coef_ haven't been re-allocated
-        assert id1, id2
-
-    def test_partial_fit_multiclass_average(self):
-        third = X2.shape[0] // 3
-        clf = self.factory(alpha=0.01, average=X2.shape[0])
-        classes = np.unique(Y2)
-
-        clf.partial_fit(X2[:third], Y2[:third], classes=classes)
-        assert_equal(clf.coef_.shape, (3, X2.shape[1]))
-        assert_equal(clf.intercept_.shape, (3,))
-
-        clf.partial_fit(X2[third:], Y2[third:])
-        assert_equal(clf.coef_.shape, (3, X2.shape[1]))
-        assert_equal(clf.intercept_.shape, (3,))
-
-    def test_fit_then_partial_fit(self):
-        # Partial_fit should work after initial fit in the multiclass case.
-        # Non-regression test for #2496; fit would previously produce a
-        # Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
-        clf = self.factory()
-        clf.fit(X2, Y2)
-        clf.partial_fit(X2, Y2)     # no exception here
-
-    def _test_partial_fit_equal_fit(self, lr):
-        for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
-            clf = self.factory(alpha=0.01, eta0=0.01, max_iter=2,
-                               learning_rate=lr, shuffle=False)
-            clf.fit(X_, Y_)
-            y_pred = clf.decision_function(T_)
-            t = clf.t_
-
-            classes = np.unique(Y_)
-            clf = self.factory(alpha=0.01, eta0=0.01, learning_rate=lr,
-                               shuffle=False)
-            for i in range(2):
-                clf.partial_fit(X_, Y_, classes=classes)
-            y_pred2 = clf.decision_function(T_)
-
-            assert_equal(clf.t_, t)
-            assert_array_almost_equal(y_pred, y_pred2, decimal=2)
-
-    def test_partial_fit_equal_fit_constant(self):
-        self._test_partial_fit_equal_fit("constant")
-
-    def test_partial_fit_equal_fit_optimal(self):
-        self._test_partial_fit_equal_fit("optimal")
-
-    def test_partial_fit_equal_fit_invscaling(self):
-        self._test_partial_fit_equal_fit("invscaling")
-
-    def test_partial_fit_equal_fit_adaptive(self):
-        self._test_partial_fit_equal_fit("adaptive")
-
-    def test_regression_losses(self):
-        clf = self.factory(alpha=0.01, learning_rate="constant",
-                           eta0=0.1, loss="epsilon_insensitive")
+    for loss in ("hinge", "squared_hinge", "log", "modified_huber"):
+        clf = klass(penalty='l2', alpha=0.01, fit_intercept=True,
+                    loss=loss, max_iter=10, shuffle=True)
         clf.fit(X, Y)
-        assert_equal(1.0, np.mean(clf.predict(X) == Y))
+        # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7)
+        assert_array_equal(clf.predict(T), true_result)
 
-        clf = self.factory(alpha=0.01, learning_rate="constant",
-                           eta0=0.1, loss="squared_epsilon_insensitive")
-        clf.fit(X, Y)
-        assert_equal(1.0, np.mean(clf.predict(X) == Y))
 
-        clf = self.factory(alpha=0.01, loss="huber")
-        clf.fit(X, Y)
-        assert_equal(1.0, np.mean(clf.predict(X) == Y))
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_bad_l1_ratio(klass):
+    # Check whether expected ValueError on bad l1_ratio
+    assert_raises(ValueError, klass, l1_ratio=1.1)
 
-        clf = self.factory(alpha=0.01, learning_rate="constant", eta0=0.01,
-                           loss="squared_loss")
-        clf.fit(X, Y)
-        assert_equal(1.0, np.mean(clf.predict(X) == Y))
 
-    def test_warm_start_multiclass(self):
-        self._test_warm_start(X2, Y2, "optimal")
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_bad_learning_rate_schedule(klass):
+    # Check whether expected ValueError on bad learning_rate
+    assert_raises(ValueError, klass, learning_rate="<unknown>")
 
-    def test_multiple_fit(self):
-        # Test multiple calls of fit w/ different shaped inputs.
-        clf = self.factory(alpha=0.01, shuffle=False)
-        clf.fit(X, Y)
-        assert hasattr(clf, "coef_")
 
-        # Non-regression test: try fitting with a different label set.
-        y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)]
-        clf.fit(X[:, :-1], y)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_bad_eta0(klass):
+    # Check whether expected ValueError on bad eta0
+    assert_raises(ValueError, klass, eta0=0,
+                  learning_rate="constant")
 
 
-class SparseSGDClassifierTestCase(DenseSGDClassifierTestCase):
-    """Run exactly the same tests using the sparse representation variant"""
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_sgd_bad_alpha(klass):
+    # Check whether expected ValueError on bad alpha
+    assert_raises(ValueError, klass, alpha=-.1)
 
-    factory_class = SparseSGDClassifier
 
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_sgd_bad_penalty(klass):
+    # Check whether expected ValueError on bad penalty
+    assert_raises(ValueError, klass, penalty='foobar',
+                  l1_ratio=0.85)
 
-###############################################################################
-# Regression Test Case
 
-class DenseSGDRegressorTestCase(unittest.TestCase, CommonTest):
-    """Test suite for the dense representation variant of SGD"""
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_bad_loss(klass):
+    # Check whether expected ValueError on bad loss
+    assert_raises(ValueError, klass, loss="foobar")
 
-    factory_class = SGDRegressor
 
-    def test_sgd(self):
-        # Check that SGD gives any results.
-        clf = self.factory(alpha=0.1, max_iter=2,
-                           fit_intercept=False)
-        clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
-        assert_equal(clf.coef_[0], clf.coef_[1])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_max_iter_param(klass):
+    # Test parameter validity check
+    assert_raises(ValueError, klass, max_iter=-10000)
 
-    def test_sgd_bad_penalty(self):
-        # Check whether expected ValueError on bad penalty
-        assert_raises(ValueError, self.factory,
-                      penalty='foobar', l1_ratio=0.85)
 
-    def test_sgd_bad_loss(self):
-        # Check whether expected ValueError on bad loss
-        assert_raises(ValueError, self.factory, loss="foobar")
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_shuffle_param(klass):
+    # Test parameter validity check
+    assert_raises(ValueError, klass, shuffle="false")
 
-    def test_sgd_averaged_computed_correctly(self):
-        # Tests the average regressor matches the naive implementation
 
-        eta = .001
-        alpha = .01
-        n_samples = 20
-        n_features = 10
-        rng = np.random.RandomState(0)
-        X = rng.normal(size=(n_samples, n_features))
-        w = rng.normal(size=n_features)
-
-        # simple linear function without noise
-        y = np.dot(X, w)
-
-        clf = self.factory(loss='squared_loss',
-                           learning_rate='constant',
-                           eta0=eta, alpha=alpha,
-                           fit_intercept=True,
-                           max_iter=1, average=True, shuffle=False)
-
-        clf.fit(X, y)
-        average_weights, average_intercept = self.asgd(X, y, eta, alpha)
-
-        assert_array_almost_equal(clf.coef_,
-                                  average_weights,
-                                  decimal=16)
-        assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
-
-    def test_sgd_averaged_partial_fit(self):
-        # Tests whether the partial fit yields the same average as the fit
-        eta = .001
-        alpha = .01
-        n_samples = 20
-        n_features = 10
-        rng = np.random.RandomState(0)
-        X = rng.normal(size=(n_samples, n_features))
-        w = rng.normal(size=n_features)
-
-        # simple linear function without noise
-        y = np.dot(X, w)
-
-        clf = self.factory(loss='squared_loss',
-                           learning_rate='constant',
-                           eta0=eta, alpha=alpha,
-                           fit_intercept=True,
-                           max_iter=1, average=True, shuffle=False)
-
-        clf.partial_fit(X[:int(n_samples / 2)][:], y[:int(n_samples / 2)])
-        clf.partial_fit(X[int(n_samples / 2):][:], y[int(n_samples / 2):])
-        average_weights, average_intercept = self.asgd(X, y, eta, alpha)
-
-        assert_array_almost_equal(clf.coef_,
-                                  average_weights,
-                                  decimal=16)
-        assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
-
-    def test_average_sparse(self):
-        # Checks the average weights on data with 0s
-
-        eta = .001
-        alpha = .01
-        clf = self.factory(loss='squared_loss',
-                           learning_rate='constant',
-                           eta0=eta, alpha=alpha,
-                           fit_intercept=True,
-                           max_iter=1, average=True, shuffle=False)
-
-        n_samples = Y3.shape[0]
-
-        clf.partial_fit(X3[:int(n_samples / 2)][:], Y3[:int(n_samples / 2)])
-        clf.partial_fit(X3[int(n_samples / 2):][:], Y3[int(n_samples / 2):])
-        average_weights, average_intercept = self.asgd(X3, Y3, eta, alpha)
-
-        assert_array_almost_equal(clf.coef_,
-                                  average_weights,
-                                  decimal=16)
-        assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
-
-    def test_sgd_least_squares_fit(self):
-        xmin, xmax = -5, 5
-        n_samples = 100
-        rng = np.random.RandomState(0)
-        X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_early_stopping_param(klass):
+    # Test parameter validity check
+    assert_raises(ValueError, klass, early_stopping="false")
 
-        # simple linear function without noise
-        y = 0.5 * X.ravel()
 
-        clf = self.factory(loss='squared_loss', alpha=0.1, max_iter=20,
-                           fit_intercept=False)
-        clf.fit(X, y)
-        score = clf.score(X, y)
-        assert_greater(score, 0.99)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_validation_fraction(klass):
+    # Test parameter validity check
+    assert_raises(ValueError, klass, validation_fraction=-.1)
 
-        # simple linear function with noise
-        y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-        clf = self.factory(loss='squared_loss', alpha=0.1, max_iter=20,
-                           fit_intercept=False)
-        clf.fit(X, y)
-        score = clf.score(X, y)
-        assert_greater(score, 0.5)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_n_iter_no_change(klass):
+    # Test parameter validity check
+    assert_raises(ValueError, klass, n_iter_no_change=0)
 
-    def test_sgd_epsilon_insensitive(self):
-        xmin, xmax = -5, 5
-        n_samples = 100
-        rng = np.random.RandomState(0)
-        X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
-
-        # simple linear function without noise
-        y = 0.5 * X.ravel()
-
-        clf = self.factory(loss='epsilon_insensitive', epsilon=0.01,
-                           alpha=0.1, max_iter=20,
-                           fit_intercept=False)
-        clf.fit(X, y)
-        score = clf.score(X, y)
-        assert score > 0.99
-
-        # simple linear function with noise
-        y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
-
-        clf = self.factory(loss='epsilon_insensitive', epsilon=0.01,
-                           alpha=0.1, max_iter=20,
-                           fit_intercept=False)
-        clf.fit(X, y)
-        score = clf.score(X, y)
-        assert score > 0.5
-
-    def test_sgd_huber_fit(self):
-        xmin, xmax = -5, 5
-        n_samples = 100
-        rng = np.random.RandomState(0)
-        X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
 
-        # simple linear function without noise
-        y = 0.5 * X.ravel()
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_argument_coef(klass):
+    # Checks coef_init not allowed as model argument (only fit)
+    # Provided coef_ does not match dataset
+    assert_raises(TypeError, klass, coef_init=np.zeros((3,)))
 
-        clf = self.factory(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
-                           fit_intercept=False)
-        clf.fit(X, y)
-        score = clf.score(X, y)
-        assert_greater(score, 0.99)
 
-        # simple linear function with noise
-        y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_provide_coef(klass):
+    # Checks coef_init shape for the warm starts
+    # Provided coef_ does not match dataset.
+    assert_raises(ValueError, klass().fit,
+                  X, Y, coef_init=np.zeros((3,)))
 
-        clf = self.factory(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
-                           fit_intercept=False)
-        clf.fit(X, y)
-        score = clf.score(X, y)
-        assert_greater(score, 0.5)
 
-    def test_elasticnet_convergence(self):
-        # Check that the SGD output is consistent with coordinate descent
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_set_intercept(klass):
+    # Checks intercept_ shape for the warm starts
+    # Provided intercept_ does not match dataset.
+    assert_raises(ValueError, klass().fit,
+                  X, Y, intercept_init=np.zeros((3,)))
+
 
-        n_samples, n_features = 1000, 5
-        rng = np.random.RandomState(0)
-        X = rng.randn(n_samples, n_features)
-        # ground_truth linear model that generate y from X and to which the
-        # models should converge if the regularizer would be set to 0.0
-        ground_truth_coef = rng.randn(n_features)
-        y = np.dot(X, ground_truth_coef)
-
-        # XXX: alpha = 0.1 seems to cause convergence problems
-        for alpha in [0.01, 0.001]:
-            for l1_ratio in [0.5, 0.8, 1.0]:
-                cd = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio,
-                                             fit_intercept=False)
-                cd.fit(X, y)
-                sgd = self.factory(penalty='elasticnet', max_iter=50,
-                                   alpha=alpha, l1_ratio=l1_ratio,
-                                   fit_intercept=False)
-                sgd.fit(X, y)
-                err_msg = ("cd and sgd did not converge to comparable "
-                           "results for alpha=%f and l1_ratio=%f"
-                           % (alpha, l1_ratio))
-                assert_almost_equal(cd.coef_, sgd.coef_, decimal=2,
-                                    err_msg=err_msg)
-
-    @ignore_warnings
-    def test_partial_fit(self):
-        third = X.shape[0] // 3
-        clf = self.factory(alpha=0.01)
-
-        clf.partial_fit(X[:third], Y[:third])
-        assert_equal(clf.coef_.shape, (X.shape[1], ))
-        assert_equal(clf.intercept_.shape, (1,))
-        assert_equal(clf.predict([[0, 0]]).shape, (1, ))
-        id1 = id(clf.coef_.data)
-
-        clf.partial_fit(X[third:], Y[third:])
-        id2 = id(clf.coef_.data)
-        # check that coef_ haven't been re-allocated
-        assert id1, id2
-
-    def _test_partial_fit_equal_fit(self, lr):
-        clf = self.factory(alpha=0.01, max_iter=2, eta0=0.01,
-                           learning_rate=lr, shuffle=False)
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_early_stopping_with_partial_fit(klass):
+    # Test parameter validity check
+    assert_raises(ValueError,
+                  klass(early_stopping=True).partial_fit, X, Y)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_set_intercept_binary(klass):
+    # Checks intercept_ shape for the warm starts in binary case
+    klass().fit(X5, Y5, intercept_init=0)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_average_binary_computed_correctly(klass):
+    # Checks the SGDClassifier correctly computes the average weights
+    eta = .1
+    alpha = 2.
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    # simple linear function without noise
+    y = np.dot(X, w)
+    y = np.sign(y)
+
+    clf.fit(X, y)
+
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+    average_weights = average_weights.reshape(1, -1)
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=14)
+    assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_set_intercept_to_intercept(klass):
+    # Checks intercept_ shape consistency for the warm starts
+    # Inconsistent intercept_ shape.
+    clf = klass().fit(X5, Y5)
+    klass().fit(X5, Y5, intercept_init=clf.intercept_)
+    clf = klass().fit(X, Y)
+    klass().fit(X, Y, intercept_init=clf.intercept_)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_at_least_two_labels(klass):
+    # Target must have at least two labels
+    clf = klass(alpha=0.01, max_iter=20)
+    assert_raises(ValueError, clf.fit, X2, np.ones(9))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_weight_class_balanced(klass):
+    # partial_fit with class_weight='balanced' not supported"""
+    regex = (r"class_weight 'balanced' is not supported for "
+             r"partial_fit\. In order to use 'balanced' weights, "
+             r"use compute_class_weight\('balanced', classes, y\). "
+             r"In place of y you can us a large enough sample "
+             r"of the full training set target to properly "
+             r"estimate the class frequency distributions\. "
+             r"Pass the resulting weights as the class_weight "
+             r"parameter\.")
+    assert_raises_regexp(ValueError,
+                         regex,
+                         klass(class_weight='balanced').partial_fit,
+                         X, Y, classes=np.unique(Y))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass(klass):
+    # Multi-class test case
+    clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2)
+    assert_equal(clf.coef_.shape, (3, 2))
+    assert_equal(clf.intercept_.shape, (3,))
+    assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3))
+    pred = clf.predict(T2)
+    assert_array_equal(pred, true_result2)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_average(klass):
+    eta = .001
+    alpha = .01
+    # Multi-class average test case
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    np_Y2 = np.array(Y2)
+    clf.fit(X2, np_Y2)
+    classes = np.unique(np_Y2)
+
+    for i, cl in enumerate(classes):
+        y_i = np.ones(np_Y2.shape[0])
+        y_i[np_Y2 != cl] = -1
+        average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha)
+        assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
+        assert_almost_equal(average_intercept,
+                            clf.intercept_[i],
+                            decimal=16)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_with_init_coef(klass):
+    # Multi-class test case
+    clf = klass(alpha=0.01, max_iter=20)
+    clf.fit(X2, Y2, coef_init=np.zeros((3, 2)),
+            intercept_init=np.zeros(3))
+    assert_equal(clf.coef_.shape, (3, 2))
+    assert clf.intercept_.shape, (3,)
+    pred = clf.predict(T2)
+    assert_array_equal(pred, true_result2)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_multiclass_njobs(klass):
+    # Multi-class test case with multi-core support
+    clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
+    assert_equal(clf.coef_.shape, (3, 2))
+    assert_equal(clf.intercept_.shape, (3,))
+    assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3))
+    pred = clf.predict(T2)
+    assert_array_equal(pred, true_result2)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_set_coef_multiclass(klass):
+    # Checks coef_init and intercept_init shape for multi-class
+    # problems
+    # Provided coef_ does not match dataset
+    clf = klass()
+    assert_raises(ValueError, clf.fit, X2, Y2, coef_init=np.zeros((2, 2)))
+
+    # Provided coef_ does match dataset
+    clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2)))
+
+    # Provided intercept_ does not match dataset
+    clf = klass()
+    assert_raises(ValueError, clf.fit, X2, Y2,
+                  intercept_init=np.zeros((1,)))
+
+    # Provided intercept_ does match dataset.
+    clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_predict_proba_method_access(klass):
+    # Checks that SGDClassifier predict_proba and predict_log_proba methods
+    # can either be accessed or raise an appropriate error message
+    # otherwise. See
+    # https://github.com/scikit-learn/scikit-learn/issues/10938 for more
+    # details.
+    for loss in linear_model.SGDClassifier.loss_functions:
+        clf = SGDClassifier(loss=loss)
+        if loss in ('log', 'modified_huber'):
+            assert hasattr(clf, 'predict_proba')
+            assert hasattr(clf, 'predict_log_proba')
+        else:
+            message = ("probability estimates are not "
+                       "available for loss={!r}".format(loss))
+            assert not hasattr(clf, 'predict_proba')
+            assert not hasattr(clf, 'predict_log_proba')
+            with pytest.raises(AttributeError,
+                               match=message):
+                clf.predict_proba
+            with pytest.raises(AttributeError,
+                               match=message):
+                clf.predict_log_proba
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_proba(klass):
+    # Check SGD.predict_proba
+
+    # Hinge loss does not allow for conditional prob estimate.
+    # We cannot use the factory here, because it defines predict_proba
+    # anyway.
+    clf = SGDClassifier(loss="hinge", alpha=0.01,
+                        max_iter=10, tol=None).fit(X, Y)
+    assert not hasattr(clf, "predict_proba")
+    assert not hasattr(clf, "predict_log_proba")
+
+    # log and modified_huber losses can output probability estimates
+    # binary case
+    for loss in ["log", "modified_huber"]:
+        clf = klass(loss=loss, alpha=0.01, max_iter=10)
         clf.fit(X, Y)
-        y_pred = clf.predict(T)
+        p = clf.predict_proba([[3, 2]])
+        assert p[0, 1] > 0.5
+        p = clf.predict_proba([[-1, -1]])
+        assert p[0, 1] < 0.5
+
+        p = clf.predict_log_proba([[3, 2]])
+        assert p[0, 1] > p[0, 0]
+        p = clf.predict_log_proba([[-1, -1]])
+        assert p[0, 1] < p[0, 0]
+
+    # log loss multiclass probability estimates
+    clf = klass(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2)
+
+    d = clf.decision_function([[.1, -.1], [.3, .2]])
+    p = clf.predict_proba([[.1, -.1], [.3, .2]])
+    assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
+    assert_almost_equal(p[0].sum(), 1)
+    assert np.all(p[0] >= 0)
+
+    p = clf.predict_proba([[-1, -1]])
+    d = clf.decision_function([[-1, -1]])
+    assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))
+
+    l = clf.predict_log_proba([[3, 2]])
+    p = clf.predict_proba([[3, 2]])
+    assert_array_almost_equal(np.log(p), l)
+
+    l = clf.predict_log_proba([[-1, -1]])
+    p = clf.predict_proba([[-1, -1]])
+    assert_array_almost_equal(np.log(p), l)
+
+    # Modified Huber multiclass probability estimates; requires a separate
+    # test because the hard zero/one probabilities may destroy the
+    # ordering present in decision_function output.
+    clf = klass(loss="modified_huber", alpha=0.01, max_iter=10)
+    clf.fit(X2, Y2)
+    d = clf.decision_function([[3, 2]])
+    p = clf.predict_proba([[3, 2]])
+    if klass != SparseSGDClassifier:
+        assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1))
+    else:   # XXX the sparse test gets a different X2 (?)
+        assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1))
+
+    # the following sample produces decision_function values < -1,
+    # which would cause naive normalization to fail (see comment
+    # in SGDClassifier.predict_proba)
+    x = X.mean(axis=0)
+    d = clf.decision_function([x])
+    if np.all(d < -1):  # XXX not true in sparse test case (why?)
+        p = clf.predict_proba([x])
+        assert_array_almost_equal(p[0], [1 / 3.] * 3)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sgd_l1(klass):
+    # Test L1 regularization
+    n = len(X4)
+    rng = np.random.RandomState(13)
+    idx = np.arange(n)
+    rng.shuffle(idx)
+
+    X = X4[idx, :]
+    Y = Y4[idx]
+
+    clf = klass(penalty='l1', alpha=.2, fit_intercept=False,
+                max_iter=2000, tol=None, shuffle=False)
+    clf.fit(X, Y)
+    assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,)))
+    pred = clf.predict(X)
+    assert_array_equal(pred, Y)
+
+    # test sparsify with dense inputs
+    clf.sparsify()
+    assert sp.issparse(clf.coef_)
+    pred = clf.predict(X)
+    assert_array_equal(pred, Y)
+
+    # pickle and unpickle with sparse coef_
+    clf = pickle.loads(pickle.dumps(clf))
+    assert sp.issparse(clf.coef_)
+    pred = clf.predict(X)
+    assert_array_equal(pred, Y)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_class_weights(klass):
+    # Test class weights.
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                  [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False,
+                class_weight=None)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False,
+                class_weight={1: 0.001})
+    clf.fit(X, y)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_equal_class_weight(klass):
+    # Test if equal class weights approx. equals no class weights.
+    X = [[1, 0], [1, 0], [0, 1], [0, 1]]
+    y = [0, 0, 1, 1]
+    clf = klass(alpha=0.1, max_iter=1000, class_weight=None)
+    clf.fit(X, y)
+
+    X = [[1, 0], [0, 1]]
+    y = [0, 1]
+    clf_weighted = klass(alpha=0.1, max_iter=1000,
+                         class_weight={0: 0.5, 1: 0.5})
+    clf_weighted.fit(X, y)
+
+    # should be similar up to some epsilon due to learning rate schedule
+    assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_wrong_class_weight_label(klass):
+    # ValueError due to not existing class label.
+    clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
+    assert_raises(ValueError, clf.fit, X, Y)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_wrong_class_weight_format(klass):
+    # ValueError due to wrong class_weight argument type.
+    clf = klass(alpha=0.1, max_iter=1000, class_weight=[0.5])
+    assert_raises(ValueError, clf.fit, X, Y)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_weights_multiplied(klass):
+    # Tests that class_weight and sample_weight are multiplicative
+    class_weights = {1: .6, 2: .3}
+    rng = np.random.RandomState(0)
+    sample_weights = rng.random_sample(Y4.shape[0])
+    multiplied_together = np.copy(sample_weights)
+    multiplied_together[Y4 == 1] *= class_weights[1]
+    multiplied_together[Y4 == 2] *= class_weights[2]
+
+    clf1 = klass(alpha=0.1, max_iter=20, class_weight=class_weights)
+    clf2 = klass(alpha=0.1, max_iter=20)
+
+    clf1.fit(X4, Y4, sample_weight=sample_weights)
+    clf2.fit(X4, Y4, sample_weight=multiplied_together)
+
+    assert_almost_equal(clf1.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_balanced_weight(klass):
+    # Test class weights for imbalanced data"""
+    # compute reference metrics on iris dataset that is quite balanced by
+    # default
+    X, y = iris.data, iris.target
+    X = scale(X)
+    idx = np.arange(X.shape[0])
+    rng = np.random.RandomState(6)
+    rng.shuffle(idx)
+    X = X[idx]
+    y = y[idx]
+    clf = klass(alpha=0.0001, max_iter=1000,
+                class_weight=None, shuffle=False).fit(X, y)
+    f1 = metrics.f1_score(y, clf.predict(X), average='weighted')
+    assert_almost_equal(f1, 0.96, decimal=1)
+
+    # make the same prediction using balanced class_weight
+    clf_balanced = klass(alpha=0.0001, max_iter=1000,
+                         class_weight="balanced",
+                         shuffle=False).fit(X, y)
+    f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted')
+    assert_almost_equal(f1, 0.96, decimal=1)
+
+    # Make sure that in the balanced case it does not change anything
+    # to use "balanced"
+    assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6)
+
+    # build an very very imbalanced dataset out of iris data
+    X_0 = X[y == 0, :]
+    y_0 = y[y == 0]
+
+    X_imbalanced = np.vstack([X] + [X_0] * 10)
+    y_imbalanced = np.concatenate([y] + [y_0] * 10)
+
+    # fit a model on the imbalanced data without class weight info
+    clf = klass(max_iter=1000, class_weight=None, shuffle=False)
+    clf.fit(X_imbalanced, y_imbalanced)
+    y_pred = clf.predict(X)
+    assert_less(metrics.f1_score(y, y_pred, average='weighted'), 0.96)
+
+    # fit a model with balanced class_weight enabled
+    clf = klass(max_iter=1000, class_weight="balanced",
+                shuffle=False)
+    clf.fit(X_imbalanced, y_imbalanced)
+    y_pred = clf.predict(X)
+    assert_greater(metrics.f1_score(y, y_pred, average='weighted'), 0.96)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_sample_weights(klass):
+    # Test weights on individual samples
+    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                  [1.0, 1.0], [1.0, 0.0]])
+    y = [1, 1, 1, -1, -1]
+
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_wrong_sample_weights(klass):
+    # Test if ValueError is raised if sample_weight has wrong shape
+    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    # provided sample_weight too long
+    assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_exception(klass):
+    clf = klass(alpha=0.01)
+    # classes was not specified
+    assert_raises(ValueError, clf.partial_fit, X3, Y3)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_binary(klass):
+    third = X.shape[0] // 3
+    clf = klass(alpha=0.01)
+    classes = np.unique(Y)
+
+    clf.partial_fit(X[:third], Y[:third], classes=classes)
+    assert_equal(clf.coef_.shape, (1, X.shape[1]))
+    assert_equal(clf.intercept_.shape, (1,))
+    assert_equal(clf.decision_function([[0, 0]]).shape, (1, ))
+    id1 = id(clf.coef_.data)
+
+    clf.partial_fit(X[third:], Y[third:])
+    id2 = id(clf.coef_.data)
+    # check that coef_ haven't been re-allocated
+    assert id1, id2
+
+    y_pred = clf.predict(T)
+    assert_array_equal(y_pred, true_result)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass(klass):
+    third = X2.shape[0] // 3
+    clf = klass(alpha=0.01)
+    classes = np.unique(Y2)
+
+    clf.partial_fit(X2[:third], Y2[:third], classes=classes)
+    assert_equal(clf.coef_.shape, (3, X2.shape[1]))
+    assert_equal(clf.intercept_.shape, (3,))
+    assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3))
+    id1 = id(clf.coef_.data)
+
+    clf.partial_fit(X2[third:], Y2[third:])
+    id2 = id(clf.coef_.data)
+    # check that coef_ haven't been re-allocated
+    assert id1, id2
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_partial_fit_multiclass_average(klass):
+    third = X2.shape[0] // 3
+    clf = klass(alpha=0.01, average=X2.shape[0])
+    classes = np.unique(Y2)
+
+    clf.partial_fit(X2[:third], Y2[:third], classes=classes)
+    assert_equal(clf.coef_.shape, (3, X2.shape[1]))
+    assert_equal(clf.intercept_.shape, (3,))
+
+    clf.partial_fit(X2[third:], Y2[third:])
+    assert_equal(clf.coef_.shape, (3, X2.shape[1]))
+    assert_equal(clf.intercept_.shape, (3,))
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_fit_then_partial_fit(klass):
+    # Partial_fit should work after initial fit in the multiclass case.
+    # Non-regression test for #2496; fit would previously produce a
+    # Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
+    clf = klass()
+    clf.fit(X2, Y2)
+    clf.partial_fit(X2, Y2)     # no exception here
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
+def test_partial_fit_equal_fit_classif(klass, lr):
+    for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
+        clf = klass(alpha=0.01, eta0=0.01, max_iter=2,
+                    learning_rate=lr, shuffle=False)
+        clf.fit(X_, Y_)
+        y_pred = clf.decision_function(T_)
         t = clf.t_
 
-        clf = self.factory(alpha=0.01, eta0=0.01,
-                           learning_rate=lr, shuffle=False)
+        classes = np.unique(Y_)
+        clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr,
+                    shuffle=False)
         for i in range(2):
-            clf.partial_fit(X, Y)
-        y_pred2 = clf.predict(T)
+            clf.partial_fit(X_, Y_, classes=classes)
+        y_pred2 = clf.decision_function(T_)
 
         assert_equal(clf.t_, t)
         assert_array_almost_equal(y_pred, y_pred2, decimal=2)
 
-    def test_partial_fit_equal_fit_constant(self):
-        self._test_partial_fit_equal_fit("constant")
 
-    def test_partial_fit_equal_fit_optimal(self):
-        self._test_partial_fit_equal_fit("optimal")
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_regression_losses(klass):
+    clf = klass(alpha=0.01, learning_rate="constant",
+                eta0=0.1, loss="epsilon_insensitive")
+    clf.fit(X, Y)
+    assert_equal(1.0, np.mean(clf.predict(X) == Y))
+
+    clf = klass(alpha=0.01, learning_rate="constant",
+                eta0=0.1, loss="squared_epsilon_insensitive")
+    clf.fit(X, Y)
+    assert_equal(1.0, np.mean(clf.predict(X) == Y))
+
+    clf = klass(alpha=0.01, loss="huber")
+    clf.fit(X, Y)
+    assert_equal(1.0, np.mean(clf.predict(X) == Y))
+
+    clf = klass(alpha=0.01, learning_rate="constant", eta0=0.01,
+                loss="squared_loss")
+    clf.fit(X, Y)
+    assert_equal(1.0, np.mean(clf.predict(X) == Y))
 
-    def test_partial_fit_equal_fit_invscaling(self):
-        self._test_partial_fit_equal_fit("invscaling")
 
-    def test_partial_fit_equal_fit_adaptive(self):
-        self._test_partial_fit_equal_fit("adaptive")
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_warm_start_multiclass(klass):
+    _test_warm_start(klass, X2, Y2, "optimal")
 
-    def test_loss_function_epsilon(self):
-        clf = self.factory(epsilon=0.9)
-        clf.set_params(epsilon=0.1)
-        assert clf.loss_functions['huber'][1] == 0.1
 
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+def test_multiple_fit(klass):
+    # Test multiple calls of fit w/ different shaped inputs.
+    clf = klass(alpha=0.01, shuffle=False)
+    clf.fit(X, Y)
+    assert hasattr(clf, "coef_")
 
-class SparseSGDRegressorTestCase(DenseSGDRegressorTestCase):
-    # Run exactly the same tests using the sparse representation variant
+    # Non-regression test: try fitting with a different label set.
+    y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)]
+    clf.fit(X[:, :-1], y)
+
+
+###############################################################################
+# Regression Test Case
 
-    factory_class = SparseSGDRegressor
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_sgd_reg(klass):
+    # Check that SGD gives any results.
+    clf = klass(alpha=0.1, max_iter=2, fit_intercept=False)
+    clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
+    assert_equal(clf.coef_[0], clf.coef_[1])
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_computed_correctly(klass):
+    # Tests the average regressor matches the naive implementation
+
+    eta = .001
+    alpha = .01
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+
+    # simple linear function without noise
+    y = np.dot(X, w)
+
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    clf.fit(X, y)
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=16)
+    assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_sgd_averaged_partial_fit(klass):
+    # Tests whether the partial fit yields the same average as the fit
+    eta = .001
+    alpha = .01
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+
+    # simple linear function without noise
+    y = np.dot(X, w)
+
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    clf.partial_fit(X[:int(n_samples / 2)][:], y[:int(n_samples / 2)])
+    clf.partial_fit(X[int(n_samples / 2):][:], y[int(n_samples / 2):])
+    average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
+
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=16)
+    assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_average_sparse(klass):
+    # Checks the average weights on data with 0s
+
+    eta = .001
+    alpha = .01
+    clf = klass(loss='squared_loss',
+                learning_rate='constant',
+                eta0=eta, alpha=alpha,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    n_samples = Y3.shape[0]
+
+    clf.partial_fit(X3[:int(n_samples / 2)][:], Y3[:int(n_samples / 2)])
+    clf.partial_fit(X3[int(n_samples / 2):][:], Y3[int(n_samples / 2):])
+    average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha)
+
+    assert_array_almost_equal(clf.coef_,
+                              average_weights,
+                              decimal=16)
+    assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_sgd_least_squares_fit(klass):
+    xmin, xmax = -5, 5
+    n_samples = 100
+    rng = np.random.RandomState(0)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
+                fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert_greater(score, 0.99)
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
+                fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert_greater(score, 0.5)
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_sgd_epsilon_insensitive(klass):
+    xmin, xmax = -5, 5
+    n_samples = 100
+    rng = np.random.RandomState(0)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf = klass(loss='epsilon_insensitive', epsilon=0.01,
+                alpha=0.1, max_iter=20,
+                fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.99
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf = klass(loss='epsilon_insensitive', epsilon=0.01,
+                alpha=0.1, max_iter=20,
+                fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert score > 0.5
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_sgd_huber_fit(klass):
+    xmin, xmax = -5, 5
+    n_samples = 100
+    rng = np.random.RandomState(0)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
+                fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert_greater(score, 0.99)
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20,
+                fit_intercept=False)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+    assert_greater(score, 0.5)
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_elasticnet_convergence(klass):
+    # Check that the SGD output is consistent with coordinate descent
+
+    n_samples, n_features = 1000, 5
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+    # ground_truth linear model that generate y from X and to which the
+    # models should converge if the regularizer would be set to 0.0
+    ground_truth_coef = rng.randn(n_features)
+    y = np.dot(X, ground_truth_coef)
+
+    # XXX: alpha = 0.1 seems to cause convergence problems
+    for alpha in [0.01, 0.001]:
+        for l1_ratio in [0.5, 0.8, 1.0]:
+            cd = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio,
+                                         fit_intercept=False)
+            cd.fit(X, y)
+            sgd = klass(penalty='elasticnet', max_iter=50,
+                        alpha=alpha, l1_ratio=l1_ratio,
+                        fit_intercept=False)
+            sgd.fit(X, y)
+            err_msg = ("cd and sgd did not converge to comparable "
+                       "results for alpha=%f and l1_ratio=%f"
+                       % (alpha, l1_ratio))
+            assert_almost_equal(cd.coef_, sgd.coef_, decimal=2,
+                                err_msg=err_msg)
+
+
+@ignore_warnings
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_partial_fit(klass):
+    third = X.shape[0] // 3
+    clf = klass(alpha=0.01)
+
+    clf.partial_fit(X[:third], Y[:third])
+    assert_equal(clf.coef_.shape, (X.shape[1], ))
+    assert_equal(clf.intercept_.shape, (1,))
+    assert_equal(clf.predict([[0, 0]]).shape, (1, ))
+    id1 = id(clf.coef_.data)
+
+    clf.partial_fit(X[third:], Y[third:])
+    id2 = id(clf.coef_.data)
+    # check that coef_ haven't been re-allocated
+    assert id1, id2
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
+def _test_partial_fit_equal_fit(klass, lr):
+    clf = klass(alpha=0.01, max_iter=2, eta0=0.01,
+                learning_rate=lr, shuffle=False)
+    clf.fit(X, Y)
+    y_pred = clf.predict(T)
+    t = clf.t_
+
+    clf = klass(alpha=0.01, eta0=0.01,
+                learning_rate=lr, shuffle=False)
+    for i in range(2):
+        clf.partial_fit(X, Y)
+    y_pred2 = clf.predict(T)
+
+    assert_equal(clf.t_, t)
+    assert_array_almost_equal(y_pred, y_pred2, decimal=2)
+
+
+@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
+def test_loss_function_epsilon(klass):
+    clf = klass(epsilon=0.9)
+    clf.set_params(epsilon=0.1)
+    assert clf.loss_functions['huber'][1] == 0.1
 
 
 def test_l1_ratio():
@@ -1352,27 +1458,27 @@ def init(max_iter=None, tol=None, n_iter=None, for_partial_fit=False):
 @ignore_warnings(category=(DeprecationWarning, FutureWarning))
 def test_tol_and_max_iter_default_values():
     # Test that the default values are correctly changed
-    est = SGDClassifier()
+    est = linear_model.SGDClassifier()
     est._validate_params()
     assert_equal(est._tol, None)
     assert_equal(est._max_iter, 5)
 
-    est = SGDClassifier(n_iter=42)
+    est = linear_model.SGDClassifier(n_iter=42)
     est._validate_params()
     assert_equal(est._tol, None)
     assert_equal(est._max_iter, 42)
 
-    est = SGDClassifier(tol=1e-2)
+    est = linear_model.SGDClassifier(tol=1e-2)
     est._validate_params()
     assert_equal(est._tol, 1e-2)
     assert_equal(est._max_iter, 1000)
 
-    est = SGDClassifier(max_iter=42)
+    est = linear_model.SGDClassifier(max_iter=42)
     est._validate_params()
     assert_equal(est._tol, None)
     assert_equal(est._max_iter, 42)
 
-    est = SGDClassifier(max_iter=42, tol=1e-2)
+    est = linear_model.SGDClassifier(max_iter=42, tol=1e-2)
     est._validate_params()
     assert_equal(est._tol, 1e-2)
     assert_equal(est._max_iter, 42)
@@ -1521,10 +1627,10 @@ def test_multi_core_gridsearch_and_early_stopping():
 
 
 @pytest.mark.skipif(
-        not hasattr(sp, "random"),
-        reason="this test uses scipy.random, that was introduced in version  "
-        "0.17. This skip condition can be dropped as soon as we drop support "
-        "for scipy versions older than 0.17")
+    not hasattr(sp, "random"),
+    reason="this test uses scipy.random, that was introduced in version  "
+           "0.17. This skip condition can be dropped as soon as we drop "
+           "support for scipy versions older than 0.17")
 @pytest.mark.parametrize("backend",
                          ["loky", "multiprocessing", "threading"])
 def test_SGDClassifier_fit_for_all_backends(backend):

From 25e5ba1170b42ac8493c22175c3bafdd078df43f Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Wed, 27 Feb 2019 10:15:21 +0100
Subject: [PATCH 2/5] review

---
 sklearn/linear_model/tests/test_sgd.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 236d3a9bea426..5b8787b9012ca 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -445,7 +445,8 @@ def test_sgd_bad_penalty(klass):
                   l1_ratio=0.85)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_loss(klass):
     # Check whether expected ValueError on bad loss
     assert_raises(ValueError, klass, loss="foobar")
@@ -1293,7 +1294,7 @@ def test_partial_fit(klass):
 @pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor])
 @pytest.mark.parametrize('lr',
                          ["constant", "optimal", "invscaling", "adaptive"])
-def _test_partial_fit_equal_fit(klass, lr):
+def test_partial_fit_equal_fit(klass, lr):
     clf = klass(alpha=0.01, max_iter=2, eta0=0.01,
                 learning_rate=lr, shuffle=False)
     clf.fit(X, Y)

From 3dbe40d0bd0c08fa78a875404a9ff0a93e48185d Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Wed, 27 Feb 2019 10:19:15 +0100
Subject: [PATCH 3/5] move together tests that are common between reg and
 classifiers

---
 sklearn/linear_model/tests/test_sgd.py | 45 +++++++++++++-------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 5b8787b9012ca..06c696d83550f 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -138,7 +138,7 @@ def SparseSGDRegressor(**kwargs):
 
 
 ###############################################################################
-# Tests common to classification and regression
+# Common Test Case to classification and regression
 
 # a simple implementation of ASGD to use for testing
 # uses squared loss to find the gradient
@@ -176,6 +176,27 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
     return average_weights, average_intercept
 
 
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_sgd_bad_alpha(klass):
+    # Check whether expected ValueError on bad alpha
+    assert_raises(ValueError, klass, alpha=-.1)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_sgd_bad_penalty(klass):
+    # Check whether expected ValueError on bad penalty
+    assert_raises(ValueError, klass, penalty='foobar',
+                  l1_ratio=0.85)
+
+
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDRegressor, SparseSGDRegressor])
+def test_sgd_bad_loss(klass):
+    # Check whether expected ValueError on bad loss
+    assert_raises(ValueError, klass, loss="foobar")
+
 def _test_warm_start(klass, X, Y, lr):
     # Test that explicit warm restart...
     clf = klass(alpha=0.01, eta0=0.01, shuffle=False,
@@ -430,28 +451,6 @@ def test_sgd_bad_eta0(klass):
                   learning_rate="constant")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
-def test_sgd_bad_alpha(klass):
-    # Check whether expected ValueError on bad alpha
-    assert_raises(ValueError, klass, alpha=-.1)
-
-
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
-def test_sgd_bad_penalty(klass):
-    # Check whether expected ValueError on bad penalty
-    assert_raises(ValueError, klass, penalty='foobar',
-                  l1_ratio=0.85)
-
-
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
-def test_sgd_bad_loss(klass):
-    # Check whether expected ValueError on bad loss
-    assert_raises(ValueError, klass, loss="foobar")
-
-
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_max_iter_param(klass):
     # Test parameter validity check

From 33a34554ebf15e904f3261ddef7e2a5baea993db Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Wed, 27 Feb 2019 10:19:26 +0100
Subject: [PATCH 4/5] pep8

---
 sklearn/linear_model/tests/test_sgd.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 06c696d83550f..4824c06b7dd7f 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -197,6 +197,7 @@ def test_sgd_bad_loss(klass):
     # Check whether expected ValueError on bad loss
     assert_raises(ValueError, klass, loss="foobar")
 
+
 def _test_warm_start(klass, X, Y, lr):
     # Test that explicit warm restart...
     clf = klass(alpha=0.01, eta0=0.01, shuffle=False,

From dac3cbbbac119afd79829591901931a49ba71a72 Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Wed, 27 Feb 2019 15:48:43 +0100
Subject: [PATCH 5/5] lint

---
 sklearn/linear_model/tests/test_sgd.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 4824c06b7dd7f..405f7003798ff 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -727,13 +727,13 @@ def test_sgd_proba(klass):
     d = clf.decision_function([[-1, -1]])
     assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))
 
-    l = clf.predict_log_proba([[3, 2]])
+    lp = clf.predict_log_proba([[3, 2]])
     p = clf.predict_proba([[3, 2]])
-    assert_array_almost_equal(np.log(p), l)
+    assert_array_almost_equal(np.log(p), lp)
 
-    l = clf.predict_log_proba([[-1, -1]])
+    lp = clf.predict_log_proba([[-1, -1]])
     p = clf.predict_proba([[-1, -1]])
-    assert_array_almost_equal(np.log(p), l)
+    assert_array_almost_equal(np.log(p), lp)
 
     # Modified Huber multiclass probability estimates; requires a separate
     # test because the hard zero/one probabilities may destroy the