scikit-learn · TomDLT · Oct 30, 2014 · May 19, 2015 · Jul 16, 2015 · Jul 17, 2015
diff --git a/.gitattributes b/.gitattributes
@@ -9,6 +9,7 @@
 /sklearn/feature_extraction/_hashing.c -diff
 /sklearn/linear_model/cd_fast.c -diff
 /sklearn/linear_model/sgd_fast.c -diff
+/sklearn/linear_model/sag_fast.c -diff
 /sklearn/metrics/pairwise_fast.c -diff
 /sklearn/neighbors/ball_tree.c -diff
 /sklearn/neighbors/kd_tree.c -diff

diff --git a/benchmarks/bench_covertype.py b/benchmarks/bench_covertype.py
@@ -53,7 +53,7 @@
 
 from sklearn.datasets import fetch_covtype, get_data_home
 from sklearn.svm import LinearSVC
-from sklearn.linear_model import SGDClassifier
+from sklearn.linear_model import SGDClassifier, LogisticRegression
 from sklearn.naive_bayes import GaussianNB
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
@@ -105,7 +105,8 @@ def load_data(dtype=np.float32, order='C', random_state=13):
     'SGD': SGDClassifier(alpha=0.001, n_iter=2),
     'GaussianNB': GaussianNB(),
     'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False,
-                           tol=1e-3)
+                           tol=1e-3),
+    'SAG': LogisticRegression(solver='sag', max_iter=2, C=1000)
 }
 
 

diff --git a/benchmarks/bench_mnist.py b/benchmarks/bench_mnist.py
@@ -47,6 +47,7 @@
 from sklearn.svm import LinearSVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils import check_array
+from sklearn.linear_model import LogisticRegression
 
 # Memoize the data extraction and memory map the resulting
 # train / test splits in readonly mode
@@ -86,7 +87,8 @@ def load_data(dtype=np.float32, order='F'):
     'Nystroem-SVM':
     make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)),
     'SampledRBF-SVM':
-    make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100))
+    make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)),
+    'LinearRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4)
 }
 
 
@@ -120,7 +122,7 @@ def load_data(dtype=np.float32, order='F'):
     print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
                                  X_train.shape[0], int(X_train.nbytes / 1e6)))
     print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
-                                  X_test.shape[0], int(X_test.nbytes / 1e6)))
+                                 X_test.shape[0], int(X_test.nbytes / 1e6)))
 
     print()
     print("Training Classifiers")

diff --git a/benchmarks/bench_rcv1_logreg_convergence.py b/benchmarks/bench_rcv1_logreg_convergence.py
@@ -0,0 +1,236 @@
+# Authors: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
+#          Olivier Grisel <olivier.grisel@ensta.org>
+#
+# License: BSD 3 clause
+
+import matplotlib.pyplot as plt
+import numpy as np
+import gc
+import time
+
+from sklearn.externals.joblib import Memory
+from sklearn.linear_model import (LogisticRegression, SGDClassifier)
+from sklearn.datasets import fetch_rcv1
+from sklearn.linear_model.sag import get_auto_step_size
+from sklearn.linear_model.sag_fast import get_max_squared_sum
+
+
+try:
+    import lightning.classification as lightning_clf
+except ImportError:
+    lightning_clf = None
+
+m = Memory(cachedir='.', verbose=0)
+
+
+# compute logistic loss
+def get_loss(w, intercept, myX, myy, C):
+    n_samples = myX.shape[0]
+    w = w.ravel()
+    p = np.mean(np.log(1. + np.exp(-myy * (myX.dot(w) + intercept))))
+    print("%f + %f" % (p, w.dot(w) / 2. / C / n_samples))
+    p += w.dot(w) / 2. / C / n_samples
+    return p
+
+
+# We use joblib to cache individual fits. Note that we do not pass the dataset
+# as argument as the hashing would be too slow, so we assume that the dataset
+# never changes.
+@m.cache()
+def bench_one(name, clf_type, clf_params, n_iter):
+    clf = clf_type(**clf_params)
+    try:
+        clf.set_params(max_iter=n_iter, random_state=42)
+    except:
+        clf.set_params(n_iter=n_iter, random_state=42)
+
+    st = time.time()
+    clf.fit(X, y)
+    end = time.time()
+
+    try:
+        C = 1.0 / clf.alpha / n_samples
+    except:
+        C = clf.C
+
+    try:
+        intercept = clf.intercept_
+    except:
+        intercept = 0.
+
+    train_loss = get_loss(clf.coef_, intercept, X, y, C)
+    train_score = clf.score(X, y)
+    test_score = clf.score(X_test, y_test)
+    duration = end - st
+
+    return train_loss, train_score, test_score, duration
+
+
+def bench(clfs):
+    for (name, clf, iter_range, train_losses, train_scores,
+         test_scores, durations) in clfs:
+        print("training %s" % name)
+        clf_type = type(clf)
+        clf_params = clf.get_params()
+
+        for n_iter in iter_range:
+            gc.collect()
+
+            train_loss, train_score, test_score, duration = bench_one(
+                name, clf_type, clf_params, n_iter)
+
+            train_losses.append(train_loss)
+            train_scores.append(train_score)
+            test_scores.append(test_score)
+            durations.append(duration)
+            print("classifier: %s" % name)
+            print("train_loss: %.8f" % train_loss)
+            print("train_score: %.8f" % train_score)
+            print("test_score: %.8f" % test_score)
+            print("time for fit: %.8f seconds" % duration)
+            print("")
+
+        print("")
+    return clfs
+
+
+def plot_train_losses(clfs):
+    plt.figure()
+    for (name, _, _, train_losses, _, _, durations) in clfs:
+        plt.plot(durations, train_losses, '-o', label=name)
+        plt.legend(loc=0)
+        plt.xlabel("seconds")
+        plt.ylabel("train loss")
+
+
+def plot_train_scores(clfs):
+    plt.figure()
+    for (name, _, _, _, train_scores, _, durations) in clfs:
+        plt.plot(durations, train_scores, '-o', label=name)
+        plt.legend(loc=0)
+        plt.xlabel("seconds")
+        plt.ylabel("train score")
+        plt.ylim((0.92, 0.96))
+
+
+def plot_test_scores(clfs):
+    plt.figure()
+    for (name, _, _, _, _, test_scores, durations) in clfs:
+        plt.plot(durations, test_scores, '-o', label=name)
+        plt.legend(loc=0)
+        plt.xlabel("seconds")
+        plt.ylabel("test score")
+        plt.ylim((0.92, 0.96))
+
+
+def plot_dloss(clfs):
+    plt.figure()
+    pobj_final = []
+    for (name, _, _, train_losses, _, _, durations) in clfs:
+        pobj_final.append(train_losses[-1])
+
+    indices = np.argsort(pobj_final)
+    pobj_best = pobj_final[indices[0]]
+
+    for (name, _, _, train_losses, _, _, durations) in clfs:
+        log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10)
+
+        plt.plot(durations, log_pobj, '-o', label=name)
+        plt.legend(loc=0)
+        plt.xlabel("seconds")
+        plt.ylabel("log(best - train_loss)")
+
+
+rcv1 = fetch_rcv1()
+X = rcv1.data
+n_samples, n_features = X.shape
+
+# consider the binary classification problem 'CCAT' vs the rest
+ccat_idx = rcv1.target_names.tolist().index('CCAT')
+y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64)
+y[y == 0] = -1
+
+# parameters
+C = 1.
+fit_intercept = True
+tol = 1.0e-14
+
+# max_iter range
+sgd_iter_range = list(range(1, 121, 10))
+newton_iter_range = list(range(1, 25, 3))
+lbfgs_iter_range = list(range(1, 242, 12))
+liblinear_iter_range = list(range(1, 37, 3))
+liblinear_dual_iter_range = list(range(1, 85, 6))
+sag_iter_range = list(range(1, 37, 3))
+
+clfs = [
+    ("LR-liblinear",
+     LogisticRegression(C=C, tol=tol,
+                        solver="liblinear", fit_intercept=fit_intercept,
+                        intercept_scaling=1),
+     liblinear_iter_range, [], [], [], []),
+    ("LR-liblinear-dual",
+     LogisticRegression(C=C, tol=tol, dual=True,
+                        solver="liblinear", fit_intercept=fit_intercept,
+                        intercept_scaling=1),
+     liblinear_dual_iter_range, [], [], [], []),
+    ("LR-SAG",
+     LogisticRegression(C=C, tol=tol,
+                        solver="sag", fit_intercept=fit_intercept),
+     sag_iter_range, [], [], [], []),
+    ("LR-newton-cg",
+     LogisticRegression(C=C, tol=tol, solver="newton-cg",
+                        fit_intercept=fit_intercept),
+     newton_iter_range, [], [], [], []),
+    ("LR-lbfgs",
+     LogisticRegression(C=C, tol=tol,
+                        solver="lbfgs", fit_intercept=fit_intercept),
+     lbfgs_iter_range, [], [], [], []),
+    ("SGD",
+     SGDClassifier(alpha=1.0 / C / n_samples, penalty='l2', loss='log',
+                   fit_intercept=fit_intercept, verbose=0),
+     sgd_iter_range, [], [], [], [])]
+
+
+if lightning_clf is not None and not fit_intercept:
+    alpha = 1. / C / n_samples
+    # compute the same step_size than in LR-sag
+    max_squared_sum = get_max_squared_sum(X)
+    step_size = get_auto_step_size(max_squared_sum, alpha, "log",
+                                   fit_intercept)
+
+    clfs.append(
+        ("Lightning-SVRG",
+         lightning_clf.SVRGClassifier(alpha=alpha, eta=step_size,
+                                      tol=tol, loss="log"),
+         sag_iter_range, [], [], [], []))
+    clfs.append(
+        ("Lightning-SAG",
+         lightning_clf.SAGClassifier(alpha=alpha, eta=step_size,
+                                     tol=tol, loss="log"),
+         sag_iter_range, [], [], [], []))
+
+    # We keep only 200 features, to have a dense dataset,
+    # and compare to lightning SAG, which seems incorrect in the sparse case.
+    X_csc = X.tocsc()
+    nnz_in_each_features = X_csc.indptr[1:] - X_csc.indptr[:-1]
+    X = X_csc[:, np.argsort(nnz_in_each_features)[-200:]]
+    X = X.toarray()
+    print("dataset: %.3f MB" % (X.nbytes / 1e6))
+
+    
+# Split training and testing. Switch train and test subset compared to
+# LYRL2004 split, to have a larger training dataset.
+n = 23149
+X_test = X[:n, :]
+y_test = y[:n]
+X = X[n:, :]
+y = y[n:]
+
+clfs = bench(clfs)
+
+plot_train_scores(clfs)
+plot_test_scores(clfs)
+plot_train_losses(clfs)
+plot_dloss(clfs)
+plt.show()
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -104,7 +104,7 @@ its ``coef_`` member::
     >>> clf = linear_model.Ridge (alpha = .5)
     >>> clf.fit ([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +NORMALIZE_WHITESPACE
     Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
-          normalize=False, solver='auto', tol=0.001)
+          normalize=False, random_state=None, solver='auto', tol=0.001)
     >>> clf.coef_
     array([ 0.34545455,  0.34545455])
     >>> clf.intercept_ #doctest: +ELLIPSIS
@@ -670,17 +670,12 @@ Similarly, L1 regularized logistic regression solves the following optimization
 
 The solvers implemented in the class :class:`LogisticRegression`
 are "liblinear" (which is a wrapper around the C++ library,
-LIBLINEAR), "newton-cg" and "lbfgs".
+LIBLINEAR), "newton-cg", "lbfgs" and "sag".
 
-The lbfgs and newton-cg solvers only support L2 penalization and are found
+The "lbfgs" and "newton-cg" solvers only support L2 penalization and are found
 to converge faster for some high dimensional data. L1 penalization yields
 sparse predicting weights.
 
-Several estimators are available for logistic regression.
-
-:class:`LogisticRegression` has an option of using three solvers,
-"liblinear", "lbfgs" and "newton-cg".
-
 The solver "liblinear" uses a coordinate descent (CD) algorithm based on
 Liblinear. For L1 penalization :func:`sklearn.svm.l1_min_c` allows to
 calculate the lower bound for C in order to get a non "null" (all feature weights to
@@ -697,8 +692,23 @@ Setting `multi_class` to "multinomial" with the "lbfgs" or "newton-cg" solver
 in :class:`LogisticRegression` learns a true multinomial logistic
 regression model, which means that its probability estimates should
 be better calibrated than the default "one-vs-rest" setting.
-L-BFGS and newton-cg cannot optimize L1-penalized models, though,
-so the "multinomial" setting does not learn sparse models.
+"lbfgs", "newton-cg" and "sag" solvers cannot optimize L1-penalized models, though, so the "multinomial" setting does not learn sparse models.
+
+The solver "sag" uses a Stochastic Average Gradient descent [3]_. It does not
+handle "multinomial" case, and is limited to L2-penalized models, yet it is
+often faster than other solvers for large datasets, when both the number of
+samples and the number of features are large.
+
+In a nutshell, one may choose the solver with the following rules:
+
+===========================   ======================
+Case                          Solver
+===========================   ======================
+Small dataset or L1 penalty   "liblinear"
+Multinomial loss              "lbfgs" or newton-cg"
+Large dataset                 "sag"
+===========================   ======================
+For large dataset, you may also consider using :class:`SGDClassifier` with 'log' loss.
 
 .. topic:: Examples:
 
@@ -729,13 +739,16 @@ so the "multinomial" setting does not learn sparse models.
 
 :class:`LogisticRegressionCV` implements Logistic Regression with
 builtin cross-validation to find out the optimal C parameter.
-"newton-cg" and "lbfgs" solvers are found to be faster
+"newton-cg", "sag" and "lbfgs" solvers are found to be faster
 for high-dimensional dense data, due to warm-starting.
 For the multiclass case, if `multi_class`
 option is set to "ovr", an optimal C is obtained for each class and if
 the `multi_class` option is set to "multinomial", an optimal C is
 obtained that minimizes the cross-entropy loss.
 
+.. topic:: References:
+
+    .. [3] Mark Schmidt, Nicolas Le Roux, and Francis Bach: `Minimizing Finite Sums with the Stochastic Average Gradient. <http://hal.inria.fr/hal-00860051/PDF/sag_journal.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================

diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
@@ -161,6 +161,7 @@ further information.
  - :ref:`example_linear_model_plot_sgd_separating_hyperplane.py`,
  - :ref:`example_linear_model_plot_sgd_iris.py`
  - :ref:`example_linear_model_plot_sgd_weighted_samples.py`
+ - :ref:`example_linear_model_plot_sgd_comparison.py`
  - :ref:`example_svm_plot_separating_hyperplane_unbalanced.py` (See the `Note`)
 
 :class:`SGDClassifier` supports averaged SGD (ASGD). Averaging can be enabled
@@ -169,6 +170,10 @@ of the plain SGD over each iteration over a sample. When using ASGD
 the learning rate can be larger and even constant leading on some
 datasets to a speed up in training time.
 
+For classification with a logistic loss, another variant of SGD with an
+averaging strategy is available with Stochastic Average Gradient (SAG)
+algorithm, available as a solver in :class:`LogisticRegression`.
+
 Regression
 ==========
 
@@ -192,7 +197,11 @@ specified via the parameter ``epsilon``. This parameter depends on the
 scale of the target variables.
 
 :class:`SGDRegressor` supports averaged SGD as :class:`SGDClassifier`.
-Averaging can be enabled by setting ```average=True```
+Averaging can be enabled by setting ```average=True```.
+
+For regression with a squared loss and a l2 penalty, another variant of
+SGD with an averaging strategy is available with Stochastic Average
+Gradient (SAG) algorithm, available as a solver in :class:`Ridge`.
 
 
 Stochastic Gradient Descent for sparse data