8000 [MRG+2] Adding Implementation of SAG - next episode by TomDLT · Pull Request #4738 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+2] Adding Implementation of SAG - next episode #4738

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
/sklearn/feature_extraction/_hashing.c -diff
/sklearn/linear_model/cd_fast.c -diff
/sklearn/linear_model/sgd_fast.c -diff
/sklearn/linear_model/sag_fast.c -diff
/sklearn/metrics/pairwise_fast.c -diff
/sklearn/neighbors/ball_tree.c -diff
/sklearn/neighbors/kd_tree.c -diff
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/bench_covertype.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

from sklearn.datasets import fetch_covtype, get_data_home
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
Expand Down Expand Up @@ -105,7 +105,8 @@ def load_data(dtype=np.float32, order='C', random_state=13):
'SGD': SGDClassifier(alpha=0.001, n_iter=2),
'GaussianNB': GaussianNB(),
'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False,
tol=1e-3)
tol=1e-3),
'SAG': LogisticRegression(solver='sag', max_iter=2, C=1000)
}


Expand Down
6 changes: 4 additions & 2 deletions benchmarks/bench_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array
from sklearn.linear_model import LogisticRegression

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
Expand Down Expand Up @@ -86,7 +87,8 @@ def load_data(dtype=np.float32, order='F'):
'Nystroem-SVM':
make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)),
'SampledRBF-SVM':
make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100))
make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)),
'LinearRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4)
}


Expand Down Expand Up @@ -120,7 +122,7 @@ def load_data(dtype=np.float32, order='F'):
print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
X_train.shape[0], int(X_train.nbytes / 1e6)))
print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
X_test.shape[0], int(X_test.nbytes / 1e6)))
X_test.shape[0], int(X_test.nbytes / 1e6)))

print()
print("Training Classifiers")
Expand Down
236 changes: 236 additions & 0 deletions benchmarks/bench_rcv1_logreg_convergence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# Authors: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
# Olivier Grisel <olivier.grisel@ensta.org>
#
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np
import gc
import time

from sklearn.externals.joblib import Memory
from sklearn.linear_model import (LogisticRegression, SGDClassifier)
from sklearn.datasets import fetch_rcv1
from sklearn.linear_model.sag import get_auto_step_size
from sklearn.linear_model.sag_fast import get_max_squared_sum


try:
import lightning.classification as lightning_clf
except ImportError:
lightning_clf = None

m = Memory(cachedir='.', verbose=0)


# compute logistic loss
def get_loss(w, intercept, myX, myy, C):
n_samples = myX.shape[0]
w = w.ravel()
p = np.mean(np.log(1. + np.exp(-myy * (myX.dot(w) + intercept))))
print("%f + %f" % (p, w.dot(w) / 2. / C / n_samples))
p += w.dot(w) / 2. / C / n_samples
return p


# We use joblib to cache individual fits. Note that we do not pass the dataset
# as argument as the hashing would be too slow, so we assume that the dataset
# never changes.
@m.cache()
def bench_one(name, clf_type, clf_params, n_iter):
clf = clf_type(**clf_params)
try:
clf.set_params(max_iter=n_iter, random_state=42)
except:
clf.set_params(n_iter=n_iter, random_state=42)

st = time.time()
clf.fit(X, y)
end = time.time()

try:
C = 1.0 / clf.alpha / n_samples
except:
C = clf.C

try:
intercept = clf.intercept_
except:
intercept = 0.

train_loss = get_loss(clf.coef_, intercept, X, y, C)
train_score = clf.score(X, y)
test_score = clf.score(X_test, y_test)
duration = end - st

return train_loss, train_score, test_score, duration


def bench(clfs):
for (name, clf, iter_range, train_losses, train_scores,
test_scores, durations) in clfs:
print("training %s" % name)
clf_type = type(clf)
clf_params = clf.get_params()

for n_iter in iter_range:
gc.collect()

train_loss, train_score, test_score, duration = bench_one(
name, clf_type, clf_params, n_iter)

train_losses.append(train_loss)
train_scores.append(train_score)
test_scores.append(test_score)
durations.append(duration)
print("classifier: %s" % name)
print("train_loss: %.8f" % train_loss)
print("train_score: %.8f" % train_score)
print("test_score: %.8f" % test_score)
print("time for fit: %.8f seconds" % duration)
print("")

print("")
return clfs


def plot_train_losses(clfs):
plt.figure()
for (name, _, _, train_losses, _, _, durations) in clfs:
plt.plot(durations, train_losses, '-o', label=name)
plt.legend(loc=0)
plt.xlabel("seconds")
plt.ylabel("train loss")


def plot_train_scores(clfs):
plt.figure()
for (name, _, _, _, train_scores, _, durations) in clfs:
plt.plot(durations, train_scores, '-o', label=n F438 ame)
plt.legend(loc=0)
plt.xlabel("seconds")
plt.ylabel("train score")
plt.ylim((0.92, 0.96))


def plot_test_scores(clfs):
plt.figure()
for (name, _, _, _, _, test_scores, durations) in clfs:
plt.plot(durations, test_scores, '-o', label=name)
plt.legend(loc=0)
plt.xlabel("seconds")
plt.ylabel("test score")
plt.ylim((0.92, 0.96))


def plot_dloss(clfs):
plt.figure()
pobj_final = []
for (name, _, _, train_losses, _, _, durations) in clfs:
pobj_final.append(train_losses[-1])

indices = np.argsort(pobj_final)
pobj_best = pobj_final[indices[0]]

for (name, _, _, train_losses, _, _, durations) in clfs:
log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10)

plt.plot(durations, log_pobj, '-o', label=name)
plt.legend(loc=0)
plt.xlabel("seconds")
plt.ylabel("log(best - train_loss)")


rcv1 = fetch_rcv1()
X = rcv1.data
n_samples, n_features = X.shape

# consider the binary classification problem 'CCAT' vs the rest
ccat_idx = rcv1.target_names.tolist().index('CCAT')
y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64)
y[y == 0] = -1

# parameters
C = 1.
fit_intercept = True
tol = 1.0e-14

# max_iter range
sgd_iter_range = list(range(1, 121, 10))
newton_iter_range = list(range(1, 25, 3))
lbfgs_iter_range = list(range(1, 242, 12))
liblinear_iter_range = list(range(1, 37, 3))
liblinear_dual_iter_range = list(range(1, 85, 6))
sag_iter_range = list(range(1, 37, 3))

clfs = [
("LR-liblinear",
LogisticRegression(C=C, tol=tol,
solver="liblinear", fit_intercept=fit_intercept,
intercept_scaling=1),
liblinear_iter_range, [], [], [], []),
("LR-liblinear-dual",
LogisticRegression(C=C, tol=tol, dual=True,
solver="liblinear", fit_intercept=fit_intercept,
intercept_scaling=1),
liblinear_dual_iter_range, [], [], [], []),
("LR-SAG",
LogisticRegression(C=C, tol=tol,
solver="sag", fit_intercept=fit_intercept),
sag_iter_range, [], [], [], []),
("LR-newton-cg",
LogisticRegression(C=C, tol=tol, solver="newton-cg",
fit_intercept=fit_intercept),
newton_iter_range, [], [], [], []),
("LR-lbfgs",
LogisticRegression(C=C, tol=tol,
solver="lbfgs", fit_intercept=fit_intercept),
lbfgs_iter_range, [], [], [], []),
("SGD",
SGDClassifier(alpha=1.0 / C / n_samples, penalty='l2', loss='log',
fit_intercept=fit_intercept, verbose=0),
sgd_iter_range, [], [], [], [])]


if lightning_clf is not None and not fit_intercept:
alpha = 1. / C / n_samples
# compute the same step_size than in LR-sag
max_squared_sum = get_max_squared_sum(X)
step_size = get_auto_step_size(max_squared_sum, alpha, "log",
fit_intercept)

clfs.append(
("Lightning-SVRG",
lightning_clf.SVRGClassifier(alpha=alpha, eta=step_size,
tol=tol, loss="log"),
sag_iter_range, [], [], [], []))
clfs.append(
("Lightning-SAG",
lightning_clf.SAGClassifier(alpha=alpha, eta=step_size,
tol=tol, loss="log"),
sag_iter_range, [], [], [], []))

# We keep only 200 features, to have a dense dataset,
# and compare to lightning SAG, which seems incorrect in the sparse case.
X_csc = X.tocsc()
nnz_in_each_features = X_csc.indptr[1:] - X_csc.indptr[:-1]
X = X_csc[:, np.argsort(nnz_in_each_features)[-200:]]
X = X.toarray()
print("dataset: %.3f MB" % (X.nbytes / 1e6))


# Split training and testing. Switch train and test subset compared to
# LYRL2004 split, to have a larger training dataset.
n = 23149
X_test = X[:n, :]
y_test = y[:n]
X = X[n:, :]
y = y[n:]

clfs = bench(clfs)

plot_train_scores(clfs)
plot_test_scores(clfs)
plot_train_losses(clfs)
plot_dloss(clfs)
plt.show()
35 changes: 24 additions & 11 deletions doc/modules/linear_model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ its ``coef_`` member::
>>> clf = linear_model.Ridge (alpha = .5)
>>> clf.fit ([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +NORMALIZE_WHITESPACE
Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, solver='auto', tol=0.001)
normalize=False, random_state=None, solver='auto', tol=0.001)
>>> clf.coef_
array([ 0.34545455, 0.34545455])
>>> clf.intercept_ #doctest: +ELLIPSIS
Expand Down Expand Up @@ -670,17 +670,12 @@ Similarly, L1 regularized logistic regression solves the following optimization

The solvers implemented in the class :class:`LogisticRegression`
are "liblinear" (which is a wrapper around the C++ library,
LIBLINEAR), "newton-cg" and "lbfgs".
LIBLINEAR), "newton-cg", "lbfgs" and "sag".

The lbfgs and newton-cg solvers only support L2 penalization and are found
The "lbfgs" and "newton-cg" solvers only support L2 penalization and are found
to converge faster for some high dimensional data. L1 penalization yields
sparse predicting weights.

Several estimators are available for logistic regression.

:class:`LogisticRegression` has an option of using three solvers,
"liblinear", "lbfgs" and "newton-cg".

The solver "liblinear" uses a coordinate descent (CD) algorithm based on
Liblinear. For L1 penalization :func:`sklearn.svm.l1_min_c` allows to
calculate the lower bound for C in order to get a non "null" (all feature weights to
Expand All @@ -697,8 +692,23 @@ Setting `multi_class` to "multinomial" with the "lbfgs" or "newton-cg" solver
in :class:`LogisticRegression` learns a true multinomial logistic
regression model, which means that its probability estimates should
be better calibrated than the default "one-vs-rest" setting.
L-BFGS and newton-cg cannot optimize L1-penalized models, though,
so the "multinomial" setting does not learn sparse models.
"lbfgs", "newton-cg" and "sag" solvers cannot optimize L1-penalized models, though, so the "multinomial" setting does not learn sparse models.

The solver "sag" uses a Stochastic Average Gradient descent [3]_. It does not
handle "multinomial" case, and is limited to L2-penalized models, yet it is
often faster than other solvers for large datasets, when both the number of
samples and the number of features are large.

In a nutshell, one may choose the solver with the following rules:

=========================== ======================
Case Solver
=========================== ======================
Small dataset or L1 penalty "liblinear"
Multinomial loss "lbfgs" or newton-cg"
Large dataset "sag"
=========================== ======================
For large dataset, you may also consider using :class:`SGDClassifier` with 'log' loss.

.. topic:: Examples:

Expand Down Expand Up @@ -729,13 +739,16 @@ so the "multinomial" setting does not learn sparse models.

:class:`LogisticRegressionCV` implements Logistic Regression with
builtin cross-validation to find out the optimal C parameter.
"newton-cg" and "lbfgs" solvers are found to be faster
"newton-cg", "sag" and "lbfgs" solvers are found to be faster
for high-dimensional dense data, due to warm-starting.
For the multiclass case, if `multi_class`
option is set to "ovr", an optimal C is obtained for each class and if
the `multi_class` option is set to "multinomial", an optimal C is
obtained that minimizes the cross-entropy loss.

.. topic:: References:

.. [3] Mark Schmidt, Nicolas Le Roux, and Francis Bach: `Minimizing Finite Sums with the Stochastic Average Gradient. <http://hal.inria.fr/hal-00860051/PDF/sag_journal.pdf>`_

Stochastic Gradient Descent - SGD
=================================
Expand Down
11 changes: 10 additions & 1 deletion doc/modules/sgd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ further information.
- :ref:`example_linear_model_plot_sgd_separating_hyperplane.py`,
- :ref:`example_linear_model_plot_sgd_iris.py`
- :ref:`example_linear_model_plot_sgd_weighted_samples.py`
- :ref:`example_linear_model_plot_sgd_comparison.py`
- :ref:`example_svm_plot_separating_hyperplane_unbalanced.py` (See the `Note`)

:class:`SGDClassifier` supports averaged SGD (ASGD). Averaging can be enabled
Expand All @@ -169,6 +170,10 @@ of the plain SGD over each iteration over a sample. When using ASGD
the learning rate can be larger and even constant leading on some
datasets to a speed up in training time.

For classification with a logistic loss, another variant of SGD with an
averaging strategy is available with Stochastic Average Gradient (SAG)
algorithm, available as a solver in :class:`LogisticRegression`.

Regression
==========

Expand All @@ -192,7 +197,11 @@ specified via the parameter ``epsilon``. This parameter depends on the
scale of the target variables.

:class:`SGDRegressor` supports averaged SGD as :class:`SGDClassifier`.
Averaging can be enabled by setting ```average=True```
Averaging can be enabled by setting ```average=True```.

For regression with a squared loss and a l2 penalty, another variant of
SGD with an averaging strategy is available with Stochastic Average
Gradient (SAG) algorithm, available as a solver in :class:`Ridge`.


Stochastic Gradient Descent for sparse data
Expand Down
Loading
0