|
| 1 | +# Authors: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org> |
| 2 | +# Olivier Grisel <olivier.grisel@ensta.org> |
| 3 | +# |
| 4 | +# License: BSD 3 clause |
| 5 | + |
| 6 | +import matplotlib.pyplot as plt |
| 7 | +import numpy as np |
| 8 | +import gc |
| 9 | +import time |
| 10 | + |
| 11 | +from sklearn.externals.joblib import Memory |
| 12 | +from sklearn.linear_model import (LogisticRegression, SGDClassifier) |
| 13 | +from sklearn.datasets import fetch_rcv1 |
| 14 | +from sklearn.linear_model.sag import get_auto_step_size |
| 15 | +from sklearn.linear_model.sag_fast import get_max_squared_sum |
| 16 | + |
| 17 | + |
| 18 | +try: |
| 19 | + import lightning.classification as lightning_clf |
| 20 | +except ImportError: |
| 21 | + lightning_clf = None |
| 22 | + |
| 23 | +m = Memory(cachedir='.', verbose=0) |
| 24 | + |
| 25 | + |
| 26 | +# compute logistic loss |
| 27 | +def get_loss(w, intercept, myX, myy, C): |
| 28 | + n_samples = myX.shape[0] |
| 29 | + w = w.ravel() |
| 30 | + p = np.mean(np.log(1. + np.exp(-myy * (myX.dot(w) + intercept)))) |
| 31 | + print("%f + %f" % (p, w.dot(w) / 2. / C / n_samples)) |
| 32 | + p += w.dot(w) / 2. / C / n_samples |
| 33 | + return p |
| 34 | + |
| 35 | + |
| 36 | +# We use joblib to cache individual fits. Note that we do not pass the dataset |
| 37 | +# as argument as the hashing would be too slow, so we assume that the dataset |
| 38 | +# never changes. |
| 39 | +@m.cache() |
| 40 | +def bench_one(name, clf_type, clf_params, n_iter): |
| 41 | + clf = clf_type(**clf_params) |
| 42 | + try: |
| 43 | + clf.set_params(max_iter=n_iter, random_state=42) |
| 44 | + except: |
| 45 | + clf.set_params(n_iter=n_iter, random_state=42) |
| 46 | + |
| 47 | + st = time.time() |
| 48 | + clf.fit(X, y) |
| 49 | + end = time.time() |
| 50 | + |
| 51 | + try: |
| 52 | + C = 1.0 / clf.alpha / n_samples |
| 53 | + except: |
| 54 | + C = clf.C |
| 55 | + |
| 56 | + try: |
| 57 | + intercept = clf.intercept_ |
| 58 | + except: |
| 59 | + intercept = 0. |
| 60 | + |
| 61 | + train_loss = get_loss(clf.coef_, intercept, X, y, C) |
| 62 | + train_score = clf.score(X, y) |
| 63 | + test_score = clf.score(X_test, y_test) |
| 64 | + duration = end - st |
| 65 | + |
| 66 | + return train_loss, train_score, test_score, duration |
| 67 | + |
| 68 | + |
| 69 | +def bench(clfs): |
| 70 | + for (name, clf, iter_range, train_losses, train_scores, |
| 71 | + test_scores, durations) in clfs: |
| 72 | + print("training %s" % name) |
| 73 | + clf_type = type(clf) |
| 74 | + clf_params = clf.get_params() |
| 75 | + |
| 76 | + for n_iter in iter_range: |
| 77 | + gc.collect() |
| 78 | + |
| 79 | + train_loss, train_score, test_score, duration = bench_one( |
| 80 | + name, clf_type, clf_params, n_iter) |
| 81 | + |
| 82 | + train_losses.append(train_loss) |
| 83 | + train_scores.append(train_score) |
| 84 | + test_scores.append(test_score) |
| 85 | + durations.append(duration) |
| 86 | + print("classifier: %s" % name) |
| 87 | + print("train_loss: %.8f" % train_loss) |
| 88 | + print("train_score: %.8f" % train_score) |
| 89 | + print("test_score: %.8f" % test_score) |
| 90 | + print("time for fit: %.8f seconds" % duration) |
| 91 | + print("") |
| 92 | + |
| 93 | + print("") |
| 94 | + return clfs |
| 95 | + |
| 96 | + |
| 97 | +def plot_train_losses(clfs): |
| 98 | + plt.figure() |
| 99 | + for (name, _, _, train_losses, _, _, durations) in clfs: |
| 100 | + plt.plot(durations, train_losses, '-o', label=name) |
| 101 | + plt.legend(loc=0) |
| 102 | + plt.xlabel("seconds") |
| 103 | + plt.ylabel("train loss") |
| 104 | + |
| 105 | + |
| 106 | +def plot_train_scores(clfs): |
| 107 | + plt.figure() |
| 108 | + for (name, _, _, _, train_scores, _, durations) in clfs: |
| 109 | + plt.plot(durations, train_scores, '-o', label=name) |
| 110 | + plt.legend(loc=0) |
| 111 | + plt.xlabel("seconds") |
| 112 | + plt.ylabel("train score") |
| 113 | + plt.ylim((0.92, 0.96)) |
| 114 | + |
| 115 | + |
| 116 | +def plot_test_scores(clfs): |
| 117 | + plt.figure() |
| 118 | + for (name, _, _, _, _, test_scores, durations) in clfs: |
| 119 | + plt.plot(durations, test_scores, '-o', label=name) |
| 120 | + plt.legend(loc=0) |
| 121 | + plt.xlabel("seconds") |
| 122 | + plt.ylabel("test score") |
| 123 | + plt.ylim((0.92, 0.96)) |
| 124 | + |
| 125 | + |
| 126 | +def plot_dloss(clfs): |
| 127 | + plt.figure() |
| 128 | + pobj_final = [] |
| 129 | + for (name, _, _, train_losses, _, _, durations) in clfs: |
| 130 | + pobj_final.append(train_losses[-1]) |
| 131 | + |
| 132 | + indices = np.argsort(pobj_final) |
| 133 | + pobj_best = pobj_final[indices[0]] |
| 134 | + |
| 135 | + for (name, _, _, train_losses, _, _, durations) in clfs: |
| 136 | + log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10) |
| 137 | + |
| 138 | + plt.plot(durations, log_pobj, '-o', label=name) |
| 139 | + plt.legend(loc=0) |
| 140 | + plt.xlabel("seconds") |
| 141 | + plt.ylabel("log(best - train_loss)") |
| 142 | + |
| 143 | + |
| 144 | +rcv1 = fetch_rcv1() |
| 145 | +X = rcv1.data |
| 146 | +n_samples, n_features = X.shape |
| 147 | + |
| 148 | +# consider the binary classification problem 'CCAT' vs the rest |
| 149 | +ccat_idx = rcv1.target_names.tolist().index('CCAT') |
| 150 | +y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64) |
| 151 | +y[y == 0] = -1 |
| 152 | + |
| 153 | +# parameters |
| 154 | +C = 1. |
| 155 | +fit_intercept = True |
| 156 | +tol = 1.0e-14 |
| 157 | + |
| 158 | +# max_iter range |
| 159 | +sgd_iter_range = list(range(1, 121, 10)) |
| 160 | +newton_iter_range = list(range(1, 25, 3)) |
| 161 | +lbfgs_iter_range = list(range(1, 242, 12)) |
| 162 | +liblinear_iter_range = list(range(1, 37, 3)) |
| 163 | +liblinear_dual_iter_range = list(range(1, 85, 6)) |
| 164 | +sag_iter_range = list(range(1, 37, 3)) |
| 165 | + |
| 166 | +clfs = [ |
| 167 | + ("LR-liblinear", |
| 168 | + LogisticRegression(C=C, tol=tol, |
| 169 | + solver="liblinear", fit_intercept=fit_intercept, |
| 170 | + intercept_scaling=1), |
| 171 | + liblinear_iter_range, [], [], [], []), |
| 172 | + ("LR-liblinear-dual", |
| 173 | + LogisticRegression(C=C, tol=tol, dual=True, |
| 174 | + solver="liblinear", fit_intercept=fit_intercept, |
| 175 | + intercept_scaling=1), |
| 176 | + liblinear_dual_iter_range, [], [], [], []), |
| 177 | + ("LR-SAG", |
| 178 | + LogisticRegression(C=C, tol=tol, |
| 179 | + solver="sag", fit_intercept=fit_intercept), |
| 180 | + sag_iter_range, [], [], [], []), |
| 181 | + ("LR-newton-cg", |
| 182 | + LogisticRegression(C=C, tol=tol, solver="newton-cg", |
| 183 | + fit_intercept=fit_intercept), |
| 184 | + newton_iter_range, [], [], [], []), |
| 185 | + ("LR-lbfgs", |
| 186 | + LogisticRegression(C=C, tol=tol, |
| 187 | + solver="lbfgs", fit_intercept=fit_intercept), |
| 188 | + lbfgs_iter_range, [], [], [], []), |
| 189 | + ("SGD", |
| 190 | + SGDClassifier(alpha=1.0 / C / n_samples, penalty='l2', loss='log', |
| 191 | + fit_intercept=fit_intercept, verbose=0), |
| 192 | + sgd_iter_range, [], [], [], [])] |
| 193 | + |
| 194 | + |
| 195 | +if lightning_clf is not None and not fit_intercept: |
| 196 | + alpha = 1. / C / n_samples |
| 197 | + # compute the same step_size than in LR-sag |
| 198 | + max_squared_sum = get_max_squared_sum(X) |
| 199 | + step_size = get_auto_step_size(max_squared_sum, alpha, "log", |
| 200 | + fit_intercept) |
| 201 | + |
| 202 | + clfs.append( |
| 203 | + ("Lightning-SVRG", |
| 204 | + lightning_clf.SVRGClassifier(alpha=alpha, eta=step_size, |
| 205 | + tol=tol, loss="log"), |
| 206 | + sag_iter_range, [], [], [], [])) |
| 207 | + clfs.append( |
| 208 | + ("Lightning-SAG", |
| 209 | + lightning_clf.SAGClassifier(alpha=alpha, eta=step_size, |
| 210 | + tol=tol, loss="log"), |
| 211 | + sag_iter_range, [], [], [], [])) |
| 212 | + |
| 213 | + # We keep only 200 features, to have a dense dataset, |
| 214 | + # and compare to lightning SAG, which seems incorrect in the sparse case. |
| 215 | + X_csc = X.tocsc() |
| 216 | + nnz_in_each_features = X_csc.indptr[1:] - X_csc.indptr[:-1] |
| 217 | + X = X_csc[:, np.argsort(nnz_in_each_features)[-200:]] |
| 218 | + X = X.toarray() |
| 219 | + print("dataset: %.3f MB" % (X.nbytes / 1e6)) |
| 220 | + |
| 221 | + |
| 222 | +# Split training and testing. Switch train and test subset compared to |
| 223 | +# LYRL2004 split, to have a larger training dataset. |
| 224 | +n = 23149 |
| 225 | +X_test = X[:n, :] |
| 226 | +y_test = y[:n] |
| 227 | +X = X[n:, :] |
| 228 | +y = y[n:] |
| 229 | + |
| 230 | +clfs = bench(clfs) |
| 231 | + |
| 232 | +plot_train_scores(clfs) |
| 233 | +plot_test_scores(clfs) |
| 234 | +plot_train_losses(clfs) |
| 235 | +plot_dloss(clfs) |
| 236 | +plt.show() |
0 commit comments