|
1 | 1 | """
|
2 | 2 | Benchmarks of Non-Negative Matrix Factorization
|
3 | 3 | """
|
| 4 | +# Author : Tom Dupre la Tour <tom.dupre-la-tour@m4x.org> |
| 5 | +# License: BSD 3 clause |
4 | 6 |
|
5 | 7 | from __future__ import print_function
|
6 |
| - |
7 |
| -from collections import defaultdict |
8 |
| -import gc |
9 | 8 | from time import time
|
| 9 | +import sys |
10 | 10 |
|
11 | 11 | import numpy as np
|
12 |
| -from scipy.linalg import norm |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +import pandas |
13 | 14 |
|
14 |
| -from sklearn.decomposition.nmf import NMF, _initialize_nmf |
15 |
| -from sklearn.datasets.samples_generator import make_low_rank_matrix |
16 |
| -from sklearn.externals.six.moves import xrange |
| 15 | +from sklearn.utils.extmath import safe_sparse_dot |
| 16 | +from sklearn.utils.testing import ignore_warnings |
| 17 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 18 | +from sklearn.decomposition.nmf import NMF |
| 19 | +from sklearn.decomposition.nmf import _initialize_nmf |
| 20 | +from sklearn.decomposition.nmf import _safe_compute_error |
| 21 | +from sklearn.externals.joblib import Memory |
17 | 22 |
|
| 23 | +mem = Memory(cachedir='.', verbose=0) |
18 | 24 |
|
19 |
| -def alt_nnmf(V, r, max_iter=1000, tol=1e-3, init='random'): |
20 |
| - ''' |
21 |
| - A, S = nnmf(X, r, tol=1e-3, R=None) |
22 | 25 |
|
23 |
| - Implement Lee & Seung's algorithm |
| 26 | +def multiplicative_nmf(X, W, H, n_iter=100, alpha=0., l1_ratio=0.): |
| 27 | + ''' |
| 28 | + Implement Lee & Seung's algorithm for NMF |
| 29 | + (currently not in scikit-learn) |
24 | 30 |
|
25 | 31 | Parameters
|
26 | 32 | ----------
|
27 |
| - V : 2-ndarray, [n_samples, n_features] |
28 |
| - input matrix |
29 |
| - r : integer |
30 |
| - number of latent features |
31 |
| - max_iter : integer, optional |
32 |
| - maximum number of iterations (default: 1000) |
33 |
| - tol : double |
34 |
| - tolerance threshold for early exit (when the update factor is within |
35 |
| - tol of 1., the function exits) |
36 |
| - init : string |
37 |
| - Method used to initialize the procedure. |
| 33 | + X : array-like, shape (n_samples, n_features) |
| 34 | + Input matrix |
| 35 | + W : array-like, shape (n_samples, n_components) |
| 36 | + Initial guess for the solution. |
| 37 | + H : array-like, shape (n_components, n_features) |
| 38 | + Initial guess for the solution. |
| 39 | + n_iter : integer, default: 100 |
| 40 | + Number of iterations |
| 41 | + alpha : float, default: 0. |
| 42 | + Constant that multiplies the regularization terms. |
| 43 | + l1_ratio : double, default: 0. |
| 44 | + The regularization mixing parameter, with 0 <= l1_ratio <= 1. |
38 | 45 |
|
39 | 46 | Returns
|
40 | 47 | -------
|
41 |
| - A : 2-ndarray, [n_samples, r] |
42 |
| - Component part of the factorization |
| 48 | + W : array-like, shape (n_samples, n_components) |
| 49 | + Solution to the non-negative least squares problem. |
| 50 | +
|
| 51 | + H : array-like, shape (n_components, n_features) |
| 52 | + Solution to the non-negative least squares problem. |
43 | 53 |
|
44 |
| - S : 2-ndarray, [r, n_features] |
45 |
| - Data part of the factorization |
46 | 54 | Reference
|
47 | 55 | ---------
|
48 | 56 | "Algorithms for Non-negative Matrix Factorization"
|
49 | 57 | by Daniel D Lee, Sebastian H Seung
|
50 | 58 | (available at http://citeseer.ist.psu.edu/lee01algorithms.html)
|
51 | 59 | '''
|
52 |
| - # Nomenclature in the function follows Lee & Seung |
53 |
| - eps = 1e-5 |
54 |
| - n, m = V.shape |
55 |
| - W, H = _initialize_nmf(V, r, init, random_state=0) |
56 |
| - |
57 |
| - for i in xrange(max_iter): |
58 |
| - updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps) |
59 |
| - H *= updateH |
60 |
| - updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps) |
61 |
| - W *= updateW |
62 |
| - if i % 10 == 0: |
63 |
| - max_update = max(updateW.max(), updateH.max()) |
64 |
| - if abs(1. - max_update) < tol: |
65 |
| - break |
| 60 | + eps = 1e-8 |
| 61 | + # compute the L1 and L2 regularization parameters |
| 62 | + l1_reg = float(alpha) * l1_ratio |
| 63 | + l2_reg = float(alpha) * (1 - l1_ratio) |
| 64 | + |
| 65 | + for i in range(n_iter): |
| 66 | + # update H |
| 67 | + denominator = np.dot(np.dot(W.T, W), H) |
| 68 | + if l1_reg > 0: |
| 69 | + denominator += l1_reg |
| 70 | + if l2_reg > 0: |
| 71 | + denominator = denominator + l2_reg * H |
| 72 | + denominator[denominator == 0] = eps |
| 73 | + |
| 74 | + deltaH = safe_sparse_dot(W.T, X) |
| 75 | + deltaH /= denominator |
| 76 | + H *= deltaH |
| 77 | + |
| 78 | + # update W |
| 79 | + denominator = np.dot(W, np.dot(H, H.T)) |
| 80 | + if l1_reg > 0: |
| 81 | + denominator += l1_reg |
| 82 | + if l2_reg > 0: |
| 83 | + denominator = denominator + l2_reg * W |
| 84 | + denominator[denominator == 0] = eps |
| 85 | + |
| 86 | + deltaW = safe_sparse_dot(X, H.T) |
| 87 | + deltaW /= denominator |
| 88 | + W *= deltaW |
| 89 | + |
66 | 90 | return W, H
|
67 | 91 |
|
68 | 92 |
|
69 |
| -def report(error, time): |
70 |
| - print("Frobenius loss: %.5f" % error) |
71 |
| - print("Took: %.2fs" % time) |
72 |
| - print() |
73 |
| - |
74 |
| - |
75 |
| -def benchmark(samples_range, features_range, rank=50, tolerance=1e-5): |
76 |
| - timeset = defaultdict(lambda: []) |
77 |
| - err = defaultdict(lambda: []) |
78 |
| - |
79 |
| - for n_samples in samples_range: |
80 |
| - for n_features in features_range: |
81 |
| - print("%2d samples, %2d features" % (n_samples, n_features)) |
82 |
| - print('=======================') |
83 |
| - X = np.abs(make_low_rank_matrix(n_samples, n_features, |
84 |
| - effective_rank=rank, tail_strength=0.2)) |
85 |
| - |
86 |
| - gc.collect() |
87 |
| - print("benchmarking nndsvd-nmf: ") |
88 |
| - tstart = time() |
89 |
| - m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) |
90 |
| - tend = time() - tstart |
91 |
| - timeset['nndsvd-nmf'].append(tend) |
92 |
| - err['nndsvd-nmf'].append(m.reconstruction_err_) |
93 |
| - report(m.reconstruction_err_, tend) |
94 |
| - |
95 |
| - gc.collect() |
96 |
| - print("benchmarking nndsvda-nmf: ") |
97 |
| - tstart = time() |
98 |
| - m = NMF(n_components=30, init='nndsvda', |
99 |
| - tol=tolerance).fit(X) |
100 |
| - tend = time() - tstart |
101 |
| - timeset['nndsvda-nmf'].append(tend) |
102 |
| - err['nndsvda-nmf'].append(m.reconstruction_err_) |
103 |
| - report(m.reconstruction_err_, tend) |
104 |
| - |
105 |
| - gc.collect() |
106 |
| - print("benchmarking nndsvdar-nmf: ") |
107 |
| - tstart = time() |
108 |
| - m = NMF(n_components=30, init='nndsvdar', |
109 |
| - tol=tolerance).fit(X) |
110 |
| - tend = time() - tstart |
111 |
| - timeset['nndsvdar-nmf'].append(tend) |
112 |
| - err['nndsvdar-nmf'].append(m.reconstruction_err_) |
113 |
| - report(m.reconstruction_err_, tend) |
114 |
| - |
115 |
| - gc.collect() |
116 |
| - print("benchmarking random-nmf") |
117 |
| - tstart = time() |
118 |
| - m = NMF(n_components=30, init='random', max_iter=1000, |
119 |
| - tol=tolerance).fit(X) |
120 |
| - tend = time() - tstart |
121 |
| - timeset['random-nmf'].append(tend) |
122 |
| - err['random-nmf'].append(m.reconstruction_err_) |
123 |
| - report(m.reconstruction_err_, tend) |
124 |
| - |
125 |
| - gc.collect() |
126 |
| - print("benchmarking alt-random-nmf") |
127 |
| - tstart = time() |
128 |
| - W, H = alt_nnmf(X, r=30, init='random', tol=tolerance) |
129 |
| - tend = time() - tstart |
130 |
| - timeset['alt-random-nmf'].append(tend) |
131 |
| - err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) |
132 |
| - report(norm(X - np.dot(W, H)), tend) |
133 |
| - |
134 |
| - return timeset, err |
| 93 | +@ignore_warnings |
| 94 | +def plot_results(results_df, plot_name): |
| 95 | + if results_df is None: |
| 96 | + return None |
| 97 | + |
| 98 | + plt.figure(figsize=(16, 6)) |
| 99 | + colors = 'bgr' |
| 100 | + markers = 'ovs' |
| 101 | + ax = plt.subplot(1, 3, 1) |
| 102 | + for i, init in enumerate(np.unique(results_df['init'])): |
| 103 | + plt.subplot(1, 3, i + 1, sharex=ax, sharey=ax) |
| 104 | + for j, method in enumerate(np.unique(results_df['method'])): |
| 105 | + selected_items = (results_df |
| 106 | + [results_df['init'] == init] |
| 107 | + [results_df['method'] == method]) |
| 108 | + |
| 109 | + plt.plot(selected_items['time'], selected_items['loss'], |
| 110 | + color=colors[j % len(colors)], ls='-', |
| 111 | + marker=markers[j % len(markers)], |
| 112 | + label=method) |
| 113 | + |
| 114 | + plt.legend(loc=0, fontsize='x-small') |
| 115 | + plt.xlabel("Time (s)") |
| 116 | + plt.ylabel("loss") |
| 117 | + plt.title("%s" % init) |
| 118 | + plt.suptitle(plot_name, fontsize=16) |
| 119 | + |
| 120 | + |
| 121 | +# use joblib to cache results. |
| 122 | +# X_shape is specified in arguments for avoiding hashing X |
| 123 | +@ignore_warnings |
| 124 | +@mem.cache(ignore=['X', 'W0', 'H0']) |
| 125 | +def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init, |
| 126 | + n_components, random_state): |
| 127 | + W = W0.copy() |
| 128 | + H = H0.copy() |
| 129 | + |
| 130 | + if 'Multiplicative' in name: |
| 131 | + st = time() |
| 132 | + W, H = clf_type(X, W, H, **clf_params) |
| 133 | + end = time() |
| 134 | + else: |
| 135 | + clf = clf_type(**clf_params) |
| 136 | + st = time() |
| 137 | + W = clf.fit_transform(X, W=W, H=H) |
| 138 | + end = time() |
| 139 | + H = clf.components_ |
| 140 | + |
| 141 | + this_loss = _safe_compute_error(X, W, H) |
| 142 | + duration = end - st |
| 143 | + return this_loss, duration |
| 144 | + |
| 145 | + |
| 146 | +def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio): |
| 147 | + start = time() |
| 148 | + results = [] |
| 149 | + for name, clf_type, iter_range, clf_params in clfs: |
| 150 | + print("Training %s:" % name) |
| 151 | + for rs, init in enumerate(('nndsvd', 'nndsvdar', 'random')): |
| 152 | + print(" %s %s: " % (init, " " * (8 - len(init))), end="") |
| 153 | + W, H = _initialize_nmf(X, n_components, init, 1e-6, rs) |
| 154 | + |
| 155 | + for itr in iter_range: |
| 156 | + clf_params['alpha'] = alpha |
| 157 | + clf_params['l1_ratio'] = l1_ratio |
| 158 | + |
| 159 | + if 'Multiplicative' in name: |
| 160 | + clf_params['n_iter'] = itr |
| 161 | + else: |
| 162 | + clf_params['max_iter'] = itr |
| 163 | + clf_params['tol'] = tol |
| 164 | + clf_params['random_state'] = rs |
| 165 | + clf_params['init'] = 'custom' |
| 166 | + clf_params['n_components'] = n_components |
| 167 | + |
| 168 | + this_loss, duration = bench_one(name, X, W, H, X.shape, |
| 169 | + clf_type, clf_params, |
| 170 | + init, n_components, rs) |
| 171 | + |
| 172 | + init_name = "init='%s'" % init |
| 173 | + results.append((name, this_loss, duration, init_name)) |
| 174 | + # print("loss: %.6f, time: %.3f sec" % (this_loss, duration)) |
| 175 | + print(".", end="") |
| 176 | + sys.stdout.flush() |
| 177 | + print(" ") |
| 178 | + |
| 179 | + # Use a panda dataframe to organize the results |
| 180 | + results_df = pandas.DataFrame(results, |
| 181 | + columns="method loss time init".split()) |
| 182 | + print("Total time = %0.3f sec\n" % (time() - start)) |
| 183 | + |
| 184 | + # plot the results |
| 185 | + plot_results(results_df, plot_name) |
| 186 | + return results_df |
| 187 | + |
| 188 | + |
| 189 | +def load_20news(): |
| 190 | + print("Loading 20 newsgroups dataset") |
| 191 | + print("-----------------------------") |
| 192 | + from sklearn.datasets import fetch_20newsgroups |
| 193 | + dataset = fetch_20newsgroups(shuffle=True, random_state=1, |
| 194 | + remove=('headers', 'footers', 'quotes')) |
| 195 | + vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') |
| 196 | + tfidf = vectorizer.fit_transform(dataset.data) |
| 197 | + return tfidf |
| 198 | + |
| 199 | + |
| 200 | +def load_faces(): |
| 201 | + print("Loading Olivetti face dataset") |
| 202 | + print("-----------------------------") |
| 203 | + from sklearn.datasets import fetch_olivetti_faces |
| 204 | + faces = fetch_olivetti_faces(shuffle=True) |
| 205 | + return faces.data |
| 206 | + |
| 207 | + |
| 208 | +def build_clfs(cd_iters, pg_iters, mu_iters): |
| 209 | + clfs = [("Coordinate Descent", NMF, cd_iters, {'solver': 'cd'}), |
| 210 | + ("Projected Gradient", NMF, pg_iters, {'solver': 'pg', |
| 211 | + 'nls_max_iter': 8}), |
| 212 | + ("Multiplicative Update", multiplicative_nmf, mu_iters, {}), |
| 213 | + ] |
| 214 | + return clfs |
135 | 215 |
|
136 | 216 |
|
137 | 217 | if __name__ == '__main__':
|
138 |
| - from mpl_toolkits.mplot3d import axes3d # register the 3d projection |
139 |
| - axes3d |
140 |
| - import matplotlib.pyplot as plt |
141 |
| - |
142 |
| - samples_range = np.linspace(50, 500, 3).astype(np.int) |
143 |
| - features_range = np.linspace(50, 500, 3).astype(np.int) |
144 |
| - timeset, err = benchmark(samples_range, features_range) |
145 |
| - |
146 |
| - for i, results in enumerate((timeset, err)): |
147 |
| - fig = plt.figure('scikit-learn Non-Negative Matrix Factorization' |
148 |
| - 'benchmark results') |
149 |
| - ax = fig.gca(projection='3d') |
150 |
| - for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())): |
151 |
| - X, Y = np.meshgrid(samples_range, features_range) |
152 |
| - Z = np.asarray(timings).reshape(samples_range.shape[0], |
153 |
| - features_range.shape[0]) |
154 |
| - # plot the actual surface |
155 |
| - ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, |
156 |
| - color=c) |
157 |
| - # dummy point plot to stick the legend to since surface plot do not |
158 |
| - # support legends (yet?) |
159 |
| - ax.plot([1], [1], [1], color=c, label=label) |
160 |
| - |
161 |
| - ax.set_xlabel('n_samples') |
162 |
| - ax.set_ylabel('n_features') |
163 |
| - zlabel = 'Time (s)' if i == 0 else 'reconstruction err
48DA
or' |
164 |
| - ax.set_zlabel(zlabel) |
165 |
| - ax.legend() |
166 |
| - plt.show() |
| 218 | + alpha = 0. |
| 219 | + l1_ratio = 0.5 |
| 220 | + n_components = 10 |
| 221 | + tol = 1e-15 |
| 222 | + |
| 223 | + # first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116) |
| 224 | + plot_name = "20 Newsgroups sparse dataset" |
| 225 | + cd_iters = np.arange(1, 30) |
| 226 | + pg_iters = np.arange(1, 6) |
| 227 | + mu_iters = np.arange(1, 30) |
| 228 | + clfs = build_clfs(cd_iters, pg_iters, mu_iters) |
| 229 | + X_20news = load_20news() |
| 230 | + run_bench(X_20news, clfs, plot_name, n_components, tol, alpha, l1_ratio) |
| 231 | + |
| 232 | + # second benchmark on Olivetti faces dataset: dense, shape(400, 4096) |
| 233 | + plot_name = "Olivetti Faces dense dataset" |
| 234 | + cd_iters = np.arange(1, 30) |
| 235 | + pg_iters = np.arange(1, 12) |
| 236 | + mu_iters = np.arange(1, 30) |
| 237 | + clfs = build_clfs(cd_iters, pg_iters, mu_iters) |
| 238 | + X_faces = load_faces() |
| 239 | + run_bench(X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio,) |
| 240 | + |
| 241 | + plt.show() |
0 commit comments