|
1 | 1 | """
|
2 | 2 | Benchmarks of Non-Negative Matrix Factorization
|
3 | 3 | """
|
| 4 | +# Author : Tom Dupre la Tour <tom.dupre-la-tour@m4x.org> |
| 5 | +# License: BSD 3 clause |
4 | 6 |
|
5 | 7 | from __future__ import print_function
|
6 |
| - |
7 |
| -from collections import defaultdict |
8 |
| -import gc |
9 | 8 | from time import time
|
| 9 | +import sys |
10 | 10 |
|
11 | 11 | import numpy as np
|
12 |
| -from scipy.linalg import norm |
13 |
| - |
14 |
| -from sklearn.decomposition.nmf import NMF, _initialize_nmf |
15 |
| -from sklearn.datasets.samples_generator import make_low_rank_matrix |
16 |
| -from sklearn.externals.six.moves import xrange |
17 |
| - |
18 |
| - |
19 |
| -def alt_nnmf(V, r, max_iter=1000, tol=1e-3, init='random'): |
20 |
| - ''' |
21 |
| - A, S = nnmf(X, r, tol=1e-3, R=None) |
22 |
| -
|
23 |
| - Implement Lee & Seung's algorithm |
24 |
| -
|
25 |
| - Parameters |
26 |
| - ---------- |
27 |
| - V : 2-ndarray, [n_samples, n_features] |
28 |
| - input matrix |
29 |
| - r : integer |
30 |
| - number of latent features |
31 |
| - max_iter : integer, optional |
32 |
| - maximum number of iterations (default: 1000) |
33 |
| - tol : double |
34 |
| - tolerance threshold for early exit (when the update factor is within |
35 |
| - tol of 1., the function exits) |
36 |
| - init : string |
37 |
| - Method used to initialize the procedure. |
38 |
| -
|
39 |
| - Returns |
40 |
| - ------- |
41 |
| - A : 2-ndarray, [n_samples, r] |
42 |
| - Component part of the factorization |
43 |
| -
|
44 |
| - S : 2-ndarray, [r, n_features] |
45 |
| - Data part of the factorization |
46 |
| - Reference |
47 |
| - --------- |
48 |
| - "Algorithms for Non-negative Matrix Factorization" |
49 |
| - by Daniel D Lee, Sebastian H Seung |
50 |
| - (available at http://citeseer.ist.psu.edu/lee01algorithms.html) |
51 |
| - ''' |
52 |
| - # Nomenclature in the function follows Lee & Seung |
53 |
| - eps = 1e-5 |
54 |
| - n, m = V.shape |
55 |
| - W, H = _initialize_nmf(V, r, init, random_state=0) |
56 |
| - |
57 |
| - for i in xrange(max_iter): |
58 |
| - updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps) |
59 |
| - H *= updateH |
60 |
| - updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps) |
61 |
| - W *= updateW |
62 |
| - if i % 10 == 0: |
63 |
| - max_update = max(updateW.max(), updateH.max()) |
64 |
| - if abs(1. - max_update) < tol: |
65 |
| - break |
66 |
| - return W, H |
67 |
| - |
68 |
| - |
69 |
| -def report(error, time): |
70 |
| - print("Frobenius loss: %.5f" % error) |
71 |
| - print("Took: %.2fs" % time) |
72 |
| - print() |
73 |
| - |
74 |
| - |
75 |
| -def benchmark(samples_range, features_range, rank=50, tolerance=1e-5): |
76 |
| - timeset = defaultdict(lambda: []) |
77 |
| - err = defaultdict(lambda: []) |
78 |
| - |
79 |
| - for n_samples in samples_range: |
80 |
| - for n_features in features_range: |
81 |
| - print("%2d samples, %2d features" % (n_samples, n_features)) |
82 |
| - print('=======================') |
83 |
| - X = np.abs(make_low_rank_matrix(n_samples, n_features, |
84 |
| - effective_rank=rank, tail_strength=0.2)) |
85 |
| - |
86 |
| - gc.collect() |
87 |
| - print("benchmarking nndsvd-nmf: ") |
88 |
| - tstart = time() |
89 |
| - m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) |
90 |
| - tend = time() - tstart |
91 |
| - timeset['nndsvd-nmf'].append(tend) |
92 |
| - err['nndsvd-nmf'].append(m.reconstruction_err_) |
93 |
| - report(m.reconstruction_err_, tend) |
94 |
| - |
95 |
| - gc.collect() |
96 |
| - print("benchmarking nndsvda-nmf: ") |
97 |
| - tstart = time() |
98 |
| - m = NMF(n_components=30, init='nndsvda', |
99 |
| - tol=tolerance).fit(X) |
100 |
| - tend = time() - tstart |
101 |
| - timeset['nndsvda-nmf'].append(tend) |
102 |
| - err['nndsvda-nmf'].append(m.reconstruction_err_) |
103 |
| - report(m.reconstruction_err_, tend) |
104 |
| - |
105 |
| - gc.collect() |
106 |
| - print("benchmarking nndsvdar-nmf: ") |
107 |
| - tstart = time() |
108 |
| - m = NMF(n_components=30, init='nndsvdar', |
109 |
| - tol=tolerance).fit(X) |
110 |
| - tend = time() - tstart |
111 |
| - timeset['nndsvdar-nmf'].append(tend) |
112 |
| - err['nndsvdar-nmf'].append(m.reconstruction_err_) |
113 |
| - report(m.reconstruction_err_, tend) |
114 |
| - |
115 |
| - gc.collect() |
116 |
| - print("benchmarking random-nmf") |
117 |
| - tstart = time() |
118 |
| - m = NMF(n_components=30, init='random', max_iter=1000, |
119 |
| - tol=tolerance).fit(X) |
120 |
| - tend = time() - tstart |
121 |
| - timeset['random-nmf'].append(tend) |
122 |
| - err['random-nmf'].append(m.reconstruction_err_) |
123 |
| - report(m.reconstruction_err_, tend) |
124 |
| - |
125 |
| - gc.collect() |
126 |
| - print("benchmarking alt-random-nmf") |
127 |
| - tstart = time() |
128 |
| - W, H = alt_nnmf(X, r=30, init='random', tol=tolerance) |
129 |
| - tend = time() - tstart |
130 |
| - timeset['alt-random-nmf'].append(tend) |
131 |
| - err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) |
132 |
| - report(norm(X - np.dot(W, H)), tend) |
133 |
| - |
134 |
| - return timeset, err |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +import pandas |
| 14 | + |
| 15 | +from sklearn.utils.testing import ignore_warnings |
| 16 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 17 | +from sklearn.decomposition.nmf import NMF |
| 18 | +from sklearn.decomposition.nmf import _initialize_nmf |
| 19 | +from sklearn.decomposition.nmf import _safe_compute_error |
| 20 | +from sklearn.externals.joblib import Memory |
| 21 | + |
| 22 | +mem = Memory(cachedir='.', verbose=0) |
| 23 | + |
| 24 | + |
| 25 | +@ignore_warnings |
| 26 | +def plot_results(results_df, plot_name): |
| 27 | + if results_df is None: |
| 28 | + return None |
| 29 | + |
| 30 | + plt.figure(figsize=(16, 6)) |
| 31 | + colors = 'bgr' |
| 32 | + markers = 'ovs' |
| 33 | + ax = plt.subplot(1, 3, 1) |
| 34 | + for i, init in enumerate(np.unique(results_df['init'])): |
| 35 | + plt.subplot(1, 3, i + 1, sharex=ax, sharey=ax) |
| 36 | + for j, method in enumerate(np.unique(results_df['method'])): |
| 37 | + selected_items = (results_df |
| 38 | + [results_df['init'] == init] |
| 39 | + [results_df['method'] == method]) |
| 40 | + |
| 41 | + plt.plot(selected_items['time'], selected_items['loss'], |
| 42 | + color=colors[j % len(colors)], ls='-', |
| 43 | + marker=markers[j % len(markers)], |
| 44 | + label=method) |
| 45 | + |
| 46 | + plt.legend(loc=0, fontsize='x-small') |
| 47 | + plt.xlabel("Time (s)") |
| 48 | + plt.ylabel("loss") |
| 49 | + plt.title("%s" % init) |
| 50 | + plt.suptitle(plot_name, fontsize=16) |
| 51 | + |
| 52 | + |
| 53 | +# use joblib to cache results. |
| 54 | +# X_shape is specified in arguments for avoiding hashing X |
| 55 | +@ignore_warnings |
| 56 | +@mem.cache(ignore=['X', 'W0', 'H0']) |
| 57 | +def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init, |
| 58 | + n_components, random_state): |
| 59 | + clf = clf_type(**clf_params) |
| 60 | + |
| 61 | + st = time() |
| 62 | + W = clf.fit_transform(X, W=W0.copy(), H=H0.copy()) |
| 63 | + end = time() |
| 64 | + |
| 65 | + H = clf.components_ |
| 66 | + this_loss = _safe_compute_error(X, W, H) |
| 67 | + duration = end - st |
| 68 | + return this_loss, duration |
| 69 | + |
| 70 | + |
| 71 | +def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio): |
| 72 | + start = time() |
| 73 | + results = [] |
| 74 | + for name, clf_type, iter_range, clf_params in clfs: |
| 75 | + print("Training %s:" % name) |
| 76 | + for rs, init in enumerate(('nndsvd', 'nndsvdar', 'random')): |
| 77 | + print(" %s %s: " % (init, " " * (8 - len(init))), end="") |
| 78 | + W, H = _initialize_nmf(X, n_components, init, 1e-6, rs) |
| 79 | + |
| 80 | + for itr in iter_range: |
| 81 | + clf_params['alpha'] = alpha |
| 82 | + clf_params['l1_ratio'] = l1_ratio |
| 83 | + clf_params['max_iter'] = itr |
| 84 | + clf_params['tol'] = tol |
| 85 | + clf_params['random_state'] = rs |
| 86 | + clf_params['init'] = 'custom' |
| 87 | + clf_params['n_components'] = n_components |
| 88 | + |
| 89 | + this_loss, duration = bench_one(name, X, W, H, X.shape, |
| 90 | + clf_type, clf_params, |
| 91 | + init, n_components, rs) |
| 92 | + |
| 93 | + init_name = "init='%s'" % init |
| 94 | + results.append((name, this_loss, duration, init_name)) |
| 95 | + # print("loss: %.6f, time: %.3f sec" % (this_loss, duration)) |
| 96 | + print(".", end="") |
| 97 | + sys.stdout.flush() |
| 98 | + print(" ") |
| 99 | + |
| 100 | + # Use a panda dataframe to organize the results |
| 101 | + results_df = pandas.DataFrame(results, |
| 102 | + columns="method loss time init".split()) |
| 103 | + print("Total time = %0.3f sec\n" % (time() - start)) |
| 104 | + |
| 105 | + # plot the results |
| 106 | + plot_results(results_df, plot_name) |
| 107 | + return results_df |
| 108 | + |
| 109 | + |
| 110 | +def load_20news(): |
| 111 | + print("Loading 20 newsgroups dataset") |
| 112 | + print("-----------------------------") |
| 113 | + from sklearn.datasets import fetch_20newsgroups |
| 114 | + dataset = fetch_20newsgroups(shuffle=True, random_state=1, |
| 115 | + remove=('headers', 'footers', 'quotes')) |
| 116 | + vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') |
| 117 | + tfidf = vectorizer.fit_transform(dataset.data) |
| 118 | + return tfidf |
| 119 | + |
| 120 | + |
| 121 | +def load_faces(): |
| 122 | + print("Loading Olivetti face dataset") |
| 123 | + print("-----------------------------") |
| 124 | + from sklearn.datasets import fetch_olivetti_faces |
| 125 | + faces = fetch_olivetti_faces(shuffle=True) |
| 126 | + return faces.data |
| 127 | + |
| 128 | + |
| 129 | +def build_clfs(cd_iters, pg_iters, mu_iters): |
| 130 | + clfs = [("Coordinate Descent", NMF, cd_iters, {'solver': 'cd'}), |
| 131 | + ("Projected Gradient", NMF, pg_iters, {'solver': 'pg', |
| 132 | + 'nls_max_iter': 8}), |
| 133 | + ("Multiplicative Update", NMF, mu_iters, {'solver': 'mu'}), |
| 134 | + ] |
| 135 | + return clfs |
135 | 136 |
|
136 | 137 |
|
137 | 138 | if __name__ == '__main__':
|
138 |
| - from mpl_toolkits.mplot3d import axes3d # register the 3d projection |
139 |
| - axes3d |
140 |
| - import matplotlib.pyplot as plt |
141 |
| - |
142 |
| - samples_range = np.linspace(50, 500, 3).astype(np.int) |
143 |
| - features_range = np.linspace(50, 500, 3).astype(np.int) |
144 |
| - timeset, err = benchmark(samples_range, features_range) |
145 |
| - |
146 |
| - for i, results in enumerate((timeset, err)): |
147 |
| - fig = plt.figure('scikit-learn Non-Negative Matrix Factorization' |
148 |
| - 'benchmark results') |
149 |
| - ax = fig.gca(projection='3d') |
150 |
| - for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())): |
151 |
| - X, Y = np.meshgrid(samples_range, features_range) |
152 |
| - Z = np.asarray(timings).reshape(samples_range.shape[0], |
153 |
| - features_range.shape[0]) |
154 |
| - # plot the actual surface |
155 |
| - ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, |
156 |
| - color=c) |
157 |
| - # dummy point plot to stick the legend to since surface plot do not |
158 |
| - # support legends (yet?) |
159 |
| - ax.plot([1], [1], [1], color=c, label=label) |
160 |
| - |
161 |
| - ax.set_xlabel('n_samples') |
162 |
| - ax.set_ylabel('n_features') |
163 |
| - zlabel = 'Time (s)' if i == 0 else 'reconstruction error' |
164 |
| - ax.set_zlabel(zlabel) |
165 |
| - ax.legend() |
166 |
| - plt.show() |
| 139 | + alpha = 0. |
| 140 | + l1_ratio = 0.5 |
| 141 | + n_components = 10 |
| 142 | + tol = 1e-15 |
| 143 | + |
| 144 | + # first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116) |
| 145 | + plot_name = "20 Newsgroups sparse dataset" |
| 146 | + cd_iters = np.arange(1, 30) |
| 147 | + pg_iters = np.arange(1, 6) |
| 148 | + mu_iters = np.arange(1, 30) |
| 149 | + clfs = build_clfs(cd_iters, pg_iters, mu_iters) |
| 150 | + X_20news = load_20news() |
| 151 | + run_bench(X_20news, clfs, plot_name, n_components, tol, alpha, l1_ratio) |
| 152 | + |
| 153 | + # second benchmark on Olivetti faces dataset: dense, shape(400, 4096) |
| 154 | + plot_name = "Olivetti Faces dense dataset" |
| 155 | + cd_iters = np.arange(1, 30) |
| 156 | + pg_iters = np.arange(1, 12) |
| 157 | + mu_iters = np.arange(1, 30) |
| 158 | + clfs = build_clfs(cd_iters, pg_iters, mu_iters) |
| 159 | + X_faces = load_faces() |
| 160 | + run_bench(X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio,) |
| 161 | + |
| 162 | + plt.show() |
0 commit comments