8000 ENH improve benchmark on nmf · scikit-learn/scikit-learn@26b0b59 · GitHub
[go: up one dir, main page]

Skip to content

Commit 26b0b59

Browse files
committed
ENH improve benchmark on nmf
1 parent bc90ba0 commit 26b0b59

File tree

1 file changed

+210
-135
lines changed

1 file changed

+210
-135
lines changed

benchmarks/bench_plot_nmf.py

+210-135
Original file line numberDiff line numberDiff line change
@@ -1,166 +1,241 @@
11
"""
22
Benchmarks of Non-Negative Matrix Factorization
33
"""
4+
# Author : Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
5+
# License: BSD 3 clause
46

57
from __future__ import print_function
6-
7-
from collections import defaultdict
8-
import gc
98
from time import time
9+
import sys
1010

1111
import numpy as np
12-
from scipy.linalg import norm
12+
import matplotlib.pyplot as plt
13+
import pandas
1314

14-
from sklearn.decomposition.nmf import NMF, _initialize_nmf
15-
from sklearn.datasets.samples_generator import make_low_rank_matrix
16-
from sklearn.externals.six.moves import xrange
15+
from sklearn.utils.extmath import safe_sparse_dot
16+
from sklearn.utils.testing import ignore_warnings
17+
from sklearn.feature_extraction.text import TfidfVectorizer
18+
from sklearn.decomposition.nmf import NMF
19+
from sklearn.decomposition.nmf import _initialize_nmf
20+
from sklearn.decomposition.nmf import _safe_compute_error
21+
from sklearn.externals.joblib import Memory
1722

23+
mem = Memory(cachedir='.', verbose=0)
1824

19-
def alt_nnmf(V, r, max_iter=1000, tol=1e-3, init='random'):
20-
'''
21-
A, S = nnmf(X, r, tol=1e-3, R=None)
2225

23-
Implement Lee & Seung's algorithm
26+
def multiplicative_nmf(X, W, H, n_iter=100, alpha=0., l1_ratio=0.):
27+
'''
28+
Implement Lee & Seung's algorithm for NMF
29+
(currently not in scikit-learn)
2430
2531
Parameters
2632
----------
27-
V : 2-ndarray, [n_samples, n_features]
28-
input matrix
29-
r : integer
30-
number of latent features
31-
max_iter : integer, optional
32-
maximum number of iterations (default: 1000)
33-
tol : double
34-
tolerance threshold for early exit (when the update factor is within
35-
tol of 1., the function exits)
36-
init : string
37-
Method used to initialize the procedure.
33+
X : array-like, shape (n_samples, n_features)
34+
Input matrix
35+
W : array-like, shape (n_samples, n_components)
36+
Initial guess for the solution.
37+
H : array-like, shape (n_components, n_features)
38+
Initial guess for the solution.
39+
n_iter : integer, default: 100
40+
Number of iterations
41+
alpha : float, default: 0.
42+
Constant that multiplies the regularization terms.
43+
l1_ratio : double, default: 0.
44+
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
3845
3946
Returns
4047
-------
41-
A : 2-ndarray, [n_samples, r]
42-
Component part of the factorization
48+
W : array-like, shape (n_samples, n_components)
49+
Solution to the non-negative least squares problem.
50+
51+
H : array-like, shape (n_components, n_features)
52+
Solution to the non-negative least squares problem.
4353
44-
S : 2-ndarray, [r, n_features]
45-
Data part of the factorization
4654
Reference
4755
---------
4856
"Algorithms for Non-negative Matrix Factorization"
4957
by Daniel D Lee, Sebastian H Seung
5058
(available at http://citeseer.ist.psu.edu/lee01algorithms.html)
5159
'''
52-
# Nomenclature in the function follows Lee & Seung
53-
eps = 1e-5
54-
n, m = V.shape
55-
W, H = _initialize_nmf(V, r, init, random_state=0)
56-
57-
for i in xrange(max_iter):
58-
updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps)
59-
H *= updateH
60-
updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps)
61-
W *= updateW
62-
if i % 10 == 0:
63-
max_update = max(updateW.max(), updateH.max())
64-
if abs(1. - max_update) < tol:
65-
break
60+
eps = 1e-8
61+
# compute the L1 and L2 regularization parameters
62+
l1_reg = float(alpha) * l1_ratio
63+
l2_reg = float(alpha) * (1 - l1_ratio)
64+
65+
for i in range(n_iter):
66+
# update H
67+
denominator = np.dot(np.dot(W.T, W), H)
68+
if l1_reg > 0:
69+
denominator += l1_reg
70+
if l2_reg > 0:
71+
denominator = denominator + l2_reg * H
72+
denominator[denominator == 0] = eps
73+
74+
deltaH = safe_sparse_dot(W.T, X)
75+
deltaH /= denominator
76+
H *= deltaH
77+
78+
# update W
79+
denominator = np.dot(W, np.dot(H, H.T))
80+
if l1_reg > 0:
81+
denominator += l1_reg
82+
if l2_reg > 0:
83+
denominator = denominator + l2_reg * W
84+
denominator[denominator == 0] = eps
85+
86+
deltaW = safe_sparse_dot(X, H.T)
87+
deltaW /= denominator
88+
W *= deltaW
89+
6690
return W, H
6791

6892

69-
def report(error, time):
70-
print("Frobenius loss: %.5f" % error)
71-
print("Took: %.2fs" % time)
72-
print()
73-
74-
75-
def benchmark(samples_range, features_range, rank=50, tolerance=1e-5):
76-
timeset = defaultdict(lambda: [])
77-
err = defaultdict(lambda: [])
78-
79-
for n_samples in samples_range:
80-
for n_features in features_range:
81-
print("%2d samples, %2d features" % (n_samples, n_features))
82-
print('=======================')
83-
X = np.abs(make_low_rank_matrix(n_samples, n_features,
84-
effective_rank=rank, tail_strength=0.2))
85-
86-
gc.collect()
87-
print("benchmarking nndsvd-nmf: ")
88-
tstart = time()
89-
m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
90-
tend = time() - tstart
91-
timeset['nndsvd-nmf'].append(tend)
92-
err['nndsvd-nmf'].append(m.reconstruction_err_)
93-
report(m.reconstruction_err_, tend)
94-
95-
gc.collect()
96-
print("benchmarking nndsvda-nmf: ")
97-
tstart = time()
98-
m = NMF(n_components=30, init='nndsvda',
99-
tol=tolerance).fit(X)
100-
tend = time() - tstart
101-
timeset['nndsvda-nmf'].append(tend)
102-
err['nndsvda-nmf'].append(m.reconstruction_err_)
103-
report(m.reconstruction_err_, tend)
104-
105-
gc.collect()
106-
print("benchmarking nndsvdar-nmf: ")
107-
tstart = time()
108-
m = NMF(n_components=30, init='nndsvdar',
109-
tol=tolerance).fit(X)
110-
tend = time() - tstart
111-
timeset['nndsvdar-nmf'].append(tend)
112-
err['nndsvdar-nmf'].append(m.reconstruction_err_)
113-
report(m.reconstruction_err_, tend)
114-
115-
gc.collect()
116-
print("benchmarking random-nmf")
117-
tstart = time()
118-
m = NMF(n_components=30, init='random', max_iter=1000,
119-
tol=tolerance).fit(X)
120-
tend = time() - tstart
121-
timeset['random-nmf'].append(tend)
122-
err['random-nmf'].append(m.reconstruction_err_)
123-
report(m.reconstruction_err_, tend)
124-
125-
gc.collect()
126-
print("benchmarking alt-random-nmf")
127-
tstart = time()
128-
W, H = alt_nnmf(X, r=30, init='random', tol=tolerance)
129-
tend = time() - tstart
130-
timeset['alt-random-nmf'].append(tend)
131-
err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
132-
report(norm(X - np.dot(W, H)), tend)
133-
134-
return timeset, err
93+
@ignore_warnings
94+
def plot_results(results_df, plot_name):
95+
if results_df is None:
96+
return None
97+
98+
plt.figure(figsize=(16, 6))
99+
colors = 'bgr'
100+
markers = 'ovs'
101+
ax = plt.subplot(1, 3, 1)
102+
for i, init in enumerate(np.unique(results_df['init'])):
103+
plt.subplot(1, 3, i + 1, sharex=ax, sharey=ax)
104+
for j, method in enumerate(np.unique(results_df['method'])):
105+
selected_items = (results_df
106+
[results_df['init'] == init]
107+
[results_df['method'] == method])
108+
109+
plt.plot(selected_items['time'], selected_items['loss'],
110+
color=colors[j % len(colors)], ls='-',
111+
marker=markers[j % len(markers)],
112+
label=method)
113+
114+
plt.legend(loc=0, fontsize='x-small')
115+
plt.xlabel("Time (s)")
116+
plt.ylabel("loss")
117+
plt.title("%s" % init)
118+
plt.suptitle(plot_name, fontsize=16)
119+
120+
121+
# use joblib to cache results.
122+
# X_shape is specified in arguments for avoiding hashing X
123+
@ignore_warnings
124+
@mem.cache(ignore=['X', 'W0', 'H0'])
125+
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init,
126+
n_components, random_state):
127+
W = W0.copy()
128+
H = H0.copy()
129+
130+
if 'Multiplicative' in name:
131+
st = time()
132+
W, H = clf_type(X, W, H, **clf_params)
133+
end = time()
134+
else:
135+
clf = clf_type(**clf_params)
136+
st = time()
137+
W = clf.fit_transform(X, W=W, H=H)
138+
end = time()
139+
H = clf.components_
140+
141+
this_loss = _safe_compute_error(X, W, H)
142+
duration = end - st
143+
return this_loss, duration
144+
145+
146+
def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio):
147+
start = time()
148+
results = []
149+
for name, clf_type, iter_range, clf_params in clfs:
150+
print("Training %s:" % name)
151+
for rs, init in enumerate(('nndsvd', 'nndsvdar', 'random')):
152+
print(" %s %s: " % (init, " " * (8 - len(init))), end="")
153+
W, H = _initialize_nmf(X, n_components, init, 1e-6, rs)
154+
155+
for itr in iter_range:
156+
clf_params['alpha'] = alpha
157+
clf_params['l1_ratio'] = l1_ratio
158+
159+
if 'Multiplicative' in name:
160+
clf_params['n_iter'] = itr
161+
else:
162+
clf_params['max_iter'] = itr
163+
clf_params['tol'] = tol
164+
clf_params['random_state'] = rs
165+
clf_params['init'] = 'custom'
166+
clf_params['n_components'] = n_components
167+
168+
this_loss, duration = bench_one(name, X, W, H, X.shape,
169+
clf_type, clf_params,
170+
init, n_components, rs)
171+
172+
init_name = "init='%s'" % init
173+
results.append((name, this_loss, duration, init_name))
174+
# print("loss: %.6f, time: %.3f sec" % (this_loss, duration))
175+
print(".", end="")
176+
sys.stdout.flush()
177+
print(" ")
178+
179+
# Use a panda dataframe to organize the results
180+
results_df = pandas.DataFrame(results,
181+
columns="method loss time init".split())
182+
print("Total time = %0.3f sec\n" % (time() - start))
183+
184+
# plot the results
185+
plot_results(results_df, plot_name)
186+
return results_df
187+
188+
189+
def load_20news():
190+
print("Loading 20 newsgroups dataset")
191+
print("-----------------------------")
192+
from sklearn.datasets import fetch_20newsgroups
193+
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
194+
remove=('headers', 'footers', 'quotes'))
195+
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
196+
tfidf = vectorizer.fit_transform(dataset.data)
197+
return tfidf
198+
199+
200+
def load_faces():
201+
print("Loading Olivetti face dataset")
202+
print("-----------------------------")
203+
from sklearn.datasets import fetch_olivetti_faces
204+
faces = fetch_olivetti_faces(shuffle=True)
205+
return faces.data
206+
207+
208+
def build_clfs(cd_iters, pg_iters, mu_iters):
209+
clfs = [("Coordinate Descent", NMF, cd_iters, {'solver': 'cd'}),
210+
("Projected Gradient", NMF, pg_iters, {'solver': 'pg',
211+
'nls_max_iter': 8}),
212+
("Multiplicative Update", multiplicative_nmf, mu_iters, {}),
213+
]
214+
return clfs
135215

136216

137217
if __name__ == '__main__':
138-
from mpl_toolkits.mplot3d import axes3d # register the 3d projection
139-
axes3d
140-
import matplotlib.pyplot as plt
141-
142-
samples_range = np.linspace(50, 500, 3).astype(np.int)
143-
features_range = np.linspace(50, 500, 3).astype(np.int)
144-
timeset, err = benchmark(samples_range, features_range)
145-
146-
for i, results in enumerate((timeset, err)):
147-
fig = plt.figure('scikit-learn Non-Negative Matrix Factorization'
148-
'benchmark results')
149-
ax = fig.gca(projection='3d')
150-
for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())):
151-
X, Y = np.meshgrid(samples_range, features_range)
152-
Z = np.asarray(timings).reshape(samples_range.shape[0],
153-
features_range.shape[0])
154-
# plot the actual surface
155-
ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3,
156-
color=c)
157-
# dummy point plot to stick the legend to since surface plot do not
158-
# support legends (yet?)
159-
ax.plot([1], [1], [1], color=c, label=label)
160-
161-
ax.set_xlabel('n_samples')
162-
ax.set_ylabel('n_features')
163-
zlabel = 'Time (s)' if i == 0 else 'reconstruction err 48DA or'
164-
ax.set_zlabel(zlabel)
165-
ax.legend()
166-
plt.show()
218+
alpha = 0.
219+
l1_ratio = 0.5
220+
n_components = 10
221+
tol = 1e-15
222+
223+
# first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116)
224+
plot_name = "20 Newsgroups sparse dataset"
225+
cd_iters = np.arange(1, 30)
226+
pg_iters = np.arange(1, 6)
227+
mu_iters = np.arange(1, 30)
228+
clfs = build_clfs(cd_iters, pg_iters, mu_iters)
229+
X_20news = load_20news()
230+
run_bench(X_20news, clfs, plot_name, n_components, tol, alpha, l1_ratio)
231+
232+
# second benchmark on Olivetti faces dataset: dense, shape(400, 4096)
233+
plot_name = "Olivetti Faces dense dataset"
234+
cd_iters = np.arange(1, 30)
235+
pg_iters = np.arange(1, 12)
236+
mu_iters = np.arange(1, 30)
237+
clfs = build_clfs(cd_iters, pg_iters, mu_iters)
238+
X_faces = load_faces()
239+
run_bench(X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio,)
240+
241+
plt.show()

0 commit comments

Comments
 (0)
0