8000 ENH add multiplicative-update solver in NMF, with all beta-divergence · scikit-learn/scikit-learn@a44f2b7 · GitHub
[go: up one dir, main page]

Skip to content

Commit a44f2b7

Browse files
committed
ENH add multiplicative-update solver in NMF, with all beta-divergence
1 parent 3d0c8f8 commit a44f2b7

File tree

5 files changed

+1079
-306
lines changed

5 files changed

+1079
-306
lines changed

benchmarks/bench_plot_nmf.py

Lines changed: 151 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -1,166 +1,162 @@
11
"""
22
Benchmarks of Non-Negative Matrix Factorization
33
"""
4+
# Author : Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
5+
# License: BSD 3 clause
46

57
from __future__ import print_function
6-
7-
from collections import defaultdict
8-
import gc
98
from time import time
9+
import sys
1010

1111
import numpy as np
12-
from scipy.linalg import norm
13-
14-
from sklearn.decomposition.nmf import NMF, _initialize_nmf
15-
from sklearn.datasets.samples_generator import make_low_rank_matrix
16-
from sklearn.externals.six.moves import xrange
17-
18-
19-
def alt_nnmf(V, r, max_iter=1000, tol=1e-3, init='random'):
20-
'''
21-
A, S = nnmf(X, r, tol=1e-3, R=None)
22-
23-
Implement Lee & Seung's algorithm
24-
25-
Parameters
26-
----------
27-
V : 2-ndarray, [n_samples, n_features]
28-
input matrix
29-
r : integer
30-
number of latent features
31-
max_iter : integer, optional
32-
maximum number of iterations (default: 1000)
33-
tol : double
34-
tolerance threshold for early exit (when the update factor is within
35-
tol of 1., the function exits)
36-
init : string
37-
Method used to initialize the procedure.
38-
39-
Returns
40-
-------
41-
A : 2-ndarray, [n_samples, r]
42-
Component part of the factorization
43-
44-
S : 2-ndarray, [r, n_features]
45-
Data part of the factorization
46-
Reference
47-
---------
48-
"Algorithms for Non-negative Matrix Factorization"
49-
by Daniel D Lee, Sebastian H Seung
50-
(available at http://citeseer.ist.psu.edu/lee01algorithms.html)
51-
'''
52-
# Nomenclature in the function follows Lee & Seung
53-
eps = 1e-5
54-
n, m = V.shape
55-
W, H = _initialize_nmf(V, r, init, random_state=0)
56-
57-
for i in xrange(max_iter):
58-
updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps)
59-
H *= updateH
60-
updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps)
61-
W *= updateW
62-
if i % 10 == 0:
63-
max_update = max(updateW.max(), updateH.max())
64-
if abs(1. - max_update) < tol:
65-
break
66-
return W, H
67-
68-
69-
def report(error, time):
70-
print("Frobenius loss: %.5f" % error)
71-
print("Took: %.2fs" % time)
72-
print()
73-
74-
75-
def benchmark(samples_range, features_range, rank=50, tolerance=1e-5):
76-
timeset = defaultdict(lambda: [])
77-
err = defaultdict(lambda: [])
78-
79-
for n_samples in samples_range:
80-
for n_features in features_range:
81-
print("%2d samples, %2d features" % (n_samples, n_features))
82-
print('=======================')
83-
X = np.abs(make_low_rank_matrix(n_samples, n_features,
84-
effective_rank=rank, tail_strength=0.2))
85-
86-
gc.collect()
87-
print("benchmarking nndsvd-nmf: ")
88-
tstart = time()
89-
m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
90-
tend = time() - tstart
91-
timeset['nndsvd-nmf'].append(tend)
92-
err['nndsvd-nmf'].append(m.reconstruction_err_)
93-
report(m.reconstruction_err_, tend)
94-
95-
gc.collect()
96-
print("benchmarking nndsvda-nmf: ")
97-
tstart = time()
98-
m = NMF(n_components=30, init='nndsvda',
99-
tol=tolerance).fit(X)
100-
tend = time() - tstart
101-
timeset['nndsvda-nmf'].append(tend)
102-
err['nndsvda-nmf'].append(m.reconstruction_err_)
103-
report(m.reconstruction_err_, tend)
104-
105-
gc.collect()
106-
print("benchmarking nndsvdar-nmf: ")
107-
tstart = time()
108-
m = NMF(n_components=30, init='nndsvdar',
109-
tol=tolerance).fit(X)
110-
tend = time() - tstart
111-
timeset['nndsvdar-nmf'].append(tend)
112-
err['nndsvdar-nmf'].append(m.reconstruction_err_)
113-
report(m.reconstruction_err_, tend)
114-
115-
gc.collect()
116-
print("benchmarking random-nmf")
117-
tstart = time()
118-
m = NMF(n_components=30, init='random', max_iter=1000,
119-
tol=tolerance).fit(X)
120-
tend = time() - tstart
121-
timeset['random-nmf'].append(tend)
122-
err['random-nmf'].append(m.reconstruction_err_)
123-
report(m.reconstruction_err_, tend)
124-
125-
gc.collect()
126-
print("benchmarking alt-random-nmf")
127-
tstart = time()
128-
W, H = alt_nnmf(X, r=30, init='random', tol=tolerance)
129-
tend = time() - tstart
130-
timeset['alt-random-nmf'].append(tend)
131-
err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
132-
report(norm(X - np.dot(W, H)), tend)
133-
134-
return timeset, err
12+
import matplotlib.pyplot as plt
13+
import pandas
14+
15+
from sklearn.utils.testing import ignore_warnings
16+
from sklearn.feature_extraction.text import TfidfVectorizer
17+
from sklearn.decomposition.nmf import NMF
18+
from sklearn.decomposition.nmf import _initialize_nmf
19+
from sklearn.decomposition.nmf import _safe_compute_error
20+
from sklearn.externals.joblib import Memory
21+
22+
mem = Memory(cachedir='.', verbose=0)
23+
24+
25+
@ignore_warnings
26+
def plot_results(results_df, plot_name):
27+
if results_df is None:
28+
return None
29+
30+
plt.figure(figsize=(16, 6))
31+
colors = 'bgr'
32+
markers = 'ovs'
33+
ax = plt.subplot(1, 3, 1)
34+
for i, init in enumerate(np.unique(results_df['init'])):
35+
plt.subplot(1, 3, i + 1, sharex=ax, sharey=ax)
36+
for j, method in enumerate(np.unique(results_df['method'])):
37+
selected_items = (results_df
38+
[results_df['init'] == init]
39+
[results_df['method'] == method])
40+
41+
plt.plot(selected_items['time'], selected_items['loss'],
42+
color=colors[j % len(colors)], ls='-',
43+
marker=markers[j % len(markers)],
44+
label=method)
45+
46+
plt.legend(loc=0, fontsize='x-small')
47+
plt.xlabel("Time (s)")
48+
plt.ylabel("loss")
49+
plt.title("%s" % init)
50+
plt.suptitle(plot_name, fontsize=16)
51+
52+
53+
# use joblib to cache results.
54+
# X_shape is specified in arguments for avoiding hashing X
55+
@ignore_warnings
56+
@mem.cache(ignore=['X', 'W0', 'H0'])
57+
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init,
58+
n_components, random_state):
59+
clf = clf_type(**clf_params)
60+
61+
st = time()
62+
W = clf.fit_transform(X, W=W0.copy(), H=H0.copy())
63+
end = time()
64+
65+
H = clf.components_
66+
this_loss = _safe_compute_error(X, W, H)
67+
duration = end - st
68+
return this_loss, duration
69+
70+
71+
def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio):
72+
start = time()
73+
results = []
74+
for name, clf_type, iter_range, clf_params in clfs:
75+
print("Training %s:" % name)
76+
for rs, init in enumerate(('nndsvd', 'nndsvdar', 'random')):
77+
print(" %s %s: " % (init, " " * (8 - len(init))), end="")
78+
W, H = _initialize_nmf(X, n_components, init, 1e-6, rs)
79+
80+
for itr in iter_range:
81+
clf_params['alpha'] = alpha
82+
clf_params['l1_ratio'] = l1_ratio
83+
clf_params['max_iter'] = itr
84+
clf_params['tol'] = tol
85+
clf_params['random_state'] = rs
86+
clf_params['init'] = 'custom'
87+
clf_params['n_components'] = n_components
88+
89+
this_loss, duration = bench_one(name, X, W, H, X.shape,
90+
clf_type, clf_params,
91+
init, n_components, rs)
92+
93+
init_name = "init='%s'" % init
94+
results.append((name, this_loss, duration, init_name))
95+
# print("loss: %.6f, time: %.3f sec" % (this_loss, duration))
96+
print(".", end="")
97+
sys.stdout.flush()
98+
print(" ")
99+
100+
# Use a panda dataframe to organize the results
101+
results_df = pandas.DataFrame(results,
102+
columns="method loss time init".split())
103+
print("Total time = %0.3f sec\n" % (time() - start))
104+
105+
# plot the results
106+
plot_results(results_df, plot_name)
107+
return results_df
108+
109+
110+
def load_20news():
111+
print("Loading 20 newsgroups dataset")
112+
print("-----------------------------")
113+
from sklearn.datasets import fetch_20newsgroups
114+
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
115+
remove=('headers', 'footers', 'quotes'))
116+
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
117+
tfidf = vectorizer.fit_transform(dataset.data)
118+
return tfidf
119+
120+
121+
def load_faces():
122+
print("Loading Olivetti face dataset")
123+
print("-----------------------------")
124+
from sklearn.datasets import fetch_olivetti_faces
125+
faces = fetch_olivetti_faces(shuffle=True)
126+
return faces.data
127+
128+
129+
def build_clfs(cd_iters, pg_iters, mu_iters):
130+
clfs = [("Coordinate Descent", NMF, cd_iters, {'solver': 'cd'}),
131+
("Projected Gradient", NMF, pg_iters, {'solver': 'pg',
132+
'nls_max_iter': 8}),
133+
("Multiplicative Update", NMF, mu_iters, {'solver': 'mu'}),
134+
]
135+
return clfs
135136

136137

137138
if __name__ == '__main__':
138-
from mpl_toolkits.mplot3d import axes3d # register the 3d projection
139-
axes3d
140-
import matplotlib.pyplot as plt
141-
142-
samples_range = np.linspace(50, 500, 3).astype(np.int)
143-
features_range = np.linspace(50, 500, 3).astype(np.int)
144-
timeset, err = benchmark(samples_range, features_range)
145-
146-
for i, results in enumerate((timeset, err)):
147-
fig = plt.figure('scikit-learn Non-Negative Matrix Factorization'
148-
'benchmark results')
149-
ax = fig.gca(projection='3d')
150-
for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())):
151-
X, Y = np.meshgrid(samples_range, features_range)
152-
Z = np.asarray(timings).reshape(samples_range.shape[0],
153-
features_range.shape[0])
154-
# plot the actual surface
155-
ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3,
156-
color=c)
157-
# dummy point plot to stick the legend to since surface plot do not
158-
# support legends (yet?)
159-
ax.plot([1], [1], [1], color=c, label=label)
160-
161-
ax.set_xlabel('n_samples')
162-
ax.set_ylabel('n_features')
163-
zlabel = 'Time (s)' if i == 0 else 'reconstruction error'
164-
ax.set_zlabel(zlabel)
165-
ax.legend()
166-
plt.show()
139+
alpha = 0.
140+
l1_ratio = 0.5
141+
n_components = 10
142+
tol = 1e-15
143+
144+
# first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116)
145+
plot_name = "20 Newsgroups sparse dataset"
146+
cd_iters = np.arange(1, 30)
147+
pg_iters = np.arange(1, 6)
148+
mu_iters = np.arange(1, 30)
149+
clfs = build_clfs(cd_iters, pg_iters, mu_iters)
150+
X_20news = load_20news()
151+
run_bench(X_20news, clfs, plot_name, n_components, tol, alpha, l1_ratio)
152+
153+
# second benchmark on Olivetti faces dataset: dense, shape(400, 4096)
154+
plot_name = "Olivetti Faces dense dataset"
155+
cd_iters = np.arange(1, 30)
156+
pg_iters = np.arange(1, 12)
157+
mu_iters = np.arange(1, 30)
158+
clfs = build_clfs(cd_iters, pg_iters, mu_iters)
159+
X_faces = load_faces()
160+
run_bench(X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio,)
161+
162+
plt.show()

0 commit comments

Comments
 (0)
0