8000 add projected gradient solver inside the benchmark file · scikit-learn/scikit-learn@c543ac2 · GitHub
[go: up one dir, main page]

Skip to content

Commit c543ac2

Browse files
committed
add projected gradient solver inside the benchmark file
1 parent d9d65a6 commit c543ac2

File tree

1 file changed

+264
-9
lines changed

1 file changed

+264
-9
lines changed

benchmarks/bench_plot_nmf.py

+264-9
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
from __future__ import print_function
88
from time import time
99
import sys
10-
11-
import six
10+
import warnings
11+
import numbers
1212

1313
import numpy as np
1414
import matplotlib.pyplot as plt
@@ -19,12 +19,267 @@
1919
from sklearn.decomposition.nmf import NMF
2020
from sklearn.decomposition.nmf import _initialize_nmf
2121
from sklearn.decomposition.nmf import _beta_divergence
22+
from sklearn.decomposition.nmf import INTEGER_TYPES, _check_init
2223
from sklearn.externals.joblib import Memory
2324
from sklearn.exceptions import ConvergenceWarning
25+
from sklearn.utils.extmath import fast_dot, safe_sparse_dot, squared_norm
26+
from sklearn.utils import check_array
27+
from sklearn.utils.validation import check_is_fitted, check_non_negative
28+
2429

2530
mem = Memory(cachedir='.', verbose=0)
2631

2732

33+
###################
34+
# Start of _PGNMF #
35+
###################
36+
37+
def _norm(x):
38+
"""Dot product-based Euclidean norm implementation
39+
See: http://fseoane.net/blog/2011/computing-the-vector-norm/
40+
"""
41+
return np.sqrt(squared_norm(x))
42+
43+
44+
def _nls_subproblem(X, W, H, tol, max_iter, alpha=0., l1_ratio=0.,
45+
sigma=0.01, beta=0.1):
46+
"""Non-negative least square solver
47+
Solves a non-negative least squares subproblem using the projected
48+
gradient descent algorithm.
49+
Parameters
50+
----------
51+
X : array-like, shape (n_samples, n_features)
52+
Constant matrix.
53+
W : array-like, shape (n_samples, n_components)
54+
Constant matrix.
55+
H : array-like, shape (n_components, n_features)
56+
Initial guess for the solution.
57+
tol : float
58+
Tolerance of the stopping condition.
59+
max_iter : int
60+
Maximum number of iterations before timing out.
61+
alpha : double, default: 0.
62+
Constant that multiplies the regularization terms. Set it to zero to
63+
have no regularization.
64+
l1_ratio : double, default: 0.
65+
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
66+
For l1_ratio = 0 the penalty is an L2 penalty.
67+
For l1_ratio = 1 it is an L1 penalty.
68+
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
69+
sigma : float
70+
Constant used in the sufficient decrease condition checked by the line
71+
search. Smaller values lead to a looser sufficient decrease condition,
72+
thus reducing the time taken by the line search, but potentially
73+
increasing the number of iterations of the projected gradient
74+
procedure. 0.01 is a commonly used value in the optimization
75+
literature.
76+
beta : float
77+
Factor by which the step size is decreased (resp. increased) until
78+
(resp. as long as) the sufficient decrease condition is satisfied.
79+
Larger values allow to find a better step size but lead to longer line
80+
search. 0.1 is a commonly used value in the optimization literature.
81+
Returns
82+
-------
83+
H : array-like, shape (n_components, n_features)
84+
Solution to the non-negative least squares problem.
85+
grad : array-like, shape (n_components, n_features)
86+
The gradient.
87+
n_iter : int
88+
The number of iterations done by the algorithm.
89+
References
90+
----------
91+
C.-J. Lin. Projected gradient methods for non-negative matrix
92+
factorization. Neural Computation, 19(2007), 2756-2779.
93+
http://www.csie.ntu.edu.tw/~cjlin/nmf/
94+
"""
95+
WtX = safe_sparse_dot(W.T, X)
96+
WtW = fast_dot(W.T, W)
97+
98+
# values justified in the paper (alpha is renamed gamma)
99+
gamma = 1
100+
for n_iter in range(1, max_iter + 1):
101+
grad = np.dot(WtW, H) - WtX
102+
if alpha > 0 and l1_ratio == 1.:
103+
grad += alpha
104+
elif alpha > 0:
105+
grad += alpha * (l1_ratio + (1 - l1_ratio) * H)
106+
107+
# The following multiplication with a boolean array is more than twice
108+
# as fast as indexing into grad.
109+
if _norm(grad * np.logical_or(grad < 0, H > 0)) < tol:
110+
break
111+
112+
Hp = H
113+
114+
for inner_iter in range(20):
115+
# Gradient step.
116+
Hn = H - gamma * grad
117+
# Projection step.
118+
Hn *= Hn > 0
119+
d = Hn - H
120+
gradd = np.dot(grad.ravel(), d.ravel())
121+
dQd = np.dot(np.dot(WtW, d).ravel(), d.ravel())
122+
suff_decr = (1 - sigma) * gradd + 0.5 * dQd < 0
123+
if inner_iter == 0:
124+
decr_gamma = not suff_decr
125+
126+
if decr_gamma:
127+
if suff_decr:
128+
H = Hn
129+
break
130+
else:
131+
gamma *= beta
132+
elif not suff_decr or (Hp == Hn).all():
133+
H = Hp
134+
break
135+
else:
136+
gamma /= beta
137+
Hp = Hn
138+
139+
if n_iter == max_iter:
140+
warnings.warn("Iteration limit reached in nls subproblem.",
141+
ConvergenceWarning)
142+
143+
return H, grad, n_iter
144+
145+
146+
def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha,
147+
l1_ratio):
148+
gradW = (np.dot(W, np.dot(H, H.T)) -
149+
safe_sparse_dot(X, H.T, dense_output=True))
150+
gradH = (np.dot(np.dot(W.T, W), H) -
151+
safe_sparse_dot(W.T, X, dense_output=True))
152+
153+
init_grad = squared_norm(gradW) + squared_norm(gradH.T)
154+
# max(0.001, tol) to force alternating minimizations of W and H
155+
tolW = max(0.001, tol) * np.sqrt(init_grad)
156+
tolH = tolW
157+
158+
for n_iter in range(1, max_iter + 1):
159+
# stopping condition as discussed in paper
160+
proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0))
161+
proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0))
162+
163+
if (proj_grad_W + proj_grad_H) / init_grad < tol ** 2:
164+
break
165+
166+
# update W
167+
Wt, gradWt, iterW = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter,
168+
alpha=alpha, l1_ratio=l1_ratio)
169+
W, gradW = Wt.T, gradWt.T
170+
171+
if iterW == 1:
172+
tolW = 0.1 * tolW
173+
174+
# update H
175+
H, gradH, iterH = _nls_subproblem(X, W, H, tolH, nls_max_iter,
176+
alpha=alpha, l1_ratio=l1_ratio)
177+
if iterH == 1:
178+
tolH = 0.1 * tolH
179+
180+
H[H == 0] = 0 # fix up negative zeros
181+
182+
if n_iter == max_iter:
183+
Wt, _, _ = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter,
184+
alpha=alpha, l1_ratio=l1_ratio)
185+
W = Wt.T
186+
187+
return W, H, n_iter
188+
189+
190+
class _PGNMF(NMF):
191+
"""Non-Negative Matrix Factorization (NMF) with projected gradient solver.
192+
193+
This class is private and for comparison purpose only.
194+
It may change or disappear without notice.
195+
196+
"""
197+
def __init__(self, n_components=None, solver='pg', init=None,
198+
tol=1e-4, max_iter=200, random_state=None,
199+
alpha=0., l1_ratio=0., nls_max_iter=10):
200+
self.nls_max_iter = nls_max_iter
201+
self.n_components = n_components
202+
self.init = init
203+
self.solver = solver
204+
self.tol = tol
205+
self.max_iter = max_iter
206+
self.random_state = random_state
207+
self.alpha = alpha
208+
self.l1_ratio = l1_ratio
209+
210+
def fit(self, X, y=None, **params):
211+
self.fit_transform(X, **params)
212+
return self
213+
214+
def transform(self, X):
215+
check_is_fitted(self, 'components_')
216+
H = self.components_
217+
W, _, self.n_iter_ = self._fit_transform(X, H=H, update_H=False)
218+
return W
219+
220+
def inverse_transform(self, W):
221+
check_is_fitted(self, 'components_')
222+
return np.dot(W, self.components_)
223+
224+
def fit_transform(self, X, y=None, W=None, H=None):
225+
W, H, self.n_iter = self._fit_transform(X, W=W, H=H, update_H=True)
226+
self.components_ = H
227+
return W
228+
229+
def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
230+
X = check_array(X, accept_sparse=('csr', 'csc'))
231+
check_non_negative(X, "NMF (input X)")
232+
233+
n_samples, n_features = X.shape
234+
n_components = self.n_components
235+
if n_components is None:
236+
n_components = n_features
237+
238+
if (not isinstance(n_components, INTEGER_TYPES) or
239+
n_components <= 0):
240+
raise ValueError("Number of components must be a positive integer;"
241+
" got (n_components=%r)" % n_components)
242+
if not isinstance(self.max_iter, INTEGER_TYPES) or self.max_iter < 0:
243+
raise ValueError("Maximum number of iterations must be a positive "
244+
"integer; got (max_iter=%r)" % self.max_iter)
245+
if not isinstance(self.tol, numbers.Number) or self.tol < 0:
246+
raise ValueError("Tolerance for stopping criteria must be "
247+
"positive; got (tol=%r)" % self.tol)
248+
249+
# check W and H, or initialize them
250+
if self.init == 'custom' and update_H:
251+
_check_init(H, (n_components, n_features), "NMF (input H)")
252+
_check_init(W, (n_samples, n_components), "NMF (input W)")
253+
elif not update_H:
254+
_check_init(H, (n_components, n_features), "NMF (input H)")
255+
W = np.zeros((n_samples, n_components))
256+
else:
257+
W, H = _initialize_nmf(X, n_components, init=self.init,
258+
random_state=self.random_state)
259+
260+
if update_H: # fit_transform
261+
W, H, n_iter = _fit_projected_gradient(
262+
X, W, H, self.tol, self.max_iter, self.nls_max_iter,
263+
self.alpha, self.l1_ratio)
264+
else: # transform
265+
Wt, _, n_iter = _nls_subproblem(X.T, H.T, W.T, self.tol,
266+
self.nls_max_iter,
267+
alpha=self.alpha,
268+
l1_ratio=self.l1_ratio)
269+
W = Wt.T
270+
271+
if n_iter == self.max_iter and self.tol > 0:
272+
warnings.warn("Maximum number of iteration %d reached. Increase it"
273+
" to improve convergence." % self.max_iter,
274+
ConvergenceWarning)
275+
276+
return W, H, n_iter
277+
278+
#################
279+
# End of _PGNMF #
280+
#################
281+
282+
28283
def plot_results(results_df, plot_name):
29284
if results_df is None:
30285
return None
@@ -52,10 +307,7 @@ def plot_results(results_df, plot_name):
52307
plt.suptitle(plot_name, fontsize=16)
53308

54309

55-
# The deprecated projected-gradient solver raises a UserWarning as convergence
56-
# is not reached; the coordinate-descent solver raises a ConvergenceWarning.
57-
@ignore_warnings(category=(ConvergenceWarning, UserWarning,
58-
DeprecationWarning))
310+
@ignore_warnings(category=ConvergenceWarning)
59311
# use joblib to cache the results.
60312
# X_shape is specified in arguments for avoiding hashing X
61313
@mem.cache(ignore=['X', 'W0', 'H0'])
@@ -133,8 +385,9 @@ def load_faces():
133385
return faces.data
134386

135387

136-
def build_clfs(cd_iters, mu_iters):
388+
def build_clfs(cd_iters, pg_iters, mu_iters):
137389
clfs = [("Coordinate Descent", NMF, cd_iters, {'solver': 'cd'}),
390+
("Projected Gradient", _PGNMF, pg_iters, {'solver': 'pg'}),
138391
("Multiplicative Update", NMF, mu_iters, {'solver': 'mu'}),
139392
]
140393
return clfs
@@ -149,16 +402,18 @@ def build_clfs(cd_iters, mu_iters):
149402
# first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116)
150403
plot_name = "20 Newsgroups sparse dataset"
151404
cd_iters = np.arange(1, 30)
405+
pg_iters = np.arange(1, 6)
152406
mu_iters = np.arange(1, 30)
153-
clfs = build_clfs(cd_iters, mu_iters)
407+
clfs = build_clfs(cd_iters, pg_iters, mu_iters)
154408
X_20news = load_20news()
155409
run_bench(X_20news, clfs, plot_name, n_components, tol, alpha, l1_ratio)
156410

157411
# second benchmark on Olivetti faces dataset: dense, shape(400, 4096)
158412
plot_name = "Olivetti Faces dense dataset"
159413
cd_iters = np.arange(1, 30)
414+
pg_iters = np.arange(1, 12)
160415
mu_iters = np.arange(1, 30)
161-
clfs = build_clfs(cd_iters, mu_iters)
416+
clfs = build_clfs(cd_iters, pg_iters, mu_iters)
162417
X_faces = load_faces()
163418
run_bench(X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio,)
164419

0 commit comments

Comments
 (0)
0