7
7
from __future__ import print_function
8
8
from time import time
9
9
import sys
10
-
11
- import six
10
+ import warnings
11
+ import numbers
12
12
13
13
import numpy as np
14
14
import matplotlib .pyplot as plt
19
19
from sklearn .decomposition .nmf import NMF
20
20
from sklearn .decomposition .nmf import _initialize_nmf
21
21
from sklearn .decomposition .nmf import _beta_divergence
22
+ from sklearn .decomposition .nmf import INTEGER_TYPES , _check_init
22
23
from sklearn .externals .joblib import Memory
23
24
from sklearn .exceptions import ConvergenceWarning
25
+ from sklearn .utils .extmath import fast_dot , safe_sparse_dot , squared_norm
26
+ from sklearn .utils import check_array
27
+ from sklearn .utils .validation import check_is_fitted , check_non_negative
28
+
24
29
25
30
mem = Memory (cachedir = '.' , verbose = 0 )
26
31
27
32
33
+ ###################
34
+ # Start of _PGNMF #
35
+ ###################
36
+
37
+ def _norm (x ):
38
+ """Dot product-based Euclidean norm implementation
39
+ See: http://fseoane.net/blog/2011/computing-the-vector-norm/
40
+ """
41
+ return np .sqrt (squared_norm (x ))
42
+
43
+
44
+ def _nls_subproblem (X , W , H , tol , max_iter , alpha = 0. , l1_ratio = 0. ,
45
+ sigma = 0.01 , beta = 0.1 ):
46
+ """Non-negative least square solver
47
+ Solves a non-negative least squares subproblem using the projected
48
+ gradient descent algorithm.
49
+ Parameters
50
+ ----------
51
+ X : array-like, shape (n_samples, n_features)
52
+ Constant matrix.
53
+ W : array-like, shape (n_samples, n_components)
54
+ Constant matrix.
55
+ H : array-like, shape (n_components, n_features)
56
+ Initial guess for the solution.
57
+ tol : float
58
+ Tolerance of the stopping condition.
59
+ max_iter : int
60
+ Maximum number of iterations before timing out.
61
+ alpha : double, default: 0.
62
+ Constant that multiplies the regularization terms. Set it to zero to
63
+ have no regularization.
64
+ l1_ratio : double, default: 0.
65
+ The regularization mixing parameter, with 0 <= l1_ratio <= 1.
66
+ For l1_ratio = 0 the penalty is an L2 penalty.
67
+ For l1_ratio = 1 it is an L1 penalty.
68
+ For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
69
+ sigma : float
70
+ Constant used in the sufficient decrease condition checked by the line
71
+ search. Smaller values lead to a looser sufficient decrease condition,
72
+ thus reducing the time taken by the line search, but potentially
73
+ increasing the number of iterations of the projected gradient
74
+ procedure. 0.01 is a commonly used value in the optimization
75
+ literature.
76
+ beta : float
77
+ Factor by which the step size is decreased (resp. increased) until
78
+ (resp. as long as) the sufficient decrease condition is satisfied.
79
+ Larger values allow to find a better step size but lead to longer line
80
+ search. 0.1 is a commonly used value in the optimization literature.
81
+ Returns
82
+ -------
83
+ H : array-like, shape (n_components, n_features)
84
+ Solution to the non-negative least squares problem.
85
+ grad : array-like, shape (n_components, n_features)
86
+ The gradient.
87
+ n_iter : int
88
+ The number of iterations done by the algorithm.
89
+ References
90
+ ----------
91
+ C.-J. Lin. Projected gradient methods for non-negative matrix
92
+ factorization. Neural Computation, 19(2007), 2756-2779.
93
+ http://www.csie.ntu.edu.tw/~cjlin/nmf/
94
+ """
95
+ WtX = safe_sparse_dot (W .T , X )
96
+ WtW = fast_dot (W .T , W )
97
+
98
+ # values justified in the paper (alpha is renamed gamma)
99
+ gamma = 1
100
+ for n_iter in range (1 , max_iter + 1 ):
101
+ grad = np .dot (WtW , H ) - WtX
102
+ if alpha > 0 and l1_ratio == 1. :
103
+ grad += alpha
104
+ elif alpha > 0 :
105
+ grad += alpha * (l1_ratio + (1 - l1_ratio ) * H )
106
+
107
+ # The following multiplication with a boolean array is more than twice
108
+ # as fast as indexing into grad.
109
+ if _norm (grad * np .logical_or (grad < 0 , H > 0 )) < tol :
110
+ break
111
+
112
+ Hp = H
113
+
114
+ for inner_iter in range (20 ):
115
+ # Gradient step.
116
+ Hn = H - gamma * grad
117
+ # Projection step.
118
+ Hn *= Hn > 0
119
+ d = Hn - H
120
+ gradd = np .dot (grad .ravel (), d .ravel ())
121
+ dQd = np .dot (np .dot (WtW , d ).ravel (), d .ravel ())
122
+ suff_decr = (1 - sigma ) * gradd + 0.5 * dQd < 0
123
+ if inner_iter == 0 :
124
+ decr_gamma = not suff_decr
125
+
126
+ if decr_gamma :
127
+ if suff_decr :
128
+ H = Hn
129
+ break
130
+ else :
131
+ gamma *= beta
132
+ elif not suff_decr or (Hp == Hn ).all ():
133
+ H = Hp
134
+ break
135
+ else :
136
+ gamma /= beta
137
+ Hp = Hn
138
+
139
+ if n_iter == max_iter :
140
+ warnings .warn ("Iteration limit reached in nls subproblem." ,
141
+ ConvergenceWarning )
142
+
143
+ return H , grad , n_iter
144
+
145
+
146
+ def _fit_projected_gradient (X , W , H , tol , max_iter , nls_max_iter , alpha ,
147
+ l1_ratio ):
148
+ gradW = (np .dot (W , np .dot (H , H .T )) -
149
+ safe_sparse_dot (X , H .T , dense_output = True ))
150
+ gradH = (np .dot (np .dot (W .T , W ), H ) -
151
+ safe_sparse_dot (W .T , X , dense_output = True ))
152
+
153
+ init_grad = squared_norm (gradW ) + squared_norm (gradH .T )
154
+ # max(0.001, tol) to force alternating minimizations of W and H
155
+ tolW = max (0.001 , tol ) * np .sqrt (init_grad )
156
+ tolH = tolW
157
+
158
+ for n_iter in range (1 , max_iter + 1 ):
159
+ # stopping condition as discussed in paper
160
+ proj_grad_W = squared_norm (gradW * np .logical_or (gradW < 0 , W > 0 ))
161
+ proj_grad_H = squared_norm (gradH * np .logical_or (gradH < 0 , H > 0 ))
162
+
163
+ if (proj_grad_W + proj_grad_H ) / init_grad < tol ** 2 :
164
+ break
165
+
166
+ # update W
167
+ Wt , gradWt , iterW = _nls_subproblem (X .T , H .T , W .T , tolW , nls_max_iter ,
168
+ alpha = alpha , l1_ratio = l1_ratio )
169
+ W , gradW = Wt .T , gradWt .T
170
+
171
+ if iterW == 1 :
172
+ tolW = 0.1 * tolW
173
+
174
+ # update H
175
+ H , gradH , iterH = _nls_subproblem (X , W , H , tolH , nls_max_iter ,
176
+ alpha = alpha , l1_ratio = l1_ratio )
177
+ if iterH == 1 :
178
+ tolH = 0.1 * tolH
179
+
180
+ H [H == 0 ] = 0 # fix up negative zeros
181
+
182
+ if n_iter == max_iter :
183
+ Wt , _ , _ = _nls_subproblem (X .T , H .T , W .T , tolW , nls_max_iter ,
184
+ alpha = alpha , l1_ratio = l1_ratio )
185
+ W = Wt .T
186
+
187
+ return W , H , n_iter
188
+
189
+
190
+ class _PGNMF (NMF ):
191
+ """Non-Negative Matrix Factorization (NMF) with projected gradient solver.
192
+
193
+ This class is private and for comparison purpose only.
194
+ It may change or disappear without notice.
195
+
196
+ """
197
+ def __init__ (self , n_components = None , solver = 'pg' , init = None ,
198
+ tol = 1e-4 , max_iter = 200 , random_state = None ,
199
+ alpha = 0. , l1_ratio = 0. , nls_max_iter = 10 ):
200
+ self .nls_max_iter = nls_max_iter
201
+ self .n_components = n_components
202
+ self .init = init
203
+ self .solver = solver
204
+ self .tol = tol
205
+ self .max_iter = max_iter
206
+ self .random_state = random_state
207
+ self .alpha = alpha
208
+ self .l1_ratio = l1_ratio
209
+
210
+ def fit (self , X , y = None , ** params ):
211
+ self .fit_transform (X , ** params )
212
+ return self
213
+
214
+ def transform (self , X ):
215
+ check_is_fitted (self , 'components_' )
216
+ H = self .components_
217
+ W , _ , self .n_iter_ = self ._fit_transform (X , H = H , update_H = False )
218
+ return W
219
+
220
+ def inverse_transform (self , W ):
221
+ check_is_fitted (self , 'components_' )
222
+ return np .dot (W , self .components_ )
223
+
224
+ def fit_transform (self , X , y = None , W = None , H = None ):
225
+ W , H , self .n_iter = self ._fit_transform (X , W = W , H = H , update_H = True )
226
+ self .components_ = H
227
+ return W
228
+
229
+ def _fit_transform (self , X , y = None , W = None , H = None , update_H = True ):
230
+ X = check_array (X , accept_sparse = ('csr' , 'csc' ))
231
+ check_non_negative (X , "NMF (input X)" )
232
+
233
+ n_samples , n_features = X .shape
234
+ n_components = self .n_components
235
+ if n_components is None :
236
+ n_components = n_features
237
+
238
+ if (not isinstance (n_components , INTEGER_TYPES ) or
239
+ n_components <= 0 ):
240
+ raise ValueError ("Number of components must be a positive integer;"
241
+ " got (n_components=%r)" % n_components )
242
+ if not isinstance (self .max_iter , INTEGER_TYPES ) or self .max_iter < 0 :
243
+ raise ValueError ("Maximum number of iterations must be a positive "
244
+ "integer; got (max_iter=%r)" % self .max_iter )
245
+ if not isinstance (self .tol , numbers .Number ) or self .tol < 0 :
246
+ raise ValueError ("Tolerance for stopping criteria must be "
247
+ "positive; got (tol=%r)" % self .tol )
248
+
249
+ # check W and H, or initialize them
250
+ if self .init == 'custom' and update_H :
251
+ _check_init (H , (n_components , n_features ), "NMF (input H)" )
252
+ _check_init (W , (n_samples , n_components ), "NMF (input W)" )
253
+ elif not update_H :
254
+ _check_init (H , (n_components , n_features ), "NMF (input H)" )
255
+ W = np .zeros ((n_samples , n_components ))
256
+ else :
257
+ W , H = _initialize_nmf (X , n_components , init = self .init ,
258
+ random_state = self .random_state )
259
+
260
+ if update_H : # fit_transform
261
+ W , H , n_iter = _fit_projected_gradient (
262
+ X , W , H , self .tol , self .max_iter , self .nls_max_iter ,
263
+ self .alpha , self .l1_ratio )
264
+ else : # transform
265
+ Wt , _ , n_iter = _nls_subproblem (X .T , H .T , W .T , self .tol ,
266
+ self .nls_max_iter ,
267
+ alpha = self .alpha ,
268
+ l1_ratio = self .l1_ratio )
269
+ W = Wt .T
270
+
271
+ if n_iter == self .max_iter and self .tol > 0 :
272
+ warnings .warn ("Maximum number of iteration %d reached. Increase it"
273
+ " to improve convergence." % self .max_iter ,
274
+ ConvergenceWarning )
275
+
276
+ return W , H , n_iter
277
+
278
+ #################
279
+ # End of _PGNMF #
280
+ #################
281
+
282
+
28
283
def plot_results (results_df , plot_name ):
29
284
if results_df is None :
30
285
return None
@@ -52,10 +307,7 @@ def plot_results(results_df, plot_name):
52
307
plt .suptitle (plot_name , fontsize = 16 )
53
308
54
309
55
- # The deprecated projected-gradient solver raises a UserWarning as convergence
56
- # is not reached; the coordinate-descent solver raises a ConvergenceWarning.
57
- @ignore_warnings (category = (ConvergenceWarning , UserWarning ,
58
- DeprecationWarning ))
310
+ @ignore_warnings (category = ConvergenceWarning )
59
311
# use joblib to cache the results.
60
312
# X_shape is specified in arguments for avoiding hashing X
61
313
@mem .cache (ignore = ['X' , 'W0' , 'H0' ])
@@ -133,8 +385,9 @@ def load_faces():
133
385
return faces .data
134
386
135
387
136
- def build_clfs (cd_iters , mu_iters ):
388
+ def build_clfs (cd_iters , pg_iters , mu_iters ):
137
389
clfs = [("Coordinate Descent" , NMF , cd_iters , {'solver' : 'cd' }),
390
+ ("Projected Gradient" , _PGNMF , pg_iters , {'solver' : 'pg' }),
138
391
("Multiplicative Update" , NMF , mu_iters , {'solver' : 'mu' }),
139
392
]
140
393
return clfs
@@ -149,16 +402,18 @@ def build_clfs(cd_iters, mu_iters):
149
402
# first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116)
150
403
plot_name = "20 Newsgroups sparse dataset"
151
404
cd_iters = np .arange (1 , 30 )
405
+ pg_iters = np .arange (1 , 6 )
152
406
mu_iters = np .arange (1 , 30 )
153
- clfs = build_clfs (cd_iters , mu_iters )
407
+ clfs = build_clfs (cd_iters , pg_iters , mu_iters )
154
408
X_20news = load_20news ()
155
409
run_bench (X_20news , clfs , plot_name , n_components , tol , alpha , l1_ratio )
156
410
157
411
# second benchmark on Olivetti faces dataset: dense, shape(400, 4096)
158
412
plot_name = "Olivetti Faces dense dataset"
159
413
cd_iters = np .arange (1 , 30 )
414
+ pg_iters = np .arange (1 , 12 )
160
415
mu_iters = np .arange (1 , 30 )
161
- clfs = build_clfs (cd_iters , mu_iters )
416
+ clfs = build_clfs (cd_iters , pg_iters , mu_iters )
162
417
X_faces = load_faces ()
163
418
run_bench (X_faces , clfs , plot_name , n_components , tol , alpha , l1_ratio ,)
164
419
0 commit comments