30
30
import matplotlib .pyplot as plt
31
31
32
32
from sklearn .feature_extraction .text import TfidfVectorizer , CountVectorizer
33
- from sklearn .decomposition import NMF , LatentDirichletAllocation
33
+ from sklearn .decomposition import NMF , MiniBatchNMF , LatentDirichletAllocation
34
34
from sklearn .datasets import fetch_20newsgroups
35
35
36
36
n_samples = 2000
37
37
n_features = 1000
38
38
n_components = 10
39
39
n_top_words = 20
40
+ batch_size = 128
41
+ init = "nndsvda"
40
42
41
43
42
44
def plot_top_words (model , feature_names , n_top_words , title ):
@@ -101,7 +103,15 @@ def plot_top_words(model, feature_names, n_top_words, title):
101
103
"n_samples=%d and n_features=%d..." % (n_samples , n_features )
102
104
)
103
105
t0 = time ()
104
- nmf = NMF (n_components = n_components , random_state = 1 , alpha = 0.1 , l1_ratio = 0.5 ).fit (tfidf )
106
+ nmf = NMF (
107
+ n_components = n_components ,
108
+ random_state = 1 ,
109
+ init = init ,
110
+ beta_loss = "frobenius" ,
111
+ alpha_W = 0.00005 ,
112
+ alpha_H = 0.00005 ,
113
+ l1_ratio = 1 ,
114
+ ).fit (tfidf )
105
115
print ("done in %0.3fs." % (time () - t0 ))
106
116
107
117
@@ -121,10 +131,12 @@ def plot_top_words(model, feature_names, n_top_words, title):
121
131
nmf = NMF (
122
132
n_components = n_components ,
123
133
random_state = 1 ,
134
+ init = init ,
124
135
beta_loss = "kullback-leibler" ,
125
136
solver = "mu" ,
126
137
max_iter = 1000 ,
127
- alpha = 0.1 ,
138
+ alpha_W = 0.00005 ,
139
+ alpha_H = 0.00005 ,
128
140
l1_ratio = 0.5 ,
129
141
).fit (tfidf )
130
142
print ("done in %0.3fs." % (time () - t0 ))
@@ -137,6 +149,63 @@ def plot_top_words(model, feature_names, n_top_words, title):
137
149
"Topics in NMF model (generalized Kullback-Leibler divergence)" ,
138
150
)
139
151
152
+ # Fit the MiniBatchNMF model
153
+ print (
154
+ "\n " * 2 ,
155
+ "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
156
+ "features, n_samples=%d and n_features=%d, batch_size=%d..."
157
+ % (n_samples , n_features , batch_size ),
158
+ )
159
+ t0 = time ()
160
+ mbnmf = MiniBatchNMF (
161
+ n_components = n_components ,
162
+ random_state = 1 ,
163
+ batch_size = batch_size ,
164
+ init = init ,
165
+ beta_loss = "frobenius" ,
166
+ alpha_W = 0.00005 ,
167
+ alpha_H = 0.00005 ,
168
+ l1_ratio = 0.5 ,
169
+ ).fit (tfidf )
170
+ print ("done in %0.3fs." % (time () - t0 ))
171
+
172
+
173
+ tfidf_feature_names = tfidf_vectorizer .get_feature_names_out ()
174
+ plot_top_words (
175
+ mbnmf ,
176
+ tfidf_feature_names ,
177
+ n_top_words ,
178
+ "Topics in MiniBatchNMF model (Frobenius norm)" ,
179
+ )
180
+
181
+ # Fit the MiniBatchNMF model
182
+ print (
183
+ "\n " * 2 ,
184
+ "Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
185
+ "divergence) with tf-idf features, n_samples=%d and n_features=%d, "
186
+ "batch_size=%d..." % (n_samples , n_features , batch_size ),
187
+ )
188
+ t0 = time ()
189
+ mbnmf = MiniBatchNMF (
190
+ n_components = n_components ,
191
+ random_state = 1 ,
192
+ batch_size = batch_size ,
193
+ init = init ,
194
+ beta_loss = "kullback-leibler" ,
195
+ alpha_W = 0.00005 ,
196
+ alpha_H = 0.00005 ,
197
+ l1_ratio = 0.5 ,
198
+ ).fit (tfidf )
199
+ print ("done in %0.3fs." % (time () - t0 ))
200
+
201
+ tfidf_feature_names = tfidf_vectorizer .get_feature_names_out ()
202
+ plot_top_words (
203
+ mbnmf ,
204
+ tfidf_feature_names ,
205
+ n_top_words ,
206
+ "Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)" ,
207
+ )
208
+
140
209
print (
141
210
"\n " * 2 ,
142
211
"Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
0 commit comments