@@ -104,15 +104,24 @@ class GaussianNB(BaseNB):
104
104
"""
105
105
Gaussian Naive Bayes (GaussianNB)
106
106
107
+ Can perform online updates to model parameters via `partial_fit` method.
108
+ For details on algorithm used to update feature means and variance online,
109
+ see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:
110
+
111
+ http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
112
+
107
113
Attributes
108
114
----------
109
- `class_prior_` : array, shape = [ n_classes]
115
+ `class_prior_` : array, shape ( n_classes,)
110
116
probability of each class.
111
117
112
- `theta_` : array, shape = [n_classes, n_features]
118
+ `class_count_` : array, shape (n_classes,)
119
+ number of training samples observed in each class.
120
+
121
+ `theta_` : array, shape (n_classes, n_features)
113
122
mean of each feature per class
114
123
115
- `sigma_` : array, shape = [ n_classes, n_features]
124
+ `sigma_` : array, shape ( n_classes, n_features)
116
125
variance of each feature per class
117
126
118
127
Examples
@@ -126,18 +135,23 @@ class GaussianNB(BaseNB):
126
135
GaussianNB()
127
136
>>> print(clf.predict([[-0.8, -1]]))
128
137
[1]
138
+ >>> clf_pf = GaussianNB()
139
+ >>> clf_pf.partial_fit(X, Y, np.unique(Y))
140
+ GaussianNB()
141
+ >>> print(clf_pf.predict([[-0.8, -1]]))
142
+ [1]
129
143
"""
130
144
131
145
def fit (self , X , y ):
132
146
"""Fit Gaussian Naive Bayes according to X, y
133
147
134
148
Parameters
135
149
----------
136
- X : array-like, shape = [ n_samples, n_features]
150
+ X : array-like, shape ( n_samples, n_features)
137
151
Training vectors, where n_samples is the number of samples
138
152
and n_features is the number of features.
139
153
140
- y : array-like, shape = [ n_samples]
154
+ y : array-like, shape ( n_samples,)
141
155
Target values.
142
156
143
157
Returns
@@ -157,12 +171,138 @@ def fit(self, X, y):
157
171
self .theta_ = np .zeros ((n_classes , n_features ))
158
172
self .sigma_ = np .zeros ((n_classes , n_features ))
159
173
self .class_prior_ = np .zeros (n_classes )
174
+ self .class_count_ = np .zeros (n_classes )
160
175
epsilon = 1e-9
161
176
for i , y_i in enumerate (unique_y ):
162
177
Xi = X [y == y_i , :]
163
178
self .theta_ [i , :] = np .mean (Xi , axis = 0 )
164
179
self .sigma_ [i , :] = np .var (Xi , axis = 0 ) + epsilon
165
- self .class_prior_ [i ] = np .float (Xi .shape [0 ]) / n_samples
180
+ self .class_count_ [i ] = Xi .shape [0 ]
181
+ self .class_prior_ [:] = self .class_count_ / n_samples
182
+ return self
183
+
184
+ @staticmethod
185
+ def _update_mean_variance (n_past , mu , var , X ):
186
+ """Compute online update of Gaussian mean and variance.
187
+
188
+ Given starting sample count, mean, and variance, and a new set of
189
+ points X, return the updated mean and variance. (NB - each dimension
190
+ (column) in X is treated as independent -- you get variance, not
191
+ covariance).
192
+
193
+ Can take scalar mean and variance, or vector mean and variance to
194
+ simultaneously update a number of independent Gaussians.
195
+
196
+ See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:
197
+
198
+ http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
199
+
200
+ Parameters
201
+ ----------
202
+ n_past : int
203
+ Number of samples represented in old mean and variance.
204
+
205
+ mu: array-like, shape (number of Gaussians,)
206
+ Means for Gaussians in original set.
207
+
208
+ var: array-like, shape (number of Gaussians,)
209
+ Variances for Gaussians in original set.
210
+
211
+ Returns
212
+ -------
213
+ mu_new: array-like, shape (number of Gaussians,)
214
+ Updated mean for each Gaussian over the combined set.
215
+
216
+ var_new: array-like, shape (number of Gaussians,)
217
+ Updated variance for each Gaussian over the combined set.
218
+ """
219
+ if n_past == 0 :
220
+ return np .mean (X , axis = 0 ), np .var (X , axis = 0 )
221
+ elif X .shape [0 ] == 0 :
222
+ return mu , var
223
+
224
+ old_ssd = var * n_past
225
+ n_new = X .shape [0 ]
226
+ new_ssd = n_new * np .var (X , axis = 0 )
227
+ new_sum = np .sum (X , axis = 0 )
228
+ n_total = float (n_past + n_new )
229
+
230
+ total_ssd = (old_ssd + new_ssd +
231
+ (n_past / float (n_new * n_total )) *
232
+ (n_new * mu - new_sum ) ** 2 )
233
+
234
+ total_sum = new_sum + (mu * n_past )
235
+
236
+ return total_sum / n_total , total_ssd / n_total
237
+
238
+ def partial_fit (self , X , y , classes = None ):
239
+ """Incremental fit on a batch of samples.
240
+
241
+ This method is expected to be called several times consecutively
242
+ on different chunks of a dataset so as to implement out-of-core
243
+ or online learning.
244
+
245
+ This is especially useful when the whole dataset is too big to fit in
246
+ memory at once.
247
+
248
+ This method has some performance and numerical stability overhead,
249
+ hence it is better to call partial_fit on chunks of data that are
250
+ as large as possible (as long as fitting in the memory budget) to
251
+ hide the overhead.
252
+
253
+ Parameters
254
+ ----------
255
+ X : array-like, shape (n_samples, n_features)
256
+ Training vectors, where n_samples is the number of samples and
257
+ n_features is the number of features.
258
+
259
+ y : array-like, shape (n_samples,)
260
+ Target values.
261
+
262
+ classes : array-like, shape (n_classes,)
263
+ List of all the classes that can possibly appear in the y vector.
264
+
265
+ Must be provided at the first call to partial_fit, can be omitted
266
+ in subsequent calls.
267
+
268
+ Returns
269
+ -------
270
+ self : object
271
+ Returns self.
272
+ """
273
+ X , y = check_arrays (X , y , sparse_format = 'dense' )
274
+ y = column_or_1d (y , warn = True )
275
+ epsilon = 1e-9
276
+
277
+ if _check_partial_fit_first_call (self , classes ):
278
+ # This is the first call to partial_fit:
279
+ # initialize various cumulative counters
280
+ n_features = X .shape [1 ]
281
+ n_classes = len (self .classes_ )
282
+ self .theta_ = np .zeros ((n_classes , n_features ))
283
+ self .sigma_ = np .zeros ((n_classes , n_features ))
284
+ self .class_prior_ = np .zeros (n_classes )
285
+ self .class_count_ = np .zeros (n_classes )
286
+ else :
287
+ # Put epsilon back in each time
288
+ self .sigma_ [:, :] -= epsilon
289
+
290
+ class2idx = dict ((cls , idx ) for idx , cls in enumerate (self .classes_ ))
291
+ for y_i in np .unique (y ):
292
+ i = class2idx [y_i ]
293
+ X_i = X [y == y_i , :]
294
+ N_i = X_i .shape [0 ]
295
+
296
+ new_theta , new_sigma = self ._update_mean_variance (
297
+ self .class_count_ [i ], self .theta_ [i , :], self .sigma_ [i , :],
298
+ X_i )
299
+
300
+ self .theta_ [i , :] = new_theta
301
+ self .sigma_ [i , :] = new_sigma
302
+ self .class_count_ [i ] += N_i
303
+
304
+ self .sigma_ [:, :] += epsilon
305
+ self .class_prior_ [:] = self .class_count_ / np .sum (self .class_count_ )
166
306
return self
167
307
168
308
def _joint_log_likelihood (self , X ):
0 commit comments