8000 ENH Add partial_fit to GaussianNB · scikit-learn/scikit-learn@e7e49a6 · GitHub
[go: up one dir, main page]

Skip to content

Commit e7e49a6

Browse files
ihaquelarsmans
authored andcommitted
ENH Add partial_fit to GaussianNB
Uses Chan/Golub/LeVeque update for model parameters. Does not implement sample weighting.
1 parent 4bd4e9a commit e7e49a6

File tree

2 files changed

+160
-6
lines changed

2 files changed

+160
-6
lines changed

sklearn/naive_bayes.py

Lines changed: 146 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,15 +104,24 @@ class GaussianNB(BaseNB):
104104
"""
105105
Gaussian Naive Bayes (GaussianNB)
106106
107+
Can perform online updates to model parameters via `partial_fit` method.
108+
For details on algorithm used to update feature means and variance online,
109+
see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:
110+
111+
http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
112+
107113
Attributes
108114
----------
109-
`class_prior_` : array, shape = [n_classes]
115+
`class_prior_` : array, shape (n_classes,)
110116
probability of each class.
111117
112-
`theta_` : array, shape = [n_classes, n_features]
118+
`class_count_` : array, shape (n_classes,)
119+
number of training samples observed in each class.
120+
121+
`theta_` : array, shape (n_classes, n_features)
113122
mean of each feature per class
114123
115-
`sigma_` : array, shape = [n_classes, n_features]
124+
`sigma_` : array, shape (n_classes, n_features)
116125
variance of each feature per class
117126
118127
Examples
@@ -126,18 +135,23 @@ class GaussianNB(BaseNB):
126135
GaussianNB()
127136
>>> print(clf.predict([[-0.8, -1]]))
128137
[1]
138+
>>> clf_pf = GaussianNB()
139+
>>> clf_pf.partial_fit(X, Y, np.unique(Y))
140+
GaussianNB()
141+
>>> print(clf_pf.predict([[-0.8, -1]]))
142+
[1]
129143
"""
130144

131145
def fit(self, X, y):
132146
"""Fit Gaussian Naive Bayes according to X, y
133147
134148
Parameters
135149
----------
136-
X : array-like, shape = [n_samples, n_features]
150+
X : array-like, shape (n_samples, n_features)
137151
Training vectors, where n_samples is the number of samples
138152
and n_features is the number of features.
139153
140-
y : array-like, shape = [n_samples]
154+
y : array-like, shape (n_samples,)
141155
Target values.
142156
143157
Returns
@@ -157,12 +171,138 @@ def fit(self, X, y):
157171
self.theta_ = np.zeros((n_classes, n_features))
158172
self.sigma_ = np.zeros((n_classes, n_features))
159173
self.class_prior_ = np.zeros(n_classes)
174+
self.class_count_ = np.zeros(n_classes)
160175
epsilon = 1e-9
161176
for i, y_i in enumerate(unique_y):
162177
Xi = X[y == y_i, :]
163178
self.theta_[i, :] = np.mean(Xi, axis=0)
164179
self.sigma_[i, :] = np.var(Xi, axis=0) + epsilon
165-
self.class_prior_[i] = np.float(Xi.shape[0]) / n_samples
180+
self.class_count_[i] = Xi.shape[0]
181+
self.class_prior_[:] = self.class_count_ / n_samples
182+
return self
183+
184+
@staticmethod
185+
def _update_mean_variance(n_past, mu, var, X):
186+
"""Compute online update of Gaussian mean and variance.
187+
188+
Given starting sample count, mean, and variance, and a new set of
189+
points X, return the updated mean and variance. (NB - each dimension
190+
(column) in X is treated as independent -- you get variance, not
191+
covariance).
192+
193+
Can take scalar mean and variance, or vector mean and variance to
194+
simultaneously update a number of independent Gaussians.
195+
196+
See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:
197+
198+
http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
199+
200+
Parameters
201+
----------
202+
n_past : int
203+
Number of samples represented in old mean and variance.
204+
205+
mu: array-like, shape (number of Gaussians,)
206+
Means for Gaussians in original set.
207+
208+
var: array-like, shape (number of Gaussians,)
209+
Variances for Gaussians in original set.
210+
211+
Returns
212+
-------
213+
mu_new: array-like, shape (number of Gaussians,)
214+
Updated mean for each Gaussian over the combined set.
215+
216+
var_new: array-like, shape (number of Gaussians,)
217+
Updated variance for each Gaussian over the combined set.
218+
"""
219+
if n_past == 0:
220+
return np.mean(X, axis=0), np.var(X, axis=0)
221+
elif X.shape[0] == 0:
222+
return mu, var
223+
224+
old_ssd = var * n_past
225+
n_new = X.shape[0]
226+
new_ssd = n_new * np.var(X, axis=0)
227+
new_sum = np.sum(X, axis=0)
228+
n_total = float(n_past + n_new)
229+
230+
total_ssd = (old_ssd + new_ssd +
231+
(n_past / float(n_new * n_total)) *
232+
(n_new * mu - new_sum) ** 2)
233+
234+
total_sum = new_sum + (mu * n_past)
235+
236+
return total_sum / n_total, total_ssd / n_total
237+
238+
def partial_fit(self, X, y, classes=None):
239+
"""Incremental fit on a batch of samples.
240+
241+
This method is expected to be called several times consecutively
242+
on different chunks of a dataset so as to implement out-of-core
243+
or online learning.
244+
245+
This is especially useful when the whole dataset is too big to fit in
246+
memory at once.
247+
248+
This method has some performance and numerical stability overhead,
249+
hence it is better to call partial_fit on chunks of data that are
250+
as large as possible (as long as fitting in the memory budget) to
251+
hide the overhead.
252+
253+
Parameters
254+
----------
255+
X : array-like, shape (n_samples, n_features)
256+
Training vectors, where n_samples is the number of samples and
257+
n_features is the number of features.
258+
259+
y : array-like, shape (n_samples,)
260+
Target values.
261+
262+
classes : array-like, shape (n_classes,)
263+
List of all the classes that can possibly appear in the y vector.
264+
265+
Must be provided at the first call to partial_fit, can be omitted
266+
in subsequent calls.
267+
268+
Returns
269+
-------
270+
self : object
271+
Returns self.
272+
"""
273+
X, y = check_arrays(X, y, sparse_format='dense')
274+
y = column_or_1d(y, warn=True)
275+
epsilon = 1e-9
276+
277+
if _check_partial_fit_first_call(self, classes):
278+
# This is the first call to partial_fit:
279+
# initialize various cumulative counters
280+
n_features = X.shape[1]
281+
n_classes = len(self.classes_)
282+
self.theta_ = np.zeros((n_classes, n_features))
283+
self.sigma_ = np.zeros((n_classes, n_features))
284+
self.class_prior_ = np.zeros(n_classes)
285+
self.class_count_ = np.zeros(n_classes)
286+
else:
287+
# Put epsilon back in each time
288+
self.sigma_[:, :] -= epsilon
289+
290+
class2idx = dict((cls, idx) for idx, cls in enumerate(self.classes_))
291+
for y_i in np.unique(y):
292+
i = class2idx[y_i]
293+
X_i = X[y == y_i, :]
294+
N_i = X_i.shape[0]
295+
296+
new_theta, new_sigma = self._update_mean_variance(
297+
self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
298+
X_i)
299+
300+
self.theta_[i, :] = new_theta
301+
self.sigma_[i, :] = new_sigma
302+
self.class_count_[i] += N_i
303+
304+
self.sigma_[:, :] += epsilon
305+
self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_)
166306
return self
167307

168308
def _joint_log_likelihood(self, X):

sklearn/tests/test_naive_bayes.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,20 @@ def test_discretenb_partial_fit():
136136
yield check_partial_fit, cls
137137

138138

139+
def test_gnb_partial_fit():
140+
clf = GaussianNB().fit(X, y)
141+
clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
142+
assert_array_almost_equal(clf.theta_, clf_pf.theta_)
143+
assert_array_almost_equal(clf.sigma_, clf_pf.sigma_)
144+
assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_)
145+
146+
clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y))
147+
clf_pf2.partial_fit(X[1::2], y[1::2])
148+
assert_array_almost_equal(clf.theta_, clf_pf2.theta_)
149+
assert_array_almost_equal(clf.sigma_, clf_pf2.sigma_)
150+
assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_)
151+
152+
139153
def test_discretenb_pickle():
140154
"""Test picklability of discrete naive Bayes classifiers"""
141155

0 commit comments

Comments
 (0)
0