Merge pull request #3344 from ihaque/remove_dup_fit

agramfort · agramfort · commit 1775095e1814 · 2014-07-05T09:35:26.000+02:00
Remove duplicate GaussianNB.fit() code
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
@@ -159,27 +159,7 @@ def fit(self, X, y):
         self : object
             Returns self.
         """
-
-        X, y = check_arrays(X, y, sparse_format='dense')
-        y = column_or_1d(y, warn=True)
-
-        n_samples, n_features = X.shape
-
-        self.classes_ = unique_y = np.unique(y)
-        n_classes = unique_y.shape[0]
-
-        self.theta_ = np.zeros((n_classes, n_features))
-        self.sigma_ = np.zeros((n_classes, n_features))
-        self.class_prior_ = np.zeros(n_classes)
-        self.class_count_ = np.zeros(n_classes)
-        epsilon = 1e-9
-        for i, y_i in enumerate(unique_y):
-            Xi = X[y == y_i, :]
-            self.theta_[i, :] = np.mean(Xi, axis=0)
-            self.sigma_[i, :] = np.var(Xi, axis=0) + epsilon
-            self.class_count_[i] = Xi.shape[0]
-        self.class_prior_[:] = self.class_count_ / n_samples
-        return self
+        return self._partial_fit(X, y, np.unique(y), _refit=True)
 
     @staticmethod
     def _update_mean_variance(n_past, mu, var, X):
@@ -270,10 +250,43 @@ def partial_fit(self, X, y, classes=None):
         self : object
             Returns self.
         """
+        return self._partial_fit(X, y, classes, _refit=False)
+
+    def _partial_fit(self, X, y, classes=None, _refit=False):
+        """Actual implementation of Gaussian NB fitting.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        classes : array-like, shape (n_classes,)
+            List of all the classes that can possibly appear in the y vector.
+
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+
+        _refit: bool
+            If true, act as though this were the first time we called
+            _partial_fit (ie, throw away any past fitting and start over).
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
         X, y = check_arrays(X, y, sparse_format='dense')
         y = column_or_1d(y, warn=True)
         epsilon = 1e-9
 
+        if _refit:
+            self.classes_ = None
+
         if _check_partial_fit_first_call(self, classes):
             # This is the first call to partial_fit:
             # initialize various cumulative counters
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
@@ -310,8 +310,8 @@ def type_of_target(y):
 def _check_partial_fit_first_call(clf, classes=None):
     """Private helper function for factorizing common classes param logic
 
-    Estimator that implement the ``partial_fit`` API need to be provided with
-    the list of possible classes at the first call to partial fit.and
+    Estimators that implement the ``partial_fit`` API need to be provided with
+    the list of possible classes at the first call to partial_fit.
 
     Subsequent calls to partial_fit should check that ``classes`` is still
     consistent with a previous value of ``clf.classes_`` when provided.