scikit-learn · n0mad · Jun 6, 2017 · Jun 6, 2017 · Jun 6, 2017 · Jun 6, 2017
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -324,6 +324,23 @@ def test_tf_idf_smoothing():
     assert_true((tfidf >= 
8000
0).all())
 
 
+def test_tfidf_partial_fit():
+    X = [[1, 1, 1],
+         [1, 1, 0],
+         [1, 0, 0]]
+
+    tr_full = TfidfTransformer(smooth_idf=True, norm='l2')
+    tfidf_full = tr_full.fit_transform(X).toarray()
+
+    tr_partial = TfidfTransformer(smooth_idf=True, norm='l2')
+    tr_partial.fit([[1, 1, 1]])
+    tr_partial.partial_fit([[1, 1, 0]])
+    tr_partial.partial_fit([[1, 0, 0]])
+    tfidf_partial = tr_partial.transform(X).toarray()
+
+    assert_array_almost_equal(tfidf_full, tfidf_partial)
+
+
 def test_tfidf_no_smoothing():
     X = [[1, 1, 1],
          [1, 1, 0],

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -11,7 +11,7 @@
 The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to
 build feature vectors from text documents.
 """
-from __future__ import unicode_literals
+from __future__ import unicode_literals, division
 
 import array
 from collections import Mapping, defaultdict
@@ -1015,7 +1015,8 @@ def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
         self.sublinear_tf = sublinear_tf
 
     def fit(self, X, y=None):
-        """Learn the idf vector (global term weights)
+        """Learn the df vector (global term weights).
+        It is used to calculate the idf scores for the terms.
 
         Parameters
         ----------
@@ -1025,18 +1026,44 @@ def fit(self, X, y=None):
         if not sp.issparse(X):
             X = sp.csc_matrix(X)
         if self.use_idf:
-            n_samples, n_features = X.shape
-            df = _document_frequency(X)
+            self._n_samples, n_features = X.shape
+            self._df = _document_frequency(X)
 
             # perform idf smoothing if required
-            df += int(self.smooth_idf)
-            n_samples += int(self.smooth_idf)
+            self._df += int(self.smooth_idf)
+            self._n_samples += int(self.smooth_idf)
+
+        return self
+
+    def partial_fit(self, X, y=None):
+        """Update the df vector (global term weights),
+        which is used to calculate the idf scores for the terms.
+        This method should only be called after `fit` since it
+        is supposed to not change the number of features.
+
+        Parameters
+        ----------
+        X : sparse matrix, [n_samples, n_features]
+            a matrix of term/token counts
+        """
 
-            # log+1 instead of log makes sure terms with zero idf don't get
-            # suppressed entirely.
-            idf = np.log(float(n_samples) / df) + 1.0
-            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
-                                        n=n_features, format='csr')
+        if not hasattr(self, '_df'):
+            raise ValueError("fit should be called before partial_fit")
+
+        if not sp.issparse(X):
+            X = sp.csc_matrix(X)
+        if self.use_idf:
+            n_samples, n_features = X.shape
+
+            expected_n_features = self._df.shape[0]
+            if n_features != expected_n_features:
+                raise ValueError("The update input has n_features=%d while"
+                                 " the model has been trained with n_features="
+                                 "%d" % (n_features, expected_n_features))
+
+            df = _document_frequency(X)
+            self._df += df
+            self._n_samples += n_samples
 
         return self
 
@@ -1070,15 +1097,19 @@ def transform(self, X, copy=True):
             X.data += 1
 
         if self.use_idf:
-            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
+            check_is_fitted(self, '_df', 'df vector is not fitted')
 
-            expected_n_features = self._idf_diag.shape[0]
+            expected_n_features = self._df.shape[0]
             if n_features != expected_n_features:
                 raise ValueError("Input has n_features=%d while the model"
                                  " has been trained with n_features=%d" % (
                                      n_features, expected_n_features))
+
+            idf_diag = sp.spdiags(self.idf_, diags=0, m=n_features,
+                                  n=n_features, format='csr')
+
             # *= doesn't work
-            X = X * self._idf_diag
+            X = X * idf_diag
 
         if self.norm:
             X = normalize(X, norm=self.norm, copy=False)
@@ -1087,9 +1118,7 @@ def transform(self, X, copy=True):
 
     @property
     def idf_(self):
-        # if _idf_diag is not set, this will raise an attribute error,
-        # which means hasattr(self, "idf_") is False
-        return np.ravel(self._idf_diag.sum(axis=0))
+        return np.log(self._n_samples / self._df) + 1.0
 
 
 class TfidfVectorizer(CountVectorizer):