From 1878f40b8318ab9f17fe44f33cae7c21127668a0 Mon Sep 17 00:00:00 2001
From: Eugene Kharitonov <e.kharitonov@criteo.com>
Date: Tue, 6 Jun 2017 16:01:39 +0200
Subject: [PATCH 1/5] [WIP] TfIdfTransformer persists df and n_samples

This commit simplifies building the partial_fit interface, that would
update the document frequency vector and the number of observed
documents.
---
 sklearn/feature_extraction/text.py | 35 ++++++++++++++++--------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 2484be7166cfa..907601bc19715 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1016,7 +1016,8 @@ def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
         self.sublinear_tf = sublinear_tf
 
     def fit(self, X, y=None):
-        """Learn the idf vector (global term weights)
+        """Learn the df vector (global term weights).
+        It is used to calculate the idf scores for the terms.
 
         Parameters
         ----------
@@ -1026,18 +1027,12 @@ def fit(self, X, y=None):
         if not sp.issparse(X):
             X = sp.csc_matrix(X)
         if self.use_idf:
-            n_samples, n_features = X.shape
-            df = _document_frequency(X)
+            self._n_samples, n_features = X.shape
+            self._df = _document_frequency(X)
 
             # perform idf smoothing if required
-            df += int(self.smooth_idf)
-            n_samples += int(self.smooth_idf)
-
-            # log+1 instead of log makes sure terms with zero idf don't get
-            # suppressed entirely.
-            idf = np.log(float(n_samples) / df) + 1.0
-            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, 
-                                        n=n_features, format='csr')
+            self._df += int(self.smooth_idf)
+            self._n_samples += int(self.smooth_idf)
 
         return self
 
@@ -1071,15 +1066,22 @@ def transform(self, X, copy=True):
             X.data += 1
 
         if self.use_idf:
-            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
+            check_is_fitted(self, '_df', 'df vector is not fitted')
 
-            expected_n_features = self._idf_diag.shape[0]
+            expected_n_features = self._df.shape[0]
             if n_features != expected_n_features:
                 raise ValueError("Input has n_features=%d while the model"
                                  " has been trained with n_features=%d" % (
                                      n_features, expected_n_features))
+
+            # log+1 instead of log makes sure terms with zero idf don't get
+            # suppressed entirely.
+            idf = np.log(float(self._n_samples) / self._df) + 1.0
+            idf_diag = sp.spdiags(idf, diags=0, m=n_features,
+                                  n=n_features, format='csr')
+
             # *= doesn't work
-            X = X * self._idf_diag
+            X = X * idf_diag
 
         if self.norm:
             X = normalize(X, norm=self.norm, copy=False)
@@ -1088,8 +1090,9 @@ def transform(self, X, copy=True):
 
     @property
     def idf_(self):
-        if hasattr(self, "_idf_diag"):
-            return np.ravel(self._idf_diag.sum(axis=0))
+        if hasattr(self, "_df"):
+            idf = np.log(float(self._n_samples) / self._df) + 1.0
+            return idf
         else:
             return None
 

From a286fdd57e7fdc1f78c6090a9773fb1b0394b57c Mon Sep 17 00:00:00 2001
From: Eugene Kharitonov <e.kharitonov@criteo.com>
Date: Tue, 6 Jun 2017 16:21:10 +0200
Subject: [PATCH 2/5] [WIP] partial_fit in TfIdfTransformer

partial_fit updates the document frequencies stored by TfIdfTransformer.
Fixes #7549
---
 sklearn/feature_extraction/text.py | 32 ++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 907601bc19715..17fc2b9dfdde0 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1036,6 +1036,38 @@ def fit(self, X, y=None):
 
         return self
 
+    def partial_fit(self, X, y=None):
+        """Update the df vector (global term weights),
+        which is used to calculate the idf scores for the terms.
+        This method should only be called after `fit` since it
+        is supposed to not change the number of features.
+
+        Parameters
+        ----------
+        X : sparse matrix, [n_samples, n_features]
+            a matrix of term/token counts
+        """
+
+        if not hasattr(self, '_df'):
+            raise ValueError("fit should be called before partial_fit")
+
+        if not sp.issparse(X):
+            X = sp.csc_matrix(X)
+        if self.use_idf:
+            n_samples, n_features = X.shape
+
+            expected_n_features = self._df.shape[0]
+            if n_features != expected_n_features:
+                raise ValueError("The update input has n_features=%d while"
+                                 " the model has been trained with n_features="
+                                 "%d" % (n_features, expected_n_features))
+
+            df = _document_frequency(X)
+            self._df += df
+            self._n_samples += n_samples
+
+        return self
+
     def transform(self, X, copy=True):
         """Transform a count matrix to a tf or tf-idf representation
 

From b9ddb7a84b8fac47c635cf0eff5f0ff8da157ebb Mon Sep 17 00:00:00 2001
From: Eugene Kharitonov <e.kharitonov@criteo.com>
Date: Tue, 6 Jun 2017 16:52:20 +0200
Subject: [PATCH 3/5] UT checks that TfIdfTransformer.partial_fit works

Issue #7549
---
 sklearn/feature_extraction/tests/test_text.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 88382f7d13c0b..e238d0681c64f 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -324,6 +324,23 @@ def test_tf_idf_smoothing():
     assert_true((tfidf >= 0).all())
 
 
+def test_tfidf_partial_fit():
+    X = [[1, 1, 1],
+         [1, 1, 0],
+         [1, 0, 0]]
+
+    tr_full = TfidfTransformer(smooth_idf=True, norm='l2')
+    tfidf_full = tr_full.fit_transform(X).toarray()
+
+    tr_partial = TfidfTransformer(smooth_idf=True, norm='l2')
+    tr_partial.fit([[1, 1, 1]])
+    tr_partial.partial_fit([[1, 1, 0]])
+    tr_partial.partial_fit([[1, 0, 0]])
+    tfidf_partial = tr_partial.transform(X).toarray()
+
+    assert_array_almost_equal(tfidf_full, tfidf_partial)
+
+
 def test_tfidf_no_smoothing():
     X = [[1, 1, 1],
          [1, 1, 0],

From c5e3dbefcc0746e48946097832d27c2cfe7fdad0 Mon Sep 17 00:00:00 2001
From: Eugene Kharitonov <e.kharitonov@criteo.com>
Date: Tue, 6 Jun 2017 17:16:36 +0200
Subject: [PATCH 4/5] idf_ is used to calculate idf in TfidfTransformer

A small refactoring so that all calcualtions are in the same place
---
 sklearn/feature_extraction/text.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 17fc2b9dfdde0..f78742f358cd2 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1106,10 +1106,7 @@ def transform(self, X, copy=True):
                                  " has been trained with n_features=%d" % (
                                      n_features, expected_n_features))
 
-            # log+1 instead of log makes sure terms with zero idf don't get
-            # suppressed entirely.
-            idf = np.log(float(self._n_samples) / self._df) + 1.0
-            idf_diag = sp.spdiags(idf, diags=0, m=n_features,
+            idf_diag = sp.spdiags(self.idf_, diags=0, m=n_features,
                                   n=n_features, format='csr')
 
             # *= doesn't work
@@ -1123,6 +1120,8 @@ def transform(self, X, copy=True):
     @property
     def idf_(self):
         if hasattr(self, "_df"):
+            # log+1 instead of log makes sure terms with zero idf don't get
+            # suppressed entirely.
             idf = np.log(float(self._n_samples) / self._df) + 1.0
             return idf
         else:

From 257988de802c22f137979a4dfb23f8d0e1cf267e Mon Sep 17 00:00:00 2001
From: Eugene Kharitonov <e.kharitonov@criteo.com>
Date: Wed, 7 Jun 2017 13:22:32 +0200
Subject: [PATCH 5/5] Replacing float() call by importing __future__.div

---
 sklearn/feature_extraction/text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index d4fcf109e9917..868ba65952a2b 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -11,7 +11,7 @@
 The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to
 build feature vectors from text documents.
 """
-from __future__ import unicode_literals
+from __future__ import unicode_literals, division
 
 import array
 from collections import Mapping, defaultdict
@@ -1118,7 +1118,7 @@ def transform(self, X, copy=True):
 
     @property
     def idf_(self):
-        return np.log(float(self._n_samples) / self._df) + 1.0
+        return np.log(self._n_samples / self._df) + 1.0
 
 
 class TfidfVectorizer(CountVectorizer):