From 1878f40b8318ab9f17fe44f33cae7c21127668a0 Mon Sep 17 00:00:00 2001 From: Eugene Kharitonov Date: Tue, 6 Jun 2017 16:01:39 +0200 Subject: [PATCH 1/5] [WIP] TfIdfTransformer persists df and n_samples This commit simplifies building the partial_fit interface, that would update the document frequency vector and the number of observed documents. --- sklearn/feature_extraction/text.py | 35 ++++++++++++++++-------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2484be7166cfa..907601bc19715 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1016,7 +1016,8 @@ def __init__(self, norm='l2', use_idf=True, smooth_idf=True, self.sublinear_tf = sublinear_tf def fit(self, X, y=None): - """Learn the idf vector (global term weights) + """Learn the df vector (global term weights). + It is used to calculate the idf scores for the terms. Parameters ---------- @@ -1026,18 +1027,12 @@ def fit(self, X, y=None): if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: - n_samples, n_features = X.shape - df = _document_frequency(X) + self._n_samples, n_features = X.shape + self._df = _document_frequency(X) # perform idf smoothing if required - df += int(self.smooth_idf) - n_samples += int(self.smooth_idf) - - # log+1 instead of log makes sure terms with zero idf don't get - # suppressed entirely. - idf = np.log(float(n_samples) / df) + 1.0 - self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, - n=n_features, format='csr') + self._df += int(self.smooth_idf) + self._n_samples += int(self.smooth_idf) return self @@ -1071,15 +1066,22 @@ def transform(self, X, copy=True): X.data += 1 if self.use_idf: - check_is_fitted(self, '_idf_diag', 'idf vector is not fitted') + check_is_fitted(self, '_df', 'df vector is not fitted') - expected_n_features = self._idf_diag.shape[0] + expected_n_features = self._df.shape[0] if n_features != expected_n_features: raise ValueError("Input has n_features=%d while the model" " has been trained with n_features=%d" % ( n_features, expected_n_features)) + + # log+1 instead of log makes sure terms with zero idf don't get + # suppressed entirely. + idf = np.log(float(self._n_samples) / self._df) + 1.0 + idf_diag = sp.spdiags(idf, diags=0, m=n_features, + n=n_features, format='csr') + # *= doesn't work - X = X * self._idf_diag + X = X * idf_diag if self.norm: X = normalize(X, norm=self.norm, copy=False) @@ -1088,8 +1090,9 @@ def transform(self, X, copy=True): @property def idf_(self): - if hasattr(self, "_idf_diag"): - return np.ravel(self._idf_diag.sum(axis=0)) + if hasattr(self, "_df"): + idf = np.log(float(self._n_samples) / self._df) + 1.0 + return idf else: return None From a286fdd57e7fdc1f78c6090a9773fb1b0394b57c Mon Sep 17 00:00:00 2001 From: Eugene Kharitonov Date: Tue, 6 Jun 2017 16:21:10 +0200 Subject: [PATCH 2/5] [WIP] partial_fit in TfIdfTransformer partial_fit updates the document frequencies stored by TfIdfTransformer. Fixes #7549 --- sklearn/feature_extraction/text.py | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 907601bc19715..17fc2b9dfdde0 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1036,6 +1036,38 @@ def fit(self, X, y=None): return self + def partial_fit(self, X, y=None): + """Update the df vector (global term weights), + which is used to calculate the idf scores for the terms. + This method should only be called after `fit` since it + is supposed to not change the number of features. + + Parameters + ---------- + X : sparse matrix, [n_samples, n_features] + a matrix of term/token counts + """ + + if not hasattr(self, '_df'): + raise ValueError("fit should be called before partial_fit") + + if not sp.issparse(X): + X = sp.csc_matrix(X) + if self.use_idf: + n_samples, n_features = X.shape + + expected_n_features = self._df.shape[0] + if n_features != expected_n_features: + raise ValueError("The update input has n_features=%d while" + " the model has been trained with n_features=" + "%d" % (n_features, expected_n_features)) + + df = _document_frequency(X) + self._df += df + self._n_samples += n_samples + + return self + def transform(self, X, copy=True): """Transform a count matrix to a tf or tf-idf representation From b9ddb7a84b8fac47c635cf0eff5f0ff8da157ebb Mon Sep 17 00:00:00 2001 From: Eugene Kharitonov Date: Tue, 6 Jun 2017 16:52:20 +0200 Subject: [PATCH 3/5] UT checks that TfIdfTransformer.partial_fit works Issue #7549 --- sklearn/feature_extraction/tests/test_text.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 88382f7d13c0b..e238d0681c64f 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -324,6 +324,23 @@ def test_tf_idf_smoothing(): assert_true((tfidf >= 0).all()) +def test_tfidf_partial_fit(): + X = [[1, 1, 1], + [1, 1, 0], + [1, 0, 0]] + + tr_full = TfidfTransformer(smooth_idf=True, norm='l2') + tfidf_full = tr_full.fit_transform(X).toarray() + + tr_partial = TfidfTransformer(smooth_idf=True, norm='l2') + tr_partial.fit([[1, 1, 1]]) + tr_partial.partial_fit([[1, 1, 0]]) + tr_partial.partial_fit([[1, 0, 0]]) + tfidf_partial = tr_partial.transform(X).toarray() + + assert_array_almost_equal(tfidf_full, tfidf_partial) + + def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], From c5e3dbefcc0746e48946097832d27c2cfe7fdad0 Mon Sep 17 00:00:00 2001 From: Eugene Kharitonov Date: Tue, 6 Jun 2017 17:16:36 +0200 Subject: [PATCH 4/5] idf_ is used to calculate idf in TfidfTransformer A small refactoring so that all calcualtions are in the same place --- sklearn/feature_extraction/text.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 17fc2b9dfdde0..f78742f358cd2 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1106,10 +1106,7 @@ def transform(self, X, copy=True): " has been trained with n_features=%d" % ( n_features, expected_n_features)) - # log+1 instead of log makes sure terms with zero idf don't get - # suppressed entirely. - idf = np.log(float(self._n_samples) / self._df) + 1.0 - idf_diag = sp.spdiags(idf, diags=0, m=n_features, + idf_diag = sp.spdiags(self.idf_, diags=0, m=n_features, n=n_features, format='csr') # *= doesn't work @@ -1123,6 +1120,8 @@ def transform(self, X, copy=True): @property def idf_(self): if hasattr(self, "_df"): + # log+1 instead of log makes sure terms with zero idf don't get + # suppressed entirely. idf = np.log(float(self._n_samples) / self._df) + 1.0 return idf else: From 257988de802c22f137979a4dfb23f8d0e1cf267e Mon Sep 17 00:00:00 2001 From: Eugene Kharitonov Date: Wed, 7 Jun 2017 13:22:32 +0200 Subject: [PATCH 5/5] Replacing float() call by importing __future__.div --- sklearn/feature_extraction/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index d4fcf109e9917..868ba65952a2b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -11,7 +11,7 @@ The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to build feature vectors from text documents. """ -from __future__ import unicode_literals +from __future__ import unicode_literals, division import array from collections import Mapping, defaultdict @@ -1118,7 +1118,7 @@ def transform(self, X, copy=True): @property def idf_(self): - return np.log(float(self._n_samples) / self._df) + 1.0 + return np.log(self._n_samples / self._df) + 1.0 class TfidfVectorizer(CountVectorizer):