8000 Issue w/ tf-idf computation · scikit-learn/scikit-learn@8957461 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8957461

Browse files
committed
Issue w/ tf-idf computation
1 parent 381315d commit 8957461

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

doc/modules/feature_extraction.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -409,8 +409,8 @@ class::
409409
>>> from sklearn.feature_extraction.text import TfidfTransformer
410410
>>> transformer = TfidfTransformer()
411411
>>> transformer # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
412-
TfidfTransformer(norm=...'l2', smooth_idf=True, sublinear_tf=False,
413-
use_idf=True)
412+
TfidfTransformer(additional_idf=1, norm=...'l2', smooth_idf=True,
413+
sublinear_tf=False, use_idf=True)
414414

415415
Again please see the :ref:`reference documentation
416416
<text_feature_extraction_ref>` for the details on all the parameters.

sklearn/feature_extraction/text.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,10 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
952952
sublinear_tf : boolean, default=False
953953
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
954954
955+
additional_idf : int, default=1
956+
If you want to use the canonical formula tf-idf = tf * idf, set this
957+
flag to 0. Otherwise, the real value of tf-idf is tf * (idf + 1)
958+
955959
References
956960
----------
957961
@@ -964,11 +968,12 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
964968
"""
965969

966970
def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
967-
sublinear_tf=False):
971+
sublinear_tf=False, additional_idf=1):
968972
self.norm = norm
969973
self.use_idf = use_idf
970974
self.smooth_idf = smooth_idf
971975
self.sublinear_tf = sublinear_tf
976+
self.additional_idf = additional_idf
972977

973978
def fit(self, X, y=None):
974979
"""Learn the idf vector (global term weights)
@@ -990,7 +995,7 @@ def fit(self, X, y=None):
990995

991996
# log+1 instead of log makes sure terms with zero idf don't get
992997
# suppressed entirely.
993-
idf = np.log(float(n_samples) / df) + 1.0
998+
idf = np.log(float(n_samples) / df) + self.additional_idf
994999
self._idf_diag = sp.spdiags(idf,
9951000
diags=0, m=n_features, n=n_features)
9961001

@@ -1177,6 +1182,10 @@ class TfidfVectorizer(CountVectorizer):
11771182
sublinear_tf : boolean, default=False
11781183
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
11791184
1185+
additional_idf : int, default=1
1186+
If you want to use the canonical formula tf-idf = tf * idf, set this
1187+
flag to 0. Otherwise, the real value of tf-idf is tf * (idf + 1)
1188+
11801189
Attributes
11811190
----------
11821191
idf_ : array, shape = [n_features], or None
@@ -1216,7 +1225,7 @@ def __init__(self, input='content', encoding='utf-8',
12161225
ngram_range=(1, 1), max_df=1.0, min_df=1,
12171226
max_features=None, vocabulary=None, binary=False,
12181227
dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
1219-
sublinear_tf=False):
1228+
sublinear_tf=False, additional_idf=1):
12201229

12211230
super(TfidfVectorizer, self).__init__(
12221231
input=input, encoding=encoding, decode_error=decode_error,
@@ -1229,7 +1238,8 @@ def __init__(self, input='content', encoding='utf-8',
12291238

12301239
self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
12311240
smooth_idf=smooth_idf,
1232-
sublinear_tf=sublinear_tf)
1241+
sublinear_tf=sublinear_tf,
1242+
additional_idf=additional_idf)
12331243

12341244
# Broadcast the TF-IDF parameters to the underlying transformer instance
12351245
# for easy grid search and repr
@@ -1266,6 +1276,14 @@ def sublinear_tf(self):
12661276
def sublinear_tf(self, value):
12671277
self._tfidf.sublinear_tf = value
12681278

1279+
@property
1280+
def additional_idf(self):
1281+
return self._tfidf.additional_idf
1282+
1283+
@sublinear_tf.setter
1284+
def additional_idf(self, value):
1285+
self._tfidf.additional_idf = value
1286+
12691287
@property
12701288
def idf_(self):
12711289
return self._tfidf.idf_

0 commit comments

Comments
 (0)
0