@@ -952,6 +952,10 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
952
952
sublinear_tf : boolean, default=False
953
953
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
954
954
955
+ additional_idf : int, default=1
956
+ If you want to use the canonical formula tf-idf = tf * idf, set this
957
+ flag to 0. Otherwise, the real value of tf-idf is tf * (idf + 1)
958
+
955
959
References
956
960
----------
957
961
@@ -964,11 +968,12 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
964
968
"""
965
969
966
970
def __init__ (self , norm = 'l2' , use_idf = True , smooth_idf = True ,
967
- sublinear_tf = False ):
971
+ sublinear_tf = False , additional_idf = 1 ):
968
972
self .norm = norm
969
973
self .use_idf = use_idf
970
974
self .smooth_idf = smooth_idf
971
975
self .sublinear_tf = sublinear_tf
976
+ self .additional_idf = additional_idf
972
977
973
978
def fit (self , X , y = None ):
974
979
"""Learn the idf vector (global term weights)
@@ -990,7 +995,7 @@ def fit(self, X, y=None):
990
995
991
996
# log+1 instead of log makes sure terms with zero idf don't get
992
997
# suppressed entirely.
993
- idf = np .log (float (n_samples ) / df ) + 1.0
998
+ idf = np .log (float (n_samples ) / df ) + self . additional_idf
994
999
self ._idf_diag = sp .spdiags (idf ,
995
1000
diags = 0 , m = n_features , n = n_features )
996
1001
@@ -1177,6 +1182,10 @@ class TfidfVectorizer(CountVectorizer):
1177
1182
sublinear_tf : boolean, default=False
1178
1183
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
1179
1184
1185
+ additional_idf : int, default=1
1186
+ If you want to use the canonical formula tf-idf = tf * idf, set this
1187
+ flag to 0. Otherwise, the real value of tf-idf is tf * (idf + 1)
1188
+
1180
1189
Attributes
1181
1190
----------
1182
1191
idf_ : array, shape = [n_features], or None
@@ -1216,7 +1225,7 @@ def __init__(self, input='content', encoding='utf-8',
1216
1225
ngram_range = (1 , 1 ), max_df = 1.0 , min_df = 1 ,
1217
1226
max_features = None , vocabulary = None , binary = False ,
1218
1227
dtype = np .int64 , norm = 'l2' , use_idf = True , smooth_idf = True ,
1219
- sublinear_tf = False ):
1228
+ sublinear_tf = False , additional_idf = 1 ):
1220
1229
1221
1230
super (TfidfVectorizer , self ).__init__ (
1222
1231
input = input , encoding = encoding , decode_error = decode_error ,
@@ -1229,7 +1238,8 @@ def __init__(self, input='content', encoding='utf-8',
1229
1238
1230
1239
self ._tfidf = TfidfTransformer (norm = norm , use_idf = use_idf ,
1231
1240
smooth_idf = smooth_idf ,
1232
- sublinear_tf = sublinear_tf )
1241
+ sublinear_tf = sublinear_tf ,
1242
+ additional_idf = additional_idf )
1233
1243
1234
1244
# Broadcast the TF-IDF parameters to the underlying transformer instance
1235
1245
# for easy grid search and repr
@@ -1266,6 +1276,14 @@ def sublinear_tf(self):
1266
1276
def sublinear_tf (self , value ):
1267
1277
self ._tfidf .sublinear_tf = value
1268
1278
1279
+ @property
1280
+ def additional_idf (self ):
1281
+ return self ._tfidf .additional_idf
1282
+
1283
+ @sublinear_tf .setter
1284
+ def additional_idf (self , value ):
1285
+ self ._tfidf .additional_idf = value
1286
+
1269
1287
@property
1270
1288
def idf_ (self ):
1271
1289
return self ._tfidf .idf_
0 commit comments