@@ -169,7 +169,7 @@ def _word_ngrams(self, tokens, stop_words=None):
169
169
space_join = " " .join
170
170
171
171
for n in range (min_n ,
172
- min (max_n + 1 , n_original_tokens + 1 )):
172
+ min (max_n + 1 , n_original_tokens + 1 )):
173
173
for i in range (n_original_tokens - n + 1 ):
174
174
tokens_append (space_join (original_tokens [i : i + n ]))
175
175
@@ -1177,18 +1177,23 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
1177
1177
1178
1178
Parameters
1179
1179
----------
1180
- norm : 'l1', 'l2' or None, optional
1181
- Norm used to normalize term vectors. None for no normalization.
1182
-
1183
- use_idf : boolean, default=True
1180
+ norm : 'l1', 'l2' or None, optional (default='l2')
1181
+ Each output row will have unit norm, either:
1182
+ * 'l2': Sum of squares of vector elements is 1. The cosine
1183
+ similarity between two vectors is their dot product when l2 norm has
1184
+ been applied.
1185
+ * 'l1': Sum of absolute values of vector elements is 1.
1186
+ See :func:`preprocessing.normalize`
1187
+
1188
+ use_idf : boolean (default=True)
1184
1189
Enable inverse-document-frequency reweighting.
1185
1190
1186
- smooth_idf : boolean, default=True
1191
+ smooth_idf : boolean ( default=True)
1187
1192
Smooth idf weights by adding one to document frequencies, as if an
1188
1193
extra document was seen containing every term in the collection
1189
1194
exactly once. Prevents zero divisions.
1190
1195
1191
- sublinear_tf : boolean, default=False
1196
+ sublinear_tf : boolean ( default=False)
1192
1197
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
1193
1198
1194
1199
Attributes
@@ -1305,7 +1310,8 @@ def idf_(self, value):
1305
1310
class TfidfVectorizer (CountVectorizer ):
1306
1311
"""Convert a collection of raw documents to a matrix of TF-IDF features.
1307
1312
1308
- Equivalent to CountVectorizer followed by TfidfTransformer.
1313
+ Equivalent to :class:`CountVectorizer` followed by
1314
+ :class:`TfidfTransformer`.
1309
1315
1310
1316
Read more in the :ref:`User Guide <text_feature_extraction>`.
1311
1317
@@ -1326,13 +1332,13 @@ class TfidfVectorizer(CountVectorizer):
1326
1332
If bytes or files are given to analyze, this encoding is used to
1327
1333
decode.
1328
1334
1329
- decode_error : {'strict', 'ignore', 'replace'}
1335
+ decode_error : {'strict', 'ignore', 'replace'} (default='strict')
1330
1336
Instruction on what to do if a byte sequence is given to analyze that
1331
1337
contains characters not of the given `encoding`. By default, it is
1332
1338
'strict', meaning that a UnicodeDecodeError will be raised. Other
1333
1339
values are 'ignore' and 'replace'.
1334
1340
1335
- strip_accents : {'ascii', 'unicode', None}
1341
+ strip_accents : {'ascii', 'unicode', None} (default=None)
1336
1342
Remove accents and perform other character normalization
1337
1343
during the preprocessing step.
1338
1344
'ascii' is a fast method that only works on characters that have
@@ -1343,14 +1349,14 @@ class TfidfVectorizer(CountVectorizer):
1343
1349
Both 'ascii' and 'unicode' use NFKD normalization from
1344
1350
:func:`unicodedata.normalize`.
1345
1351
1346
- lowercase : boolean, default True
1352
+ lowercase : boolean ( default= True)
1347
1353
Convert all characters to lowercase before tokenizing.
1348
1354
1349
- preprocessor : callable or None (default)
1355
+ preprocessor : callable or None (default=None )
1350
1356
Override the preprocessing (string transformation) stage while
1351
1357
preserving the tokenizing and n-grams generation steps.
1352
1358
1353
- tokenizer : callable or None (default)
1359
+ tokenizer : callable or None (default=None )
1354
1360
Override the string tokenization step while preserving the
1355
1361
preprocessing and n-grams generation steps.
1356
1362
Only applies if ``analyzer == 'word'``.
@@ -1363,7 +1369,7 @@ class TfidfVectorizer(CountVectorizer):
1363
1369
If a callable is passed it is used to extract the sequence of features
1364
1370
out of the raw, unprocessed input.
1365
1371
1366
- stop_words : string {'english'}, list, or None (default)
1372
+ stop_words : string {'english'}, list, or None (default=None )
1367
1373
If a string, it is passed to _check_stop_list and the appropriate stop
1368
1374
list is returned. 'english' is currently the only supported string
1369
1375
value.
@@ -1384,58 +1390,63 @@ class TfidfVectorizer(CountVectorizer):
1384
1390
or more alphanumeric characters (punctuation is completely ignored
1385
1391
and always treated as a token separator).
1386
1392
1387
- ngram_range : tuple (min_n, max_n)
1393
+ ngram_range : tuple (min_n, max_n) (default=(1, 1))
1388
1394
The lower and upper boundary of the range of n-values for different
1389
1395
n-grams to be extracted. All values of n such that min_n <= n <= max_n
1E79
1390
1396
will be used.
1391
1397
1392
- max_df : float in range [0.0, 1.0] or int, default=1.0
1398
+ max_df : float in range [0.0, 1.0] or int ( default=1.0)
1393
1399
When building the vocabulary ignore terms that have a document
1394
1400
frequency strictly higher than the given threshold (corpus-specific
1395
1401
stop words).
1396
1402
If float, the parameter represents a proportion of documents, integer
1397
1403
absolute counts.
1398
1404
This parameter is ignored if vocabulary is not None.
1399
1405
1400
- min_df : float in range [0.0, 1.0] or int, default=1
1406
+ min_df : float in range [0.0, 1.0] or int ( default=1)
1401
1407
When building the vocabulary ignore terms that have a document
1402
1408
frequency strictly lower than the given threshold. This value is also
1403
1409
called cut-off in the literature.
1404
1410
If float, the parameter represents a proportion of documents, integer
1405
1411
absolute counts.
1406
1412
This parameter is ignored if vocabulary is not None.
1407
1413
1408
- max_features : int or None, default=None
1414
+ max_features : int or None ( default=None)
1409
1415
If not None, build a vocabulary that only consider the top
1410
1416
max_features ordered by term frequency across the corpus.
1411
1417
1412
1418
This parameter is ignored if vocabulary is not None.
1413
1419
1414
- vocabulary : Mapping or iterable, optional
1420
+ vocabulary : Mapping or iterable, optional (default=None)
1415
1421
Either a Mapping (e.g., a dict) where keys are terms and values are
1416
1422
indices in the feature matrix, or an iterable over terms. If not
1417
1423
given, a vocabulary is determined from the input documents.
1418
1424
1419
- binary : boolean, default=False
1425
+ binary : boolean ( default=False)
1420
1426
If True, all non-zero term counts are set to 1. This does not mean
1421
1427
outputs will have only 0/1 values, only that the tf term in tf-idf
1422
1428
is binary. (Set idf and normalization to False to get 0/1 outputs.)
1423
1429
1424
- dtype : type, optional
1430
+ dtype : type, optional (default=float64)
1425
1431
Type of the matrix returned by fit_transform() or transform().
1426
1432
1427
- norm : 'l1', 'l2' or None, optional
1428
- Norm used to normalize term vectors. None for no normalization.
1433
+ norm : 'l1', 'l2' or None, optional (default='l2')
1434
+ Each output row will have unit norm, either:
1435
+ * 'l2': Sum of squares of vector elements is 1. The cosine
1436
+ similarity between two vectors is their dot product when l2 norm has
1437
+ been applied.
1438
+ * 'l1': Sum of absolute values of vector elements is 1.
1439
+ See :func:`preprocessing.normalize`
1429
1440
1430
- use_idf : boolean, default=True
1441
+ use_idf : boolean ( default=True)
1431
1442
Enable inverse-document-frequency reweighting.
1432
1443
1433
- smooth_idf : boolean, default=True
1444
+ smooth_idf : boolean ( default=True)
1434
1445
Smooth idf weights by adding one to document frequencies, as if an
1435
1446
extra document was seen containing every term in the collection
1436
1447
exactly once. Prevents zero divisions.
1437
1448
1438
- sublinear_tf : boolean, default=False
1449
+ sublinear_tf : boolean ( default=False)
1439
1450
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
1440
1451
1441
1452
Attributes
@@ -1474,13 +1485,10 @@ class TfidfVectorizer(CountVectorizer):
1474
1485
1475
1486
See also
1476
1487
--------
1477
- CountVectorizer
1478
- Tokenize the documents and count the occurrences of token and return
1479
- them as a sparse matrix
1488
+ CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
1480
1489
1481
- TfidfTransformer
1482
- Apply Term Frequency Inverse Document Frequency normalization to a
1483
- sparse matrix of occurrence counts.
1490
+ TfidfTransformer : Performs the TF-IDF transformation from a provided
1491
+ matrix of counts.
1484
1492
1485
1493
Notes
1486
1494
-----
0 commit comments