8000 DOC Ensures that TfidfTransformer passes numpydoc validation (#20379) · scikit-learn/scikit-learn@9682692 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9682692

Browse files
DOC Ensures that TfidfTransformer passes numpydoc validation (#20379)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent 2645eb3 commit 9682692

File tree

2 files changed

+41
-15
lines changed

2 files changed

+41
-15
lines changed

maint_tools/test_docstrings.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,6 @@
199199
"StackingClassifier",
200200
"StackingRegressor",
201201
"TSNE",
202-
"TfidfTransformer",
203202
"TfidfVectorizer",
204203
"TheilSenRegressor",
205204
"TransformedTargetRegressor",

sklearn/feature_extraction/text.py

Lines changed: 41 additions & 14 deletions
-------
Original file line numberDiff line numberDiff line change
@@ -1395,7 +1395,7 @@ def _make_int_array():
13951395

13961396

13971397
class TfidfTransformer(TransformerMixin, BaseEstimator):
1398-
"""Transform a count matrix to a normalized tf or tf-idf representation
1398+
"""Transform a count matrix to a normalized tf or tf-idf representation.
13991399
14001400
Tf means term-frequency while tf-idf means term-frequency times inverse
14011401
document-frequency. This is a common term weighting scheme in information
@@ -1445,7 +1445,7 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
14451445
similarity between two vectors is their dot product when l2 norm has
14461446
been applied.
14471447
* 'l1': Sum of absolute values of vector elements is 1.
1448-
See :func:`preprocessing.normalize`
1448+
See :func:`preprocessing.normalize`.
14491449
14501450
use_idf : bool, default=True
14511451
Enable inverse-document-frequency reweighting.
@@ -1471,6 +1471,26 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
14711471
14721472
.. versionadded:: 1.0
14731473
1474+
See Also
1475+
--------
1476+
CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
1477+
8000 1478+
TfidfVectorizer : Convert a collection of raw documents to a matrix of
1479+
TF-IDF features.
1480+
1481+
HashingVectorizer : Convert a collection of text documents to a matrix
1482+
of token occurrences.
1483+
1484+
References
1485+
----------
1486+
1487+
.. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
1488+
Information Retrieval. Addison Wesley, pp. 68-74.
1489+
1490+
.. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze (2008).
1491+
Introduction to Information Retrieval. Cambridge University
1492+
Press, pp. 118-120.
1493+
14741494
Examples
14751495
--------
14761496
>>> from sklearn.feature_extraction.text import TfidfTransformer
@@ -1495,16 +1515,6 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
14951515
1. , 1.91629073, 1.91629073])
14961516
>>> pipe.transform(corpus).shape
14971517
(4, 8)
1498-
1499-
References
1500-
----------
1501-
1502-
.. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
1503-
Information Retrieval. Addison Wesley, pp. 68-74.
1504-
1505-
.. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze (2008).
1506-
Introduction to Information Retrieval. Cambridge University
1507-
Press, pp. 118-120.
15081518
"""
15091519

15101520
def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False):
@@ -1520,6 +1530,14 @@ def fit(self, X, y=None):
15201530
----------
15211531
X : sparse matrix of shape n_samples, n_features)
15221532
A matrix of term/token counts.
1533+
1534+
y : None
1535+
This parameter is not needed to compute tf-idf.
1536+
1537+
Returns
1538+
-------
1539+
self : object
1540+
Fitted transformer.
15231541
"""
15241542
X = self._validate_data(X, accept_sparse=("csr", "csc"))
15251543
if not sp.issparse(X):
@@ -1549,12 +1567,12 @@ def fit(self, X, y=None):
15491567
return self
15501568

15511569
def transform(self, X, copy=True):
1552-
"""Transform a count matrix to a tf or tf-idf representation
1570+
"""Transform a count matrix to a tf or tf-idf representation.
15531571
15541572
Parameters
15551573
----------
15561574
X : sparse matrix of (n_samples, n_features)
1557-
a matrix of term/token counts
1575+
A matrix of term/token counts.
15581576
15591577
copy : bool, default=True
15601578
Whether to copy X and operate on the copy or perform in-place
@@ -1563,6 +1581,7 @@ def transform(self, X, copy=True):
15631581
Returns
15641582
15651583
vectors : sparse matrix of shape (n_samples, n_features)
1584+
Tf-idf-weighted document-term matrix.
15661585
"""
15671586
X = self._validate_data(
15681587
X, accept_sparse="csr", dtype=FLOAT_DTYPES, copy=copy, reset=False
@@ -1590,6 +1609,14 @@ def transform(self, X, copy=True):
15901609

15911610
@property
15921611
def idf_(self):
1612+
"""Return the inverse document frecuency (IDF) vector.
1613+
1614+
Returns
1615+
-------
1616+
idf_ : ndarray of shape (n_features,)
1617+
The inverse document frequency (IDF) vector; only defined
1618+
if `use_idf` is True.
1619+
"""
15931620
# if _idf_diag is not set, this will raise an attribute error,
15941621
# which means hasattr(self, "idf_") is False
15951622
return np.ravel(self._idf_diag.sum(axis=0))

0 commit comments

Comments
 (0)
0