8000 Improved Document class documentation · kingking888/python-readability@b20d5c1 · GitHub
[go: up one dir, main page]

Skip to content

Commit b20d5c1

Browse files
authored
Improved Document class documentation
1 parent b6e5921 commit b20d5c1

File tree

1 file changed

+29
-5
lines changed

1 file changed

+29
-5
lines changed

readability/readability.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,24 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
8686
:param input: string of the html content.
8787
:param positive_keywords: regex or list of patterns in classes and ids
8888
:param negative_keywords: regex or list of patterns in classes and ids
89-
:param min_text_length:
90-
:param retry_length:
89+
:param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
90+
:param retry_length: Tunable. Set to a lower value for better detection of very small texts.
91+
:param xpath: If set to True, adds x="..." attribute to each HTML node,
92+
containing xpath path pointing to original document path (allows to
93+
reconstruct selected summary in original document).
9194
9295
Example:
9396
positive_keywords=["news-item", "block"]
9497
negative_keywords=["mysidebar", "related", "ads"]
98+
99+
The Document class is not re-enterable.
100+
You need to create a new Document() for each HTML file to process.
101+
102+
Provides four API methods:
103+
.get_title()
104+
.short_title()
105+
.get_content()
106+
.summary()
95107
"""
96108
self.input = input
97109
self.html = None
@@ -131,23 +143,33 @@ def _parse(self, input):
131143
return doc
132144

133145
def content(self):
146+
"""Returns full document body"""
134147
return get_body(self._html(True))
135148

136149
def title(self):
150+
"""Returns document title"""
137151
return get_title(self._html(True))
138152

139153
def short_title(self):
154+
"""Returns cleaned up document title"""
140155
return shorten_title(self._html(True))
141156

142157
def get_clean_html(self):
143-
return clean_attributes(tounicode(self.html))
158+
"""
159+
An internal method, which can be overridden in subclasses, for example,
160+
to disable or to improve DOM-to-text conversion in .summary() method
161+
"""
162+
return clean_attributes(tounicode(self.html))
144163

145164
def summary(self, html_partial=False):
146-
"""Generate the summary of the html docuemnt
165+
"""
166+
Given a HTML file, extracts the text of the article.
147167
148168
:param html_partial: return only the div of the document, don't wrap
149169
in html and body tags.
150170
171+
Warning: It mangles internal DOM representation of the HTML document,
172+
so always use other API methods before this one.
151173
"""
152174
try:
153175
ruthless = True
@@ -278,7 +300,7 @@ def get_link_density(self, elem):
278300
total_length = text_length(elem)
279301
return float(link_length) / max(total_length, 1)
280302

281-
def score_paragraphs(self, ):
303+
def score_paragraphs(self):
282304
MIN_LEN = self.min_text_length
283305
candidates = {}
284306
ordered = []
@@ -373,6 +395,7 @@ def score_node(self, elem):
373395
}
374396

375397
def remove_unlikely_candidates(self):
398+
"""Utility method"""
376399
for elem in self.html.iter():
377400
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
378401
if len(s) < 2:
@@ -382,6 +405,7 @@ def remove_unlikely_candidates(self):
382405
elem.drop_tree()
383406

384407
def transform_misused_divs_into_paragraphs(self):
408+
"""Utility method"""
385409
for elem in self.tags(self.html, 'div'):
386410
# transform <div>s that do not contain other block elements into
387411
# <p>s

0 commit comments

Comments
 (0)
0