@@ -86,12 +86,24 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
86
86
:param input: string of the html content.
87
87
:param positive_keywords: regex or list of patterns in classes and ids
88
88
:param negative_keywords: regex or list of patterns in classes and ids
89
- :param min_text_length:
90
- :param retry_length:
89
+ :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
90
+ :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
91
+ :param xpath: If set to True, adds x="..." attribute to each HTML node,
92
+ containing xpath path pointing to original document path (allows to
93
+ reconstruct selected summary in original document).
91
94
92
95
Example:
93
96
positive_keywords=["news-item", "block"]
94
97
negative_keywords=["mysidebar", "related", "ads"]
98
+
99
+ The Document class is not re-enterable.
100
+ You need to create a new Document() for each HTML file to process.
101
+
102
+ Provides four API methods:
103
+ .get_title()
104
+ .short_title()
105
+ .get_content()
106
+ .summary()
95
107
"""
96
108
self .input = input
97
109
self .html = None
@@ -131,23 +143,33 @@ def _parse(self, input):
131
143
return doc
132
144
133
145
def content (self ):
146
+ """Returns full document body"""
134
147
return get_body (self ._html (True ))
135
148
136
149
def title (self ):
150
+ """Returns document title"""
137
151
return get_title (self ._html (True ))
138
152
139
153
def short_title (self ):
154
+ """Returns cleaned up document title"""
140
155
return shorten_title (self ._html (True ))
141
156
142
157
def get_clean_html (self ):
143
- return clean_attributes (tounicode (self .html ))
158
+ """
159
+ An internal method, which can be overridden in subclasses, for example,
160
+ to disable or to improve DOM-to-text conversion in .summary() method
161
+ """
162
+ return clean_attributes (tounicode (self .html ))
144
163
145
164
def summary (self , html_partial = False ):
146
- """Generate the summary of the html docuemnt
165
+ """
166
+ Given a HTML file, extracts the text of the article.
147
167
148
168
:param html_partial: return only the div of the document, don't wrap
149
169
in html and body tags.
150
170
171
+ Warning: It mangles internal DOM representation of the HTML document,
172
+ so always use other API methods before this one.
151
173
"""
152
174
try :
153
175
ruthless = True
@@ -278,7 +300,7 @@ def get_link_density(self, elem):
278
300
total_length = text_length (elem )
279
301
return float (link_length ) / max (total_length , 1 )
280
302
281
- def score_paragraphs (self , ):
303
+ def score_paragraphs (self ):
282
304
MIN_LEN = self .min_text_length
283
305
candidates = {}
284
306
ordered = []
@@ -373,6 +395,7 @@ def score_node(self, elem):
373
395
}
374
396
375
397
def remove_unlikely_candidates (self ):
398
+ """Utility method"""
376
399
for elem in self .html .iter ():
377
400
s = "%s %s" % (elem .get ('class' , '' ), elem .get ('id' , '' ))
378
401
if len (s ) < 2 :
@@ -382,6 +405,7 @@ def remove_unlikely_candidates(self):
382
405
elem .drop_tree ()
383
406
384
407
def transform_misused_divs_into_paragraphs (self ):
408
+ """Utility method"""
385
409
for elem in self .tags (self .html , 'div' ):
386
410
# transform <div>s that do not contain other block elements into
387
411
# <p>s
0 commit comments