@@ -86,7 +86,8 @@ class Document:
86
86
"""Class to build a etree document out of html."""
87
87
88
88
def __init__ (self , input , positive_keywords = None , negative_keywords = None ,
89
- url = None , min_text_length = 25 , retry_length = 250 , xpath = False ):
89
+ url = None , min_text_length = 25 , retry_length = 250 , xpath = False ,
90
+ handle_failures = 'discard' ):
90
91
"""Generate the document
91
92
92
93
:param input: string of the html content.
@@ -97,6 +98,8 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
97
98
:param xpath: If set to True, adds x="..." attribute to each HTML node,
98
99
containing xpath path pointing to original document path (allows to
99
100
reconstruct selected summary in original document).
101
+ :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
102
+ Support options = ["discard", "ignore", None]
100
103
101
104
Examples:
102
105
positive_keywords=["news-item", "block"]
@@ -122,6 +125,7 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
122
125
self .min_text_length = min_text_length
123
126
self .retry_length = retry_length
124
127
self .xpath = xpath
128
+ self .handle_failures = handle_failures
125
129
126
130
def _html (self , force = False ):
127
131
if force or self .html is None :
@@ -141,13 +145,13 @@ def _parse(self, input):
141
145
# trying to guard against bad links like <a href="http://[http://...">
142
146
try :
143
147
# such support is added in lxml 3.3.0
144
- doc .make_links_absolute (base_href , resolve_base_href = True , handle_failures = 'discard' )
148
+ doc .make_links_absolute (base_href , resolve_base_href = True , handle_failures = self . handle_failures )
145
149
except TypeError : #make_links_absolute() got an unexpected keyword argument 'handle_failures'
146
150
# then we have lxml < 3.3.0
147
151
# please upgrade to lxml >= 3.3.0 if you're failing here!
148
- doc .make_links_absolute (base_href , resolve_base_href = True )
152
+ doc .make_links_absolute (base_href , resolve_base_href = True , handle_failures = self . handle_failures )
149
153
else :
150
- doc .resolve_base_href ()
154
+ doc .resolve_base_href (handle_failures = self . handle_failures )
151
155
return doc
152
156
153
157
def content (self ):
0 commit comments