8000 Fix #99 · kingking888/python-readability@bac691a · GitHub
[go: up one dir, main page]

Skip to content

Commit bac691a

Browse files
author
jkclee
committed
Fix buriy#99
1 parent 3cbede6 commit bac691a

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

readability/readability.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ class Document:
8686
"""Class to build a etree document out of html."""
8787

8888
def __init__(self, input, positive_keywords=None, negative_keywords=None,
89-
url=None, min_text_length=25, retry_length=250, xpath=False):
89+
url=None, min_text_length=25, retry_length=250, xpath=False,
90+
handle_failures='discard'):
9091
"""Generate the document
9192
9293
:param input: string of the html content.
@@ -97,6 +98,8 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
9798
:param xpath: If set to True, adds x="..." attribute to each HTML node,
9899
containing xpath path pointing to original document path (allows to
99100
reconstruct selected summary in original document).
101+
:param handle_failures: Parameter passed to `lxml` for handling failure during exception.
102+
Support options = ["discard", "ignore", None]
100103
101104
Examples:
102105
positive_keywords=["news-item", "block"]
@@ -122,6 +125,7 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
122125
self.min_text_length = min_text_length
123126
self.retry_length = retry_length
124127
self.xpath = xpath
128+
self.handle_failures = handle_failures
125129

126130
def _html(self, force=False):
127131
if force or self.html is None:
@@ -141,13 +145,13 @@ def _parse(self, input):
141145
# trying to guard against bad links like <a href="http://[http://...">
142146
try:
143147
# such support is added in lxml 3.3.0
144-
doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures='discard')
148+
doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures=self.handle_failures)
145149
except TypeError: #make_links_absolute() got an unexpected keyword argument 'handle_failures'
146150
# then we have lxml < 3.3.0
147151
# please upgrade to lxml >= 3.3.0 if you're failing here!
148-
doc.make_links_absolute(base_href, resolve_base_href=True)
152+
doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures=self.handle_failures)
149153
else:
150-
doc.resolve_base_href()
154+
doc.resolve_base_href(handle_failures=self.handle_failures)
151155
return doc
152156

153157
def content(self):

0 commit comments

Comments
 (0)
0