8000 Minor fix in encoding guessing. Claiming it v0.3.0.1 · littletao08/python-readability@318f25c · GitHub
[go: up one dir, main page]

Skip to content

Commit 318f25c

Browse files
committed
Minor fix in encoding guessing. Claiming it v0.3.0.1
1 parent 08658d1 commit 318f25c

File tree

2 files changed

+4
-14
lines changed

2 files changed

+4
-14
lines changed

readability/htmls.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,10 @@ def build_doc(page):
1111
if isinstance(page, unicode):
1212
page_unicode = page
1313
else:
14-
enc = get_encoding(page)
15-
if enc:
16-
page_unicode = page.decode(enc, 'replace')
17-
encoding = enc
18-
else:
19-
try:
20-
#try utf-8
21-
page_unicode = page.decode('utf-8', 'strict')
22-
encoding = 'utf-8'
23-
except UnicodeDecodeError:
24-
page_unicode = page.decode('utf-8', 'replace')
25-
encoding = 'utf-8'
14+
enc = get_encoding(page) or 'utf-8'
15+
page_unicode = page.decode(enc, 'replace')
2616
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
27-
return doc, encoding
17+
return doc, enc
2818

2919
def js_re(src, pattern, flags, repl):
3020
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
setup(
1111
name="readability-lxml",
12-
version="0.3",
12+
version="0.3.0.1",
1313
author="Yuri Baburov",
1414
author_email="burchik@gmail.com",
1515
description="fast python port of arc90's readability tool",

0 commit comments

Comments
 (0)
0