8000 Fix UnicodeDecodeError on python2 · stevejaker/python-readability@6671144 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6671144

Browse files
committed
Fix UnicodeDecodeError on python2
1 parent e4a699b commit 6671144

File tree

2 files changed

+7
-3
lines changed

2 files changed

+7
-3
lines changed

readability/compat/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,16 @@
55
syntax that can only be solved by conditionally importing different functions.
66
"""
77
import sys
8+
from lxml.etree import tostring
89

910
if sys.version_info[0] == 2:
1011
bytes_ = str
1112
str_ = unicode
13+
def tostring_(s):
14+
return tostring(s, encoding='utf-8').decode('utf-8')
1215

1316
elif sys.version_info[0] == 3:
1417
bytes_ = bytes
1518
str_ = str
19+
def tostring_(s):
20+
return tostring(s, encoding='utf-8')

readability/readability.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import re
55
import sys
66

7-
from lxml.etree import tostring
87
from lxml.etree import tounicode
98
from lxml.html import document_fromstring
109
from lxml.html import fragment_fromstring
@@ -15,7 +14,7 @@
1514
from .htmls import get_body
1615
from .htmls import get_title
1716
from .htmls import shorten_title
18-
from .compat import str_, bytes_
17+
from .compat import str_, bytes_, tostring_
1918
from .debug import describe, text_content
2019

2120

@@ -464,7 +463,7 @@ def transform_misused_divs_into_paragraphs(self):
464463
# This results in incorrect results in case there is an <img>
465464
# buried within an <a> for example
466465
if not REGEXES["divToPElementsRe"].search(
467-
str_(b"".join(map(lambda it: tostring(it, encoding="utf-8"), list(elem))))
466+
str_(b"".join(map(tostring_, list(elem))))
468467
):
469468
# log.debug("Altering %s to p" % (describe(elem)))
470469
elem.tag = "p"

0 commit comments

Comments
 (0)
0