Fix UnicodeDecodeError on python2

anekos · anekos · commit 667114463dd0 · 2020-05-11T12:09:24.000+09:00
diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
@@ -5,11 +5,16 @@
 syntax that can only be solved by conditionally importing different functions.
 """
 import sys
+from lxml.etree import tostring
 
 if sys.version_info[0] == 2:
     bytes_ = str
     str_ = unicode
+    def tostring_(s):
+        return tostring(s, encoding='utf-8').decode('utf-8')
 
 elif sys.version_info[0] == 3:
     bytes_ = bytes
     str_ = str
+    def tostring_(s):
+        return tostring(s, encoding='utf-8')
diff --git a/readability/readability.py b/readability/readability.py
@@ -4,7 +4,6 @@
 import re
 import sys
 
-from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring
@@ -15,7 +14,7 @@
 from .htmls import get_body
 from .htmls import get_title
 from .htmls import shorten_title
-from .compat import str_, bytes_
+from .compat import str_, bytes_, tostring_
 from .debug import describe, text_content
 
 
@@ -464,7 +463,7 @@ def transform_misused_divs_into_paragraphs(self):
             # This results in incorrect results in case there is an <img>
             # buried within an <a> for example
             if not REGEXES["divToPElementsRe"].search(
-                str_(b"".join(map(lambda it: tostring(it, encoding="utf-8"), list(elem))))
+                str_(b"".join(map(tostring_, list(elem))))
             ):
                 # log.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"