html5lib · jimbaker · May 3, 2014 · May 3, 2014 · Jun 16, 2014 · Jun 16, 2014
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # Because we never want compiled Python
 __pycache__/
 *.pyc
+*.py$class
 
 # Ignore stuff produced by distutils
 /build/

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type
+from six import text_type, unichr
 from six.moves import http_client
 
 import codecs
@@ -28,7 +28,18 @@ class BufferedIOBase(object):
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
-invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
+
+if utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # unichr. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    invalid_unicode_re = re.compile(invalid_unicode_template % (
+        "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_template % "")
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +175,23 @@ def __init__(self, source):
 
         """
 
-        # Craziness
-        if len("\U0010FFFF") == 1:
+        if not utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+            self.replaceCharactersRegexp = None
+        elif len("\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
+            self.replaceCharactersRegexp = re.compile("[%s-%s]" % (
+                unichr(0xD800), unichr(0xDFFF)))
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+            self.replaceCharactersRegexp = re.compile(
+                "([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
+                    unichr(0xD800), unichr(0xDBFF),
+                    unichr(0xDC00), unichr(0xDFFF),
+                    unichr(0xD800), unichr(0xDBFF),
+                    unichr(0xDC00), unichr(0xDFFF)))
 
         # List of where new lines occur
         self.newLines = [0]
@@ -265,11 +286,12 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        self.reportCharacterErrors(data)
+        if utils.supports_lone_surrogates:
+            self.reportCharacterErrors(data)
 
-        # Replace invalid characters
-        # Note U+0000 is dealt with in the tokenizer
-        data = self.replaceCharactersRegexp.sub("\ufffd", data)
+            # Replace invalid characters
+            # Note U+0000 is dealt with in the tokenizer
+            data = self.replaceCharactersRegexp.sub("\ufffd", data)
 
         data = data.replace("\r\n", "\n")
         data = data.replace("\r", "\n")

diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
@@ -7,7 +7,7 @@
 from .support import get_data_files
 
 from html5lib.tokenizer import HTMLTokenizer
-from html5lib import constants
+from html5lib import constants, utils
 
 
 class TokenizerTestParser(object):
@@ -122,9 +122,28 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
         return tokens["expected"] == tokens["received"]
 
 
+_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")
+
+
 def unescape(test):
     def decode(inp):
-        return inp.encode("utf-8").decode("unicode-escape")
+        try:
+            return inp.encode("utf-8").decode("unicode-escape")
+        except UnicodeDecodeError:
+            possible_surrogate_match = _surrogateRe.search(inp)
+            if possible_surrogate_match and not utils.supports_lone_surrogates:
+                possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
+                if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
+                    # Not valid unicode input for platforms that do
+                    # not have support for lone surrogates.
+                    #
+                        # NOTE it's not even possible to have such
+                    # isolated surrogates in unicode input streams in
+                    # such platforms (like Jython) - the decoding to
+                    # unicode would have raised a similar
+                    # UnicodeDecodeError.
+                    return None
+            raise
 
     test["input"] = decode(test["input"])
     for token in test["output"]:
@@ -183,6 +202,8 @@ def testTokenizer():
                         test["initialStates"] = ["Data state"]
                     if 'doubleEscaped' in test:
                         test = unescape(test)
+                        if test["input"] is None:
+                            continue  # Not valid input for this platform
                     for initialState in test["initialStates"]:
                         test["initialState"] = capitalize(initialState)
                         yield runTokenizerTest, test
diff --git a/html5lib/utils.py b/html5lib/utils.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
+import platform
 from types import ModuleType
 
 try:
@@ -9,7 +10,18 @@
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
-           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# added to the below test. In general this would be any platform using
+# UTF-16 as its encoding of unicode strings, such as Jython. This is
+# because UTF-16 itself is based on the use of such surrogates, and
+# there is no mechanism to further escape such escapes.
+# 
+# Otherwise we assume such support.
+supports_lone_surrogates = platform.python_implementation() != "Jython"
 
 
 class MethodDispatcher(dict):