From 8aab9d8493710a194756bcb5b5cf5a51b46e6f21 Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Fri, 2 May 2014 20:59:47 -0600
Subject: [PATCH 1/5] Do not directly use isolated surrogates in unicode
 literals for platforms besides Jython

---
 html5lib/inputstream.py | 44 ++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 9e03b931..d6ca39a3 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -3,6 +3,7 @@
 from six.moves import http_client
 
 import codecs
+import platform
 import re
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
@@ -28,7 +29,19 @@ class BufferedIOBase(object):
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
-invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
+
+if platform.python_implementation() == "Jython":
+    # Jython does not allow the use of solitary surrogate escapes
+    # (\uD800-\uDFFF) in literals or other usage. This is because it
+    # uses UTF-16, which is based on the use of such surrogates.
+    invalid_unicode_re = re.compile(invalid_unicode_template % "")
+else:
+    # Instead use one extra step of indirection and create surrogates with
+    # unichr
+    invalid_unicode_re = re.compile(invalid_unicode_template % (
+        "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +177,23 @@ def __init__(self, source):
 
         """
 
-        # Craziness
-        if len("\U0010FFFF") == 1:
+        if platform.python_implementation() == "Jython":
+            # By its nature Jython's UTF-16 support does not allow
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+            self.replaceCharactersRegexp = None
+        elif len("\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
+            self.replaceCharactersRegexp = re.compile("[%s-%s]" % (
+                unichr(0xD800), unichr(0xDFFF)))
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+            self.replaceCharactersRegexp = re.compile(
+                "([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
+                    unichr(0xD800), unichr(0xDBFF),
+                    unichr(0xDC00), unichr(0xDFFF),
+                    unichr(0xD800), unichr(0xDBFF),
+                    unichr(0xDC00), unichr(0xDFFF)))
 
         # List of where new lines occur
         self.newLines = [0]
@@ -265,11 +288,14 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        self.reportCharacterErrors(data)
+        if platform.python_implementation() != "Jython":
+            # data is already Unicode, so Jython already has dealt
+            # with any surrogate character errors, no need to go here
+            self.reportCharacterErrors(data)
 
-        # Replace invalid characters
-        # Note U+0000 is dealt with in the tokenizer
-        data = self.replaceCharactersRegexp.sub("\ufffd", data)
+            # Replace invalid characters
+            # Note U+0000 is dealt with in the tokenizer
+            data = self.replaceCharactersRegexp.sub("\ufffd", data)
 
         data = data.replace("\r\n", "\n")
         data = data.replace("\r", "\n")

From 0c5916e700e0e1610bca6ca891a6cf2a73f74434 Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Fri, 2 May 2014 21:37:02 -0600
Subject: [PATCH 2/5] Use six.unichr for Python 3.x

---
 html5lib/inputstream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index d6ca39a3..ab47c710 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type
+from six import text_type, unichr
 from six.moves import http_client
 
 import codecs

From a6c4b41731550a493f285f983eb63f7466a861a5 Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Mon, 16 Jun 2014 14:33:33 -0600
Subject: [PATCH 3/5] Ignore compiled Python classes for Jython

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 73d97fec..52622ced 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Because we never want compiled Python
 __pycache__/
 *.pyc
+*.py$class
 
 # Ignore stuff produced by distutils
 /build/

From 7f189f8937d9b669121f48ad1070fa3822fe176f Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Mon, 16 Jun 2014 14:35:10 -0600
Subject: [PATCH 4/5] Pass on constructed tests in test_tokenizer that attempt
 to build HTMLUnicodeInputStream objects from unicode strings that contain
 isolated surrogates. Such tests are not meaningful on Jython which does not
 allow for invalid unicode strings to be decoded in the first place.

---
 html5lib/tests/test_tokenizer.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index 90315ab3..d33cc79d 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 
 import json
+import platform
 import warnings
 import re
 
@@ -122,9 +123,26 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
         return tokens["expected"] == tokens["received"]
 
 
+_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")
+
+
 def unescape(test):
     def decode(inp):
-        return inp.encode("utf-8").decode("unicode-escape")
+        try:
+            return inp.encode("utf-8").decode("unicode-escape")
+        except UnicodeDecodeError:
+            possible_surrogate_match = _surrogateRe.search(inp)
+            if possible_surrogate_match and platform.python_implementation() == "Jython":
+                possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
+                if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
+                    # Not valid unicode input for Jython.
+                    #
+                    # NOTE it's not even possible to have such
+                    # isolated surrogates in unicode input streams in
+                    # Jython - the decoding to unicode would have
+                    # raised a similar UnicodeDecodeError.
+                    return None
+            raise
 
     test["input"] = decode(test["input"])
     for token in test["output"]:
@@ -183,6 +201,8 @@ def testTokenizer():
                         test["initialStates"] = ["Data state"]
                     if 'doubleEscaped' in test:
                         test = unescape(test)
+                        if test["input"] is None:
+                            continue  # Not valid input for this platform
                     for initialState in test["initialStates"]:
                         test["initialState"] = capitalize(initialState)
                         yield runTokenizerTest, test

From dc52b8ec1469f05116528772c73e9279cf2a508f Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Tue, 12 Aug 2014 20:33:34 +0200
Subject: [PATCH 5/5] Use utils.supports_lone_surrogates in place of
 Jython-specific tlogic

---
 html5lib/inputstream.py          | 24 ++++++++++--------------
 html5lib/tests/test_tokenizer.py | 13 +++++++------
 html5lib/utils.py                | 14 +++++++++++++-
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index ab47c710..fb5ea759 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -3,7 +3,6 @@
 from six.moves import http_client
 
 import codecs
-import platform
 import re
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
@@ -32,16 +31,15 @@ class BufferedIOBase(object):
 
 invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
 
-if platform.python_implementation() == "Jython":
-    # Jython does not allow the use of solitary surrogate escapes
-    # (\uD800-\uDFFF) in literals or other usage. This is because it
-    # uses UTF-16, which is based on the use of such surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_template % "")
-else:
-    # Instead use one extra step of indirection and create surrogates with
-    # unichr
+if utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # unichr. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
     invalid_unicode_re = re.compile(invalid_unicode_template % (
         "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_template % "")
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -177,8 +175,8 @@ def __init__(self, source):
 
         """
 
-        if platform.python_implementation() == "Jython":
-            # By its nature Jython's UTF-16 support does not allow
+        if not utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
             # surrogate errors, so no need to do this checking.
             self.reportCharacterErrors = None
             self.replaceCharactersRegexp = None
@@ -288,9 +286,7 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        if platform.python_implementation() != "Jython":
-            # data is already Unicode, so Jython already has dealt
-            # with any surrogate character errors, no need to go here
+        if utils.supports_lone_surrogates:
             self.reportCharacterErrors(data)
 
             # Replace invalid characters
diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index d33cc79d..7f4c02ba 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -1,14 +1,13 @@
 from __future__ import absolute_import, division, unicode_literals
 
 import json
-import platform
 import warnings
 import re
 
 from .support import get_data_files
 
 from html5lib.tokenizer import HTMLTokenizer
-from html5lib import constants
+from html5lib import constants, utils
 
 
 class TokenizerTestParser(object):
@@ -132,15 +131,17 @@ def decode(inp):
             return inp.encode("utf-8").decode("unicode-escape")
         except UnicodeDecodeError:
             possible_surrogate_match = _surrogateRe.search(inp)
-            if possible_surrogate_match and platform.python_implementation() == "Jython":
+            if possible_surrogate_match and not utils.supports_lone_surrogates:
                 possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
                 if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
-                    # Not valid unicode input for Jython.
+                    # Not valid unicode input for platforms that do
+                    # not have support for lone surrogates.
                     #
                     # NOTE it's not even possible to have such
                     # isolated surrogates in unicode input streams in
-                    # Jython - the decoding to unicode would have
-                    # raised a similar UnicodeDecodeError.
+                    # such platforms (like Jython) - the decoding to
+                    # unicode would have raised a similar
+                    # UnicodeDecodeError.
                     return None
             raise
 
diff --git a/html5lib/utils.py b/html5lib/utils.py
index 2f41f4df..62cd80ce 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
+import platform
 from types import ModuleType
 
 try:
@@ -9,7 +10,18 @@
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
-           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# added to the below test. In general this would be any platform using
+# UTF-16 as its encoding of unicode strings, such as Jython. This is
+# because UTF-16 itself is based on the use of such surrogates, and
+# there is no mechanism to further escape such escapes.
+# 
+# Otherwise we assume such support.
+supports_lone_surrogates = platform.python_implementation() != "Jython"
 
 
 class MethodDispatcher(dict):