html5lib
diff --git a/‎html5lib/inputstream.py
Lines changed: 10 additions & 14 deletions b/‎html5lib/inputstream.py
Lines changed: 10 additions & 14 deletions
diff --git a/‎html5lib/tests/test_tokenizer.py
Lines changed: 7 additions & 6 deletions b/‎html5lib/tests/test_tokenizer.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎html5lib/utils.py
Lines changed: 13 additions & 1 deletion b/‎html5lib/utils.py
Lines changed: 13 additions & 1 deletion
@@ -3,7 +3,6 @@
 from six.moves import http_client
 
 import codecs
-import platform
 import re
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
@@ -32,16 +31,15 @@ class BufferedIOBase(object):
 
 invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
 
-if platform.python_implementation() == "Jython":
-    # Jython does not allow the use of solitary surrogate escapes
-    # (\uD800-\uDFFF) in literals or other usage. This is because it
-    # uses UTF-16, which is based on the use of such surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_template % "")
-else:
-    # Instead use one extra step of indirection and create surrogates with
-    # unichr
+if utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # unichr. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
     invalid_unicode_re = re.compile(invalid_unicode_template % (
         "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_template % "")
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -177,8 +175,8 @@ def __init__(self, source):
 
         """
 
-        if platform.python_implementation() == "Jython":
-            # By its nature Jython's UTF-16 support does not allow
+        if not utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
             # surrogate errors, so no need to do this checking.
             self.reportCharacterErrors = None
             self.replaceCharactersRegexp = None
@@ -288,9 +286,7 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        if platform.python_implementation() != "Jython":
-            # data is already Unicode, so Jython already has dealt
-            # with any surrogate character errors, no need to go here
+        if utils.supports_lone_surrogates:
             self.reportCharacterErrors(data)
 
             # Replace invalid characters
 
@@ -1,14 +1,13 @@
 from __future__ import absolute_import, division, unicode_literals
 
 import json
-import platform
 import warnings
 import re
 
 from .support import get_data_files
 
 from html5lib.tokenizer import HTMLTokenizer
-from html5lib import constants
+from html5lib import constants, utils
 
 
 class TokenizerTestParser(object):
@@ -132,15 +131,17 @@ def decode(inp):
             return inp.encode("utf-8").decode("unicode-escape")
         except UnicodeDecodeError:
             possible_surrogate_match = _surrogateRe.search(inp)
-            if possible_surrogate_match and platform.python_implementation() == "Jython":
+            if possible_surrogate_match and not utils.supports_lone_surrogates:
                 possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
                 if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
-                    # Not valid unicode input for Jython.
+                    # Not valid unicode input for platforms that do
+                    # not have support for lone surrogates.
                     #
                     # NOTE it's not even possible to have such
                     # isolated surrogates in unicode input streams in
-                    # Jython - the decoding to unicode would have
-                    # raised a similar UnicodeDecodeError.
+                    # such platforms (like Jython) - the decoding to
+                    # unicode would have raised a similar
+                    # UnicodeDecodeError.
                     return None
             raise
 
 
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
+import platform
 from types import ModuleType
 
 try:
@@ -9,7 +10,18 @@
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
-           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# added to the below test. In general this would be any platform using
+# UTF-16 as its encoding of unicode strings, such as Jython. This is
+# because UTF-16 itself is based on the use of such surrogates, and
+# there is no mechanism to further escape such escapes.
+# 
+# Otherwise we assume such support.
+supports_lone_surrogates = platform.python_implementation() != "Jython"
 
 
 class MethodDispatcher(dict):