diff --git a/.gitignore b/.gitignore index 73d97fec..52622ced 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Because we never want compiled Python __pycache__/ *.pyc +*.py$class # Ignore stuff produced by distutils /build/ diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..fb5ea759 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from six import text_type, unichr from six.moves import http_client import codecs @@ -28,7 +28,18 @@ class BufferedIOBase(object): asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") + +invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]" + +if utils.supports_lone_surrogates: + # Use one extra step of indirection and create surrogates with + # unichr. Not using this indirection would introduce an illegal + # unicode literal on platforms not supporting such lone + # surrogates. + invalid_unicode_re = re.compile(invalid_unicode_template % ( + "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),)) +else: + invalid_unicode_re = re.compile(invalid_unicode_template % "") non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -164,13 +175,23 @@ def __init__(self, source): """ - # Craziness - if len("\U0010FFFF") == 1: + if not utils.supports_lone_surrogates: + # Such platforms will have already checked for such + # surrogate errors, so no need to do this checking. + self.reportCharacterErrors = None + self.replaceCharactersRegexp = None + elif len("\U0010FFFF") == 1: self.reportCharacterErrors = self.characterErrorsUCS4 - self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]") + self.replaceCharactersRegexp = re.compile("[%s-%s]" % ( + unichr(0xD800), unichr(0xDFFF))) else: self.reportCharacterErrors = self.characterErrorsUCS2 - self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?[0-9A-Fa-f]{4})") + + def unescape(test): def decode(inp): - return inp.encode("utf-8").decode("unicode-escape") + try: + return inp.encode("utf-8").decode("unicode-escape") + except UnicodeDecodeError: + possible_surrogate_match = _surrogateRe.search(inp) + if possible_surrogate_match and not utils.supports_lone_surrogates: + possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) + if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: + # Not valid unicode input for platforms that do + # not have support for lone surrogates. + # + # NOTE it's not even possible to have such + # isolated surrogates in unicode input streams in + # such platforms (like Jython) - the decoding to + # unicode would have raised a similar + # UnicodeDecodeError. + return None + raise test["input"] = decode(test["input"]) for token in test["output"]: @@ -183,6 +202,8 @@ def testTokenizer(): test["initialStates"] = ["Data state"] if 'doubleEscaped' in test: test = unescape(test) + if test["input"] is None: + continue # Not valid input for this platform for initialState in test["initialStates"]: test["initialState"] = capitalize(initialState) yield runTokenizerTest, test diff --git a/html5lib/utils.py b/html5lib/utils.py index 2f41f4df..62cd80ce 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +import platform from types import ModuleType try: @@ -9,7 +10,18 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", - "surrogatePairToCodepoint", "moduleFactoryFactory"] + "surrogatePairToCodepoint", "moduleFactoryFactory", + "supports_lone_surrogates"] + + +# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be +# added to the below test. In general this would be any platform using +# UTF-16 as its encoding of unicode strings, such as Jython. This is +# because UTF-16 itself is based on the use of such surrogates, and +# there is no mechanism to further escape such escapes. +# +# Otherwise we assume such support. +supports_lone_surrogates = platform.python_implementation() != "Jython" class MethodDispatcher(dict):