From 8aab9d8493710a194756bcb5b5cf5a51b46e6f21 Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Fri, 2 May 2014 20:59:47 -0600 Subject: [PATCH 1/5] Do not directly use isolated surrogates in unicode literals for platforms besides Jython --- html5lib/inputstream.py | 44 ++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..d6ca39a3 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -3,6 +3,7 @@ from six.moves import http_client import codecs +import platform import re from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase @@ -28,7 +29,19 @@ class BufferedIOBase(object): asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") + +invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]" + +if platform.python_implementation() == "Jython": + # Jython does not allow the use of solitary surrogate escapes + # (\uD800-\uDFFF) in literals or other usage. This is because it + # uses UTF-16, which is based on the use of such surrogates. + invalid_unicode_re = re.compile(invalid_unicode_template % "") +else: + # Instead use one extra step of indirection and create surrogates with + # unichr + invalid_unicode_re = re.compile(invalid_unicode_template % ( + "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),)) non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -164,13 +177,23 @@ def __init__(self, source): """ - # Craziness - if len("\U0010FFFF") == 1: + if platform.python_implementation() == "Jython": + # By its nature Jython's UTF-16 support does not allow + # surrogate errors, so no need to do this checking. + self.reportCharacterErrors = None + self.replaceCharactersRegexp = None + elif len("\U0010FFFF") == 1: self.reportCharacterErrors = self.characterErrorsUCS4 - self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]") + self.replaceCharactersRegexp = re.compile("[%s-%s]" % ( + unichr(0xD800), unichr(0xDFFF))) else: self.reportCharacterErrors = self.characterErrorsUCS2 - self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(? Date: Fri, 2 May 2014 21:37:02 -0600 Subject: [PATCH 2/5] Use six.unichr for Python 3.x --- html5lib/inputstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index d6ca39a3..ab47c710 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from six import text_type, unichr from six.moves import http_client import codecs From a6c4b41731550a493f285f983eb63f7466a861a5 Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Mon, 16 Jun 2014 14:33:33 -0600 Subject: [PATCH 3/5] Ignore compiled Python classes for Jython --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 73d97fec..52622ced 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Because we never want compiled Python __pycache__/ *.pyc +*.py$class # Ignore stuff produced by distutils /build/ From 7f189f8937d9b669121f48ad1070fa3822fe176f Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Mon, 16 Jun 2014 14:35:10 -0600 Subject: [PATCH 4/5] Pass on constructed tests in test_tokenizer that attempt to build HTMLUnicodeInputStream objects from unicode strings that contain isolated surrogates. Such tests are not meaningful on Jython which does not allow for invalid unicode strings to be decoded in the first place. --- html5lib/tests/test_tokenizer.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index 90315ab3..d33cc79d 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, unicode_literals import json +import platform import warnings import re @@ -122,9 +123,26 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, return tokens["expected"] == tokens["received"] +_surrogateRe = re.compile(r"\\u(?P[0-9A-Fa-f]{4})") + + def unescape(test): def decode(inp): - return inp.encode("utf-8").decode("unicode-escape") + try: + return inp.encode("utf-8").decode("unicode-escape") + except UnicodeDecodeError: + possible_surrogate_match = _surrogateRe.search(inp) + if possible_surrogate_match and platform.python_implementation() == "Jython": + possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) + if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: + # Not valid unicode input for Jython. + # + # NOTE it's not even possible to have such + # isolated surrogates in unicode input streams in + # Jython - the decoding to unicode would have + # raised a similar UnicodeDecodeError. + return None + raise test["input"] = decode(test["input"]) for token in test["output"]: @@ -183,6 +201,8 @@ def testTokenizer(): test["initialStates"] = ["Data state"] if 'doubleEscaped' in test: test = unescape(test) + if test["input"] is None: + continue # Not valid input for this platform for initialState in test["initialStates"]: test["initialState"] = capitalize(initialState) yield runTokenizerTest, test From dc52b8ec1469f05116528772c73e9279cf2a508f Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Tue, 12 Aug 2014 20:33:34 +0200 Subject: [PATCH 5/5] Use utils.supports_lone_surrogates in place of Jython-specific tlogic --- html5lib/inputstream.py | 24 ++++++++++-------------- html5lib/tests/test_tokenizer.py | 13 +++++++------ html5lib/utils.py | 14 +++++++++++++- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index ab47c710..fb5ea759 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -3,7 +3,6 @@ from six.moves import http_client import codecs -import platform import re from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase @@ -32,16 +31,15 @@ class BufferedIOBase(object): invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]" -if platform.python_implementation() == "Jython": - # Jython does not allow the use of solitary surrogate escapes - # (\uD800-\uDFFF) in literals or other usage. This is because it - # uses UTF-16, which is based on the use of such surrogates. - invalid_unicode_re = re.compile(invalid_unicode_template % "") -else: - # Instead use one extra step of indirection and create surrogates with - # unichr +if utils.supports_lone_surrogates: + # Use one extra step of indirection and create surrogates with + # unichr. Not using this indirection would introduce an illegal + # unicode literal on platforms not supporting such lone + # surrogates. invalid_unicode_re = re.compile(invalid_unicode_template % ( "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),)) +else: + invalid_unicode_re = re.compile(invalid_unicode_template % "") non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -177,8 +175,8 @@ def __init__(self, source): """ - if platform.python_implementation() == "Jython": - # By its nature Jython's UTF-16 support does not allow + if not utils.supports_lone_surrogates: + # Such platforms will have already checked for such # surrogate errors, so no need to do this checking. self.reportCharacterErrors = None self.replaceCharactersRegexp = None @@ -288,9 +286,7 @@ def readChunk(self, chunkSize=None): self._bufferedCharacter = data[-1] data = data[:-1] - if platform.python_implementation() != "Jython": - # data is already Unicode, so Jython already has dealt - # with any surrogate character errors, no need to go here + if utils.supports_lone_surrogates: self.reportCharacterErrors(data) # Replace invalid characters diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index d33cc79d..7f4c02ba 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -1,14 +1,13 @@ from __future__ import absolute_import, division, unicode_literals import json -import platform import warnings import re from .support import get_data_files from html5lib.tokenizer import HTMLTokenizer -from html5lib import constants +from html5lib import constants, utils class TokenizerTestParser(object): @@ -132,15 +131,17 @@ def decode(inp): return inp.encode("utf-8").decode("unicode-escape") except UnicodeDecodeError: possible_surrogate_match = _surrogateRe.search(inp) - if possible_surrogate_match and platform.python_implementation() == "Jython": + if possible_surrogate_match and not utils.supports_lone_surrogates: possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: - # Not valid unicode input for Jython. + # Not valid unicode input for platforms that do + # not have support for lone surrogates. # # NOTE it's not even possible to have such # isolated surrogates in unicode input streams in - # Jython - the decoding to unicode would have - # raised a similar UnicodeDecodeError. + # such platforms (like Jython) - the decoding to + # unicode would have raised a similar + # UnicodeDecodeError. return None raise diff --git a/html5lib/utils.py b/html5lib/utils.py index 2f41f4df..62cd80ce 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +import platform from types import ModuleType try: @@ -9,7 +10,18 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", - "surrogatePairToCodepoint", "moduleFactoryFactory"] + "surrogatePairToCodepoint", "moduleFactoryFactory", + "supports_lone_surrogates"] + + +# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be +# added to the below test. In general this would be any platform using +# UTF-16 as its encoding of unicode strings, such as Jython. This is +# because UTF-16 itself is based on the use of such surrogates, and +# there is no mechanism to further escape such escapes. +# +# Otherwise we assume such support. +supports_lone_surrogates = platform.python_implementation() != "Jython" class MethodDispatcher(dict):