8000 Do not directly use isolated surrogates in unicode literals by jimbaker · Pull Request #150 · html5lib/html5lib-python · GitHub
[go: up one dir, main page]

Skip to content

Do not directly use isolated surrogates in unicode literals #150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Because we never want compiled Python
__pycache__/
*.pyc
*.py$class

# Ignore stuff produced by distutils
/build/
Expand Down
42 changes: 32 additions & 10 deletions html5lib/inputstream.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from six import text_type, unichr
from six.moves import http_client

import codecs
Expand Down Expand Up @@ -28,7 +28,18 @@ class BufferedIOBase(object):
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")

invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"

if utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
invalid_unicode_re = re.compile(invalid_unicode_template % (
"%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
else:
invalid_unicode_re = re.compile(invalid_unicode_template % "")

non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
Expand Down Expand Up @@ -164,13 +175,23 @@ def __init__(self, source):

"""

# Craziness
if len("\U0010FFFF") == 1:
if not utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
self.replaceCharactersRegexp = re.compile("[%s-%s]" % (
unichr(0xD800), unichr(0xDFFF)))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
self.replaceCharactersRegexp = re.compile(
"([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
unichr(0xD800), unichr(0xDBFF),
unichr(0xDC00), unichr(0xDFFF),
unichr(0xD800), unichr(0xDBFF),
unichr(0xDC00), unichr(0xDFFF)))

# List of where new lines occur
self.newLines = [0]
Expand Down Expand Up @@ -265,11 +286,12 @@ def readChunk(self, chunkSize=None):
self._bufferedCharacter = data[-1]
data = data[:-1]

self.reportCharacterErrors(data)
if utils.supports_lone_surrogates:
self.reportCharacterErrors(data)

# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)

data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
Expand Down
25 changes: 23 additions & 2 deletions html5lib/tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .support import get_data_files

from html5lib.tokenizer import HTMLTokenizer
from html5lib import constants
from html5lib import constants, utils


class TokenizerTestParser(object):
Expand Down Expand Up @@ -122,9 +122,28 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
return tokens["expected"] == tokens["received"]


_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")


def unescape(test):
def decode(inp):
return inp.encode("utf-8").decode("unicode-escape")
try:
return inp.encode("utf-8").decode("unicode-escape")
except UnicodeDecodeError:
possible_surrogate_match = _surrogateRe.search(inp)
if possible_surrogate_match and not utils.supports_lone_surrogates:
possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
# Not valid unicode input for platforms that do
# not have support for lone surrogates.
#
# NOTE it's not even possible to have such
# isolated surrogates in unicode input streams in
# such platforms (like Jython) - the decoding to
# unicode would have raised a similar
# UnicodeDecodeError.
return None
raise

test["input"] = decode(test["input"])
for token in test["output"]:
Expand Down Expand Up @@ -183,6 +202,8 @@ def testTokenizer():
test["initialStates"] = ["Data state"]
if 'doubleEscaped' in test:
test = unescape(test)
if test["input"] is None:
continue # Not valid input for this platform
for initialState in test["initialStates"]:
test["initialState"] = capitalize(initialState)
yield runTokenizerTest, test
14 changes: 13 additions & 1 deletion html5lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import, division, unicode_literals

import platform
from types import ModuleType

try:
Expand All @@ -9,7 +10,18 @@


__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"]
"surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]


# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# added to the below test. In general this would be any platform using
# UTF-16 as its encoding of unicode strings, such as Jython. This is
# because UTF-16 itself is based on the use of such surrogates, and
# there is no mechanism to further escape such escapes.
#
# Otherwise we assume such support.
supports_lone_surrogates = platform.python_implementation() != "Jython"


class MethodDispatcher(dict):
Expand Down
0