8000 Use utils.supports_lone_surrogates in place of Jython-specific tlogic · html5lib/html5lib-python@dc52b8e · GitHub
[go: up one dir, main page]

Skip to content

Commit dc52b8e

Browse files
committed
Use utils.supports_lone_surrogates in place of Jython-specific tlogic
1 parent 08e7eb5 commit dc52b8e

File tree

3 files changed

+30
-21
lines changed

3 files changed

+30
-21
lines changed

html5lib/inputstream.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from six.moves import http_client
44

55
import codecs
6-
import platform
76
import re
87

98
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
@@ -32,16 +31,15 @@ class BufferedIOBase(object):
3231

3332
invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
3433

35-
if platform.python_implementation() == "Jython":
36-
# Jython does not allow the use of solitary surrogate escapes
37-
# (\uD800-\uDFFF) in literals or other usage. This is because it
38-
# uses UTF-16, which is based on the use of such surrogates.
39-
invalid_unicode_re = re.compile(invalid_unicode_template % "")
40-
else:
41-
# Instead use one extra step of indirection and create surrogates with
42-
# unichr
34+
if utils.supports_lone_surrogates:
35+
# Use one extra step of indirection and create surrogates with
36+
# unichr. Not using this indirection would introduce an illegal
37+
# unicode literal on platforms not supporting such lone
38+
# surrogates.
4339
invalid_unicode_re = re.compile(invalid_unicode_template % (
4440
"%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
41+
else:
42+
invalid_unicode_re = re.compile(invalid_unicode_template % "")
4543

4644
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
4745
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -177,8 +175,8 @@ def __init__(self, source):
177175
178176
"""
179177

180-
if platform.python_implementation() == "Jython":
181-
# By its nature Jython's UTF-16 support does not allow
178+
if not utils.supports_lone_surrogates:
179+
# Such platforms will have already checked for such
182180
# surrogate errors, so no need to do this checking.
183181
self.reportCharacterErrors = None
184182
self.replaceCharactersRegexp = None
@@ -288,9 +286,7 @@ def readChunk(self, chunkSize=None):
288286
self._bufferedCharacter = data[-1]
289287
data = data[:-1]
290288

291-
if platform.python_implementation() != "Jython":
292-
# data is already Unicode, so Jython already has dealt
293-
# with any surrogate character errors, no need to go here
289+
if utils.supports_lone_surrogates:
294290
self.reportCharacterErrors(data)
295291

296292
# Replace invalid characters

html5lib/tests/test_tokenizer.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
from __future__ import absolute_import, division, unicode_literals
22

33
import json
4-
import platform
54
import warnings
65
import re
76

87
from .support import get_data_files
98

109
from html5lib.tokenizer import HTMLTokenizer
11-
from html5lib import constants
10+
from html5lib import constants, utils
1211

1312

1413
class TokenizerTestParser(object):
@@ -132,15 +131,17 @@ def decode(inp):
132131
return inp.encode("utf-8").decode("unicode-escape")
133132
except UnicodeDecodeError:
134133
possible_surrogate_match = _surrogateRe.search(inp)
135-
if possible_surrogate_match and platform.python_implementation() == "Jython":
134+
if possible_surrogate_match and not utils.supports_lone_surrogates:
136135
possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
137136
if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
138-
# Not valid unicode input for Jython.
137+
# Not valid unicode input for platforms that do
138+
# not have support for lone surrogates.
139139
#
140140
# NOTE it's not even possible to have such
141141
# isolated surrogates in unicode input streams in
142-
# Jython - the decoding to unicode would have
143-
# raised a similar UnicodeDecodeError.
142+
# such platforms (like Jython) - the decoding to
143+
# unicode would have raised a similar
144+
# UnicodeDecodeError.
144145
return None
145146
raise
146147

html5lib/utils.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3+
import platform
34
from types import ModuleType
45

56
try:
@@ -9,7 +10,18 @@
910

1011

1112
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
12-
"surrogatePairToCodepoint", "moduleFactoryFactory"]
13+
"surrogatePairToCodepoint", "moduleFactoryFactory",
14+
"supports_lone_surrogates"]
15+
16+
17+
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
18+
# added to the below test. In general this would be any platform using
19+
# UTF-16 as its encoding of unicode strings, such as Jython. This is
20+
# because UTF-16 itself is based on the use of such surrogates, and
21+
# there is no mechanism to further escape such escapes.
22+
#
23+
# Otherwise we assume such support.
24+
supports_lone_surrogates = platform.python_implementation() != "Jython"
1325

1426

1527
class MethodDispatcher(dict):

0 commit comments

Comments
 (0)
0