8000 Work-around for unichr limitation to UCS-2 using eval(). · awesome-python/html5lib-python@1651f9b · GitHub
[go: up one dir, main page]

Skip to content

Commit 1651f9b

Browse files
committed
Work-around for unichr limitation to UCS-2 using eval().
Also reintroduced ParseError on Windows 1252 numeric entity codepoints and removed ParseError on numeric entity codepoints greater than 1114111 (U+10FFFF) (per the current spec) --HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40791
1 parent 8cdb012 commit 1651f9b

File tree

2 files changed

+18
-19
lines changed

2 files changed

+18
-19
lines changed

src/tokenizer.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def consumeNumberEntity(self, isHex):
129129
if isHex:
130130
allowed = hexDigits
131131
radix = 16
132-
132+
133133
char = u"\uFFFD"
134134
charStack = []
135135

@@ -146,25 +146,24 @@ def consumeNumberEntity(self, isHex):
146146
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
147147
# smaller) we need to do the "windows trick".
148148
if 127 < charAsInt < 160:
149-
#XXX - removed parse error from windows 1252 entity for now
150-
#we may want to reenable this later
151-
#self.tokenQueue.append({"type": "ParseError", "data":
152-
# _("Entity used with illegal number (windows-1252 reference).")})
149+
self.tokenQueue.append({"type": "ParseError", "data":
150+
_("Entity used with illegal number (windows-1252 reference).")})
153151

154152
charAsInt = entitiesWindows1252[charAsInt - 128]
155153

156-
# 0 is not a good number.
157-
if charAsInt == 0:
158-
charAsInt = 65533
159-
160-
try:
161-
# XXX We should have a separate function that does "int" to
162-
# "unicodestring" conversion since this doesn't always work
163-
# according to hsivonen. Also, unichr has a limitation of 65535
164-
char = unichr(charAsInt)
165-
except:
166-
self.tokenQueue.append({"type": "ParseError", "data":
167-
_("Numeric entity couldn't be converted to character.")})
154+
# 0 is not a good number, neither are illegal Unicode code points.
155+
if charAsInt > 0 and charAsInt <= 1114111:
156+
try:
157+
# XXX We should have a separate function that does "int" to
158+
# "unicodestring" conversion since this doesn't always work
159+
# according to hsivonen. Also, unichr has a limitation of 65535
160+
char = unichr(charAsInt)
161+
except:
162+
try:
163+
char = eval("u'\\U%08x'" % charAsInt)
164+
except:
165+
self.tokenQueue.append({"type": "ParseError", "data":
166+
_("Numeric entity couldn't be converted to character.")})
168167

169168
# Discard the ; if present. Otherwise, put it back on the queue and
170169
# invoke parseError on parser.

tests/test_sanitizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class SanitizeTest(unittest.TestCase):
1313
def addTest(cls, name, expected, input):
1414
def test(self, expected=expected, input=input):
1515
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
16-
parseFragment(expected.encode('utf-8')).childNodes])
16+
parseFragment(expected).childNodes])
1717
self.assertEqual(expected, self.sanitize_html(input))
1818
setattr(cls, name, test)
1919
addTest = classmethod(addTest)
@@ -23,7 +23,7 @@ def sanitize_html(self,stream):
2323
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
2424
parseFragment(stream).childNodes])
2525

26-
def test_should_handle_astral_plane_characters(self):
26+
def test_should_handle_astral_plane_characters(self):
2727
self.assertEqual(u"<p>\U0001d4b5 \U0001d538</p>",
2828
self.sanitize_html("<p>&#x1d4b5; &#x1d538;</p>"))
2929

0 commit comments

Comments
 (0)
0