10BC0 Fuck compile-time options · awesome-python/html5lib-python@9411041 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9411041

Browse files
committed
Fuck compile-time options
--HG-- branch : svgmathml extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401267
1 parent b3a5385 commit 9411041

File tree

1 file changed

+57
-5
lines changed

1 file changed

+57
-5
lines changed

src/html5lib/inputstream.py

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@
1212

1313
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDDF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
1414

15+
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
16+
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
17+
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
18+
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
19+
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
20+
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
21+
0x10FFFE, 0x10FFFF])
22+
1523
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
1624

1725
# Cache for charsUntil()
@@ -119,6 +127,13 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
119127
parseMeta - Look for a <meta> element containing encoding information
120128
121129
"""
130+
131+
#Craziness
132+
if len(u"\U0010FFFF") == 1:
133+
self.reportCharacterErrors = self.characterErrorsUCS4
134+
else:
135+
self.reportCharacterErrors = self.characterErrorsUCS2
136+
122137
# List of where new lines occur
123138
self.newLines = [0]
124139

@@ -141,6 +156,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
141156
if (self.charEncoding[0] is None):
142157
self.charEncoding = self.detectEncoding(parseMeta, chardet)
143158

159+
144160
self.reset()
145161

146162
def reset(self):
@@ -343,11 +359,8 @@ def readChunk(self, chunkSize=_defaultChunkSize):
343359

344360
if not data:
345361
return False
346-
#Replace null characters
347-
for i in xrange(data.count(u"\u0000")):
348-
self.errors.append("null-character")
349-
for i in xrange(len(invalid_unicode_re.findall(data))):
350-
self.errors.append("invalid-codepoint")
362+
363+
self.reportCharacterErrors(data)
351364

352365
data = data.replace(u"\u0000", u"\ufffd")
353366
#Check for CR LF broken across chunks
@@ -365,6 +378,45 @@ def readChunk(self, chunkSize=_defaultChunkSize):
365378

366379
return True
367380

381+
def characterErrorsUCS4(self, data):
382+
for i in xrange(data.count(u"\u0000")):
383+
self.errors.append("null-character")
384+
for i 10BC0 in xrange(len(invalid_unicode_re.findall(data))):
385+
self.errors.append("invalid-codepoint")
386+
387+
def characterErrorsUCS2(self, data):
388+
#Someone picked the wrong compile option
389+
#You lose
390+
for i in xrange(data.count(u"\u0000")):
391+
self.errors.append("null-character")
392+
skip = False
393+
import sys
394+
for match in invalid_unicode_re.finditer(data):
395+
if skip:
396+
continue
397+
codepoint = ord(match.group())
398+
pos = match.start()
399+
#Pretty sure there should be endianness issues here
400+
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
401+
pos < len(data) - 1 and
402+
ord(data[pos + 1]) >= 0xDC00 and
403+
ord(data[pos + 1]) <= 0xDFFF):
404+
#We have a surrogate pair!
405+
#From a perl manpage
406+
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
407+
(ord(data[pos + 1]) - 0xDC00))
408+
if char_val in non_bmp_invalid_codepoints:
409+
self.errors.append("invalid-codepoint")
410+
skip = True
411+
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
412+
pos == len(data) - 1):
413+
self.errors.append("invalid-codepoint")
414+
else:
415+
skip = False
416+
self.errors.append("invalid-codepoint")
417+
#This is still wrong if it is possible for a surrogate pair to break a
418+
#chunk boundary
419+
368420
def charsUntil(self, characters, opposite = False):
369421
""" Returns a string of characters from the stream up to but not
370422
including any character in 'characters' or EOF. 'characters' must be

0 commit comments

Comments
 (0)
0