1212
1313invalid_unicode_re = re .compile (u"[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDDF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
1414
15+ non_bmp_invalid_codepoints = set ([0x1FFFE , 0x1FFFF , 0x2FFFE , 0x2FFFF , 0x3FFFE ,
16+ 0x3FFFF , 0x4FFFE , 0x4FFFF , 0x5FFFE , 0x5FFFF ,
17+ 0x6FFFE , 0x6FFFF , 0x7FFFE , 0x7FFFF , 0x8FFFE ,
18+ 0x8FFFF , 0x9FFFE , 0x9FFFF , 0xAFFFE , 0xAFFFF ,
19+ 0xBFFFE , 0xBFFFF , 0xCFFFE , 0xCFFFF , 0xDFFFE ,
20+ 0xDFFFF , 0xEFFFE , 0xEFFFF , 0xFFFFE , 0xFFFFF ,
21+ 0x10FFFE , 0x10FFFF ])
22+
1523ascii_punctuation_re = re .compile (ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]" )
1624
1725# Cache for charsUntil()
@@ -119,6 +127,13 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
119127 parseMeta - Look for a <meta> element containing encoding information
120128
121129 """
130+
131+ #Craziness
132+ if len (u"\U0010FFFF " ) == 1 :
133+ self .reportCharacterErrors = self .characterErrorsUCS4
134+ else :
135+ self .reportCharacterErrors = self .characterErrorsUCS2
136+
122137 # List of where new lines occur
123138 self .newLines = [0 ]
124139
@@ -141,6 +156,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
141156 if (self .charEncoding [0 ] is None ):
142157 self .charEncoding = self .detectEncoding (parseMeta , chardet )
143158
159+
144160 self .reset ()
145161
146162 def reset (self ):
@@ -343,11 +359,8 @@ def readChunk(self, chunkSize=_defaultChunkSize):
343359
344360 if not data :
345361 return False
346- #Replace null characters
347- for i in xrange (data .count (u"\u0000 " )):
348- self .errors .append ("null-character" )
349- for i in xrange (len (invalid_unicode_re .findall (data ))):
350- self .errors .append ("invalid-codepoint" )
362+
363+ self .reportCharacterErrors (data )
351364
352365 data = data .replace (u"\u0000 " , u"\ufffd " )
353366 #Check for CR LF broken across chunks
@@ -365,6 +378,45 @@ def readChunk(self, chunkSize=_defaultChunkSize):
365378
366379 return True
367380
381+ def characterErrorsUCS4 (self , data ):
382+ for i in xrange (data .count (u"\u0000 " )):
383+ self .errors .append ("null-character" )
384+ for i
10BC0
in xrange (len (invalid_unicode_re .findall (data ))):
385+ self .errors .append ("invalid-codepoint" )
386+
387+ def characterErrorsUCS2 (self , data ):
388+ #Someone picked the wrong compile option
389+ #You lose
390+ for i in xrange (data .count (u"\u0000 " )):
391+ self .errors .append ("null-character" )
392+ skip = False
393+ import sys
394+ for match in invalid_unicode_re .finditer (data ):
395+ if skip :
396+ continue
397+ codepoint = ord (match .group ())
398+ pos = match .start ()
399+ #Pretty sure there should be endianness issues here
400+ if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
401+ pos < len (data ) - 1 and
402+ ord (data [pos + 1 ]) >= 0xDC00 and
403+ ord (data [pos + 1 ]) <= 0xDFFF ):
404+ #We have a surrogate pair!
405+ #From a perl manpage
406+ char_val = (0x10000 + (codepoint - 0xD800 ) * 0x400 +
407+ (ord (data [pos + 1 ]) - 0xDC00 ))
408+ if char_val in non_bmp_invalid_codepoints :
409+ self .errors .append ("invalid-codepoint" )
410+ skip = True
411+ elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
412+ pos == len (data ) - 1 ):
413+ self .errors .append ("invalid-codepoint" )
414+ else :
415+ skip = False
416+ self .errors .append ("invalid-codepoint" )
417+ #This is still wrong if it is possible for a surrogate pair to break a
418+ #chunk boundary
419+
368420 def charsUntil (self , characters , opposite = False ):
369421 """ Returns a string of characters from the stream up to but not
370422 including any character in 'characters' or EOF. 'characters' must be
0 commit comments