7
7
from constants import encodings , ReparseException
8
8
9
9
#Non-unicode versions of constants for use in the pre-parser
10
- spaceCharactersBytes = [str (item ) for item in spaceCharacters ]
11
- asciiLettersBytes = [str (item ) for item in asciiLetters ]
12
- asciiUppercaseBytes = [str (item ) for item in asciiUppercase ]
10
+ spaceCharactersBytes = frozenset ([str (item ) for item in spaceCharacters ])
11
+ asciiLettersBytes = frozenset ([str (item ) for item in asciiLetters ])
12
+ asciiUppercaseBytes = frozenset ([str (item ) for item in asciiUppercase ])
13
+ spacesAngleBrackets = spaceCharactersBytes | frozenset ([">" , "<" ])
13
14
14
15
invalid_unicode_re = re .compile (u"[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
15
16
@@ -169,13 +170,10 @@ def reset(self):
169
170
self .chunkOffset = 0
170
171
self .errors = []
171
172
172
- # Remember the current position in the document
173
- self .positionLine = 1
174
- self .positionCol = 0
175
- # Remember the length of the last line, so unget("\n") can restore
176
- # positionCol. (Only one character can be ungot at once, so we only
177
- # need to remember the single last line.)
178
- self .lastLineLength = None
173
+ # number of (complete) lines in previous chunks
174
+ self .prevNumLines = 0
175
+ # number of columns in the last line of the previous chunk
176
+ self .prevNumCols = 0
179
177
180
178
#Flag to indicate we may have a CR LF broken across a data chunk
181
179
self ._lastChunkEndsWithCR = False
@@ -252,11 +250,11 @@ def changeEncoding(self, newEncoding):
252
250
if newEncoding is None :
253
251
return
254
252
elif newEncoding == self .charEncoding [0 ]:
255
- self .charEncoding = (self .charEncoding [0 ], "certian " )
253
+ self .charEncoding = (self .charEncoding [0 ], "certain " )
256
254
else :
257
255
self .rawStream .seek (0 )
258
256
self .reset ()
259
- self .charEncoding = (newEncoding , "certian " )
257
+ self .charEncoding = (newEncoding , "certain " )
260
258
raise ReparseException , "Encoding changed from %s to %s" % (self .charEncoding [0 ], newEncoding )
261
259
262
260
def detectBOM (self ):
@@ -302,33 +300,21 @@ def detectEncodingMeta(self):
302
300
303
301
return encoding
304
302
305
- def updatePosition (self , chars ):
306
- # Update the position attributes to correspond to some sequence of
307
- # read characters
308
-
309
- # Find the last newline character
310
- idx = chars .rfind (u"\n " )
311
- if idx == - 1 :
312
- # No newlines in chars
313
- self .positionCol += len (chars )
303
+ def _position (self , offset ):
304
+ chunk = self .chunk
305
+ nLines = chunk .count (u'\n ' , 0 , offset )
306
+ positionLine = self .prevNumLines + nLines
307
+ lastLinePos = chunk .rfind (u'\n ' , 0 , offset )
308
+ if lastLinePos == - 1 :
309
+ positionColumn = self .prevNumCols + offset
314
310
else :
315
- # Find the last-but-one newline character
316
- idx2 = chars .rfind (u"\n " , 0 , idx )
317
- if idx2 == - 1 :
318
- # Only one newline in chars
319
- self .positionLine += 1
320
- self .lastLineLength = self .positionCol + idx
321
- self .positionCol = len (chars ) - (idx + 1 )
322
- else :
323
- # At least two newlines in chars
324
- newlines = chars .count (u"\n " )
325
- self .positionLine += newlines
326
- self .lastLineLength = idx - (idx2 + 1 )
327
- self .positionCol = len (chars ) - (idx + 1 )
311
+ positionColumn = offset - (lastLinePos + 1 )
312
+ return (positionLine , positionColumn )
328
313
329
314
def position (self ):
330
315
"""Returns (line, col) of the current position in the stream."""
331
- return (self .positionLine , self .positionCol )
316
+ line , col = self ._position (self .chunkOffset )
317
+ return (line + 1 , col )
332
318
333
319
def char (self ):
334
320
""" Read one character from the stream or queue if available. Return
@@ -339,20 +325,18 @@ def char(self):
339
325
if not self .readChunk ():
340
326
return EOF
341
327
342
- char = self .chunk [self .chunkOffset ]
343
- self .chunkOffset += 1
344
-
345
- # Update the position attributes
346
- if char == u"\n " :
347
- self .lastLineLength = self .positionCol
348
- self .positionCol = 0
349
- self .positionLine += 1
350
- elif char is not EOF :
351
- self .positionCol += 1
328
+ chunkOffset = self .chunkOffset
329
+ char = self .chunk [chunkOffset ]
330
+ self .chunkOffset = chunkOffset + 1
352
331
353
332
return char
354
333
355
- def readChunk (self , chunkSize = _defaultChunkSize ):
334
+ def readChunk (self , chunkSize = None ):
335
+ if chunkSize is None :
336
+ chunkSize = self ._defaultChunkSize
337
+
338
+ self .prevNumLines , self .prevNumCols = self ._position (self .chunkSize )
339
+
356
340
self .chunk = u""
357
341
self .chunkSize = 0
358
342
self .chunkOffset = 0
@@ -430,7 +414,9 @@ def charsUntil(self, characters, opposite = False):
430
414
try :
431
415
chars = charsUntilRegEx [(characters , opposite )]
432
416
except KeyError :
433
- for c in characters : assert (ord (c ) < 128 )
417
+ if __debug__ :
418
+ for c in characters :
419
+ assert (ord (c ) < 128 )
434
420
regex = u"" .join ([u"\\ x%02x" % ord (c ) for c in characters ])
435
421
if not opposite :
436
422
regex = u"^%s" % regex
@@ -462,7 +448,6 @@ def charsUntil(self, characters, opposite = False):
462
448
break
463
449
464
450
r = u"" .join (rv )
465
- self .updatePosition (r )
466
451
return r
467
452
468
453 def unget (self , char ):
@@ -482,18 +467,8 @@ def unget(self, char):
482
467
self .chunkOffset -= 1
483
468
assert self .chunk [self .chunkOffset ] == char
484
469
485
- # Update the position attributes
486
- if char == u"\n " :
487
- assert self .positionLine >= 1
488
- assert self .lastLineLength is not None
489
- self .positionLine -= 1
490
- self .positionCol = self .lastLineLength
491
- self .lastLineLength = None
492
- else :
493
- self .positionCol -= 1
494
-
495
470
class EncodingBytes (str ):
496
- """String-like object with an assosiated position and various extra methods
471
+ """String-like object with an associated position and various extra methods
497
472
If the position is ever greater than the string length then an exception is
498
473
raised"""
499
474
def __new__ (self , value ):
@@ -506,9 +481,21 @@ def __iter__(self):
506
481
return self
507
482
508
483
def next (self ):
509
- self ._position += 1
510
- rv = self [self .position ]
511
- return rv
484
+ p = self ._position = self ._position + 1
485
+ if p >= len (self ):
486
+ raise StopIteration
487
+ elif p < 0 :
488
+ raise TypeError
489
+ return self [p ]
490
+
491
+ def previous (self ):
492
+ p = self ._position
493
+ if p >= len (self ):
494
+ raise StopIteration
495
+ elif p < 0 :
496
+ raise TypeError
497
+ self ._position = p = p - 1
498
+ return self [p ]
512
499
513
500
def setPosition (self , position ):
514
501
if self ._position >= len (self ):
@@ -532,18 +519,37 @@ def getCurrentByte(self):
532
519
533
520
def skip (self , chars = spaceCharactersBytes ):
534
521
"""Skip past a list of characters"""
535
- while self .currentByte in chars :
536
- self .position += 1
522
+ p = self .position # use property for the error-checking
523
+ while p < len (self ):
524
+ c = self [p ]
525
+ if c not in chars :
526
+ self ._position = p
527
+ return c
528
+ p += 1
529
+ self ._position = p
530
+ return None
531
+
532
+ def skipUntil (self , chars ):
533
+ p = self .position
534
+ while p < len (self ):
535
+ c = self [p ]
536
+ if c in chars :
537
+ self ._position = p
538
+ return c
539
+ p += 1
540
+ self ._position = p
541
+ return None
537
542
538
543
def matchBytes (self , bytes , lower = False ):
539
544
"""Look for a sequence of bytes at the start of a string. If the bytes
540
545
are found return True and advance the position to the byte after the
541
546
match. Otherwise return False and leave the position alone"""
542
- data = self [self .position :self .position + len (bytes )]
547
+ p = self .position
548
+ data = self [p :p + len (bytes )]
543
549
if lower :
544
550
data = data .lower ()
545
551
rv = data .startswith (bytes )
546
- if rv == True :
552
+ if rv :
547
553
self .position += len (bytes )
548
554
return rv
549
555
@@ -556,12 +562,6 @@ def jumpTo(self, bytes):
556
562
return True
557
563
else :
558
564
raise StopIteration
559
-
560
- def findNext (self , byteList ):
561
- """Move the pointer so it points to the next byte in a set of possible
562
- bytes"""
563
- while (self .currentByte not in byteList ):
564
- self .position += 1
565
565
566
566
class EncodingParser (object ):
567
567
"""Mini parser for detecting character encoding from meta elements"""
@@ -627,24 +627,25 @@ def handlePossibleStartTag(self):
627
627
return self .handlePossibleTag (False )
628
628
629
629
def handlePossibleEndTag (self ):
630
- self .data .position += 1
630
+ self .data .next ()
631
631
return self .handlePossibleTag (True )
632
632
633
633
def handlePossibleTag (self , endTag ):
634
- if self .data .currentByte not in asciiLettersBytes :
634
+ data = self .data
635
+ if data .currentByte not in asciiLettersBytes :
635
636
#If the next byte is not an ascii letter either ignore this
636
637
#fragment (possible start tag case) or treat it according to
637
638
#handleOther
638
639
if endTag :
639
- self . data .position -= 1
640
+ data .previous ()
640
641
self .handleOther ()
641
642
return True
642
643
643
- self . data .findNext ( list ( spaceCharactersBytes ) + [ "<" , ">" ] )
644
- if self . data . currentByte == "<" :
644
+ c = data .skipUntil ( spacesAngleBrackets )
645
+ if c == "<" :
645
646
#return to the first step in the overall "two step" algorithm
646
647
#reprocessing the < byte
647
- self . data .position -= 1
648
+ data .previous ()
648
649
else :
649
650
#Read all attributes
650
651
attr = self .getAttribute ()
@@ -658,73 +659,75 @@ def handleOther(self):
658
659
def getAttribute (self ):
659
660
"""Return a name,value pair for the next attribute in the stream,
660
661
if one is found, or None"""
661
- self .data .skip (list (spaceCharactersBytes )+ ["/" ])
662
- if self .data .currentByte == "<" :
663
- self .data .position -= 1
662
+ data = self .data
663
+ c = data .skip (spaceCharactersBytes | frozenset ("/" ))
664
+ if c == "<" :
665
+ data .previous ()
664
666
return None
665
- elif self . data . currentByte == ">" :
667
+ elif c == ">" or c is None :
666
668
return None
667
669
attrName = []
668
670
attrValue = []
669
671
spaceFound = False
670
672
#Step 5 attribute name
671
673
while True :
672
- if self . data . currentByte == "=" and attrName :
674
+ if c == "=" and attrName :
673
675
break
674
- elif self . data . currentByte in spaceCharactersBytes :
676
+ elif c in spaceCharactersBytes :
675
677
spaceFound = True
676
678
break
677
- elif self . data . currentByte in ("/" , "<" , ">" ):
679
+ elif c in ("/" , "<" , ">" ):
678
680
return "" .join (attrName ), ""
679
- elif self . data . currentByte in asciiUppercaseBytes :
680
- attrName .extend ( self . data . currentByte .lower ())
681
+ elif c in asciiUppercaseBytes :
682
+ attrName .append ( c .lower ())
681
683
else :
682
- attrName .extend ( self . data . currentByte )
684
+ attrName .append ( c )
683
685
#Step 6
684
- self . data . position += 1
686
+ c = data . next ()
685
687
#Step 7
686
688
if spaceFound :
687
- self . data .skip ()
689
+ c = data .skip ()
688
690
#Step 8
689
- if self . data . currentByte != "=" :
690
- self . data .position -= 1
691
+ if c != "=" :
692
+ data .previous ()
691
693
return "" .join (attrName ), ""
692
694
#XXX need to advance position in both spaces and value case
693
695
#Step 9
694
- self . data .position += 1
696
+ data .next ()
695
697
#Step 10
696
- self . data .skip ()
698
+ c = data .skip ()
697
699
#Step 11
698
- if self . data . currentByte in ("'" , '"' ):
700
+ if c in ("'" , '"' ):
699
701
#11.1
700
- quoteChar = self . data . currentByte
702
+ quoteChar = c
701
703
while True :
702
- self .data .position += 1
703
704
#11.3
704
- if self .data .currentByte == quoteChar :
705
- self .data .position += 1
705
+ c = data .next ()
706
+ if c == quoteChar :
707
+ data .next ()
706
708
return "" .join (attrName ), "" .join (attrValue )
707
709
#11.4
708
- elif self . data . currentByte in asciiUppercaseBytes :
709
- attrValue .extend ( self . data . currentByte .lower ())
710
+ elif c in asciiUppercaseBytes :
711
+ attrValue .append ( c .lower ())
710
712
#11.5
711
713
else :
712
- attrValue .extend (self .data .currentByte )
713
- elif self .data .currentByte in (">" , "<" ):
714
- return "" .join (attrName ), ""
715
- elif self .data .currentByte in asciiUppercaseBytes :
716
- attrValue .extend (self .data .currentByte .lower ())
714
+ attrValue .append (c )
715
+ elif c in (">" , "<" ):
716
+ return "" .join (attrName ), ""
717
+ elif c in asciiUppercaseBytes :
718
+ attrValue .append (c .lower ())
719
+ elif c is None :
720
+ return None
717
721
else :
718
- attrValue .extend ( self . data . currentByte )
722
+ attrValue .append ( c )
719
723
while True :
720
- self .data .position += 1
721
- if self .data .currentByte in (
722
- list (spaceCharactersBytes ) + [">" , "<" ]):
724
+ c = data .next ()
725
+ if c in spacesAngleBrackets :
723
726
return "" .join (attrName ), "" .join (attrValue )
724
- elif self . data . currentByte in asciiUppercaseBytes :
725
- attrValue .extend ( self . data . currentByte .lower ())
727
+ elif c in asciiUppercaseBytes :
728
+ attrValue .append ( c .lower ())
726
729
else :
727
- attrValue .extend ( self . data . currentByte )
730
+ attrValue .append ( c )
728
731
729
732
730
733
class ContentAttrParser (object ):
@@ -757,7 +760,7 @@ def parse(self):
757
760
#Unquoted value
758
761
oldPosition = self .data .position
759
762
try :
760
- self .data .findNext (spaceCharactersBytes )
763
+ self .data .skipUntil (spaceCharactersBytes )
761
764
return self .data [oldPosition :self .data .position ]
762
765
except StopIteration :
763
766
#Return the whole remaining value
0 commit comments