8000 Optimisations in inputstrem. Credit david.m.cooke · awesome-python/html5lib-python@e9279e8 · GitHub
[go: up one dir, main page]

Skip to content

Commit e9279e8

Browse files
committed
Optimisations in inputstrem. Credit david.m.cooke
1 parent 786225f commit e9279e8

File tree

2 files changed

+123
-117
lines changed

2 files changed

+123
-117
lines changed

src/html5lib/inputstream.py

Lines changed: 117 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77
from constants import encodings, ReparseException
88

99
#Non-unicode versions of constants for use in the pre-parser
10-
spaceCharactersBytes = [str(item) for item in spaceCharacters]
11-
asciiLettersBytes = [str(item) for item in asciiLetters]
12-
asciiUppercaseBytes = [str(item) for item in asciiUppercase]
10+
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
11+
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
12+
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
13+
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
1314

1415
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
1516

@@ -169,13 +170,10 @@ def reset(self):
169170
self.chunkOffset = 0
170171
self.errors = []
171172

172-
# Remember the current position in the document
173-
self.positionLine = 1
174-
self.positionCol = 0
175-
# Remember the length of the last line, so unget("\n") can restore
176-
# positionCol. (Only one character can be ungot at once, so we only
177-
# need to remember the single last line.)
178-
self.lastLineLength = None
173+
# number of (complete) lines in previous chunks
174+
self.prevNumLines = 0
175+
# number of columns in the last line of the previous chunk
176+
self.prevNumCols = 0
179177

180178
#Flag to indicate we may have a CR LF broken across a data chunk
181179
self._lastChunkEndsWithCR = False
@@ -252,11 +250,11 @@ def changeEncoding(self, newEncoding):
252250
if newEncoding is None:
253251
return
254252
elif newEncoding == self.charEncoding[0]:
255-
self.charEncoding = (self.charEncoding[0], "certian")
253+
self.charEncoding = (self.charEncoding[0], "certain")
256254
else:
257255
self.rawStream.seek(0)
258256
self.reset()
259-
self.charEncoding = (newEncoding, "certian")
257+
self.charEncoding = (newEncoding, "certain")
260258
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
261259

262260
def detectBOM(self):
@@ -302,33 +300,21 @@ def detectEncodingMeta(self):
302300

303301
return encoding
304302

305-
def updatePosition(self, chars):
306-
# Update the position attributes to correspond to some sequence of
307-
# read characters
308-
309-
# Find the last newline character
310-
idx = chars.rfind(u"\n")
311-
if idx == -1:
312-
# No newlines in chars
313-
self.positionCol += len(chars)
303+
def _position(self, offset):
304+
chunk = self.chunk
305+
nLines = chunk.count(u'\n', 0, offset)
306+
positionLine = self.prevNumLines + nLines
307+
lastLinePos = chunk.rfind(u'\n', 0, offset)
308+
if lastLinePos == -1:
309+
positionColumn = self.prevNumCols + offset
314310
else:
315-
# Find the last-but-one newline character
316-
idx2 = chars.rfind(u"\n", 0, idx)
317-
if idx2 == -1:
318-
# Only one newline in chars
319-
self.positionLine += 1
320-
self.lastLineLength = self.positionCol + idx
321-
self.positionCol = len(chars) - (idx + 1)
322-
else:
323-
# At least two newlines in chars
324-
newlines = chars.count(u"\n")
325-
self.positionLine += newlines
326-
self.lastLineLength = idx - (idx2 + 1)
327-
self.positionCol = len(chars) - (idx + 1)
311+
positionColumn = offset - (lastLinePos + 1)
312+
return (positionLine, positionColumn)
328313

329314
def position(self):
330315
"""Returns (line, col) of the current position in the stream."""
331-
return (self.positionLine, self.positionCol)
316+
line, col = self._position(self.chunkOffset)
317+
return (line+1, col)
332318

333319
def char(self):
334320
""" Read one character from the stream or queue if available. Return
@@ -339,20 +325,18 @@ def char(self):
339325
if not self.readChunk():
340326
return EOF
341327

342-
char = self.chunk[self.chunkOffset]
343-
self.chunkOffset += 1
344-
345-
# Update the position attributes
346-
if char == u"\n":
347-
self.lastLineLength = self.positionCol
348-
self.positionCol = 0
349-
self.positionLine += 1
350-
elif char is not EOF:
351-
self.positionCol += 1
328+
chunkOffset = self.chunkOffset
329+
char = self.chunk[chunkOffset]
330+
self.chunkOffset = chunkOffset + 1
352331

353332
return char
354333

355-
def readChunk(self, chunkSize=_defaultChunkSize):
334+
def readChunk(self, chunkSize=None):
335+
if chunkSize is None:
336+
chunkSize = self._defaultChunkSize
337+
338+
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
339+
356340
self.chunk = u""
357341
self.chunkSize = 0
358342
self.chunkOffset = 0
@@ -430,7 +414,9 @@ def charsUntil(self, characters, opposite = False):
430414
try:
431415
chars = charsUntilRegEx[(characters, opposite)]
432416
except KeyError:
433-
for c in characters: assert(ord(c) < 128)
417+
if __debug__:
418+
for c in characters:
419+
assert(ord(c) < 128)
434420
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
435421
if not opposite:
436422
regex = u"^%s" % regex
@@ -462,7 +448,6 @@ def charsUntil(self, characters, opposite = False):
462448
break
463449

464450
r = u"".join(rv)
465-
self.updatePosition(r)
466451
return r
467452

468453
def unget(self, char):
@@ -482,18 +467,8 @@ def unget(self, char):
482467
self.chunkOffset -= 1
483468
assert self.chunk[self.chunkOffset] == char
484469

485-
# Update the position attributes
486-
if char == u"\n":
487-
assert self.positionLine >= 1
488-
assert self.lastLineLength is not None
489-
self.positionLine -= 1
490-
self.positionCol = self.lastLineLength
491-
self.lastLineLength = None
492-
else:
493-
self.positionCol -= 1
494-
495470
class EncodingBytes(str):
496-
"""String-like object with an assosiated position and various extra methods
471+
"""String-like object with an associated position and various extra methods
497472
If the position is ever greater than the string length then an exception is
498473
raised"""
499474
def __new__(self, value):
@@ -506,9 +481,21 @@ def __iter__(self):
506481
return self
507482

508483
def next(self):
509-
self._position += 1
510-
rv = self[self.position]
511-
return rv
484+
p = self._position = self._position + 1
485+
if p >= len(self):
486+
raise StopIteration
487+
elif p < 0:
488+
raise TypeError
489+
return self[p]
490+
491+
def previous(self):
492+
p = self._position
493+
if p >= len(self):
494+
raise StopIteration
495+
elif p < 0:
496+
raise TypeError
497+
self._position = p = p - 1
498+
return self[p]
512499

513500
def setPosition(self, position):
514501
if self._position >= len(self):
@@ -532,18 +519,37 @@ def getCurrentByte(self):
532519

533520
def skip(self, chars=spaceCharactersBytes):
534521
"""Skip past a list of characters"""
535-
while self.currentByte in chars:
536-
self.position += 1
522+
p = self.position # use property for the error-checking
523+
while p < len(self):
524+
c = self[p]
525+
if c not in chars:
526+
self._position = p
527+
return c
528+
p += 1
529+
self._position = p
530+
return None
531+
532+
def skipUntil(self, chars):
533+
p = self.position
534+
while p < len(self):
535+
c = self[p]
536+
if c in chars:
537+
self._position = p
538+
return c
539+
p += 1
540+
self._position = p
541+
return None
537542

538543
def matchBytes(self, bytes, lower=False):
539544
"""Look for a sequence of bytes at the start of a string. If the bytes
540545
are found return True and advance the position to the byte after the
541546
match. Otherwise return False and leave the position alone"""
542-
data = self[self.position:self.position+len(bytes)]
547+
p = self.position
548+
data = self[p:p+len(bytes)]
543549
if lower:
544550
data = data.lower()
545551
rv = data.startswith(bytes)
546-
if rv == True:
552+
if rv:
547553
self.position += len(bytes)
548554
return rv
549555

@@ -556,12 +562,6 @@ def jumpTo(self, bytes):
556562
return True
557563
else:
558564
raise StopIteration
559-
560-
def findNext(self, byteList):
561-
"""Move the pointer so it points to the next byte in a set of possible
562-
bytes"""
563-
while (self.currentByte not in byteList):
564-
self.position += 1
565565

566566
class EncodingParser(object):
567567
"""Mini parser for detecting character encoding from meta elements"""
@@ -627,24 +627,25 @@ def handlePossibleStartTag(self):
627627
return self.handlePossibleTag(False)
628628

629629
def handlePossibleEndTag(self):
630-
self.data.position+=1
630+
self.data.next()
631631
return self.handlePossibleTag(True)
632632

633633
def handlePossibleTag(self, endTag):
634-
if self.data.currentByte not in asciiLettersBytes:
634+
data = self.data
635+
if data.currentByte not in asciiLettersBytes:
635636
#If the next byte is not an ascii letter either ignore this
636637
#fragment (possible start tag case) or treat it according to
637638
#handleOther
638639
if endTag:
639-
self.data.position -= 1
640+
data.previous()
640641
self.handleOther()
641642
return True
642643

643-
self.data.findNext(list(spaceCharactersBytes) + ["<", ">"])
644-
if self.data.currentByte == "<":
644+
c = data.skipUntil(spacesAngleBrackets)
645+
if c == "<":
645646
#return to the first step in the overall "two step" algorithm
646647
#reprocessing the < byte
647-
self.data.position -= 1
648+
data.previous()
648649
else:
649650
#Read all attributes
650651
attr = self.getAttribute()
@@ -658,73 +659,75 @@ def handleOther(self):
658659
def getAttribute(self):
659660
"""Return a name,value pair for the next attribute in the stream,
660661
if one is found, or None"""
661-
self.data.skip(list(spaceCharactersBytes)+["/"])
662-
if self.data.currentByte == "<":
663-
self.data.position -= 1
662+
data = self.data
663+
c = data.skip(spaceCharactersBytes | frozenset("/"))
664+
if c == "<":
665+
data.previous()
664666
return None
665-
elif self.data.currentByte == ">":
667+
elif c == ">" or c is None:
666668
return None
667669
attrName = []
668670
attrValue = []
669671
spaceFound = False
670672
#Step 5 attribute name
671673
while True:
672-
if self.data.currentByte == "=" and attrName:
674+
if c == "=" and attrName:
673675
break
674-
elif self.data.currentByte in spaceCharactersBytes:
676+
elif c in spaceCharactersBytes:
675677
spaceFound=True
676678
break
677-
elif self.data.currentByte in ("/", "<", ">"):
679+
elif c in ("/", "<", ">"):
678680
return "".join(attrName), ""
679-
elif self.data.currentByte in asciiUppercaseBytes:
680-
attrName.extend(self.data.currentByte.lower())
681+
elif c in asciiUppercaseBytes:
682+
attrName.append(c.lower())
681683
else:
682-
attrName.extend(self.data.currentByte)
684+
attrName.append(c)
683685
#Step 6
684-
self.data.position += 1
686+
c = data.next()
685687
#Step 7
686688
if spaceFound:
687-
self.data.skip()
689+
c = data.skip()
688690
#Step 8
689-
if self.data.currentByte != "=":
690-
self.data.position -= 1
691+
if c != "=":
692+
data.previous()
691693
return "".join(attrName), ""
692694
#XXX need to advance position in both spaces and value case
693695
#Step 9
694-
self.data.position += 1
696+
data.next()
695697
#Step 10
696-
self.data.skip()
698+
c = data.skip()
697699
#Step 11
698-
if self.data.currentByte in ("'", '"'):
700+
if c in ("'", '"'):
699701
#11.1
700-
quoteChar = self.data.currentByte
702+
quoteChar = c
701703
while True:
702-
self.data.position+=1
703704
#11.3
704-
if self.data.currentByte == quoteChar:
705-
self.data.position += 1
705+
c = data.next()
706+
if c == quoteChar:
707+
data.next()
706708
return "".join(attrName), "".join(attrValue)
707709
#11.4
708-
elif self.data.currentByte in asciiUppercaseBytes:
709-
attrValue.extend(self.data.currentByte.lower())
710+
elif c in asciiUppercaseBytes:
711+
attrValue.append(c.lower())
710712
#11.5
711713
else:
712-
attrValue.extend(self.data.currentByte)
713-
elif self.data.currentByte in (">", "<"):
714-
return "".join(attrName), ""
715-
elif self.data.currentByte in asciiUppercaseBytes:
716-
attrValue.extend(self.data.currentByte.lower())
714+
attrValue.append(c)
715+
elif c in (">", "<"):
716+
return "".join(attrName), ""
717+
elif c in asciiUppercaseBytes:
718+
attrValue.append(c.lower())
719+
elif c is None:
720+
return None
717721
else:
718-
attrValue.extend(self.data.currentByte)
722+
attrValue.append(c)
719723
while True:
720-
self.data.position +=1
721-
if self.data.currentByte in (
722-
list(spaceCharactersBytes) + [">", "<"]):
724+
c = data.next()
725+
if c in spacesAngleBrackets:
723726
return "".join(attrName), "".join(attrValue)
724-
elif self.data.currentByte in asciiUppercaseBytes:
725-
attrValue.extend(self.data.currentByte.lower())
727+
elif c in asciiUppercaseBytes:
728+
attrValue.append(c.lower())
726729
else:
727-
attrValue.extend(self.data.currentByte)
730+
attrValue.append(c)
728731

729732

730733
class ContentAttrParser(object):
@@ -757,7 +760,7 @@ def parse(self):
757760
#Unquoted value
758761
oldPosition = self.data.position
759762
try:
760-
self.data.findNext(spaceCharactersBytes)
763+
self.data.skipUntil(spaceCharactersBytes)
761764
return self.data[oldPosition:self.data.position]
762765
except StopIteration:
763766
#Return the whole remaining value

0 commit comments

Comments
 (0)
0