8000 First steps toward making charsUntil not suck · awesome-python/html5lib-python@edaaab3 · GitHub
[go: up one dir, main page]

Skip to content

Commit edaaab3

Browse files
committed
First steps toward making charsUntil not suck
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40911
1 parent 1478d54 commit edaaab3

File tree

1 file changed

+33
-11
lines changed

1 file changed

+33
-11
lines changed

src/html5lib/inputstream.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,8 @@ def char(self):
226226
self.col += 1
227227
return char
228228

229-
def readChunk(self, chunkSize=1024):
230-
data = self.dataStream.read(1024)
229+
def readChunk(self, chunkSize=10240):
230+
data = self.dataStream.read(chunkSize)
231231
if not data:
232232
return
233233
#Replace null characters
@@ -250,18 +250,40 @@ def charsUntil(self, characters, opposite = False):
250250
including any character in characters or EOF. characters can be
251251
any container that supports the in method being called on it.
252252
"""
253-
charStack = [self.char()]
254253

255-
while charStack[-1] and (charStack[-1] in characters) == opposite:
256-
charStack.append(self.char())
254+
#This method is currently 40-50% of our total runtime and badly needs
255+
#optimizing
256+
#Possible improvements:
257+
# - use regexp to find characters that match the required character set
258+
# - compute line positions in a single pass at the end
259+
# - improve EOF handling for fewer if statements
257260

258-
# Put the character stopped on back to the front of the queue
259-
# from where it came.
260-
c = charStack.pop()
261-
if c != EOF:
262-
self.unget(c)
261+
if not self.queue:
262+
self.readChunk()
263+
#Break if we have reached EOF
264+
if not self.queue or self.queue[0] == None:
265+
return u""
263266

264-
return u"".join(charStack)
267+
i = 0
268+
while (self.queue[i] in characters) == opposite:
269+
#Working out positions like this really sucks
270+
if self.queue[i] == '\n':
271+
self.lineLengths.append(self.col)
272+
self.line += 1
273+
self.col = 0
274+
else:
275+
self.col += 1
276+
i += 1
277+
if i == len(self.queue):
278+
self.readChunk()
279+
#If the queue doesn't grow we have reached EOF
280+
if i == len(self.queue) or self.queue[i] is EOF:
281+
break
282+
283+
rv = u"".join(self.queue[:i])
284+
self.queue = self.queue[i:]
285+
286+
return rv
265287

266288
def unget(self, chars):
267289
if chars:

0 commit comments

Comments
 (0)
0