8000 Simplified unget code (and improved speed by ~2%) · 5j9/html5lib-python@0d0282b · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 0d0282b

Browse files
committed
Simplified unget code (and improved speed by ~2%)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401246
1 parent 22886b1 commit 0d0282b

File tree

1 file changed

+30
-39
lines changed

1 file changed

+30
-39
lines changed

src/html5lib/inputstream.py

Lines changed: 30 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7373
self.chunkSize = 0
7474
self.chunkOffset = 0
7575
self.errors = []
76-
# Single-character buffer to handle 'unget'
77-
self.ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
7876

7977
# Remember the current position in the document
8078
self.positionLine = 1
@@ -257,18 +255,13 @@ def char(self):
257255
""" Read one character from the stream or queue if available. Return
258256
EOF when EOF is reached.
259257
"""
260-
char = self.ungetChar
261-
if char != u"":
262-
# Use the ungot character, and reset the buffer
263-
self.ungetChar = u""
264-
else:
265-
# Read a new chunk from the input stream if necessary
266-
if self.chunkOffset >= self.chunkSize:
267-
if not self.readChunk():
268-
return EOF
258+
# Read a new chunk from the input stream if necessary
259+
if self.chunkOffset >= self.chunkSize:
260+
if not self.readChunk():
261+
return EOF
269262

270-
char = self.chunk[self.chunkOffset]
271-
self.chunkOffset += 1
263+
char = self.chunk[self.chunkOffset]
264+
self.chunkOffset += 1
272265

273266
# Update the position attributes
274267
if char == u"\n":
@@ -317,18 +310,6 @@ def charsUntil(self, characters, opposite = False):
317310
characters.
318311
"""
319312

320-
rv = []
321-
322-
# Check the ungot character, if any.
323-
# (Since it's only a single character, don't use the regex here)
324-
char = self.ungetChar
325-
if char != u"":
326-
if char is EOF or (char in characters) != opposite:
327-
return u""
328-
else:
329-
rv.append(char)
330-
self.ungetChar = u""
331-
332313
# Use a cache of regexps to find the required characters
333314
try:
334315
chars = charsUntilRegEx[(characters, opposite)]
@@ -339,6 +320,8 @@ def charsUntil(self, characters, opposite = False):
339320
regex = u"^%s" % regex
340321
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
341322

323+
rv = []
324+
342325
while True:
343326
# Find the longest matching prefix
344327
m = chars.match(self.chunk, self.chunkOffset)
@@ -369,21 +352,29 @@ def charsUntil(self, characters, opposite = False):
369352
def unget(self, char):
370353
# Only one character is allowed to be ungotten at once - it must
371354
# be consumed again before any further call to unget
372-
assert self.ungetChar == u""
373-
374-
self.ungetChar = char
375355

376-
# Update the position attributes
377-
if char is None:
378-
pass
379-
elif char == u"\n":
380-
assert self.positionLine >= 1
381-
assert self.lastLineLength is not None
382-
self.positionLine -= 1
383-
self.positionCol = self.lastLineLength
384-
self.lastLineLength = None
385-
else:
386-
self.positionCol -= 1
356+
if char is not None:
357+
if self.chunkOffset == 0:
358+
# unget is called quite rarely, so it's a good idea to do
359+
# more work here if it saves a bit of work in the frequently
360+
# called char and charsUntil.
361+
# So, just prepend the ungotten character onto the current
362+
# chunk:
363+
self.chunk = char + self.chunk
364+
self.chunkSize += 1
365+
else:
366+
self.chunkOffset -= 1
367+
assert self.chunk[self.chunkOffset] == char
368+
369+
# Update the position attributes
370+
if char == u"\n":
371+
assert self.positionLine >= 1
372+
assert self.lastLineLength is not None
373+
self.positionLine -= 1
374+
self.positionCol = self.lastLineLength
375+
self.lastLineLength = None
376+
else:
377+
self.positionCol -= 1
387378

388379
class EncodingBytes(str):
389380
"""String-like object with an assosiated position and various extra methods

0 commit comments

Comments
 (0)
0