8000 Fix one more testcase - this probably hurts performace a bit so a bet… · awesome-python/html5lib-python@1edb2e8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1edb2e8

Browse files
committed
Fix one more testcase - this probably hurts performace a bit so a better fix is welcome
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40890
1 parent ae0e9ea commit 1edb2e8

File tree

1 file changed

+29
-13
lines changed

1 file changed

+29
-13
lines changed

src/html5lib/inputstream.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -207,24 +207,25 @@ def char(self):
207207
EOF when EOF is reached.
208208
"""
209209
if self.queue:
210-
return self.queue.pop(0)
210+
char = self.queue.pop(0)
211+
if char == "\n":
212+
self.lineLengths.append(self.col)
213+
self.line += 1
214+
self.col = 0
215+
return char
211216
else:
212-
c = self.dataStream.read(1, 1)
213-
if not c:
214-
self.col += 1
215-
return EOF
216-
217-
# Normalize newlines and null characters
218-
if c == '\x00':
219-
self.errors.append('null character found in input stream, '
220-
'replaced with U+FFFD')
221-
c = u'\uFFFD'
217+
c = self.readChar()
218+
if c is EOF:
219+
return c
220+
222221
if c == '\r':
223222
#XXX This isn't right in the case with multiple CR in a row
224223
#also recursing here isn't ideal + not sure what happens to input position
225-
c = self.char()
226-
if c and c != '\n':
224+
c = self.readChar()
225+
if c is not EOF and c not in ('\n', '\r'):
227226
self.queue.insert(0, unicode(c))
227+
elif c == '\r':
228+
self.queue.insert(0, u'\n')
228229
c = '\n'
229230

230231
# update position in stream
@@ -236,6 +237,21 @@ def char(self):
236237
self.col += 1
237238
return unicode(c)
238239

240+
def readChar(self):
241+
"""Read the next character from the datastream and normalize for null
242+
but not for CR"""
243+
c = self.dataStream.read(1, 1)
244+
if not c:
245+
self.col += 1
246+
return EOF
247+
248+
# Normalize newlines and null characters
249+
if c == '\x00':
250+
self.errors.append('null character found in input stream, '
251+
'replaced with U+FFFD')
252+
c = u'\uFFFD'
253+
return c
254+
239255
def charsUntil(self, characters, opposite = False):
240256
""" Returns a string of characters from the stream up to but not
241257
including any character in characters or EOF. characters can be

0 commit comments

Comments
 (0)
0