8000 Read files in chunks rather than one character at a time · html5lib/html5lib-python@1478d54 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1478d54

Browse files
committed
Read files in chunks rather than one character at a time
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40910
1 parent e97b068 commit 1478d54

File tree

1 file changed

+45
-51
lines changed

1 file changed

+45
-51
lines changed

src/html5lib/inputstream.py

Lines changed: 45 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5555
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
5656
self.charEncoding = self.detectEncoding(parseMeta, chardet)
5757

58-
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
58+
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
59+
'replace')
5960

6061
self.queue = []
6162
self.errors = []
6263

6364
self.line = self.col = 0
6465
self.lineLengths = []
66+
67+
#Flag to indicate we may have a CR LF broken across a data chunk
68+
self._lastChunkEndsWithCR = False
6569

6670
def openStream(self, source):
6771
"""Produces a file object from source.
@@ -199,64 +203,47 @@ def detectEncodingMeta(self):
199203
def position(self):
200204
"""Returns (line, col) of the current position in the stream."""
201205
line, col = self.line, self.col
202-
for c in self.queue[::-1]:
203-
if c == '\n':
204-
line -= 1
205-
assert col == 0
206-
col = self.lineLengths[line]
207-
else:
208-
col -= 1
209206
return (line + 1, col)
210207

211208
def char(self):
212209
""" Read one character from the stream or queue if available. Return
213210
EOF when EOF is reached.
214211
"""
215-
if self.queue:
216-
char = self.queue.pop(0)
217-
if char == "\n":
218-
self.lineLengths.append(self.col)
219-
self.line += 1
220-
self.col = 0
221-
return char
212+
if not self.queue:
213+
self.readChunk()
214+
#If we still don't have a character we have reached EOF
215+
if not self.queue:
216+
return EOF
217+
218+
char = self.queue.pop(0)
219+
220+
# update position in stream
221+
if char == '\n':
222+
self.lineLengths.append(self.col)
223+
self.line += 1
224+
self.col = 0
222225
else:
223-
c = self.readChar()
224-
if c is EOF:
225-
return c
226-
227-
if c == '\r':
228-
#XXX This isn't right in the case with multiple CR in a row
229-
#also recursing here isn't ideal + not sure what happens to input position
230-
c = self.readChar()
231-
if c is not EOF and c not in ('\n', '\r'):
232-
self.queue.insert(0, unicode(c))
233-
elif c == '\r':
234-
self.queue.insert(0, u'\n')
235-
c = '\n'
236-
237-
# update position in stream
238-
if c == '\n':
239-
self.lineLengths.append(self.col)
240-
self.line += 1
241-
self.col = 0
242-
else:
243-
self.col += 1
244-
return unicode(c)
245-
246-
def readChar(self):
247-
"""Read the next character from the datastream and normalize for null
248-
but not for CR"""
249-
c = self.dataStream.read(1, 1)
250-
if not c:
251226
self.col += 1
252-
return EOF
253-
254-
# Normalize newlines and null characters
255-
if c == '\x00':
227+
return char
228+
229+
def readChunk(self, chunkSize=1024):
230+
data = self.dataStream.read(1024)
231+
if not data:
232+
return
233+
#Replace null characters
234+
for i in xrange(data.count(u"\u0000")):
256235
self.errors.append(_('null character found in input stream, '
257-
'replaced with U+FFFD'))
258-
c = u'\uFFFD'
259-
return c
236+
'replaced with U+FFFD'))
237+
data = data.replace(u"\u0000", u"\ufffd")
238+
#Check for CR LF broken across chunks
239+
if (self._lastChunkEndsWithCR and data[0] == "\n"):
240+
data = data[1:]
241+
self._lastChunkEndsWithCR = data[-1] == "\r"
242+
data = data.replace("\r\n", "\n")
243+
data = data.replace("\r", "\n")
244+
245+
data = unicode(data)
246+
self.queue.extend([char for char in data])
260247

261248
def charsUntil(self, characters, opposite = False):
262249
""" Returns a string of characters from the stream up to but not
@@ -272,13 +259,20 @@ def charsUntil(self, characters, opposite = False):
272259
# from where it came.
273260
c = charStack.pop()
274261
if c != EOF:
275-
self.queue.insert(0, c)
262+
self.unget(c)
276263

277264
return u"".join(charStack)
278265

279266
def unget(self, chars):
280267
if chars:
281268
self.queue = list(chars) + self.queue
269+
#Alter the current line, col position
270+
for c in chars[::-1]:
271+
if c == '\n':
272+
self.line -= 1
273+
self.col = self.lineLengths[self.line]
274+
else:
275+
self.col -= 1
282276

283277
class EncodingBytes(str):
284278
"""String-like object with an assosiated position and various extra methods

0 commit comments

Comments
 (0)
0