@@ -55,13 +55,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
55
55
if self .charEncoding is None or not isValidEncoding (self .charEncoding ):
56
56
self .charEncoding = self .detectEncoding (parseMeta , chardet )
57
57
58
- self .dataStream = codecs .getreader (self .charEncoding )(self .rawStream , 'replace' )
58
+ self .dataStream = codecs .getreader (self .charEncoding )(self .rawStream ,
59
+ 'replace' )
59
60
60
61
self .queue = []
61
62
self .errors = []
62
63
63
64
self .line = self .col = 0
64
65
self .lineLengths = []
66
+
67
+ #Flag to indicate we may have a CR LF broken across a data chunk
68
+ self ._lastChunkEndsWithCR = False
65
69
66
70
def openStream (self , source ):
67
71
"""Produces a file object from source.
@@ -199,64 +203,47 @@ def detectEncodingMeta(self):
199
203
def position (self ):
200
204
"""Returns (line, col) of the current position in the stream."""
201
205
line , col = self .line , self .col
202
- for c in self .queue [::- 1 ]:
203
- if c == '\n ' :
204
- line -= 1
205
- assert col == 0
206
- col = self .lineLengths [line ]
207
- else :
208
- col -= 1
209
206
return (line + 1 , col )
210
207
211
208
def char (self ):
212
209
""" Read one character from the stream or queue if available. Return
213
210
EOF when EOF is reached.
214
211
"""
215
- if self .queue :
216
- char = self .queue .pop (0 )
217
- if char == "\n " :
218
- self .lineLengths .append (self .col )
219
- self .line += 1
220
- self .col = 0
221
- return char
212
+ if not self .queue :
213
+ self .readChunk ()
214
+ #If we still don't have a character we have reached EOF
215
+ if not self .queue :
216
+ return EOF
217
+
218
+ char = self .queue .pop (0 )
219
+
220
+ # update position in stream
221
+ if char == '\n ' :
222
+ self .lineLengths .append (self .col )
223
+ self .line += 1
224
+ self .col = 0
222
225
else :
223
- c = self .readChar ()
224
- if c is EOF :
225
- return c
226
-
227
- if c == '\r ' :
228
- #XXX This isn't right in the case with multiple CR in a row
229
- #also recursing here isn't ideal + not sure what happens to input position
230
- c = self .readChar ()
231
- if c is not EOF and c not in ('\n ' , '\r ' ):
232
- self .queue .insert (0 , unicode (c ))
233
- elif c == '\r ' :
234
- self .queue .insert (0 , u'\n ' )
235
- c = '\n '
236
-
237
- # update position in stream
238
- if c == '\n ' :
239
- self .lineLengths .append (self .col )
240
- self .line += 1
241
- self .col = 0
242
- else :
243
- self .col += 1
244
- return unicode (c )
245
-
246
- def readChar (self ):
247
- """Read the next character from the datastream and normalize for null
248
- but not for CR"""
249
- c = self .dataStream .read (1 , 1 )
250
- if not c :
251
226
self .col += 1
252
- return EOF
253
-
254
- # Normalize newlines and null characters
255
- if c == '\x00 ' :
227
+ return char
228
+
229
+ def readChunk (self , chunkSize = 1024 ):
230
+ data = self .dataStream .read (1024 )
231
+ if not data :
232
+ return
233
+ #Replace null characters
234
+ for i in xrange (data .count (u"\u0000 " )):
256
235
self .errors .append (_ ('null character found in input stream, '
257
- 'replaced with U+FFFD' ))
258
- c = u'\uFFFD '
259
- return c
236
+ 'replaced with U+FFFD' ))
237
+ data = data .replace (u"\u0000 " , u"\ufffd " )
238
+ #Check for CR LF broken across chunks
239
+ if (self ._lastChunkEndsWithCR and data [0 ] == "\n " ):
240
+ data = data [1 :]
241
+ self ._lastChunkEndsWithCR = data [- 1 ] == "\r "
242
+ data = data .replace ("\r \n " , "\n " )
243
+ data = data .replace ("\r " , "\n " )
244
+
245
+ data = unicode (data )
246
+ self .queue .extend ([char for char in data ])
260
247
261
248
def charsUntil (self , characters , opposite = False ):
262
249
""" Returns a string of characters from the stream up to but not
@@ -272,13 +259,20 @@ def charsUntil(self, characters, opposite = False):
272
259
# from where it came.
273
260
c = charStack .pop ()
274
261
if c != EOF :
275
- self .queue . insert ( 0 , c )
262
+ self .unget ( c )
276
263
277
264
return u"" .join (charStack )
278
265
279
266
def unget (self , chars ):
280
267
if chars :
281
268
self .queue = list (chars ) + self .queue
269
+ #Alter the current line, col position
270
+ for c in chars [::- 1 ]:
271
+ if c == '\n ' :
272
+ self .line -= 1
273
+ self .col = self .lineLengths [self .line ]
274
+ else :
275
+ self .col -= 1
282
276
283
277
class EncodingBytes (str ):
284
278
"""String-like object with an assosiated position and various extra methods
0 commit comments