@@ -47,24 +47,13 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
47
47
if encoding is None or not isValidEncoding (encoding ):
48
48
encoding = self .detectEncoding (parseMeta , chardet )
49
49
self .charEncoding = encoding
50
- self .win1252 = False
51
50
52
- # Read bytes from stream decoding them into Unicode
53
- uString = self .rawStream .read ()
54
-
55
- # Convert the unicode string into a list to be used as the data stream
56
- if self .charEncoding == 'windows-1252' :
57
- self .win1252 = True
58
- else :
59
- self .win1252 = False
60
- uString = uString .decode (self .charEncoding , 'replace' )
61
-
62
- self .dataStream = uString
51
+ self .dataStream = codecs .getreader (self .charEncoding )(self .rawStream , 'replace' )
63
52
64
53
self .queue = []
65
54
66
- # Reset position in the list to read from
67
- self .tell = 0
55
+ self . line = self . col = 0
56
+ self .lineLengths = []
68
57
69
58
def openStream (self , source ):
70
59
"""Produces a file object from source.
@@ -76,8 +65,8 @@ def openStream(self, source):
76
65
if hasattr (source , 'read' ):
77
66
stream = source
78
67
else :
79
- # Otherwise treat source as a string and convert to a file object
80
- if isinstance (source , unicode ):
68
+ # Otherwise treat source as a string and convert to a file object
69
+ if isinstance (source , unicode ):
81
70
source = source .encode ('utf-8' )
82
71
import cStringIO
83
72
stream = cStringIO .StringIO (str (source ))
@@ -154,15 +143,15 @@ def detectEncodingMeta(self):
154
143
155
144
def position (self ):
156
145
"""Returns (line, col) of the current position in the stream."""
157
- line = 0
158
- tell = self .tell
159
- for pos in self .newLines :
160
- if pos < tell :
161
- line += 1
146
+ line , col = self .line , self .col
147
+ for c in self .queue [::- 1 ]:
148
+ if c == '\n ' :
149
+ line -= 1
150
+ assert col == 0
151
+ col = self .lineLengths [line ]
162
152
else :
163
- break
164
- col = tell - self .newLines [line - 1 ] - 1
165
- return (line , col )
153
+ col -= 1
154
+ return (line + 1 , col )
166
155
167
156
def char (self ):
168
157
""" Read one character from the stream or queue if available. Return
@@ -171,26 +160,28 @@ def char(self):
171
160
if self .queue :
172
161
return self .queue .pop (0 )
173
162
else :
174
- try :
175
- c = self .dataStream [self .tell ]
176
- self .tell += 1
177
- if self .win1252 and c >= '\x80 ' : c = c .decode ('windows-1252' )
178
-
179
- # Normalize newlines and null characters
180
- if c == '\x00 ' : c = u'\uFFFD '
181
- if c == '\r ' :
182
- if self .tell < len (self .dataStream ) and \
183
- self .dataStream [self .tell ] == '\n ' :
184
- self .tell += 1
185
- c = '\n '
186
-
187
- # record where newlines occur so that the position method
188
- # can tell where it is
189
- if c == '\n ' : self .newLines .append (self .tell - 1 )
190
- return unicode (c )
191
- except :
163
+ c = self .dataStream .read (1 , 1 )
164
+ if not c :
165
+ self .col += 1
192
166
return EOF
193
167
168
+ # Normalize newlines and null characters
169
+ if c == '\x00 ' : c = u'\uFFFD '
170
+ if c == '\r ' :
171
+ c = self .dataStream .read (1 , 1 )
172
+ if c != '\n ' :
173
+ self .queue .insert (0 , unicode (c ))
174
+ c = '\n '
175
+
176
+ # update position in stream
177
+ if c == '\n ' :
178
+ self .lineLengths .append (self .col )
179
+ self .line += 1
180
+ self .col = 0
181
+ else :
182
+ self .col += 1
183
+ return unicode (c )
184
+
194
185
def charsUntil (self , characters , opposite = False ):
195
186
""" Returns a string of characters from the stream up to but not
196
187
including any character in characters or EOF. characters can be
@@ -204,12 +195,19 @@ def charsUntil(self, characters, opposite = False):
204
195
# Put the character stopped on back to the front of the queue
205
196
# from where it came.
206
197
c = charStack .pop ()
207
- if c != EOF and self .tell > 0 and not self .queue and \
208
- self .dataStream [self .tell - 1 ] == c [0 ]:
209
- self .tell -= 1
210
- else :
211
- self .queue .insert (0 , c )
212
- return "" .join (charStack )
198
+ if c != EOF :
199
+ self .queue .insert (0 , c )
200
+
201
+ # XXX the following is need for correct line number reporting apparently
202
+ # but it causes to break other tests with the fixes in tokenizer. I have
203
+ # no idea why...
204
+ #
205
+ #if c != EOF and self.tell <= len(self.dataStream) and \
206
+ # self.dataStream[self.tell - 1] == c[0]:
207
+ # self.tell -= 1
208
+ #else:
209
+ # self.queue.insert(0, c)
210
+ return u"" .join (charStack )
213
211
214
212
class EncodingBytes (str ):
215
213
"""String-like object with an assosiated position and various extra methods
0 commit comments