awesome-python
diff --git a/‎src/inputstream.py
Lines changed: 46 additions & 48 deletions b/‎src/inputstream.py
Lines changed: 46 additions & 48 deletions
diff --git a/‎tests/test_stream.py
Lines changed: 2 additions & 4 deletions b/‎tests/test_stream.py
Lines changed: 2 additions & 4 deletions
@@ -47,24 +47,13 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         if encoding is None or not isValidEncoding(encoding):
             encoding = self.detectEncoding(parseMeta, chardet)
         self.charEncoding = encoding
-        self.win1252 = False
 
-        # Read bytes from stream decoding them into Unicode
-        uString = self.rawStream.read()
-
-        # Convert the unicode string into a list to be used as the data stream
-        if self.charEncoding == 'windows-1252':
-            self.win1252 = True
-        else:
-            self.win1252 = False
-            uString = uString.decode(self.charEncoding, 'replace')
-
-        self.dataStream = uString
+        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
 
         self.queue = []
 
-        # Reset position in the list to read from
-        self.tell = 0
+        self.line = self.col = 0
+        self.lineLengths = []
 
     def openStream(self, source):
         """Produces a file object from source.
@@ -76,8 +65,8 @@ def openStream(self, source):
         if hasattr(source, 'read'):
             stream = source
         else:
-            # Otherwise treat source as a string and convert to a file object
-            if isinstance(source, unicode):
+            # Otherwise treat source as a string and convert to a file object
+            if isinstance(source, unicode):
                 source = source.encode('utf-8')
             import cStringIO
             stream = cStringIO.StringIO(str(source))
@@ -154,15 +143,15 @@ def detectEncodingMeta(self):
 
     def position(self):
         """Returns (line, col) of the current position in the stream."""
-        line = 0
-        tell = self.tell
-        for pos in self.newLines:
-            if pos < tell:
-                line += 1
+        line, col = self.line, self.col
+        for c in self.queue[::-1]:
+            if c == '\n':
+                line -= 1
+                assert col == 0
+                col = self.lineLengths[line]
             else:
-                break
-        col = tell - self.newLines[line-1] - 1
-        return (line, col)
+                col -= 1
+        return (line + 1, col)
 
     def char(self):
         """ Read one character from the stream or queue if available. Return
@@ -171,26 +160,28 @@ def char(self):
         if self.queue:
             return self.queue.pop(0)
         else:
-            try:
-                c = self.dataStream[self.tell]
-                self.tell += 1
-                if self.win1252 and c >= '\x80': c=c.decode('windows-1252')
-
-                # Normalize newlines and null characters
-                if c == '\x00': c = u'\uFFFD'
-                if c == '\r':
-                    if self.tell < len(self.dataStream) and \
-                      self.dataStream[self.tell] == '\n':
-                        self.tell += 1
-                    c = '\n'
-
-                # record where newlines occur so that the position method
-                # can tell where it is
-                if c == '\n': self.newLines.append(self.tell - 1)
-                return unicode(c)
-            except:
+            c = self.dataStream.read(1, 1)
+            if not c:
+                self.col += 1
                 return EOF
 
+            # Normalize newlines and null characters
+            if c == '\x00': c = u'\uFFFD'
+            if c == '\r':
+                c = self.dataStream.read(1, 1)
+                if c != '\n':
+                    self.queue.insert(0, unicode(c))
+                c = '\n'
+
+            # update position in stream
+            if c == '\n':
+                self.lineLengths.append(self.col)
+                self.line += 1
+                self.col = 0
+            else:
+                self.col += 1
+            return unicode(c)
+
     def charsUntil(self, characters, opposite = False):
         """ Returns a string of characters from the stream up to but not
         including any character in characters or EOF. characters can be
@@ -204,12 +195,19 @@ def charsUntil(self, characters, opposite = False):
         # Put the character stopped on back to the front of the queue
         # from where it came.
         c = charStack.pop()
-        if c != EOF and self.tell > 0 and not self.queue and \
-          self.dataStream[self.tell - 1] == c[0]:
-            self.tell -= 1
-        else:
-            self.queue.insert(0, c)
-        return "".join(charStack)
+        if c != EOF:
+            self.queue.insert(0, c)
+        
+        # XXX the following is need for correct line number reporting apparently
+        # but it causes to break other tests with the fixes in tokenizer. I have
+        # no idea why...
+        #
+        #if c != EOF and self.tell <= len(self.dataStream) and \
+        #  self.dataStream[self.tell - 1] == c[0]:
+        #    self.tell -= 1
+        #else:
+        #    self.queue.insert(0, c)
+        return u"".join(charStack)
 
 class EncodingBytes(str):
     """String-like object with an assosiated position and various extra methods
 
@@ -49,14 +49,12 @@ def test_utf_16(self):
 
     def test_newlines(self):
         stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\r\nccc\rdddd")
-        self.assertEquals(stream.tell, 0)
+        self.assertEquals(stream.position(), (1, 0))
         self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
-        self.assertEquals(stream.tell, 6)
         self.assertEquals(stream.position(), (3,0))
         self.assertEquals(stream.charsUntil('x'),u"ccc\ndddd")
-        self.assertEquals(stream.tell, 14)
         self.assertEquals(stream.position(), (4,4))
-        self.assertEquals(stream.newLines, [0,1,5,9])
+        self.assertEquals(stream.lineLengths, [1,2,3])
 
 def buildTestSuite():
     return unittest.defaultTestLoader.loadTestsFromName(__name__)