8000 Applied patch from Issue #42. · awesome-python/html5lib-python@e822736 · GitHub
[go: up one dir, main page]

Skip to content

Commit e822736

Browse files
committed
Applied patch from Issue html5lib#42.
http://code.google.com/p/html5lib/issues/detail?id=42 --HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40770
1 parent c69c726 commit e822736

File tree

2 files changed

+48
-52
lines changed

2 files changed

+48
-52
lines changed

src/inputstream.py

Lines changed: 46 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -47,24 +47,13 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
4747
if encoding is None or not isValidEncoding(encoding):
4848
encoding = self.detectEncoding(parseMeta, chardet)
4949
self.charEncoding = encoding
50-
self.win1252 = False
5150

52-
# Read bytes from stream decoding them into Unicode
53-
uString = self.rawStream.read()
54-
55-
# Convert the unicode string into a list to be used as the data stream
56-
if self.charEncoding == 'windows-1252':
57-
self.win1252 = True
58-
else:
59-
self.win1252 = False
60-
uString = uString.decode(self.charEncoding, 'replace')
61-
62-
self.dataStream = uString
51+
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
6352

6453
self.queue = []
6554

66-
# Reset position in the list to read from
67-
self.tell = 0
55+
self.line = self.col = 0
56+
self.lineLengths = []
6857

6958
def openStream(self, source):
7059
"""Produces a file object from source.
@@ -76,8 +65,8 @@ def openStream(self, source):
7665
if hasattr(source, 'read'):
7766
stream = source
7867
else:
79-
# Otherwise treat source as a string and convert to a file object
80-
if isinstance(source, unicode):
68+
# Otherwise treat source as a string and convert to a file object
69+
if isinstance(source, unicode):
8170
source = source.encode('utf-8')
8271
import cStringIO
8372
stream = cStringIO.StringIO(str(source))
@@ -154,15 +143,15 @@ def detectEncodingMeta(self):
154143

155144
def position(self):
156145
"""Returns (line, col) of the current position in the stream."""
157-
line = 0
158-
tell = self.tell
159-
for pos in self.newLines:
160-
if pos < tell:
161-
line += 1
146+
line, col = self.line, self.col
147+
for c in self.queue[::-1]:
148+
if c == '\n':
149+
line -= 1
150+
assert col == 0
151+
col = self.lineLengths[line]
162152
else:
163-
break
164-
col = tell - self.newLines[line-1] - 1
165-
return (line, col)
153+
col -= 1
154+
return (line + 1, col)
166155

167156
def char(self):
168157
""" Read one character from the stream or queue if available. Return
@@ -171,26 +160,28 @@ def char(self):
171160
if self.queue:
172161
return self.queue.pop(0)
173162
else:
174-
try:
175-
c = self.dataStream[self.tell]
176-
self.tell += 1
177-
if self.win1252 and c >= '\x80': c=c.decode('windows-1252')
178-
179-
# Normalize newlines and null characters
180-
if c == '\x00': c = u'\uFFFD'
181-
if c == '\r':
182-
if self.tell < len(self.dataStream) and \
183-
self.dataStream[self.tell] == '\n':
184-
self.tell += 1
185-
c = '\n'
186-
187-
# record where newlines occur so that the position method
188-
# can tell where it is
189-
if c == '\n': self.newLines.append(self.tell - 1)
190-
return unicode(c)
191-
except:
163+
c = self.dataStream.read(1, 1)
164+
if not c:
165+
self.col += 1
192166
return EOF
193167

168+
# Normalize newlines and null characters
169+
if c == '\x00': c = u'\uFFFD'
170+
if c == '\r':
171+
c = self.dataStream.read(1, 1)
172+
if c != '\n':
173+
self.queue.insert(0, unicode(c))
174+
c = '\n'
175+
176+
# update position in stream
177+
if c == '\n':
178+
self.lineLengths.append(self.col)
179+
self.line += 1
180+
self.col = 0
181+
else:
182+
self.col += 1
183+
return unicode(c)
184+
194185
def charsUntil(self, characters, opposite = False):
195186
""" Returns a string of characters from the stream up to but not
196187
including any character in characters or EOF. characters can be
@@ -204,12 +195,19 @@ def charsUntil(self, characters, opposite = False):
204195
# Put the character stopped on back to the front of the queue
205196
# from where it came.
206197
c = charStack.pop()
207-
if c != EOF and self.tell > 0 and not self.queue and \
208-
self.dataStream[self.tell - 1] == c[0]:
209-
self.tell -= 1
210-
else:
211-
self.queue.insert(0, c)
212-
return "".join(charStack)
198+
if c != EOF:
199+
self.queue.insert(0, c)
200+
201+
# XXX the following is need for correct line number reporting apparently
202+
# but it causes to break other tests with the fixes in tokenizer. I have
203+
# no idea why...
204+
#
205+
#if c != EOF and self.tell <= len(self.dataStream) and \
206+
# self.dataStream[self.tell - 1] == c[0]:
207+
# self.tell -= 1
208+
#else:
209+
# self.queue.insert(0, c)
210+
return u"".join(charStack)
213211

214212
class EncodingBytes(str):
215213
"""String-like object with an assosiated position and various extra methods

tests/test_stream.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,12 @@ def test_utf_16(self):
4949

5050
def test_newlines(self):
5151
stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\r\nccc\rdddd")
52-
self.assertEquals(stream.tell, 0)
52+
self.assertEquals(stream.position(), (1, 0))
5353
self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
54-
self.assertEquals(stream.tell, 6)
5554
self.assertEquals(stream.position(), (3,0))
5655
self.assertEquals(stream.charsUntil('x'),u"ccc\ndddd")
57-
self.assertEquals(stream.tell, 14)
5856
self.assertEquals(stream.position(), (4,4))
59-
self.assertEquals(stream.newLines, [0,1,5,9])
57+
self.assertEquals(stream.lineLengths, [1,2,3])
6058

6159
def buildTestSuite():
6260
return unittest.defaultTestLoader.loadTestsFromName(__name__)

0 commit comments

Comments
 (0)
0