8000 Python streaming, and Ruby progress towards that goal · awesome-python/html5lib-python@e706c6e · GitHub
[go: up one dir, main page]

Skip to content

Commit e706c6e

Browse files
committed
Python streaming, and Ruby progress towards that goal
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40774
1 parent 2213b78 commit e706c6e

File tree

3 files changed

+44
-19
lines changed

3 files changed

+44
-19
lines changed

parse.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ def parse():
3030
import urllib
3131
f = urllib.urlopen(f).read()
3232
except: pass
33+
elif f == '-':
34+
f = sys.stdin
3335
else:
3436
try:
3537
# Try opening from file system

src/html5parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def _parse(self, stream, innerHTML=False, container="div",
8282
self.errors = []
8383

8484
self.tokenizer = self.tokenizer_class(stream, encoding,
85-
parseMeta=innerHTML)
85+
parseMeta=not innerHTML)
8686

8787
if innerHTML:
8888
self.innerHTML = container.lower()

src/inputstream.py

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
8787
import chardet
8888
buffer = self.rawStream.read()
8989
encoding = chardet.detect(buffer)['encoding']
90-
self.rawStream = self.openStream(buffer)
90+
self.seek(buffer, 0)
9191
except ImportError:
9292
pass
9393
# If all else fails use the default encoding
@@ -127,18 +127,50 @@ def detectBOM(self):
127127
seek = 2
128128

129129

130-
#AT - move this to the caller?
131-
# Set the read position past the BOM if one was found, otherwise
132-
# set it to the start of the stream
133-
self.rawStream.seek(encoding and seek or 0)
130+
self.seek(string, encoding and seek or 0)
134131

135132
return encoding
136133

134+
def seek(self, buffer, n):
135+
"""Unget buffer[n:]"""
136+
if hasattr(self.rawStream, 'unget'):
137+
self.rawStream.unget(buffer[n:])
138+
return
139+
140+
try:
141+
self.rawStream.seek(n)
142+
except IOError:
143+
class BufferedStream:
144+
def __init__(self, data, stream):
145+
self.data = data
146+
self.stream = stream
147+
def read(self, chars=-1):
148+
if chars == -1 or chars > len(self.data):
149+
result = self.data
150+
self.data = ''
151+
if chars == -1:
152+
return result + self.stream.read()
153+
else:
154+
return result + self.stream.read(chars-len(result))
155+
elif not self.data:
156+
return self.stream.read(chars)
157+
else:
158+
result = self.data[:chars]
159+
self.data = self.data[chars:]
160+
return result
161+
def unget(self, data):
162+
if self.data:
163+
self.data += data
164+
else:
165+
self.data = data
166+
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
167+
137168
def detectEncodingMeta(self):
138169
"""Report the encoding declared by the meta element
139170
"""
140-
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
141-
self.rawStream.seek(0)
171+
buffer = self.rawStream.read(self.numBytesMeta)
172+
parser = EncodingParser(buffer)
173+
self.seek(buffer, 0)
142174
return parser.getEncoding()
143175

144176
def position(self):
@@ -195,18 +227,9 @@ def charsUntil(self, characters, opposite = False):
195227
# Put the character stopped on back to the front of the queue
196228
# from where it came.
197229
c = charStack.pop()
198-
if c != EOF:
199-
self.queue.insert(0, c)
230+
if c != EOF:
231+
self.queue.insert(0, c)
200232

201-
# XXX the following is need for correct line number reporting apparently
202-
# but it causes to break other tests with the fixes in tokenizer. I have
203-
# no idea why...
204-
#
205-
#if c != EOF and self.tell <= len(self.dataStream) and \
206-
# self.dataStream[self.tell - 1] == c[0]:
207-
# self.tell -= 1
208-
#else:
209-
# self.queue.insert(0, c)
210233
return u"".join(charStack)
211234

212235
class EncodingBytes(str):

0 commit comments

Comments
 (0)
0