8000 Allow changing character encoding · awesome-python/html5lib-python@e4021af · GitHub
[go: up one dir, main page]

Skip to content

Commit e4021af

Browse files
committed
Allow changing character encoding
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401257
1 parent f2bf97f commit e4021af

File tree

7 files changed

+152
-105
lines changed

7 files changed

+152
-105
lines changed

parse.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
#RELEASE remove
1313
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
1414
#END RELEASE
15-
from html5lib import html5parser, liberalxmlparser, sanitizer, tokenizer
15+
from html5lib import html5parser, liberalxmlparser, sanitizer
16+
from html5lib.tokenizer import HTMLTokenizer
1617
from html5lib import treebuilders, serializer, treewalkers
1718
from html5lib import constants
1819

src/html5lib/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1073,7 +1073,6 @@
10731073
'utf16': 'utf-16',
10741074
'utf16be': 'utf-16-be',
10751075
'utf16le': 'utf-16-le',
1076-
'utf7': 'utf-7',
10771076
'utf8': 'utf-8',
10781077
'windows1250': 'cp1250',
10791078
'windows1251': 'cp1251',
@@ -1100,3 +1099,6 @@
11001099

11011100
class DataLossWarning(UserWarning):
11021101
pass
1102+
1103+
class ReparseException(Exception):
1104+
pass
8000

src/html5lib/html5parser.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
from constants import scopingElements, formattingElements, specialElements
1919
from constants import headingElements, tableInsertModeElements
2020
from constants import cdataElements, rcdataElements, voidElements
21-
from constants import tokenTypes
21+
from constants import tokenTypes, ReparseException
2222

23-
def parse(doc, treebuilderName="simpletree", encoding=None):
24-
tb = treebuilders.getTreeBuilder(treebuilderName)
23+
def parse(doc, treebuilder="simpletree", encoding=None):
24+
tb = treebuilders.getTreeBuilder(treebuilder)
2525
p = HTMLParser(tb)
2626
return p.parse(doc, encoding=encoding)
2727

@@ -80,18 +80,29 @@ def __init__(self, tree = simpletree.TreeBuilder,
8080

8181
def _parse(self, stream, innerHTML=False, container="div",
8282
encoding=None, parseMeta=True, useCharde 3262 t=True, **kwargs):
83-
84-
self.tree.reset()
85-
self.firstStartTag = False
86-
self.errors = []
87-
self.compatMode = "no quirks"
8883

84+
self.innerHTMLMode = innerHTML
85+
self.container = container
8986
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
9087
parseMeta=parseMeta,
9188
useChardet=useChardet, **kwargs)
89+
self.reset()
9290

93-
if innerHTML:
94-
self.innerHTML = container.lower()
91+
while True:
92+
try:
93+
self.mainLoop()
94+
break
95+
except ReparseException, e:
96+
self.reset()
97+
98+
def reset(self):
99+
self.tree.reset()
100+
self.firstStartTag = False
101+
self.errors = []
102+
self.compatMode = "no quirks"
103+
104+
if self.innerHTMLMode:
105+
self.innerHTML = self.container.lower()
95106

96107
if self.innerHTML in cdataElements:
97108
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
@@ -113,9 +124,9 @@ def _parse(self, stream, innerHTML=False, container="div",
113124
# We only seem to have InBodyPhase testcases where the following is
114125
# relevant ... need others too
115126
self.lastPhase = None
116-
117127
self.beforeRCDataPhase = None
118-
128+
129+
def mainLoop(self):
119130
(CharactersToken,
120131
SpaceCharactersToken,
121132
StartTagToken,
@@ -287,18 +298,6 @@ def __init__(self, parser, tree):
287298

288299
def processEOF(self):
289300
raise NotImplementedError
290-
self.tree.generateImpliedEndTags()
291-
if len(self.tree.openElements) > 2:
292-
self.parser.parseError("expected-closing-tag-but-got-eof")
293-
elif len(self.tree.openElements) == 2 and\
294-
self.tree.openElements[1].name != "body":
295-
# This happens for framesets or something?
296-
self.parser.parseError("expected-closing-tag-but-got-eof")
297-
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
298-
# XXX This is not what the specification says. Not sure what to do
299-
# here.
300-
self.parser.parseError("eof-in-innerhtml")
301-
# Betting ends.
302301

303302
def processComment(self, data):
304303
# For most phases the following is correct. Where it's not it will be
@@ -601,8 +600,7 @@ def startTagMeta(self, name, attributes):
601600

602601
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
603602
if "charset" in attributes:
604-
codec = inputstream.codecName(attributes["charset"])
605-
self.parser.tokenizer.stream.changeEncoding(codec)
603+
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
606604
elif "content" in attributes:
607605
data = inputstream.EncodingBytes(attributes["content"])
608606
parser = inputstream.ContentAttrParser(data)

src/html5lib/inputstream.py

Lines changed: 102 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import types
44

55
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
6-
from constants import encodings
6+
from constants import encodings, ReparseException
77

88
#Non-unicode versions of constants for use in the pre-parser
99
spaceCharactersBytes = [str(item) for item in spaceCharacters]
@@ -16,6 +16,82 @@
1616

1717
# Cache for charsUntil()
1818
charsUntilRegEx = {}
19+
20+
class BufferedStream:
21+
"""Buffering for streams that do not have buffering of their own
22+
23+
The buffer is implemented as a list of chunks on the assumption that
24+
joining many strings will be slow since it is O(n**2)
25+
"""
26+
27+
def __init__(self, stream):
28+
self.stream = stream
29+
self.buffer = []
30+
self.position = [-1,0] #chunk number, offset
31+
32+
def tell(self):
33+
pos = 0
34+
for chunk in self.buffer[:self.position[0]]:
35+
pos += len(chunk)
36+
pos += self.position[1]
37+
return pos
38+
39+
def seek(self, pos):
40+
assert pos < self._bufferedBytes()
41+
offset = pos
42+
i = 0
43+
while len(self.buffer[i]) < offset:
44+
offset -= pos
45+
i += 1
46+
self.position = [i, offset]
47+
48+
def read(self, bytes):
49+
if not self.buffer:
50+
return self._readStream(bytes)
51+
elif (self.position[0] == len(self.buffer) and
52+
self.position[1] == len(self.buffer[-1])):
53+
return self._readStream(bytes)
54+
else:
55+
return self._readFromBuffer(bytes)
56+
57+
def _bufferedBytes(self):
58+
return sum([len(item) for item in self.buffer])
59+
60+
def _readStream(self, bytes):
61+
data = self.stream.read(bytes)
62+
self.buffer.append(data)
63+
self.position[0] += 1
64+
self.position[1] = len(data)
65+
return data
66+
67+
def _readFromBuffer(self, bytes):
68+
remainingBytes = bytes
69+
rv = []
70+
bufferIndex = self.position[0]
71+
bufferOffset = self.position[1]
72+
while bufferIndex < len(self.buffer) and remainingBytes != 0:
73+
assert remainingBytes > 0
74+
bufferedData = self.buffer[bufferIndex]
75+
76+
if remainingBytes <= len(bufferedData) - bufferOffset:
77+
bytesToRead = remainingBytes
78+
self.position = [bufferIndex, bufferOffset + bytesToRead]
79+
else:
80+
bytesToRead = len(bufferedData) - bufferOffset
81+
self.position = [bufferIndex, len(bufferedData)]
82+
bufferIndex += 1
83+
data = rv.append(bufferedData[bufferOffset:
84+
bufferOffset + bytesToRead])
85+
remainingBytes -= bytesToRead
86+
87+
bufferOffset = 0
88+
89+
if remainingBytes:
90+
rv.append(self._readStream(remainingBytes))
91+
92+
return "".join(rv)
93+
94+
1995

2096
class HTMLInputStream:
2197
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
65141
if (self.charEncoding[0] is None):
66142
self.charEncoding = self.detectEncoding(parseMeta, chardet)
67143

144+
self.reset()
145+
146+
def reset(self):
68147
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
69148
'replace')
70149

@@ -100,6 +179,10 @@ def openStream(self, source):
100179
self.charEncoding = ("utf-8", "certain")
101180
import cStringIO
102181
stream = cStringIO.StringIO(str(source))
182+
183+
if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
184+
stream = BufferedStream(stream)
185+
103186
return stream
104187

105188
def detectEncoding(self, parseMeta=True, chardet=True):
@@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
128211
detector.feed(buffer)
129212
detector.close()
130213
encoding = detector.result['encoding']
131-
self.seek("".join(buffers), 0)
214+
self.rawStream.seek(0)
132215
except ImportError:
133216
pass
134217
# If all else fails use the default encoding
@@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):
146229

147230
def changeEncoding(self, newEncoding):
148231
newEncoding = codecName(newEncoding)
149-
if newEncoding == "utf16":
150-
newEncoding = "utf8"
151-
232+
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
233+
newEncoding = "utf-8"
152234
if newEncoding is None:
153235
return
154236
elif newEncoding == self.charEncoding[0]:
155-
self.charEncoding = (self.charEncoding[0] and "certian")
237+
self.charEncoding = (self.charEncoding[0], "certian")
156238
else:
157-
raise NotImplementedError, "Cannot change character encoding mid stream"
158-
239+
self.rawStream.seek(0)
240+
self.reset()
241+
self.charEncoding = (newEncoding, "certian")
242+
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
243+
159244
def detectBOM(self):
160245
"""Attempts to detect at BOM at the start of the stream. If
161246
an encoding can be determined from the BOM return the name of the
@@ -182,56 +267,21 @@ def detectBOM(self):
182267

183268
# Set the read position past the BOM if one was found, otherwise
184269
# set it to the start of the stream
185-
self.seek(string, encoding and seek or 0)
270+
self.rawStream.seek(encoding and seek or 0)
186271

187272
return encoding
188273

189-
def seek(self, buffer, n):
190-
"""Unget buffer[n:]"""
191-
if hasattr(self.rawStream, 'unget'):
192-
self.rawStream.unget(buffer[n:])
193-
return
194-
195-
if hasattr(self.rawStream, 'seek'):
196-
try:
197-
self.rawStream.seek(n)
198-
return
199-
except IOError:
200-
pass
201-
202-
class BufferedStream:
203-
def __init__(self, data, stream):
204-
self.data = data
205-
self.stream = stream
206-
def read(self, chars=-1):
207-
if chars == -1 or chars > len(self.data):
208-
result = self.data
209-
self.data = ''
210-
if chars == -1:
211-
return result + self.stream.read()
212-
else:
213-
return result + self.stream.read(chars-len(result))
214-
elif not self.data:
215-
return self.stream.read(chars)
216-
else:
217-
result = self.data[:chars]
218-
self.data = self.data[chars:]
219-
return result
220-
def unget(self, data):
221-
if self.data:
222-
self.data += data
223-
else:
224-
self.data = data
225-
226-
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
227-
228274
def detectEncodingMeta(self):
229275
"""Report the encoding declared by the meta element
230276
"""
231277
buffer = self.rawStream.read(self.numBytesMeta)
232278
parser = EncodingParser(buffer)
233-
self.seek(buffer, 0)
279+
self.rawStream.seek(0)
234280
encoding = parser.getEncoding()
281+
282+
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
283+
encoding = "utf-8"
284+
235285
return encoding
236286

237287
def updatePosition(self, chars):
@@ -485,13 +535,6 @@ def getEncoding(self):
485535
break
486536
if not keepParsing:
487537
break
488-
if self.encoding is not None:
489-
self.encoding = self.encoding.strip()
490-
#Spec violation that complies with hsivonen + mjs
491-
if (ascii_punctuation_re.sub("", self.encoding) in
492-
("utf16", "utf16be", "utf16le",
493-
"utf32", "utf32be", "utf32le")):
494-
self.encoding = "utf-8"
495538

496539
return self.encoding
497540

@@ -666,11 +709,12 @@ def parse(self):
666709
except StopIteration:
667710
return None
668711

712+
669713
def codecName(encoding):
670714
"""Return the python codec name corresponding to an encoding or None if the
671715
string doesn't correspond to a valid encoding."""
672-
if (encoding is not None and type(encoding) == types.StringType):
716+
if (encoding is not None and type(encoding) in types.StringTypes):
673717
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
674-
return encodings.get(canonicalName, None)
718+
return encodings.get(canonicalName, None)
675719
else:
676720
return None

src/html5lib/treebuilders/etree_lxml.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def _getChildNodes(self):
4747
def testSerializer(element):
4848
rv = []
4949
finalText = None
50+
filter = ihatexml.InfosetFilter()
5051
def serializeElement(element, indent=0):
5152
if not hasattr(element, "tag"):
5253
if hasattr(element, "getroot"):
@@ -79,10 +80,11 @@ def serializeElement(element, indent=0):
7980
elif type(element.tag) == type(etree.Comment):
8081
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
8182
else:
82-
rv.append("|%s<%s>"%(' '*indent, element.tag))
83+
rv.append("|%s<%s>"%(' '*indent, filter.fromXmlName(element.tag)))
8384
if hasattr(element, "attrib"):
8485
for name, value in element.attrib.iteritems():
85-
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
86+
rv.append('|%s%s="%s"' % (' '*(indent+2),
87+
filter.fromXmlName(name), value))
8688
if element.text:
8789
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
8890
indent += 2
@@ -239,8 +241,8 @@ def getFragment(self):
239241
return fragment
240242

241243
def insertDoctype(self, name, publicId, systemId):
242-
if not name:
243-
warnings.warn("lxml cannot represent null doctype", DataLossWarning)
244+
if not name or ihatexml.nonXmlBMPRegexp.search(name):
245+
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
244246
doctype = self.doctypeClass(name, publicId, systemId)
245247
self.doctype = doctype
246248

0 commit comments

Comments
 (0)
0