8000 Fix issue #156. · laukik/html5lib-python@bbbb03f · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit bbbb03f

Browse files
committed
Fix issue html5lib#156.
Fix some Unicode mix-up in the serializer, too, making sure Unicode strings are unicode strings.
1 parent c2eecb5 commit bbbb03f

File tree

2 files changed

+57
-67
lines changed

2 files changed

+57
-67
lines changed

html5lib/filters/inject_meta_charset.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,44 +13,44 @@ def __iter__(self):
1313
for token in _base.Filter.__iter__(self):
1414
type = token["type"]
1515
if type == "StartTag":
16-
if token["name"].lower() == "head":
16+
if token["name"].lower() == u"head":
1717
state = "in_head"
1818

1919
elif type == "EmptyTag":
20-
if token["name"].lower() == "meta":
20+
if token["name"].lower() == u"meta":
2121
# replace charset with actual encoding
2222
has_http_equiv_content_type = False
2323
for (namespace,name),value in token["data"].iteritems():
2424
if namespace != None:
2525
continue
26-
elif name.lower() == 'charset':
26+
elif name.lower() == u'charset':
2727
token["data"][(namespace,name)] = self.encoding
2828
meta_found = True
2929
break
30-
elif name == 'http-equiv' and value.lower() == 'content-type':
30+
elif name == u'http-equiv' and value.lower() == u'content-type':
3131
has_http_equiv_content_type = True
3232
else:
33-
if has_http_equiv_content_type and (None, "content") in token["data"]:
34-
token["data"][(None, "content")] = u'text/html; charset=%s' % self.encoding
33+
if has_http_equiv_content_type and (None, u"content") in token["data"]:
34+
token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
3535
meta_found = True
3636

37-
elif token["name"].lower() == "head" and not meta_found:
37+
elif token["name"].lower() == u"head" and not meta_found:
3838
# insert meta into empty head
39-
yield {"type": "StartTag", "name": "head",
39+
yield {"type": "StartTag", "name": u"head",
4040
"data": token["data"]}
41-
yield {"type": "EmptyTag", "name": "meta",
42-
"data": {(None, "charset"): self.encoding}}
43-
yield {"type": "EndTag", "name": "head"}
41+
yield {"type": "EmptyTag", "name": u"meta",
42+
"data": {(None, u"charset"): self.encoding}}
43+
yield {"type": "EndTag", "name": u"head"}
4444
meta_found = True
4545
continue
4646

4747
elif type == "EndTag":
48-
if token["name"].lower() == "head" and pending:
48+
if token["name"].lower() == u"head" and pending:
4949
# insert meta into head (if necessary) and flush pending queue
5050
yield pending.pop(0)
5151
if not meta_found:
52-
yield {"type": "EmptyTag", "name": "meta",
53-
"data": {(None, "charset"): self.encoding}}
52+
yield {"type": "EmptyTag", "name": u"meta",
53+
"data": {(None, u"charset"): self.encoding}}
5454
while pending:
5555
yield pending.pop(0)
5656
meta_found = True

html5lib/serializer/htmlserializer.py

Lines changed: 43 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,12 @@ def htmlentityreplace_errors(exc):
7676

7777
del register_error
7878

79-
def encode(text, encoding):
80-
return text.encode(encoding, unicode_encode_errors)
8179

8280
class HTMLSerializer(object):
8381

8482
# attribute quoting options
8583
quote_attr_values = False
86-
quote_char = '"'
84+
quote_char = u'"'
8785
use_best_quote_char = True
8886

8987
# tag syntax options
@@ -159,7 +157,22 @@ def __init__(self, **kwargs):
159157
self.errors = []
160158
self.strict = False
161159

160+
def encode(self, string):
161+
assert(isinstance(string, unicode))
162+
if self.encoding:
163+
return string.encode(self.encoding, unicode_encode_errors)
164+
else:
165+
return string
166+
167+
def encodeStrict(self, string):
168+
assert(isinstance(string, unicode))
169+
if self.encoding:
170+
return string.encode(self.encoding, "strict")
171+
else:
172+
return string
173+
162174
def serialize(self, treewalker, encoding=None):
175+
self.encoding = encoding
163176
in_cdata = False
164177
self.errors = []
165178
if encoding and self.inject_meta_charset:
@@ -195,27 +208,19 @@ def serialize(self, treewalker, encoding=None):
195208
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
196209

197210
doctype += u">"
198-
199-
if encoding:
200-
yield doctype.encode(encoding)
201-
else:
202-
yield doctype
211+
yield self.encodeStrict(doctype)
203212

204213
elif type in ("Characters", "SpaceCharacters"):
205214
if type == "SpaceCharacters" or in_cdata:
206215
if in_cdata and token["data"].find("</") >= 0:
207216
self.serializeError(_("Unexpected </ in CDATA"))
208-
if encoding:
209-
yield token["data"].encode(encoding, "strict")
210-
else:
211-
yield token["data"]
212-
elif encoding:
213-
yield encode(escape(token["data"]), encoding)
217+
yield self.encode(token["data"])
214218
else:
215-
yield escape(token["data"])
219+
yield self.encode(escape(token["data"]))
216220

217221
elif type in ("StartTag", "EmptyTag"):
218222
name = token["name"]
223+
yield self.encodeStrict(u"<%s" % name)
219224
if name in rcdataElements and not self.escape_rcdata:
220225
in_cdata = True
221226
elif in_cdata:
@@ -225,69 +230,56 @@ def serialize(self, treewalker, encoding=None):
225230
#TODO: Add namespace support here
226231
k = attr_name
227232
v = attr_value
228-
if encoding:
229-
k = k.encode(encoding, "strict")
230-
attributes.append(' ')
233+
yield self.encodeStrict(u' ')
231234

232-
attributes.append(k)
235+
yield self.encodeStrict(k)
233236
if not self.minimize_boolean_attributes or \
234237
(k not in booleanAttributes.get(name, tuple()) \
235238
and k not in booleanAttributes.get("", tuple())):
236-
attributes.append("=")
239+
yield self.encodeStrict(u"=")
237240
if self.quote_attr_values or not v:
238241
quote_attr = True
239242
10000 else:
240243
quote_attr = reduce(lambda x,y: x or (y in v),
241-
spaceCharacters + ">\"'=", False)
242-
v = v.replace("&", "&amp;")
243-
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
244-
if encoding:
245-
v = encode(v, encoding)
244+
spaceCharacters + u">\"'=", False)
245+
v = v.replace(u"&", u"&amp;")
246+
if self.escape_lt_in_attrs: v = v.replace(u"<", u"&lt;")
246247
if quote_attr:
247248
quote_char = self.quote_char
248249
if self.use_best_quote_char:
249-
if "'" in v and '"' not in v:
250-
quote_char = '"'
251-
elif '"' in v and "'" not in v:
252-
quote_char = "'"
253-
if quote_char == "'":
254-
v = v.replace("'", "&#39;")
250+
if u"'" in v and u'"' not in v:
251+
quote_char = u'"'
252+
elif u'"' in v and u"'" not in v:
253+
quote_char = u"'"
254+
if quote_char == u"'":
255+
v = v.replace(u"'", u"&#39;")
255256
else:
256-
v = v.replace('"', "&quot;")
257-
attributes.append(quote_char)
258-
attributes.append(v)
259-
attributes.append(quote_char)
257+
v = v.replace(u'"', u"&quot;")
258+
yield self.encodeStrict(quote_char)
259+
yield self.encode(v)
260+
yield self.encodeStrict(quote_char)
260261
else:
261-
attributes.append(v)
262+
yield self.encode(v)
262263
if name in voidElements and self.use_trailing_solidus:
263264
if self.space_before_trailing_solidus:
264-
attributes.append(" /")
265+
yield self.encodeStrict(u" /")
265266
else:
266-
attributes.append("/")
267-
if encoding:
268-
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
269-
else:
270-
yield u"<%s%s>" % (name, u"".join(attributes))
267+
yield self.encodeStrict(u"/")
268+
yield self.encode(u">")
271269

272270
elif type == "EndTag":
273271
name = token["name"]
274272
if name in rcdataElements:
275273
in_cdata = False
276274
elif in_cdata:
277275
self.serializeError(_("Unexpected child element of a CDATA element"))
278-
end_tag = u"</%s>" % name
279-
if encoding:
280-
end_tag = end_tag.encode(encoding, "strict")
281-
yield end_tag
276+
yield self.encodeStrict(u"</%s>" % name)
282277

283278
elif type == "Comment":
284279
data = token["data"]
285280
if data.find("--") >= 0:
286281
self.serializeError(_("Comment contains --"))
287-
comment = u"<!--%s-->" % token["data"]
288-
if encoding:
289-
comment = comment.encode(encoding, unicode_encode_errors)
290-
yield comment
282+
yield self.encodeStrict(u"<!--%s-->" % token["data"])
291283

292284
elif type == "Entity":
293285
name = token["name"]
@@ -298,9 +290,7 @@ def serialize(self, treewalker, encoding=None):
298290
data = entities[key]
299291
else:
300292
data = u"&%s;" % name
301-
if encoding:
302-
data = data.encode(encoding, unicode_encode_errors)
303-
yield data
293+
yield self.encodeStrict(data)
304294

305295
else:
306296
self.serializeError(token["data"])

0 commit comments

Comments
 (0)
0