10BC0 Added support for encoding the output (serialize generates string() i… · awesome-python/html5lib-python@a521291 · GitHub
[go: up one dir, main page]

Skip to content

Commit a521291

Browse files
committed
Added support for encoding the output (serialize generates string() instead of unicode()) and refactored options
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40632
1 parent d25f3c6 commit a521291

File tree

2 files changed

+115
-29
lines changed

2 files changed

+115
-29
lines changed

src/serializer.py

Lines changed: 115 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,40 @@
77
import gettext
88
_ = gettext.gettext
99

10-
from constants import voidElements, booleanAttributes, spaceCharacters
10+
from constants import voidElements, booleanAttributes, spaceCharacters, entities
11+
1112
spaceCharacters = u"".join(spaceCharacters)
1213

14+
default_entity_map = {}
15+
for k, v in entities.items():
16+
if v != "&" and default_entity_map.get(v) != k.lower():
17+
# prefer < over < and similarly for &, >, etc.
18+
default_entity_map[v] = k
19+
20+
try:
21+
from codecs import register_error, xmlcharrefreplace_errors
22+
except ImportError:
23+
unicode_encode_errors = "strict"
24+
else:
25+
unicode_encode_errors = "htmlentityreplace"
26+
27+
def htmlentityreplace_errors(ex):
28+
if isinstance(ex, UnicodeEncodeError):
29+
res = []
30+
for c in ex.object[ex.start:ex.end]:
31+
c = default_entity_map.get(c)
32+
if c:
33+
res.append(c)
34+
else:
35+
res.append(c.encode(ex.encoding, "xmlcharrefreplace"))
36+
return (u"".join(res), ex.end)
37+
else:
38+
return xmlcharrefreplace_errors(ex)
39+
40+
register_error(unicode_encode_errors, htmlentityreplace_errors)
41+
42+
del register_error, xmlcharrefreplace_errors
43+
1344
def _slide(iterator):
1445
previous = None
1546
for token in iterator:
@@ -23,40 +54,57 @@ class HTMLSerializer(object):
2354

2455
quote_attr_values = False
2556
quote_char = '"'
57+
use_best_quote_char = True
2658
minimize_boolean_attributes = True
2759

2860
use_trailing_solidus = False
29-
trailing_solidus = " /"
61+
space_before_trailing_solidus = True
3062

3163
omit_optional_tags = True
3264

65+
strip_whitespace = False
66+
3367
def __init__(self, **kwargs):
34-
for attr in ("quote_attr_values", "quote_char", "minimize_boolean_attributes",
35-
"trailing_solidus", "use_trailing_solidus", "omit_optional_tags"):
68+
for attr in ("quote_attr_values", "quote_char", "use_best_quote_char",
69+
"minimize_boolean_attributes", "use_trailing_solidus",
70+
"space_before_trailing_solidus", "omit_optional_tags",
71+
"strip_whitespace"):
3672
if attr in kwargs:
3773
setattr(self, attr, kwargs[attr])
3874
self.errors = []
3975

40-
def serialize(self, treewalker):
76+
def serialize(self, treewalker, encoding=None):
4177
in_cdata = False
4278
self.errors = []
79+
if self.strip_whitespace:
80+
treewalker = self.filter_whitespace(treewalker)
4381
if self.omit_optional_tags:
44-
treewalker = self.filter(treewalker)
82+
treewalker = self.filter_optional_tags(treewalker)
4583
for token in treewalker:
4684
type = token["type"]
4785
if type == "Doctype":
48-
yield u"<!DOCTYPE %s>" % token["name"]
86+
doctype = u"<!DOCTYPE %s>" % token["name"]
87+
if encoding:
88+
yield doctype.encode(encoding)
89+
else:
90+
yield doctype
4991

5092
elif type in ("Characters", "SpaceCharacters"):
5193
if type == "SpaceCharacters" or in_cdata:
5294
if in_cdata and token["data"].find("</") >= 0:
5395
self.serializeError(_("Unexpected </ in CDATA"))
54-
yield token["data"]
96+
if encoding:
97+
yield token["data"].encode(encoding, errors or "strict")
98+
else:
99+
yield token["data"]
100+
elif encoding:
101+
yield token["data"].replace("&", "&amp;") \
102+
.encode(encoding, unicode_encode_errors)
55103
else:
56104
yield token["data"] \
57105
.replace("&", "&amp;") \
58106
.replace("<", "&lt;") \
59-
.replace(">", "&gt;") \
107+
.replace(">", "&gt;")
60108

61109
elif type in ("StartTag", "EmptyTag"):
62110
name = token["name"]
@@ -70,53 +118,98 @@ def serialize(self, treewalker):
70118
attrs.sort()
71119
attributes = []
72120
for k,v in attrs:
121+
if encoding:
122+
k = k.encode(encoding)
73123
attributes.append(' ')
124+
74125
attributes.append(k)
75126
if not self.minimize_boolean_attributes or \
76127
(k not in booleanAttributes.get(name, tuple()) \
77128
and k not in booleanAttributes.get("", tuple())):
78129
attributes.append("=")
79-
v = v.replace("&", "&amp;")
80130
if self.quote_attr_values or not v:
81131
quote_attr = True
82132
else:
83133
quote_attr = reduce(lambda x,y: x or y in v,
84134
spaceCharacters + "<>\"'", False)
135+
v = v.replace("&", "&amp;")
136+
if encoding:
137+
v = v.encode(encoding, unicode_encode_errors)
85138
if quote_attr:
86-
if self.quote_char == '"':
87-
v = v.replace('"', "&quot;")
139+
quote_char = self.quote_char
140+
if self.use_best_quote_char:
141+
if "'" in v and '"' not in v:
142+
quote_char = "'"
143+
elif '"' in v and "'" not in v:
144+
quote_char = '"'
145+
if quote_char == "'":
146+
v = v.replace("'", "&#39;")
88147
else:
89-
v = v.replace(self.quote_char, "&#%u;" % ord(self.quote_char))
90-
attributes.append(self.quote_char)
148+
v = v.replace('"', "&quot;")
149+
attributes.append(quote_char)
91150
attributes.append(v)
92-
attributes.append(self.quote_char)
151+
attributes.append(quote_char)
93152
else:
94153
attributes.append(v)
95154
if name in voidElements and self.use_trailing_solidus:
96-
attributes.append(self.trailing_solidus)
97-
yield u"<%s%s>" % (name, u"".join(attributes))
155+
if self.space_before_trailing_solidus:
156+
attributes.append(" /")
157+
else:
158+
attributes.append("/")
159+
if encoding:
160+
yield "<%s%s>" % (name.encode(encoding), "".join(attributes))
161+
else:
162+
yield u"<%s%s>" % (name, u"".join(attributes))
98163

99164
elif type == "EndTag":
100165
name = token["name"]
101166
if name in self.cdata_elements:
102167
in_cdata = False
103168
elif in_cdata:
104169
self.serializeError(_("Unexpected child element of a CDATA element"))
105-
yield u"</%s>" % name
170+
end_tag = u"</%s>" % name
171+
if encoding:
172+
end_tag = end_tag.encode(encoding)
173+
yield end_tag
106174

107175
elif type == "Comment":
108176
data = token["data"]
10BC0 109177
if data.find("--") >= 0:
110178
self.serializeError(_("Comment contains --"))
111-
yield u"<!--%s-->" % token["data"]
179+
comment = u"<!--%s-->" % token["data"]
180+
if encoding:
181+
comment = comment.encode(encoding, unicode_encode_errors)
182+
yield comment
112183

113184
else:
114185
self.serializeError(token["data"])
115186

116-
def render(self, treewalker, encoding='UTF-8', errors="strict"):
117-
u''.join(list(self.serialize(treewalker))).encode(encoding, errors)
187+
def render(self, treewalker, encoding=None):
188+
if encoding:
189+
return "".join(list(self.serialize(treewalker, encoding)))
190+
else:
191+
return u"".join(list(self.serialize(treewalker)))
118192

119-
def filter(self, treewalker):
193+
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
194+
# XXX The idea is to make data mandatory.
195+
self.errors.append(data)
196+
if self.strict:
197+
raise SerializeError
198+
199+
def filter_inject_meta_charset(self, treewalker):
200+
done = False
201+
for token in treewalker:
202+
if not done and token["type"] == "StartTag" \
203+
and token["name"].lower() == "head":
204+
yield {"type": "EmptyTag", "name": "meta", \
205+
"data": {"charset": encoding}}
206+
yield token
207+
208+
def filter_whitespace(self, treewalker):
209+
# TODO
210+
return treewalker
211+
212+
def filter_optional_tags(self, treewalker):
120213
for token, next in _slide(treewalker):
121214
type = token["type"]
122215
if type == "StartTag":
@@ -128,12 +221,6 @@ def filter(self, treewalker):
128221
else:
129222
yield token
130223

131-
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
132-
# XXX The idea is to make data mandatory.
133-
self.errors.append(data)
134-
if self.strict:
135-
raise SerializeError
136-
137224
def is_optional_start(self, tagname, next):
138225
type = next and next["type"] or None
139226
if tagname in 'html':

tests/test_treewalkers.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ def load(f):
2525
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
2626

2727
import html5parser
28-
import serializer
2928
#Run tests over all treewalkers/treebuilders pairs
3029
#XXX - it would be nice to automate finding all treewalkers or to allow running just one
3130

0 commit comments

Comments
 (0)
0