7
7
import gettext
8
8
_ = gettext .gettext
9
9
10
- from constants import voidElements , booleanAttributes , spaceCharacters
10
+ from constants import voidElements , booleanAttributes , spaceCharacters , entities
11
+
11
12
spaceCharacters = u"" .join (spaceCharacters )
12
13
14
+ default_entity_map = {}
15
+ for k , v in entities .items ():
16
+ if v != "&" and default_entity_map .get (v ) != k .lower ():
17
+ # prefer < over < and similarly for &, >, etc.
18
+ default_entity_map [v ] = k
19
+
20
+ try :
21
+ from codecs import register_error , xmlcharrefreplace_errors
22
+ except ImportError :
23
+ unicode_encode_errors = "strict"
24
+ else :
25
+ unicode_encode_errors = "htmlentityreplace"
26
+
27
+ def htmlentityreplace_errors (ex ):
28
+ if isinstance (ex , UnicodeEncodeError ):
29
+ res = []
30
+ for c in ex .object [ex .start :ex .end ]:
31
+ c = default_entity_map .get (c )
32
+ if c :
33
+ res .append (c )
34
+ else :
35
+ res .append (c .encode (ex .encoding , "xmlcharrefreplace" ))
36
+ return (u"" .join (res ), ex .end )
37
+ else :
38
+ return xmlcharrefreplace_errors (ex )
39
+
40
+ register_error (unicode_encode_errors , htmlentityreplace_errors )
41
+
42
+ del register_error , xmlcharrefreplace_errors
43
+
13
44
def _slide (iterator ):
14
45
previous = None
15
46
for token in iterator :
@@ -23,40 +54,57 @@ class HTMLSerializer(object):
23
54
24
55
quote_attr_values = False
25
56
quote_char = '"'
57
+ use_best_quote_char = True
26
58
minimize_boolean_attributes = True
27
59
28
60
use_trailing_solidus = False
29
- trailing_solidus = " /"
61
+ space_before_trailing_solidus = True
30
62
31
63
omit_optional_tags = True
32
64
65
+ strip_whitespace = False
66
+
33
67
def __init__ (self , ** kwargs ):
34
- for attr in ("quote_attr_values" , "quote_char" , "minimize_boolean_attributes" ,
35
- "trailing_solidus" , "use_trailing_solidus" , "omit_optional_tags" ):
68
+ for attr in ("quote_attr_values" , "quote_char" , "use_best_quote_char" ,
69
+ "minimize_boolean_attributes" , "use_trailing_solidus" ,
70
+ "space_before_trailing_solidus" , "omit_optional_tags" ,
71
+ "strip_whitespace" ):
36
72
if attr in kwargs :
37
73
setattr (self , attr , kwargs [attr ])
38
74
self .errors = []
39
75
40
- def serialize (self , treewalker ):
76
+ def serialize (self , treewalker , encoding = None ):
41
77
in_cdata = False
42
78
self .errors = []
79
+ if self .strip_whitespace :
80
+ treewalker = self .filter_whitespace (treewalker )
43
81
if self .omit_optional_tags :
44
- treewalker = self .filter (treewalker )
82
+ treewalker = self .filter_optional_tags (treewalker )
45
83
for token in treewalker :
46
84
type = token ["type" ]
47
85
if type == "Doctype" :
48
- yield u"<!DOCTYPE %s>" % token ["name" ]
86
+ doctype = u"<!DOCTYPE %s>" % token ["name" ]
87
+ if encoding :
88
+ yield doctype .encode (encoding )
89
+ else :
90
+ yield doctype
49
91
50
92
elif type in ("Characters" , "SpaceCharacters" ):
51
93
if type == "SpaceCharacters" or in_cdata :
52
94
if in_cdata and token ["data" ].find ("</" ) >= 0 :
53
95
self .serializeError (_ ("Unexpected </ in CDATA" ))
54
- yield token ["data" ]
96
+ if encoding :
97
+ yield token ["data" ].encode (encoding , errors or "strict" )
98
+ else :
99
+ yield token ["data" ]
100
+ elif encoding :
101
+ yield token ["data" ].replace ("&" , "&" ) \
102
+ .encode (encoding , unicode_encode_errors )
55
103
else :
56
104
yield token ["data" ] \
57
105
.replace ("&" , "&" ) \
58
106
.replace ("<" , "<" ) \
59
- .replace (">" , ">" ) \
107
+ .replace (">" , ">" )
60
108
61
109
elif type in ("StartTag" , "EmptyTag" ):
62
110
name = token ["name" ]
@@ -70,53 +118,98 @@ def serialize(self, treewalker):
70
118
attrs .sort ()
71
119
attributes = []
72
120
for k ,v in attrs :
121
+ if encoding :
122
+ k = k .encode (encoding )
73
123
attributes .append (' ' )
124
+
74
125
attributes .append (k )
75
126
if not self .minimize_boolean_attributes or \
76
127
(k not in booleanAttributes .get (name , tuple ()) \
77
128
and k not in booleanAttributes .get ("" , tuple ())):
78
129
attributes .append ("=" )
79
- v = v .replace ("&" , "&" )
80
130
if self .quote_attr_values or not v :
81
131
quote_attr = True
82
132
else :
83
133
quote_attr = reduce (lambda x ,y : x or y in v ,
84
134
spaceCharacters + "<>\" '" , False )
135
+ v = v .replace ("&" , "&" )
136
+ if encoding :
137
+ v = v .encode (encoding , unicode_encode_errors )
85
138
if quote_attr :
86
- if self .quote_char == '"' :
87
- v = v .replace ('"' , """ )
139
+ quote_char = self .quote_char
140
+ if self .use_best_quote_char :
141
+ if "'" in v and '"' not in v :
142
+ quote_char = "'"
143
+ elif '"' in v and "'" not in v :
144
+ quote_char = '"'
145
+ if quote_char == "'" :
146
+ v = v .replace ("'" , "'" )
88
147
else :
89
- v = v .replace (self . quote_char , "&#%u;" % ord ( self . quote_char ) )
90
- attributes .append (self . quote_char )
148
+ v = v .replace ('"' , """ )
149
+ attributes .append (quote_char )
91
150
attributes .append (v )
92
- attributes .append (self . quote_char )
151
+ attributes .append (quote_char )
93
152
else :
94
153
attributes .append (v )
95
154
if name in voidElements and self .use_trailing_solidus :
96
- attributes .append (self .trailing_solidus )
97
- yield u"<%s%s>" % (name , u"" .join (attributes ))
155
+ if self .space_before_trailing_solidus :
156
+ attributes .append (" /" )
157
+ else :
158
+ attributes .append ("/" )
159
+ if encoding :
160
+ yield "<%s%s>" % (name .encode (encoding ), "" .join (attributes ))
161
+ else :
162
+ yield u"<%s%s>" % (name , u"" .join (attributes ))
98
163
99
164
elif type == "EndTag" :
100
165
name = token ["name" ]
101
166
if name in self .cdata_elements :
102
167
in_cdata = False
103
168
elif in_cdata :
104
169
self .serializeError (_ ("Unexpected child element of a CDATA element" ))
105
- yield u"</%s>" % name
170
+ end_tag = u"</%s>" % name
171
+ if encoding :
172
+ end_tag = end_tag .encode (encoding )
173
+ yield end_tag
106
174
107
175
elif type == "Comment" :
108
176
data = token ["data" ]
10BC0
109
177
if data .find ("--" ) >= 0 :
110
178
self .serializeError (_ ("Comment contains --" ))
111
- yield u"<!--%s-->" % token ["data" ]
179
+ comment = u"<!--%s-->" % token ["data" ]
180
+ if encoding :
181
+ comment = comment .encode (encoding , unicode_encode_errors )
182
+ yield comment
112
183
113
184
else :
114
185
self .serializeError (token ["data" ])
115
186
116
- def render (self , treewalker , encoding = 'UTF-8' , errors = "strict" ):
117
- u'' .join (list (self .serialize (treewalker ))).encode (encoding , errors )
187
+ def render (self , treewalker , encoding = None ):
188
+ if encoding :
189
+ return "" .join (list (self .serialize (treewalker , encoding )))
190
+ else :
191
+ return u"" .join (list (self .serialize (treewalker )))
118
192
119
- def filter (self , treewalker ):
193
+ def serializeError (self , data = "XXX ERROR MESSAGE NEEDED" ):
194
+ # XXX The idea is to make data mandatory.
195
+ self .errors .append (data )
196
+ if self .strict :
197
+ raise SerializeError
198
+
199
+ def filter_inject_meta_charset (self , treewalker ):
200
+ done = False
201
+ for token in treewalker :
202
+ if not done and token ["type" ] == "StartTag" \
203
+ and token ["name" ].lower () == "head" :
204
+ yield {"type" : "EmptyTag" , "name" : "meta" , \
205
+ "data" : {"charset" : encoding }}
206
+ yield token
207
+
208
+ def filter_whitespace (self , treewalker ):
209
+ # TODO
210
+ return treewalker
211
+
212
+ def filter_optional_tags (self , treewalker ):
120
213
for token , next in _slide (treewalker ):
121
214
type = token ["type" ]
122
215
if type == "StartTag" :
@@ -128,12 +221,6 @@ def filter(self, treewalker):
128
221
else :
129
222
yield token
130
223
131
- def serializeError (self , data = "XXX ERROR MESSAGE NEEDED" ):
132
- # XXX The idea is to make data mandatory.
133
- self .errors .append (data )
134
- if self .strict :
135
- raise SerializeError
136
-
137
224
def is_optional_start (self , tagname , next ):
138
225
type = next and next ["type" ] or None
139
226
if tagname in 'html' :
0 commit comments