|
| 1 | +from constants import voidElements, booleanAttributes |
| 2 | + |
| 3 | +try: |
| 4 | + frozenset |
| 5 | +except NameError: |
| 6 | + # Import from the sets module for python 2.3 |
| 7 | + from sets import ImmutableSet as frozenset |
| 8 | + |
| 9 | +import gettext |
| 10 | +_ = gettext.gettext |
| 11 | + |
| 12 | +def _slide(iterator): |
| 13 | + previous = None |
| 14 | + for token in iterator: |
| 15 | + if previous is not None: |
| 16 | + yield previous, token |
| 17 | + previous = token |
| 18 | + yield previous, None |
| 19 | + |
| 20 | +class HTMLSerializer(object): |
| 21 | + cdata_elements = frozenset(("style", "script", "xmp", "iframe", "noembed", "noframes", "noscript")) |
| 22 | + |
| 23 | + quote_attr_values = False |
| 24 | + quote_char = '"' |
| 25 | + minimize_boolean_attributes = True |
| 26 | + |
| 27 | + trailing_solidus = " /" |
| 28 | + |
| 29 | + omit_optional_tags = True |
| 30 | + |
| 31 | + def __init__(self, **kwargs): |
| 32 | + for attr in ("quote_attr_values", "quote_char", |
| 33 | + "minimize_boolean_attributes", "trailing_solidus", |
| 34 | + "omit_optional_tags"): |
| 35 | + if attr in kwargs: |
| 36 | + setattr(self, attr, kwargs[attr]) |
| 37 | + self.errors = [] |
| 38 | + |
| 39 | + def serialize(self, treewalker): |
| 40 | + in_cdata = False |
| 41 | + self.errors = [] |
| 42 | + if self.omit_optional_tags: |
| 43 | + treewalker = self.filter(treewalker) |
| 44 | + for token in treewalker: |
| 45 | + type = token["type"] |
| 46 | + if type == "Doctype": |
| 47 | + yield u"<!DOCTYPE %s>" % token["name"] |
| 48 | + |
| 49 | + elif type in ("Characters", "SpaceCharacters"): |
| 50 | + if type == "SpaceCharacters" or in_cdata: |
| 51 | + if in_cdata and token["data"].find("</") >= 0: |
| 52 | + self.serializeError(_("Unexpected </ in CDATA")) |
| 53 | + yield token["data"] |
| 54 | + else: |
| 55 | + yield token["data"] \ |
| 56 | + .replace("&", "&") \ |
| 57 | + .replace("<", "<") \ |
| 58 | + .replace(">", ">") \ |
| 59 | + |
| 60 | + elif type in ("StartTag", "EmptyTag"): |
| 61 | + name = token["name"] |
| 62 | + if name in self.cdata_elements: |
| 63 | + in_cdata = True |
| 64 | + elif in_cdata: |
| 65 | + self.serializeError(_("Unexpected child element of a CDATA element")) |
| 66 | + attrs = token["data"] |
| 67 | + attrs.sort() |
| 68 | + attributes = [] |
| 69 | + for k,v in attrs: |
| 70 | + attributes.append(' ') |
| 71 | + attributes.append(k) |
| 72 | + if not self.minimize_boolean_attributes or \ |
| 73 | + (k not in booleanAttributes.get(name, tuple()) \ |
| 74 | + and k not in booleanAttributes.get("", tuple())): |
| 75 | + attributes.append("=") |
| 76 | + v = v.replace("&", "&") |
| 77 | + if self.quote_attr_values: |
| 78 | + quote_attr = True |
| 79 | + else: |
| 80 | + quote_attr = reduce(lambda x,y: x or y in v, |
| 81 | + spaceCharacters + "<>\"'", False) |
| 82 | + if quote_attr: |
| 83 | + if self.quote_char == '"': |
| 84 | + v = v.replace('"', """) |
| 85 | + else: |
| 86 | + v = v.replace(self.quote_char, "&#%u;" % ord(self.quote_char)) |
| 87 | + attributes.append(self.quote_char) |
| 88 | + attributes.append(v) |
| 89 | + attributes.append(self.quote_char) |
| 90 | + else: |
| 91 | + attributes.append(v) |
| 92 | + if name in voidElements and self.include_trailing_slashes: |
| 93 | + attributes.append(" /") |
| 94 | + yield u"<%s%s>" % (name, u"".join(attributes)) |
| 95 | + |
| 96 | + elif type == "EndTag": |
| 97 | + name = token["name"] |
| 98 | + if name in self.cdata_elements: |
| 99 | + in_cdata = False |
| 100 | + elif in_cdata: |
| 101 | + self.serializeError(_("Unexpected child element of a CDATA element")) |
| 102 | + yield u"</%s>" % name |
| 103 | + |
| 104 | + elif type == "Comment": |
| 105 | + data = token["data"] |
| 106 | + if data.find("--") >= 0: |
| 107 | + self.serializeError(_("Comment contains --")) |
| 108 | + yield u"<!--%s-->" % token["data"] |
| 109 | + |
| 110 | + else: |
| 111 | + self.serializeError(token["data"]) |
| 112 | + |
| 113 | + def render(self, treewalker, encoding='UTF-8', errors="strict"): |
| 114 | + u''.join(list(self.serialize(treewalker))).encode(encoding, errors) |
| 115 | + |
| 116 | + def filter(self, treewalker): |
| 117 | + for token, next in _slide(treewalker): |
| 118 | + type = token["type"] |
| 119 | + if type == "StartTag": |
| 120 | + if token["data"] or not self.is_optional_start(token["name"], next): |
| 121 | + yield token |
| 122 | + elif type == "EndTag": |
| 123 | + if not self.is_optional_end(token["name"], next): |
| 124 | + yield token |
| 125 | + else: |
| 126 | + yield token |
| 127 | + |
| 128 | + def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): |
| 129 | + # XXX The idea is to make data mandatory. |
| 130 | + self.errors.append(data) |
| 131 | + if self.strict: |
| 132 | + raise SerializeError |
| 133 | + |
| 134 | + def is_optional_start(self, tagname, next): |
| 135 | + type = next and next["type"] or None |
| 136 | + if tagname in 'html': |
| 137 | + # An html element's start tag may be omitted if the first thing |
| 138 | + # inside the html element is not a space character or a comment. |
| 139 | + return type not in ("Comment", "SpaceCharacters") |
| 140 | + elif tagname == 'head': |
| 141 | + # A head element's start tag may be omitted if the first thing |
| 142 | + # inside the head element is an element. |
| 143 | + return type == "StartTag" |
| 144 | + elif tagname == 'body': |
| 145 | + # A body element's start tag may be omitted if the first thing |
| 146 | + # inside the body element is not a space character or a comment, |
| 147 | + # except if the first thing inside the body element is a script |
| 148 | + # or style element and the node immediately preceding the body |
| 149 | + # element is a head element whose end tag has been omitted. |
| 150 | + if type in ("Comment", "SpaceCharacters"): |
| 151 | + return False |
| 152 | + elif type == "StartTag": |
| 153 | + # XXX: we do not look at the preceding event, so we never omit |
| 154 | + # the body element's start tag if it's followed by a script or |
| 155 | + # a style element. |
| 156 | + return next["name"] not in ('script', 'style') |
| 157 | + else: |
| 158 | + return True |
| 159 | + elif tagname == 'colgroup': |
| 160 | + # A colgroup element's start tag may be omitted if the first thing |
| 161 | + # inside the colgroup element is a col element, and if the element |
| 162 | + # is not immediately preceeded by another colgroup element whose |
| 163 | + # end tag has been omitted. |
| 164 | + if type == "StartTag": |
| 165 | + # XXX: we do not look at the preceding event, so instead we never |
| 166 | + # omit the colgroup element's end tag when it is immediately |
| 167 | + # followed by another colgroup element. See _is_optional_end. |
| 168 | + return next["name"] == "col" |
| 169 | + else: |
| 170 | + return False |
| 171 | + elif tagname == 'tbody': |
| 172 | + # A tbody element's start tag may be omitted if the first thing |
| 173 | + # inside the tbody element is a tr element, and if the element is |
| 174 | + # not immediately preceeded by a tbody, thead, or tfoot element |
| 175 | + # whose end tag has been omitted. |
| 176 | + if type == "StartTag": |
| 177 | + # XXX: we do not look at the preceding event, so instead we never |
| 178 | + # omit the thead and tfoot elements' end tag when they are |
| 179 | + # immediately followed by a tbody element. See _is_optional_end. |
| 180 | + return next["name"] == 'tr' |
| 181 | + else: |
| 182 | + return False |
| 183 | + # TODO |
| 184 | + return False |
| 185 | + |
| 186 | + def _is_optional_end(self, tagname, next_event): |
| 187 | + type, data = next_event |
| 188 | + if tagname in ('html', 'head', 'body'): |
| 189 | + # An html element's end tag may be omitted if the html element |
| 190 | + # is not immediately followed by a space character or a comment. |
| 191 | + return type not in ("Comment", "SpaceCharacters") |
| 192 | + elif tagname in ('li', 'optgroup', 'option', 'tr'): |
| 193 | + # A li element's end tag may be omitted if the li element is |
| 194 | + # immediately followed by another li element or if there is |
| 195 | + # no more content in the parent element. |
| 196 | + # An optgroup element's end tag may be omitted if the optgroup |
| 197 | + # element is immediately followed by another optgroup element, |
| 198 | + # or if there is no more content in the parent element. |
| 199 | + # An option element's end tag may be omitted if the option |
| 200 | + # element is immediately followed by another option element, |
| 201 | + # or if there is no more content in the parent element. |
| 202 | + # A tr element's end tag may be omitted if the tr element is |
| 203 | + # immediately followed by another tr element, or if there is |
| 204 | + # no more content in the parent element. |
| 205 | + if type == "StartTag": |
| 206 | + return next["name"] == tagname |
| 207 | + else: |
| 208 | + return type == "EndTag" or type is None |
| 209 | + elif tagname in ('dt', 'dd'): |
| 210 | + # A dt element's end tag may be omitted if the dt element is |
| 211 | + # immediately followed by another dt element or a dd element. |
| 212 | + # A dd element's end tag may be omitted if the dd element is |
| 213 | + # immediately followed by another dd element or a dt element, |
| 214 | + # or if there is no more content in the parent element. |
| 215 | + if type == "StartTag": |
| 216 | + return next["name"] in ('dt', 'dd') |
| 217 | + elif tagname == 'dd': |
| 218 | + return type == "EndTag
10000
" or type is None |
| 219 | + else: |
| 220 | + return False |
| 221 | + elif tagname == 'p': |
| 222 | + # A p element's end tag may be omitted if the p element is |
| 223 | + # immediately followed by an address, blockquote, dl, fieldset, |
| 224 | + # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table, |
| 225 | + # or ul element, or if there is no more content in the parent |
| 226 | + # element. |
| 227 | + if type == "StartTag": |
| 228 | + return next["name"] in ('address', 'blockquote', \ |
| 229 | + 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \ |
| 230 | + 'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul') |
| 231 | + else: |
| 232 | + return type == "EndTag" or type is None |
| 233 | + elif tagname == 'colgroup': |
| 234 | + # A colgroup element's end tag may be omitted if the colgroup |
| 235 | + # element is not immediately followed by a space character or |
| 236 | + # a comment. |
| 237 | + if type in ("Comment", "SpaceCharacters"): |
| 238 | + return False |
| 239 | + elif type == "StartTag": |
| 240 | + # XXX: we also look for an immediately following colgroup |
| 241 | + # element. See _is_optional_start. |
| 242 | + return next["name"] != 'colgroup' |
| 243 | + else: |
| 244 | + return True |
| 245 | + elif tagname in ('thead', 'tbody'): |
| 246 | + # A thead element's end tag may be omitted if the thead element |
| 247 | + # is immediately followed by a tbody or tfoot element. |
| 248 | + # A tbody element's end tag may be omitted if the tbody element |
| 249 | + # is immediately followed by a tbody or tfoot element, or if |
| 250 | + # there is no more content in the parent element. |
| 251 | + # A tfoot element's end tag may be omitted if the tfoot element |
| 252 | + # is immediately followed by a tbody element, or if there is no |
| 253 | + # more content in the parent element. |
| 254 | + # XXX: we never omit the end tag when the following element is |
| 255 | + # a tbody. See _is_optional_start. |
| 256 | + if type == "StartTag": |
| 257 | + return next["name"] == 'tfoot' |
| 258 | + elif tagname == 'tbody': |
| 259 | + return type == "EndTag" or type is None |
| 260 | + else: |
| 261 | + return False |
| 262 | + elif tagname == 'tfoot': |
| 263 | + # A tfoot element's end tag may be omitted if the tfoot element |
| 264 | + # is immediately followed by a tbody element, or if there is no |
| 265 | + # more content in the parent element. |
| 266 | + # XXX: we never omit the end tag when the following element is |
| 267 | + # a tbody. See _is_optional_start. |
| 268 | + return type == "EndTag" or type is None |
| 269 | + elif tagname in ('td', 'th'): |
| 270 | + # A td element's end tag may be omitted if the td element is |
| 271 | + # immediately followed by a td or th element, or if there is |
| 272 | + # no more content in the parent element. |
| 273 | + # A th element's end tag may be omitted if the th element is |
| 274 | + # immediately followed by a td or th element, or if there is |
| 275 | + # no more content in the parent element. |
| 276 | + if type == "StartTag": |
| 277 | + return next["name"] in ('td', 'th') |
| 278 | + else: |
| 279 | + return type == "EndTag" or type is None |
| 280 | + # TODO |
| 281 | + return False |
| 282 | + |
| 283 | +def SerializeError(Exception): |
| 284 | + """Error in serialized tree""" |
| 285 | + pass |
0 commit comments