8000 Preliminary work on HTML serializer (with support for HTML5-specific … · awesome-python/html5lib-python@94025dc · GitHub
[go: up one dir, main page]

Skip to content

Commit 94025dc

Browse files
committed
Preliminary work on HTML serializer (with support for HTML5-specific syntax: optional tags, boolean attributes, etc.)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40611
1 parent 1baaa99 commit 94025dc

File tree

8 files changed

+641
-1
lines changed

8 files changed

+641
-1
lines changed

src/constants.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,27 @@
156156
"area",
157157
"col",
158158
"input"
159-
))
159+
))
160+
161+
booleanAttributes = {
162+
"": frozenset(("irrelevant",)),
163+
"style": frozenset(("scoped",)),
164+
"img": frozenset(("ismap",)),
165+
"audio": frozenset(("autoplay","controls")),
166+
"video": frozenset(("autoplay","controls")),
167+
"script": frozenset(("defer", "async")),
168+
"details": frozenset(("open",)),
169+
"datagrid": frozenset(("multiple", "disabled")),
170+
"command": frozenset(("hidden", "disabled", "checked", "default")),
171+
"menu": frozenset(("autosubmit",)),
172+
"fieldset": frozenset(("disabled", "readonly")),
173+
"option": frozenset(("disabled", "readonly", "selected")),
174+
"optgroup": frozenset(("disabled", "readonly")),
175+
"button": frozenset(("disabled", "autofocus")),
176+
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
177+
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
178+
"output": frozenset(("disabled", "readonly")),
179+
}
160180

161181
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
162182
# therefore can't be a frozenset.

src/serializer.py

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
from constants import voidElements, booleanAttributes
2+
3+
try:
4+
frozenset
5+
except NameError:
6+
# Import from the sets module for python 2.3
7+
from sets import ImmutableSet as frozenset
8+
9+
import gettext
10+
_ = gettext.gettext
11+
12+
def _slide(iterator):
13+
previous = None
14+
for token in iterator:
15+
if previous is not None:
16+
yield previous, token
17+
previous = token
18+
yield previous, None
19+
20+
class HTMLSerializer(object):
21+
cdata_elements = frozenset(("style", "script", "xmp", "iframe", "noembed", "noframes", "noscript"))
22+
23+
quote_attr_values = False
24+
quote_char = '"'
25+
minimize_boolean_attributes = True
26+
27+
trailing_solidus = " /"
28+
29+
omit_optional_tags = True
30+
31+
def __init__(self, **kwargs):
32+
for attr in ("quote_attr_values", "quote_char",
33+
"minimize_boolean_attributes", "trailing_solidus",
34+
"omit_optional_tags"):
35+
if attr in kwargs:
36+
setattr(self, attr, kwargs[attr])
37+
self.errors = []
38+
39+
def serialize(self, treewalker):
40+
in_cdata = False
41+
self.errors = []
42+
if self.omit_optional_tags:
43+
treewalker = self.filter(treewalker)
44+
for token in treewalker:
45+
type = token["type"]
46+
if type == "Doctype":
47+
yield u"<!DOCTYPE %s>" % token["name"]
48+
49+
elif type in ("Characters", "SpaceCharacters"):
50+
if type == "SpaceCharacters" or in_cdata:
51+
if in_cdata and token["data"].find("</") >= 0:
52+
self.serializeError(_("Unexpected </ in CDATA"))
53+
yield token["data"]
54+
else:
55+
yield token["data"] \
56+
.replace("&", "&amp;") \
57+
.replace("<", "&lt;") \
58+
.replace(">", "&gt;") \
59+
60+
elif type in ("StartTag", "EmptyTag"):
61+
name = token["name"]
62+
if name in self.cdata_elements:
63+
in_cdata = True
64+
elif in_cdata:
65+
self.serializeError(_("Unexpected child element of a CDATA element"))
66+
attrs = token["data"]
67+
attrs.sort()
68+
attributes = []
69+
for k,v in attrs:
70+
attributes.append(' ')
71+
attributes.append(k)
72+
if not self.minimize_boolean_attributes or \
73+
(k not in booleanAttributes.get(name, tuple()) \
74+
and k not in booleanAttributes.get("", tuple())):
75+
attributes.append("=")
76+
v = v.replace("&", "&amp;")
77+
if self.quote_attr_values:
78+
quote_attr = True
79+
else:
80+
quote_attr = reduce(lambda x,y: x or y in v,
81+
spaceCharacters + "<>\"'", False)
82+
if quote_attr:
83+
if self.quote_char == '"':
84+
v = v.replace('"', "&quot;")
85+
else:
86+
v = v.replace(self.quote_char, "&#%u;" % ord(self.quote_char))
87+
attributes.append(self.quote_char)
88+
attributes.append(v)
89+
attributes.append(self.quote_char)
90+
else:
91+
attributes.append(v)
92+
if name in voidElements and self.include_trailing_slashes:
93+
attributes.append(" /")
94+
yield u"<%s%s>" % (name, u"".join(attributes))
95+
96+
elif type == "EndTag":
97+
name = token["name"]
98+
if name in self.cdata_elements:
99+
in_cdata = False
100+
elif in_cdata:
101+
self.serializeError(_("Unexpected child element of a CDATA element"))
102+
yield u"</%s>" % name
103+
104+
elif type == "Comment":
105+
data = token["data"]
106+
if data.find("--") >= 0:
107+
self.serializeError(_("Comment contains --"))
108+
yield u"<!--%s-->" % token["data"]
109+
110+
else:
111+
self.serializeError(token["data"])
112+
113+
def render(self, treewalker, encoding='UTF-8', errors="strict"):
114+
u''.join(list(self.serialize(treewalker))).encode(encoding, errors)
115+
116+
def filter(self, treewalker):
117+
for token, next in _slide(treewalker):
118+
type = token["type"]
119+
if type == "StartTag":
120+
if token["data"] or not self.is_optional_start(token["name"], next):
121+
yield token
122+
elif type == "EndTag":
123+
if not self.is_optional_end(token["name"], next):
124+
yield token
125+
else:
126+
yield token
127+
128+
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
129+
# XXX The idea is to make data mandatory.
130+
self.errors.append(data)
131+
if self.strict:
132+
raise SerializeError
133+
134+
def is_optional_start(self, tagname, next):
135+
type = next and next["type"] or None
136+
if tagname in 'html':
137+
# An html element's start tag may be omitted if the first thing
138+
# inside the html element is not a space character or a comment.
139+
return type not in ("Comment", "SpaceCharacters")
140+
elif tagname == 'head':
141+
# A head element's start tag may be omitted if the first thing
142+
# inside the head element is an element.
143+
return type == "StartTag"
144+
elif tagname == 'body':
145+
# A body element's start tag may be omitted if the first thing
146+
# inside the body element is not a space character or a comment,
147+
# except if the first thing inside the body element is a script
148+
# or style element and the node immediately preceding the body
149+
# element is a head element whose end tag has been omitted.
150+
if type in ("Comment", "SpaceCharacters"):
151+
return False
152+
elif type == "StartTag":
153+
# XXX: we do not look at the preceding event, so we never omit
154+
# the body element's start tag if it's followed by a script or
155+
# a style element.
156+
return next["name"] not in ('script', 'style')
157+
else:
158+
return True
159+
elif tagname == 'colgroup':
160+
# A colgroup element's start tag may be omitted if the first thing
161+
# inside the colgroup element is a col element, and if the element
162+
# is not immediately preceeded by another colgroup element whose
163+
# end tag has been omitted.
164+
if type == "StartTag":
165+
# XXX: we do not look at the preceding event, so instead we never
166+
# omit the colgroup element's end tag when it is immediately
167+
# followed by another colgroup element. See _is_optional_end.
168+
return next["name"] == "col"
169+
else:
170+
return False
171+
elif tagname == 'tbody':
172+
# A tbody element's start tag may be omitted if the first thing
173+
# inside the tbody element is a tr element, and if the element is
174+
# not immediately preceeded by a tbody, thead, or tfoot element
175+
# whose end tag has been omitted.
176+
if type == "StartTag":
177+
# XXX: we do not look at the preceding event, so instead we never
178+
# omit the thead and tfoot elements' end tag when they are
179+
# immediately followed by a tbody element. See _is_optional_end.
180+
return next["name"] == 'tr'
181+
else:
182+
return False
183+
# TODO
184+
return False
185+
186+
def _is_optional_end(self, tagname, next_event):
187+
type, data = next_event
188+
if tagname in ('html', 'head', 'body'):
189+
# An html element's end tag may be omitted if the html element
190+
# is not immediately followed by a space character or a comment.
191+
return type not in ("Comment", "SpaceCharacters")
192+
elif tagname in ('li', 'optgroup', 'option', 'tr'):
193+
# A li element's end tag may be omitted if the li element is
194+
# immediately followed by another li element or if there is
195+
# no more content in the parent element.
196+
# An optgroup element's end tag may be omitted if the optgroup
197+
# element is immediately followed by another optgroup element,
198+
# or if there is no more content in the parent element.
199+
# An option element's end tag may be omitted if the option
200+
# element is immediately followed by another option element,
201+
# or if there is no more content in the parent element.
202+
# A tr element's end tag may be omitted if the tr element is
203+
# immediately followed by another tr element, or if there is
204+
# no more content in the parent element.
205+
if type == "StartTag":
206+
return next["name"] == tagname
207+
else:
208+
return type == "EndTag" or type is None
209+
elif tagname in ('dt', 'dd'):
210+
# A dt element's end tag may be omitted if the dt element is
211+
# immediately followed by another dt element or a dd element.
212+
# A dd element's end tag may be omitted if the dd element is
213+
# immediately followed by another dd element or a dt element,
214+
# or if there is no more content in the parent element.
215+
if type == "StartTag":
216+
return next["name"] in ('dt', 'dd')
217+
elif tagname == 'dd':
218+
return type == "EndTag 10000 " or type is None
219+
else:
220+
return False
221+
elif tagname == 'p':
222+
# A p element's end tag may be omitted if the p element is
223+
# immediately followed by an address, blockquote, dl, fieldset,
224+
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
225+
# or ul element, or if there is no more content in the parent
226+
# element.
227+
if type == "StartTag":
228+
return next["name"] in ('address', 'blockquote', \
229+
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
230+
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
231+
else:
232+
return type == "EndTag" or type is None
233+
elif tagname == 'colgroup':
234+
# A colgroup element's end tag may be omitted if the colgroup
235+
# element is not immediately followed by a space character or
236+
# a comment.
237+
if type in ("Comment", "SpaceCharacters"):
238+
return False
239+
elif type == "StartTag":
240+
# XXX: we also look for an immediately following colgroup
241+
# element. See _is_optional_start.
242+
return next["name"] != 'colgroup'
243+
else:
244+
return True
245+
elif tagname in ('thead', 'tbody'):
246+
# A thead element's end tag may be omitted if the thead element
247+
# is immediately followed by a tbody or tfoot element.
248+
# A tbody element's end tag may be omitted if the tbody element
249+
# is immediately followed by a tbody or tfoot element, or if
250+
# there is no more content in the parent element.
251+
# A tfoot element's end tag may be omitted if the tfoot element
252+
# is immediately followed by a tbody element, or if there is no
253+
# more content in the parent element.
254+
# XXX: we never omit the end tag when the following element is
255+
# a tbody. See _is_optional_start.
256+
if type == "StartTag":
257+
return next["name"] == 'tfoot'
258+
elif tagname == 'tbody':
259+
return type == "EndTag" or type is None
260+
else:
261+
return False
262+
elif tagname == 'tfoot':
263+
# A tfoot element's end tag may be omitted if the tfoot element
264+
# is immediately followed by a tbody element, or if there is no
265+
# more content in the parent element.
266+
# XXX: we never omit the end tag when the following element is
267+
# a tbody. See _is_optional_start.
268+
return type == "EndTag" or type is None
269+
elif tagname in ('td', 'th'):
270+
# A td element's end tag may be omitted if the td element is
271+
# immediately followed by a td or th element, or if there is
272+
# no more content in the parent element.
273+
# A th element's end tag may be omitted if the th element is
274+
# immediately followed by a td or th element, or if there is
275+
# no more content in the parent element.
276+
if type == "StartTag":
277+
return next["name"] in ('td', 'th')
278+
else:
279+
return type == "EndTag" or type is None
280+
# TODO
281+
return False
282+
283+
def SerializeError(Exception):
284+
"""Error in serialized tree"""
285+
pass

src/treewalkers/__init__.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""A collection of modules for iterating through different kinds of
2+
tree, generating tokens identical to those produced by the tokenizer
3+
module.
4+
5+
To create a tree walker for a new type of tree, you need to do
6+
implement a tree walker object (called TreeWalker by convention) that
7+
implements a 'serialize' method taking a tree as sole argument and
8+
returning an iterator generating tokens.
9+
"""
10+
11+
import os.path
12+
__path__.append(os.path.dirname(__path__[0]))
13+
14+
import dom
15+
import simpletree
16+
import etree
17+
try:
18+
import soup as beautifulsoup
19+
except:
20+
pass
21+
22+
23+
def getTreeWalker(treeType, implementation=None, **kwargs):
24+
"""Get a TreeWalker class for various types of tree with built-in support
25+
26+
treeType - the name of the tree type required (case-insensitive). Supported
27+
values are "simpletree", "dom", "etree" and "beautifulsoup"
28+
29+
"simpletree" - a built-in DOM-ish tree type with support for some
30+
more pythonic idioms.
31+
"dom" - The xml.dom.minidom DOM implementation
32+
"etree" - A generic builder for tree implementations exposing an
33+
elementtree-like interface (known to work with
34+
ElementTree, cElementTree and lxml.etree).
35+
"beautifulsoup" - Beautiful soup (if installed)
36+
37+
implementation - (Currently applies to the "etree" tree type only). A module
38+
implementing the tree type e.g. xml.etree.ElementTree or
39+
lxml.etree."""
40+
41+
treeType = treeType.lower()
42+
if treeType in ("dom", "simpletree", "beautifulsoup"):
43+
return globals()[treeType].TreeWalker
44+
elif treeType == "etree":
45+
return etree.getETreeModule(implementation, **kwargs).TreeWalker

0 commit comments

Comments
 (0)
0