8000 Externalized OptionalTagFilter into html5lib.filters.optionaltags. · awesome-python/html5lib-python@2a7e2e0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2a7e2e0

Browse files
committed
Externalized OptionalTagFilter into html5lib.filters.optionaltags.
HTMLSerializer now uses html5lib.filters.whitespace to strip unnecessary whitespace (still disabled by default because it changes the document and could break layout and/or some CSS) --HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40691
1 parent 576fa7e commit 2a7e2e0

File tree

2 files changed

+181
-181
lines changed

2 files changed

+181
-181
lines changed

src/filters/optionaltags.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import _base
2+
3+
class Filter(_base.Filter):
4+
def slider(self):
5+
previous1 = previous2 = None
6+
for token in self.source:
7+
if previous1 is not None:
8+
yield previous2, previous1, token
9+
previous2 = previous1
10+
previous1 = token
11+
yield previous2, previous1, None
12+
13+
def __iter__(self):
14+
for previous, token, next in self.slider():
15+
type = token["type"]
16+
if type == "StartTag":
17+
if token["data"] or not self.is_optional_start(token["name"], previous, next):
18+
yield token
19+
elif type == "EndTag":
20+
if not self.is_optional_end(token["name"], next):
21+
yield token
22+
else:
23+
yield token
24+
25+
def is_optional_start(self, tagname, previous, next):
26+
type = next and next["type"] or None
27+
if tagname in 'html':
28+
# An html element's start tag may be omitted if the first thing
29+
# inside the html element is not a space character or a comment.
30+
return type not in ("Comment", "SpaceCharacters")
31+
elif tagname == 'head':
32+
# A head element's start tag may be omitted if the first thing
33+
# inside the head element is an element.
34+
return type == "StartTag"
35+
elif tagname == 'body':
36+
# A body element's start tag may be omitted if the first thing
37+
# inside the body element is not a space character or a comment,
38+
# except if the first thing inside the body element is a script
39+
# or style element and the node immediately preceding the body
40+
# element is a head element whose end tag has been omitted.
41+
if type in ("Comment", "SpaceCharacters"):
42+
return False
43+
elif type == "StartTag":
44+
# XXX: we do not look at the preceding event, so we never omit
45+
# the body element's start tag if it's followed by a script or
46+
# a style element.
47+
return next["name"] not in ('script', 'style')
48+
else:
49+
return True
50+
elif tagname == 'colgroup':
51+
# A colgroup element's start tag may be omitted if the first thing
52+
# inside the colgroup element is a col element, and if the element
53+
# is not immediately preceeded by another colgroup element whose
54+
# end tag has been omitted.
55+
if type == "StartTag":
56+
# XXX: we do not look at the preceding event, so instead we never
57+
# omit the colgroup element's end tag when it is immediately
58+
# followed by another colgroup element. See is_optional_end.
59+
return next["name"] == "col"
60+
else:
61+
return False
62+
elif tagname == 'tbody':
63+
# A tbody element's start tag may be omitted if the first thing
64+
# inside the tbody element is a tr element, and if the element is
65+
# not immediately preceeded by a tbody, thead, or tfoot element
66+
# whose end tag has been omitted.
67+
if type == "StartTag":
68+
# omit the thead and tfoot elements' end tag when they are
69+
# immediately followed by a tbody element. See is_optional_end.
70+
if previous and previous['type'] == 'EndTag' and \
71+
previous['name'] in ('tbody','thead','tfoot'):
72+
return False
73+
return next["name"] == 'tr'
74+
else:
75+
return False
76+
return False
77+
78+
def is_optional_end(self, tagname, next):
79+
type = next and next["type"] or None
80+
if tagname in ('html', 'head', 'body'):
81+
# An html element's end tag may be omitted if the html element
82+
# is not immediately followed by a space character or a comment.
83+
return type not in ("Comment", "SpaceCharacters")
84+
elif tagname in ('li', 'optgroup', 'option', 'tr'):
85+
# A li element's end tag may be omitted if the li element is
86+
# immediately followed by another li element or if there is
87+
# no more content in the parent element.
88+
# An optgroup element's end tag may be omitted if the optgroup
89+
# element is immediately followed by another optgroup element,
90+
# or if there is no more content in the parent element.
91+
# An option element's end tag may be omitted if the option
92+
# element is immediately followed by another option element,
93+
# or if there is no more content in the parent element.
94+
# A tr element's end tag may be omitted if the tr element is
95+
# immediately followed by another tr element, or if there is
96+
# no more content in the parent element.
97+
if type == "StartTag":
98+
return next["name"] == tagname
99+
else:
100+
return type == "EndTag" or type is None
101+
elif tagname in ('dt', 'dd'):
102+
# A dt element's end tag may be omitted if the dt element is
103+
# immediately followed by another dt element or a dd element.
104+
# A dd element's end tag may be omitted if the dd element is
105+
# immediately followed by another dd element or a dt element,
106+
# or if there is no more content in the parent element.
107+
if type == "StartTag":
108+
return next["name"] in ('dt', 'dd')
109+
elif tagname == 'dd':
110+
return type == "EndTag" or type is None
111+
else:
112+
return False
113+
elif tagname == 'p':
114+
# A p element's end tag may be omitted if the p element is
115+
# immediately followed by an address, blockquote, dl, fieldset,
116+
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
117+
# or ul element, or if there is no more content in the parent
118+
# element.
119+
if type == "StartTag":
120+
return next["name"] in ('address', 'blockquote', \
121+
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
122+
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
123+
else:
124+
return type == "EndTag" or type is None
125+
elif tagname == 'colgroup':
126+
# A colgroup element's end tag may be omitted if the colgroup
127+
# element is not immediately followed by a space character or
128+
# a comment.
129+
if type in ("Comment", "SpaceCharacters"):
130+
return False
131+
elif type == "StartTag":
132+
# XXX: we also look for an immediately following colgroup
133+
# element. See is_optional_start.
134+
return next["name"] != 'colgroup'
135+
else:
136+
return True
137+
elif tagname in ('thead', 'tbody'):
138+
# A thead element's end tag may be omitted if the thead element
139+
# is immediately followed by a tbody or tfoot element.
140+
# A tbody element's end tag may be omitted if the tbody element
141+
# is immediately followed by a tbody or tfoot element, or if
142+
# there is no more content in the parent element.
143+
# A tfoot element's end tag may be omitted if the tfoot element
144+
# is immediately followed by a tbody element, or if there is no
145+
# more content in the parent element.
146+
# XXX: we never omit the end tag when the following element is
147+
# a tbody. See is_optional_start.
148+
if type == "StartTag":
149+
return next["name"] in ['tbody', 'tfoot']
150+
elif tagname == 'tbody':
151+
return type == "EndTag" or type is None
152+
else:
153+
return False
154+
elif tagname == 'tfoot':
155+
# A tfoot element's end tag may be omitted if the tfoot element
156+
# is immediately followed by a tbody element, or if there is no
157+
# more content in the parent element.
158+
# XXX: we never omit the end tag when the following element is
159+
# a tbody. See is_optional_start.
160+
if type == "StartTag":
161+
return next["name"] == 'tbody'
162+
else:
163+
return type == "EndTag" or type is None
164+
elif tagname in ('td', 'th'):
165+
# A td element's end tag may be omitted if the td element is
166+
# immediately followed by a td or th element, or if there is
167+
# no more content in the parent element.
168+
# A th element's end tag may be omitted if the th element is
169+
# immediately followed by a td or th element, or if there is
170+
# no more content in the parent element.
171+
if type == "StartTag":
172+
return next["name"] in ('td', 'th')
173+
else:
174+
return type == "EndTag" or type is None
175+
return False

src/serializer.py

Lines changed: 6 additions & 181 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
import gettext
88
_ = gettext.gettext
99

10+
from filters.whitespace import Filter as WhitespaceFilter
11+
from filters.optionaltags import Filter as OptionalTagFilter
12+
1013
from constants import voidElements, booleanAttributes, spaceCharacters
1114

1215
spaceCharacters = u"".join(spaceCharacters)
@@ -48,183 +51,6 @@ def htmlentityreplace_errors(exc):
4851
def escape_text(text, encoding):
4952
return text.replace("&", "&").encode(encoding, unicode_encode_errors)
5053

51-
class OptionalTagFilter:
52-
def __init__(self, source):
53-
self.source = source
54-
55-
def slider(self):
56-
previous1 = previous2 = None
57-
for token in self.source:
58-
if previous1 is not None:
59-
yield previous2, previous1, token
60-
previous2 = previous1
61-
previous1 = token
62-
yield previous2, previous1, None
63-
64-
def __iter__(self):
65-
for previous, token, next in self.slider():
66-
type = token["type"]
67-
if type == "StartTag":
68-
if token["data"] or not self.is_optional_start(token["name"], previous, next):
69-
yield token
70-
elif type == "EndTag":
71-
if not self.is_optional_end(token["name"], next):
72-
yield token
73-
else:
74-
yield token
75-
76-
def is_optional_start(self, tagname, previous, next):
77-
type = next and next["type"] or None
78-
if tagname in 'html':
79-
# An html element's start tag may be omitted if the first thing
80-
# inside the html element is not a space character or a comment.
81-
return type not in ("Comment", "SpaceCharacters")
82-
elif tagname == 'head':
83-
# A head element's start tag may be omitted if the first thing
84-
# inside the head element is an element.
85-
return type == "StartTag"
86-
elif tagname == 'body':
87-
# A body element's start tag may be omitted if the first thing
88-
# inside the body element is not a space character or a comment,
89-
# except if the first thing inside the body element is a script
90-
# or style element and the node immediately preceding the body
91-
# element is a head element whose end tag has been omitted.
92-
if type in ("Comment", "SpaceCharacters"):
93-
return False
94-
elif type == "StartTag":
95-
# XXX: we do not look at the preceding event, so we never omit
96-
# the body element's start tag if it's followed by a script or
97-
# a style element.
98-
return next["name"] not in ('script', 'style')
99-
else:
100-
return True
101-
elif tagname == 'colgroup':
102-
# A colgroup element's start tag may be omitted if the first thing
103-
# inside the colgroup element is a col element, and if the element
104-
# is not immediately preceeded by another colgroup element whose
105-
# end tag has been omitted.
106-
if type == "StartTag":
107-
# XXX: we do not look at the preceding event, so instead we never
108-
# omit the colgroup element's end tag when it is immediately
109-
# followed by another colgroup element. See is_optional_end.
110-
return next["name"] == "col"
111-
else:
112-
return False
113-
elif tagname == 'tbody':
114-
# A tbody element's start tag may be omitted if the first thing
115-
# inside the tbody element is a tr element, and if the element is
116-
# not immediately preceeded by a tbody, thead, or tfoot element
117-
# whose end tag has been omitted.
118-
if type == "StartTag":
119-
# omit the thead and tfoot elements' end tag when they are
120-
# immediately followed by a tbody element. See is_optional_end.
121-
if previous and previous['type'] == 'EndTag' and \
122-
previous['name'] in ('tbody','thead','tfoot'):
123-
return False
124-
return next["name"] == 'tr'
125-
else:
126-
return False
127-
return False
128-
129-
def is_optional_end(self, tagname, next):
130-
type = next and next["type"] or None
131-
if tagname in ('html', 'head', 'body'):
132-
# An html element's end tag may be omitted if the html element
133-
# is not immediately followed by a space character or a comment.
134-
return type not in ("Comment", "SpaceCharacters")
135-
elif tagname in ('li', 'optgroup', 'option', 'tr'):
136-
# A li element's end tag may be omitted if the li element is
137-
# immediately followed by another li element or if there is
138-
# no more content in the parent element.
139-
# An optgroup element's end tag may be omitted if the optgroup
140-
# element is immediately followed by another optgroup element,
141-
# or if there is no more content in the parent element.
142-
# An option element's end tag may be omitted if the option
143-
# element is immediately followed by another option element,
144-
# or if there is no more content in the parent element.
145-
# A tr element's end tag may be omitted if the tr element is
146-
# immediately followed by another tr element, or if there is
147-
# no more content in the parent element.
148-
if type == "StartTag":
149-
return next["name"] == tagname
150-
else:
151-
return type == "EndTag" or type is None
152-
elif tagname in ('dt', 'dd'):
153-
# A dt element's end tag may be omitted if the dt element is
154-
# immediately followed by another dt element or a dd element.
155-
# A dd element's end tag may be omitted if the dd element is
156-
# immediately followed by another dd element or a dt element,
157-
# or if there is no more content in the parent element.
158-
if type == "StartTag":
159-
return next["name"] in ('dt', 'dd')
160-
elif tagname == 'dd':
161-
return type == "EndTag" or type is None
162-
else:
163-
return False
164-
elif tagname == 'p':
165-
# A p element's end tag may be omitted if the p element is
166-
# immediately followed by an address, blockquote, dl, fieldset,
167-
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
168-
# or ul element, or if there is no more content in the parent
169-
# element.
170-
if type == "StartTag":
171-
return next["name"] in ('address', 'blockquote', \
172-
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
173-
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
174-
else:
175-
return type == "EndTag" or type is None
176-
elif tagname == 'colgroup':
177-
# A colgroup element's end tag may be omitted if the colgroup
178-
# element is not immediately followed by a space character or
179-
# a comment.
180-
if type in ("Comment", "SpaceCharacters"):
181-
return False
182-
elif type == "StartTag":
183-
# XXX: we also look for an immediately following colgroup
184-
# element. See is_optional_start.
185-
return next["name"] != 'colgroup'
186-
else:
187-
return True
188-
elif tagname in ('thead', 'tbody'):
189-
# A thead element's end tag may be omitted if the thead element
190-
# is immediately followed by a tbody or tfoot element.
191-
# A tbody element's end tag may be omitted if the tbody element
192-
# is immediately followed by a tbody or tfoot element, or if
193-
# there is no more content in the parent element.
194-
# A tfoot element's end tag may be omitted if the tfoot element
195-
# is immediately followed by a tbody element, or if there is no
196-
# more content in the parent element.
197-
# XXX: we never omit the end tag when the following element is
198-
# a tbody. See is_optional_start.
199-
if type == "StartTag":
200-
return next["name"] in ['tbody', 'tfoot']
201-
elif tagname == 'tbody':
202-
return type == "EndTag" or type is None
203-
else:
204-
return False
205-
elif tagname == 'tfoot':
206-
# A tfoot element's end tag may be omitted if the tfoot element
207-
# is immediately followed by a tbody element, or if there is no
208-
# more content in the parent element.
209-
# XXX: we never omit the end tag when the following element is
210-
# a tbody. See is_optional_start.
211-
if type == "StartTag":
212-
return next["name"] == 'tbody'
213-
else:
214-
return type == "EndTag" or type is None
215-
elif tagname in ('td', 'th'):
216-
# A td element's end tag may be omitted if the td element is
217-
# immediately followed by a td or th element, or if there is
218-
# no more content in the parent element.
219-
# A th element's end tag may be omitted if the th element is
220-
# immediately followed by a td or th element, or if there is
221-
# no more content in the parent element.
222-
if type == "StartTag":
223-
return next["name"] in ('td', 'th')
224-
else:
225-
return type == "EndTag" or type is None
226-
return False
227-
22854
class HTMLSerializer(object):
22955
cdata_elements = frozenset(("style", "script", "xmp", "iframe", "noembed", "noframes", "noscript"))
23056

@@ -258,8 +84,10 @@ def serialize(self, treewalker, encoding=None):
25884
self.errors = []
25985
if encoding and self.inject_meta_charset:
26086
treewalker = self.filter_inject_meta_charset(treewalker, encoding)
87+
# XXX: WhitespaceFilter should be used before OptionalTagFilter
88+
# for maximum efficiently of this latter filter
26189
if self.strip_whitespace:
262-
treewalker = self.filter_whitespace(treewalker)
90+
treewalker = WhitespaceFilter(treewalker)
26391
if self.omit_optional_tags:
26492
treewalker = OptionalTagFilter(treewalker)
26593
for token in treewalker:
@@ -387,9 +215,6 @@ def filter_inject_meta_charset(self, treewalker, encoding):
387215
"data": {"charset": encoding}}
388216
yield token
389217

390-
def filter_whitespace(self, treewalker):
391-
raise NotImplementedError
392-
393218
def SerializeError(Exception):
394219
"""Error in serialized tree"""
395220
pass

0 commit comments

Comments
 (0)
0