|
7 | 7 | import gettext
|
8 | 8 | _ = gettext.gettext
|
9 | 9 |
|
| 10 | +from filters.whitespace import Filter as WhitespaceFilter |
| 11 | +from filters.optionaltags import Filter as OptionalTagFilter |
| 12 | + |
10 | 13 | from constants import voidElements, booleanAttributes, spaceCharacters
|
11 | 14 |
|
12 | 15 | spaceCharacters = u"".join(spaceCharacters)
|
@@ -48,183 +51,6 @@ def htmlentityreplace_errors(exc):
|
48 | 51 | def escape_text(text, encoding):
|
49 | 52 | return text.replace("&", "&").encode(encoding, unicode_encode_errors)
|
50 | 53 |
|
51 |
| -class OptionalTagFilter: |
52 |
| - def __init__(self, source): |
53 |
| - self.source = source |
54 |
| - |
55 |
| - def slider(self): |
56 |
| - previous1 = previous2 = None |
57 |
| - for token in self.source: |
58 |
| - if previous1 is not None: |
59 |
| - yield previous2, previous1, token |
60 |
| - previous2 = previous1 |
61 |
| - previous1 = token |
62 |
| - yield previous2, previous1, None |
63 |
| - |
64 |
| - def __iter__(self): |
65 |
| - for previous, token, next in self.slider(): |
66 |
| - type = token["type"] |
67 |
| - if type == "StartTag": |
68 |
| - if token["data"] or not self.is_optional_start(token["name"], previous, next): |
69 |
| - yield token |
70 |
| - elif type == "EndTag": |
71 |
| - if not self.is_optional_end(token["name"], next): |
72 |
| - yield token |
73 |
| - else: |
74 |
| - yield token |
75 |
| - |
76 |
| - def is_optional_start(self, tagname, previous, next): |
77 |
| - type = next and next["type"] or None |
78 |
| - if tagname in 'html': |
79 |
| - # An html element's start tag may be omitted if the first thing |
80 |
| - # inside the html element is not a space character or a comment. |
81 |
| - return type not in ("Comment", "SpaceCharacters") |
82 |
| - elif tagname == 'head': |
83 |
| - # A head element's start tag may be omitted if the first thing |
84 |
| - # inside the head element is an element. |
85 |
| - return type == "StartTag" |
86 |
| - elif tagname == 'body': |
87 |
| - # A body element's start tag may be omitted if the first thing |
88 |
| - # inside the body element is not a space character or a comment, |
89 |
| - # except if the first thing inside the body element is a script |
90 |
| - # or style element and the node immediately preceding the body |
91 |
| - # element is a head element whose end tag has been omitted. |
92 |
| - if type in ("Comment", "SpaceCharacters"): |
93 |
| - return False |
94 |
| - elif type == "StartTag": |
95 |
| - # XXX: we do not look at the preceding event, so we never omit |
96 |
| - # the body element's start tag if it's followed by a script or |
97 |
| - # a style element. |
98 |
| - return next["name"] not in ('script', 'style') |
99 |
| - else: |
100 |
| - return True |
101 |
| - elif tagname == 'colgroup': |
102 |
| - # A colgroup element's start tag may be omitted if the first thing |
103 |
| - # inside the colgroup element is a col element, and if the element |
104 |
| - # is not immediately preceeded by another colgroup element whose |
105 |
| - # end tag has been omitted. |
106 |
| - if type == "StartTag": |
107 |
| - # XXX: we do not look at the preceding event, so instead we never |
108 |
| - # omit the colgroup element's end tag when it is immediately |
109 |
| - # followed by another colgroup element. See is_optional_end. |
110 |
| - return next["name"] == "col" |
111 |
| - else: |
112 |
| - return False |
113 |
| - elif tagname == 'tbody': |
114 |
| - # A tbody element's start tag may be omitted if the first thing |
115 |
| - # inside the tbody element is a tr element, and if the element is |
116 |
| - # not immediately preceeded by a tbody, thead, or tfoot element |
117 |
| - # whose end tag has been omitted. |
118 |
| - if type == "StartTag": |
119 |
| - # omit the thead and tfoot elements' end tag when they are |
120 |
| - # immediately followed by a tbody element. See is_optional_end. |
121 |
| - if previous and previous['type'] == 'EndTag' and \ |
122 |
| - previous['name'] in ('tbody','thead','tfoot'): |
123 |
| - return False |
124 |
| - return next["name"] == 'tr' |
125 |
| - else: |
126 |
| - return False |
127 |
| - return False |
128 |
| - |
129 |
| - def is_optional_end(self, tagname, next): |
130 |
| - type = next and next["type"] or None |
131 |
| - if tagname in ('html', 'head', 'body'): |
132 |
| - # An html element's end tag may be omitted if the html element |
133 |
| - # is not immediately followed by a space character or a comment. |
134 |
| - return type not in ("Comment", "SpaceCharacters") |
135 |
| - elif tagname in ('li', 'optgroup', 'option', 'tr'): |
136 |
| - # A li element's end tag may be omitted if the li element is |
137 |
| - # immediately followed by another li element or if there is |
138 |
| - # no more content in the parent element. |
139 |
| - # An optgroup element's end tag may be omitted if the optgroup |
140 |
| - # element is immediately followed by another optgroup element, |
141 |
| - # or if there is no more content in the parent element. |
142 |
| - # An option element's end tag may be omitted if the option |
143 |
| - # element is immediately followed by another option element, |
144 |
| - # or if there is no more content in the parent element. |
145 |
| - # A tr element's end tag may be omitted if the tr element is |
146 |
| - # immediately followed by another tr element, or if there is |
147 |
| - # no more content in the parent element. |
148 |
| - if type == "StartTag": |
149 |
| - return next["name"] == tagname |
150 |
| - else: |
151 |
| - return type == "EndTag" or type is None |
152 |
| - elif tagname in ('dt', 'dd'): |
153 |
| - # A dt element's end tag may be omitted if the dt element is |
154 |
| - # immediately followed by another dt element or a dd element. |
155 |
| - # A dd element's end tag may be omitted if the dd element is |
156 |
| - # immediately followed by another dd element or a dt element, |
157 |
| - # or if there is no more content in the parent element. |
158 |
| - if type == "StartTag": |
159 |
| - return next["name"] in ('dt', 'dd') |
160 |
| - elif tagname == 'dd': |
161 |
| - return type == "EndTag" or type is None |
162 |
| - else: |
163 |
| - return False |
164 |
| - elif tagname == 'p': |
165 |
| - # A p element's end tag may be omitted if the p element is |
166 |
| - # immediately followed by an address, blockquote, dl, fieldset, |
167 |
| - # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table, |
168 |
| - # or ul element, or if there is no more content in the parent |
169 |
| - # element. |
170 |
| - if type == "StartTag": |
171 |
| - return next["name"] in ('address', 'blockquote', \ |
172 |
| - 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \ |
173 |
| - 'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul') |
174 |
| - else: |
175 |
| - return type == "EndTag" or type is None |
176 |
| - elif tagname == 'colgroup': |
177 |
| - # A colgroup element's end tag may be omitted if the colgroup |
178 |
| - # element is not immediately followed by a space character or |
179 |
| - # a comment. |
180 |
| - if type in ("Comment", "SpaceCharacters"): |
181 |
| - return False |
182 |
| - elif type == "StartTag": |
183 |
| - # XXX: we also look for an immediately following colgroup |
184 |
| - # element. See is_optional_start. |
185 |
| - return next["name"] != 'colgroup' |
186 |
| - else: |
187 |
| - return True |
188 |
| - elif tagname in ('thead', 'tbody'): |
189 |
| - # A thead element's end tag may be omitted if the thead element |
190 |
| - # is immediately followed by a tbody or tfoot element. |
191 |
| - # A tbody element's end tag may be omitted if the tbody element |
192 |
| - # is immediately followed by a tbody or tfoot element, or if |
193 |
| - # there is no more content in the parent element. |
194 |
| - # A tfoot element's end tag may be omitted if the tfoot element |
195 |
| - # is immediately followed by a tbody element, or if there is no |
196 |
| - # more content in the parent element. |
197 |
| - # XXX: we never omit the end tag when the following element is |
198 |
| - # a tbody. See is_optional_start. |
199 |
| - if type == "StartTag": |
200 |
| - return next["name"] in ['tbody', 'tfoot'] |
201 |
| - elif tagname == 'tbody': |
202 |
| - return type == "EndTag" or type is None |
203 |
| - else: |
204 |
| - return False |
205 |
| - elif tagname == 'tfoot': |
206 |
| - # A tfoot element's end tag may be omitted if the tfoot element |
207 |
| - # is immediately followed by a tbody element, or if there is no |
208 |
| - # more content in the parent element. |
209 |
| - # XXX: we never omit the end tag when the following element is |
210 |
| - # a tbody. See is_optional_start. |
211 |
| - if type == "StartTag": |
212 |
| - return next["name"] == 'tbody' |
213 |
| - else: |
214 |
| - return type == "EndTag" or type is None |
215 |
| - elif tagname in ('td', 'th'): |
216 |
| - # A td element's end tag may be omitted if the td element is |
217 |
| - # immediately followed by a td or th element, or if there is |
218 |
| - # no more content in the parent element. |
219 |
| - # A th element's end tag may be omitted if the th element is |
220 |
| - # immediately followed by a td or th element, or if there is |
221 |
| - # no more content in the parent element. |
222 |
| - if type == "StartTag": |
223 |
| - return next["name"] in ('td', 'th') |
224 |
| - else: |
225 |
| - return type == "EndTag" or type is None |
226 |
| - return False |
227 |
| - |
228 | 54 | class HTMLSerializer(object):
|
229 | 55 | cdata_elements = frozenset(("style", "script", "xmp", "iframe", "noembed", "noframes", "noscript"))
|
230 | 56 |
|
@@ -258,8 +84,10 @@ def serialize(self, treewalker, encoding=None):
|
258 | 84 | self.errors = []
|
259 | 85 | if encoding and self.inject_meta_charset:
|
260 | 86 | treewalker = self.filter_inject_meta_charset(treewalker, encoding)
|
| 87 | + # XXX: WhitespaceFilter should be used before OptionalTagFilter |
| 88 | + # for maximum efficiently of this latter filter |
261 | 89 | if self.strip_whitespace:
|
262 |
| - treewalker = self.filter_whitespace(treewalker) |
| 90 | + treewalker = WhitespaceFilter(treewalker) |
263 | 91 | if self.omit_optional_tags:
|
264 | 92 | treewalker = OptionalTagFilter(treewalker)
|
265 | 93 | for token in treewalker:
|
@@ -387,9 +215,6 @@ def filter_inject_meta_charset(self, treewalker, encoding):
|
387 | 215 | "data": {"charset": encoding}}
|
388 | 216 | yield token
|
389 | 217 |
|
390 |
| - def filter_whitespace(self, treewalker): |
391 |
| - raise NotImplementedError |
392 |
| - |
393 | 218 | def SerializeError(Exception):
|
394 | 219 | """Error in serialized tree"""
|
395 | 220 | pass
|
0 commit comments