awesome-python
diff --git a/‎src/filters/optionaltags.py
Lines changed: 175 additions & 0 deletions b/‎src/filters/optionaltags.py
Lines changed: 175 additions & 0 deletions
diff --git a/‎src/serializer.py
Lines changed: 6 additions & 181 deletions b/‎src/serializer.py
Lines changed: 6 additions & 181 deletions
@@ -0,0 +1,175 @@
+import _base
+
+class Filter(_base.Filter):
+    def slider(self):
+        previous1 = previous2 = None
+        for token in self.source:
+            if previous1 is not None:
+                yield previous2, previous1, token
+            previous2 = previous1
+            previous1 = token
+        yield previous2, previous1, None
+
+    def __iter__(self):
+        for previous, token, next in self.slider():
+            type = token["type"]
+            if type == "StartTag":
+                if token["data"] or not self.is_optional_start(token["name"], previous, next):
+                    yield token
+            elif type == "EndTag":
+                if not self.is_optional_end(token["name"], next):
+                    yield token
+            else:
+                yield token
+
+    def is_optional_start(self, tagname, previous, next):
+        type = next and next["type"] or None
+        if tagname in 'html':
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname == 'head':
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            return type == "StartTag"
+        elif tagname == 'body':
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return next["name"] not in ('script', 'style')
+            else:
+                return True
+        elif tagname == 'colgroup':
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceeded by another colgroup element whose
+            # end tag has been omitted.
+            if type == "StartTag":
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return next["name"] == "col"
+            else:
+                return False
+        elif tagname == 'tbody':
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceeded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == "StartTag":
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous['type'] == 'EndTag' and \
+                  previous['name'] in ('tbody','thead','tfoot'):
+                    return False
+                return next["name"] == 'tr'
+            else:
+                return False
+        return False
+
+    def is_optional_end(self, tagname, next):
+        type = next and next["type"] or None
+        if tagname in ('html', 'head', 'body'):
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname in ('li', 'optgroup', 'option', 'tr'):
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] == tagname
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('dt', 'dd'):
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('dt', 'dd')
+            elif tagname == 'dd':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'p':
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, blockquote, dl, fieldset,
+            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+            # or ul  element, or if there is no more content in the parent
+            # element.
+            if type == "StartTag":
+                return next["name"] in ('address', 'blockquote', \
+                    'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
+                    'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'colgroup':
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return next["name"] != 'colgroup'
+            else:
+                return True
+        elif tagname in ('thead', 'tbody'):
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] in ['tbody', 'tfoot']
+            elif tagname == 'tbody':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'tfoot':
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] == 'tbody'
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('td', 'th'):
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('td', 'th')
+            else:
+                return type == "EndTag" or type is None
+        return False
@@ -7,6 +7,9 @@
 import gettext
 _ = gettext.gettext
 
+from filters.whitespace import Filter as WhitespaceFilter
+from filters.optionaltags import Filter as OptionalTagFilter
+
 from constants import voidElements, booleanAttributes, spaceCharacters
 
 spaceCharacters = u"".join(spaceCharacters)
@@ -48,183 +51,6 @@ def htmlentityreplace_errors(exc):
 def escape_text(text, encoding):
     return text.replace("&", "&amp;").encode(encoding, unicode_encode_errors)
 
-class OptionalTagFilter:
-    def __init__(self, source):
-        self.source = source
-
-    def slider(self):
-        previous1 = previous2 = None
-        for token in self.source:
-            if previous1 is not None:
-                yield previous2, previous1, token
-            previous2 = previous1
-            previous1 = token
-        yield previous2, previous1, None
-
-    def __iter__(self):
-        for previous, token, next in self.slider():
-            type = token["type"]
-            if type == "StartTag":
-                if token["data"] or not self.is_optional_start(token["name"], previous, next):
-                    yield token
-            elif type == "EndTag":
-                if not self.is_optional_end(token["name"], next):
-                    yield token
-            else:
-                yield token
-
-    def is_optional_start(self, tagname, previous, next):
-        type = next and next["type"] or None
-        if tagname in 'html':
-            # An html element's start tag may be omitted if the first thing
-            # inside the html element is not a space character or a comment.
-            return type not in ("Comment", "SpaceCharacters")
-        elif tagname == 'head':
-            # A head element's start tag may be omitted if the first thing
-            # inside the head element is an element.
-            return type == "StartTag"
-        elif tagname == 'body':
-            # A body element's start tag may be omitted if the first thing
-            # inside the body element is not a space character or a comment,
-            # except if the first thing inside the body element is a script
-            # or style element and the node immediately preceding the body
-            # element is a head element whose end tag has been omitted.
-            if type in ("Comment", "SpaceCharacters"):
-                return False
-            elif type == "StartTag":
-                # XXX: we do not look at the preceding event, so we never omit
-                # the body element's start tag if it's followed by a script or
-                # a style element.
-                return next["name"] not in ('script', 'style')
-            else:
-                return True
-        elif tagname == 'colgroup':
-            # A colgroup element's start tag may be omitted if the first thing
-            # inside the colgroup element is a col element, and if the element
-            # is not immediately preceeded by another colgroup element whose
-            # end tag has been omitted.
-            if type == "StartTag":
-                # XXX: we do not look at the preceding event, so instead we never
-                # omit the colgroup element's end tag when it is immediately
-                # followed by another colgroup element. See is_optional_end.
-                return next["name"] == "col"
-            else:
-                return False
-        elif tagname == 'tbody':
-            # A tbody element's start tag may be omitted if the first thing
-            # inside the tbody element is a tr element, and if the element is
-            # not immediately preceeded by a tbody, thead, or tfoot element
-            # whose end tag has been omitted.
-            if type == "StartTag":
-                # omit the thead and tfoot elements' end tag when they are
-                # immediately followed by a tbody element. See is_optional_end.
-                if previous and previous['type'] == 'EndTag' and \
-                  previous['name'] in ('tbody','thead','tfoot'):
-                    return False
-                return next["name"] == 'tr'
-            else:
-                return False
-        return False
-
-    def is_optional_end(self, tagname, next):
-        type = next and next["type"] or None
-        if tagname in ('html', 'head', 'body'):
-            # An html element's end tag may be omitted if the html element
-            # is not immediately followed by a space character or a comment.
-            return type not in ("Comment", "SpaceCharacters")
-        elif tagname in ('li', 'optgroup', 'option', 'tr'):
-            # A li element's end tag may be omitted if the li element is
-            # immediately followed by another li element or if there is
-            # no more content in the parent element.
-            # An optgroup element's end tag may be omitted if the optgroup
-            # element is immediately followed by another optgroup element,
-            # or if there is no more content in the parent element.
-            # An option element's end tag may be omitted if the option
-            # element is immediately followed by another option element,
-            # or if there is no more content in the parent element.
-            # A tr element's end tag may be omitted if the tr element is
-            # immediately followed by another tr element, or if there is
-            # no more content in the parent element.
-            if type == "StartTag":
-                return next["name"] == tagname
-            else:
-                return type == "EndTag" or type is None
-        elif tagname in ('dt', 'dd'):
-            # A dt element's end tag may be omitted if the dt element is
-            # immediately followed by another dt element or a dd element.
-            # A dd element's end tag may be omitted if the dd element is
-            # immediately followed by another dd element or a dt element,
-            # or if there is no more content in the parent element.
-            if type == "StartTag":
-                return next["name"] in ('dt', 'dd')
-            elif tagname == 'dd':
-                return type == "EndTag" or type is None
-            else:
-                return False
-        elif tagname == 'p':
-            # A p element's end tag may be omitted if the p element is
-            # immediately followed by an address, blockquote, dl, fieldset,
-            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
-            # or ul  element, or if there is no more content in the parent
-            # element.
-            if type == "StartTag":
-                return next["name"] in ('address', 'blockquote', \
-                    'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
-                    'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
-            else:
-                return type == "EndTag" or type is None
-        elif tagname == 'colgroup':
-            # A colgroup element's end tag may be omitted if the colgroup
-            # element is not immediately followed by a space character or
-            # a comment.
-            if type in ("Comment", "SpaceCharacters"):
-                return False
-            elif type == "StartTag":
-                # XXX: we also look for an immediately following colgroup
-                # element. See is_optional_start.
-                return next["name"] != 'colgroup'
-            else:
-                return True
-        elif tagname in ('thead', 'tbody'):
-            # A thead element's end tag may be omitted if the thead element
-            # is immediately followed by a tbody or tfoot element.
-            # A tbody element's end tag may be omitted if the tbody element
-            # is immediately followed by a tbody or tfoot element, or if
-            # there is no more content in the parent element.
-            # A tfoot element's end tag may be omitted if the tfoot element
-            # is immediately followed by a tbody element, or if there is no
-            # more content in the parent element.
-            # XXX: we never omit the end tag when the following element is
-            # a tbody. See is_optional_start.
-            if type == "StartTag":
-                return next["name"] in ['tbody', 'tfoot']
-            elif tagname == 'tbody':
-                return type == "EndTag" or type is None
-            else:
-                return False
-        elif tagname == 'tfoot':
-            # A tfoot element's end tag may be omitted if the tfoot element
-            # is immediately followed by a tbody element, or if there is no
-            # more content in the parent element.
-            # XXX: we never omit the end tag when the following element is
-            # a tbody. See is_optional_start.
-            if type == "StartTag":
-                return next["name"] == 'tbody'
-            else:
-                return type == "EndTag" or type is None
-        elif tagname in ('td', 'th'):
-            # A td element's end tag may be omitted if the td element is
-            # immediately followed by a td or th element, or if there is
-            # no more content in the parent element.
-            # A th element's end tag may be omitted if the th element is
-            # immediately followed by a td or th element, or if there is
-            # no more content in the parent element.
-            if type == "StartTag":
-                return next["name"] in ('td', 'th')
-            else:
-                return type == "EndTag" or type is None
-        return False
-
 class HTMLSerializer(object):
     cdata_elements = frozenset(("style", "script", "xmp", "iframe", "noembed", "noframes", "noscript"))
 
@@ -258,8 +84,10 @@ def serialize(self, treewalker, encoding=None):
         self.errors = []
         if encoding and self.inject_meta_charset:
             treewalker = self.filter_inject_meta_charset(treewalker, encoding)
+        # XXX: WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
         if self.strip_whitespace:
-            treewalker = self.filter_whitespace(treewalker)
+            treewalker = WhitespaceFilter(treewalker)
         if self.omit_optional_tags:
             treewalker = OptionalTagFilter(treewalker)
         for token in treewalker:
@@ -387,9 +215,6 @@ def filter_inject_meta_charset(self, treewalker, encoding):
                     "data": {"charset": encoding}}
             yield token
 
-    def filter_whitespace(self, treewalker):
-        raise NotImplementedError
-
 def SerializeError(Exception):
     """Error in serialized tree"""
     pass