|
| 1 | +import os |
| 2 | +import sys |
| 3 | +import itertools |
| 4 | +import copy |
| 5 | + |
| 6 | +#RELEASE remove |
| 7 | +# XXX Allow us to import the sibling module |
| 8 | +os.chdir(os.path.split(os.path.abspath(__file__))[0]) |
| 9 | +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src"))) |
| 10 | + |
| 11 | +import html5parser |
| 12 | +from treebuilders import simpletree |
| 13 | +#END RELEASE |
| 14 | + |
| 15 | +#RELEASE add |
| 16 | +#import html5lib |
| 17 | +#from html5lib import html5parser |
| 18 | +#from html5lib.treebuilders import simpletree |
| 19 | +#END RELEASE |
| 20 | + |
| 21 | +class HTMLSanitizer(object): |
| 22 | + |
| 23 | + default_acceptable_elements = ('a', 'abbr', 'acronym', 'address', 'area', |
| 24 | + 'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', |
| 25 | + 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', |
| 26 | + 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
| 27 | + 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', |
| 28 | + 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', |
| 29 | + 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', |
| 30 | + 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', |
| 31 | + 'ul', 'var') |
| 32 | + |
| 33 | + default_acceptable_attributes = ('abbr', 'accept', 'accept-charset', |
| 34 | + 'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', |
| 35 | + 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', |
| 36 | + 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', |
| 37 | + 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', |
| 38 | + 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', |
| 39 | + 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', |
| 40 | + 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', |
| 41 | + 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', |
| 42 | + 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', |
| 43 | + 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang') |
| 44 | + |
| 45 | + def __init__(self, acceptable_elements=None, acceptable_attributes=None): |
| 46 | + self.parser = html5parser.HTMLParser() |
| 47 | + if acceptable_elements is None: |
| 48 | + self.acceptable_elements = self.default_acceptable_elements |
| 49 | + else: |
| 50 | + self.acceptable_elements = acceptable_elements |
| 51 | + |
| 52 | + if acceptable_attributes is None: |
| 53 | + self.acceptable_attributes = self.default_acceptable_attributes |
| 54 | + else: |
| 55 | + self.acceptable_attributes = acceptable_attributes |
| 56 | + |
| 57 | + def _sanitizeTree(self, tree): |
| 58 | + tree_copy = copy.copy(tree) |
| 59 | + #Set up a correspondence between the nodes in the original tree and the |
| 60 | + #ones in the new tree |
| 61 | + for originalNode, copyNode in itertools.izip(tree, tree_copy): |
| 62 | + copyNode._orig = originalNode |
| 63 | + #Iterate over a copy of the tree |
| 64 | + for nodeCopy in tree_copy: |
| 65 | + node = nodeCopy._orig |
| 66 | + print node.name, node.name in self.acceptable_elements |
| 67 | + #XXX Need to nead with non-nodes |
| 68 | + if (isinstance(node, simpletree.TextNode) or |
| 69 | + isinstance(node, simpletree.DocumentFragment)): |
| 70 | + continue |
| 71 | + #XXX Need to remove the dependence on parent |
| 72 | + elif (node.name not in self.acceptable_elements): |
| 73 | + for child in node.childNodes: |
| 74 | + node.parent.insertBefore(child, node) |
| 75 | + node.parent.removeChild(node) |
| 76 | + |
| 77 | + for attrib in node.attributes.keys()[:]: |
| 78 | + if attrib not in self.acceptable_attributes: |
| 79 | + del node.attributes[attrib] |
| 80 | + |
| 81 | + return tree |
| 82 | + |
| 83 | + def sanitize(self, fragment): |
| 84 | + tree = self.parser.parseFragment(fragment) |
| 85 | + tree = self._sanitizeTree(tree) |
| 86 | + return tree.toxml() |
0 commit comments