8000 Sync options · awesome-python/html5lib-python@a69f4f5 · GitHub
[go: up one dir, main page]

Skip to content

Commit a69f4f5

Browse files
committed
Sync options
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40821
1 parent 77700c0 commit a69f4f5

File tree

4 files changed

+64
-49
lines changed

4 files changed

+64
-49
lines changed

parse.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,7 @@ def printOutput(parser, document, opts):
9191
sys.stdout.write(document.hilite("utf-8"))
9292
elif opts.html:
9393
kwargs = {}
94-
for opt in ['inject_meta_charset', 'strip_whitespace', 'sanitize',
95-
'omit_optional_tags', 'quote_attr_values', 'quote_char',
96-
'use_best_quote_char', 'minimize_boolean_attributes',
97-
'use_trailing_solidus', 'escape_lt_in_attrs',
98-
'escape_rcdata']:
94+
for opt in serializer.HTMLSerializer.options:
9995
kwargs[opt] = getattr(opts,opt)
10096
if not kwargs['quote_char']: del kwargs['quote_char']
10197
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
@@ -176,6 +172,11 @@ def getOptParser():
176172
default=False, dest="use_trailing_solidus",
177173
help="use trailing solidus")
178174

175+
parser.add_option("", "--space-before-trailing-solidus",
176+
action="store_true", default=False,
177+
dest="space_before_trailing_solidus",
178+
help="add space before trailing solidus")
179+
179180
parser.add_option("", "--escape-lt-in-attrs", action="store_true",
180181
default=False, dest="escape_lt_in_attrs",
181182
help="escape less than signs in attribute values")

src/html5lib/filters/sanitizer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import _base
2+
from html5lib.sanitizer import HTMLSanitizerMixin
3+
4+
class Filter(_base.Filter, HTMLSanitizerMixin):
5+
def __iter__(self):
6+
for token in _base.Filter.__iter__(self):
7+
token = self.sanitize_token(token)
8+
if token: yield token

src/html5lib/sanitizer.py

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from xml.sax.saxutils import escape, unescape
33
from tokenizer import HTMLTokenizer
44

5-
class HTMLSanitizer(HTMLTokenizer):
5+
class HTMLSanitizerMixin:
66
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
77

88
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
130130
# => <script> do_nasty_stuff() </script>
131131
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
132132
# => <a>Click here for $100</a>
133-
def __iter__(self):
134-
for token in HTMLTokenizer.__iter__(self):
135-
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
136-
if token["name"] in self.allowed_elements:
137-
if token.has_key("data"):
138-
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
139-
for attr in self.attr_val_is_uri:
140-
if not attrs.has_key(attr): continue
141-
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
142-
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
143-
del attrs[attr]
144-
if attrs.has_key('style'):
145-
attrs['style'] = self.sanitize_css(attrs['style'])
146-
token["data"] = [[name,val] for name,val in attrs.items()]
147-
yield token
148-
else:
149-
if token["type"] == "EndTag":
150-
token["data"] = "</%s>" % token["name"]
151-
elif token["data"]:
152-
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
153-
token["data"] = "<%s%s>" % (token["name"],attrs)
154-
else:
155-
token["data"] = "<%s>" % token["name"]
156-
if token["type"] == "EmptyTag":
157-
token["data"]=token["data"][:-1] + "/>"
158-
token["type"] = "Characters"
159-
del token["name"]
160-
yield token
161-
elif token["type"] == "Comment":
162-
pass
133+
def sanitize_token(self, token):
134+
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
135+
if token["name"] in self.allowed_elements:
136+
if token.has_key("data"):
137+
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
138+
for attr in self.attr_val_is_uri:
139+
if not attrs.has_key(attr): continue
140+
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
141+
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
142+
del attrs[attr]
143+
if attrs.has_key('style'):
144+
attrs['style'] = self.sanitize_css(attrs['style'])
145+
token["data"] = [[name,val] for name,val in attrs.items()]
146+
return token
163147
else:
164-
yield token
148+
if token["type"] == "EndTag":
149+
token["data"] = "</%s>" % token["name"]
150+
elif token["data"]:
151+
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
152+
token["data"] = "<%s%s>" % (token["name"],attrs)
153+
else:
154+
token["data"] = "<%s>" % token["name"]
155+
if token["type"] == "EmptyTag":
156+
token["data"]=token["data"][:-1] + "/>"
157+
token["type"] = "Characters"
158+
del token["name"]
159+
return token
160+
elif token["type"] == "Comment":
161+
pass
162+
else:
163+
return token
165164

166165
def sanitize_css(self, style):
167166
# disallow urls
@@ -187,3 +186,9 @@ def sanitize_css(self, style):
187186
clean F438 .append(prop + ': ' + value + ';')
188187

189188
return ' '.join(clean)
189+
190+
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
191+
def __iter__(self):
192+
for token in HTMLTokenizer.__iter__(self):
193+
token = self.sanitize_token(token)
194+
if token: yield token

src/html5lib/serializer/htmlserializer.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,6 @@
77
import gettext
88
_ = gettext.gettext
99

10-
from html5lib.filters.whitespace import Filter as WhitespaceFilter
11-
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
12-
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
13-
1410
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
1511
from html5lib.constants import rcdataElements
1612

@@ -67,17 +63,16 @@ class HTMLSerializer(object):
6763
escape_lt_in_attrs = False
6864
escape_rcdata = False
6965

70-
omit_optional_tags = True
71-
72-
strip_whitespace = False
73-
7466
inject_meta_charset = True
67+
strip_whitespace = False
68+
sanitize = False
69+
omit_optional_tags = True
7570

7671
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
7772
"minimize_boolean_attributes", "use_trailing_solidus",
7873
"space_before_trailing_solidus", "omit_optional_tags",
7974
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
80-
"escape_rcdata")
75+
"escape_rcdata", 'use_trailing_solidus', "sanitize")
8176

8277
def __init__(self, **kwargs):
8378
if kwargs.has_key('quote_char'):
@@ -91,13 +86,19 @@ def serialize(self, treewalker, encoding=None):
9186
in_cdata = False
9287
self.errors = []
9388
if encoding and self.inject_meta_charset:
94-
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
89+
from html5lib.filters.inject_meta_charset import Filter
90+
treewalker = Filter(treewalker, encoding)
9591
# XXX: WhitespaceFilter should be used before OptionalTagFilter
9692
# for maximum efficiently of this latter filter
9793
if self.strip_whitespace:
98-
treewalker = WhitespaceFilter(treewalker)
94+
from html5lib.filters.whitespace import Filter
95+
treewalker = Filter(treewalker)
96+
if self.sanitize:
97+
from html5lib.filters.sanitizer import Filter
98+
treewalker = Filter(treewalker)
9999
if self.omit_optional_tags:
100-
treewalker = OptionalTagFilter(treewalker)
100+
from html5lib.filters.optionaltags import Filter
101+
treewalker = Filter(treewalker)
101102
for token in treewalker:
102103
type = token["type"]
103104
if type == "Doctype":

0 commit comments

Comments
 (0)
0