8000 Port sanitize tests from ruby to python (four currently fail) · awesome-python/html5lib-python@56e0bed · GitHub
[go: up one dir, main page]

Skip to content

Commit 56e0bed

Browse files
committed
Port sanitize tests from ruby to python (four currently fail)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40704
1 parent be4c9fe commit 56e0bed

File tree

4 files changed

+31
-136
lines changed

4 files changed

+31
-136
lines changed

src/sanitizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,15 +135,15 @@ def __iter__(self):
135135
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
136136
if token["name"] in self.allowed_elements:
137137
if token.has_key("data"):
138-
attrs = dict([[name,val] for name,val in token["data"][::-1] if name in self.allowed_attributes])
138+
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
139139
for attr in self.attr_val_is_uri:
140140
if not attrs.has_key(attr): continue
141141
val_unescaped = re.sub("[\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
142142
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
143143
del attrs[attr]
144144
if attrs.has_key('style'):
145145
attrs['style'] = self.sanitize_css(attrs['style'])
146-
token["data"] = attrs.items()
146+
token["data"] = [[name,val] for name,val in attrs.items()]
147147
yield token
148148
else:
149149
if token["type"] == "EndTag":

tests/support.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os
2+
import sys
3+
4+
#Allow us to import the parent module
5+
os.chdir(os.path.split(os.path.abspath(__file__))[0])
6+
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
7+
8+
try:
9+
import simplejson
10+
except:
11+
import re
12+
class simplejson:
13+
def load(f):
14+
true, false = True, False
15+
input=re.sub(r'(".*?(?<!\\)")',r'u\1',f.read().decode('utf-8'))
16+
return eval(input.replace('\r',''))
17+
load = staticmethod(load)

tests/test_sanitizer.py

Lines changed: 11 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
1-
import os,sys,unittest
1+
import os,sys,unittest,glob
2+
from support import simplejson
23

34
#RELEASE remove
4-
if __name__ == '__main__':
5-
# XXX Allow us to import the sibling module
6-
os.chdir(os.path.split(os.path.abspath(__file__))[0])
7-
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
8-
95
import html5parser, sanitizer, constants
106
#END RELEASE
117

@@ -15,128 +11,22 @@
1511

1612
class SanitizeTest(unittest.TestCase):
1713
def addTest(cls, name, expected, input):
18-
setattr(cls, name,
19-
lambda self: self.assertEqual(expected, self.sanitize_html(input)))
14+
def test(self, expected=expected, input=input):
15+
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
16+
parseFragment(expected.encode('utf-8')).childNodes])
17+
self.assertEqual(expected, self.sanitize_html(input))
18+
setattr(cls, name, test)
2019
addTest = classmethod(addTest)
2120

2221
def sanitize_html(self,stream):
2322
return ''.join([token.toxml() for token in
2423
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
2524
parseFragment(stream).childNodes])
2625

27-
def test_should_allow_anchors(self):
28-
self.assertEqual("<a href=\"foo\">&lt;script&gt;baz&lt;/script&gt;</a>",
29-
self.sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>"))
30-
31-
# RFC 3986, sec 4.2
32-
def test_allow_colons_in_path_component(self):
33-
self.assertEqual("<a href=\"./this:that\">foo</a>",
34-
self.sanitize_html("<a href=\"./this:that\">foo</a>"))
35-
36-
def test_should_handle_non_html(self):
37-
self.assertEqual('abc', self.sanitize_html("abc"))
38-
39-
def test_should_handle_blank_text(self):
40-
self.assertEqual('', self.sanitize_html(''))
41-
42-
def test_should_sanitize_tag_broken_up_by_null(self):
43-
self.assertEqual(u"&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;", self.sanitize_html("""<scr\0ipt>alert(\"XSS\")</scr\0ipt>"""))
44-
45-
def test_should_sanitize_invalid_script_tag(self):
46-
self.assertEqual("&lt;script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;", self.sanitize_html("""<script/XSS SRC="http://ha.ckers.org/xss.js"></script>"""))
47-
48-
def test_should_sanitize_script_tag_with_multiple_open_brackets(self):
49-
self.assertEqual("&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;", self.sanitize_html("""<<script>alert("XSS");//<</script>"""))
50-
self.assertEqual("""&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;""", self.sanitize_html("""<iframe src=http://ha.ckers.org/scriptlet.html\n<"""))
51-
52-
def test_should_sanitize_unclosed_script(self):
53-
self.assertEqual("&lt;script src=\"http://ha.ckers.org/xss.js?\"&gt;<b/>", self.sanitize_html("""<script src=http://ha.ckers.org/xss.js?<b>"""))
54-
55-
def test_should_sanitize_half_open_scripts(self):
56-
self.assertEqual("<img/>", self.sanitize_html("""<img src="javascript:alert('XSS')"""))
57-
58-
def test_should_not_fall_for_ridiculous_hack(self):
59-
img_hack = """<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />"""
60-
self.assertEqual("<img/>", self.sanitize_html(img_hack))
61-
62-
def test_platypus(self):
63-
self.assertEqual("""<a style=\"display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;\" href=\"http://www.ragingplatypus.com/\">never trust your upstream platypus</a>""",
64-
self.sanitize_html("""<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>"""))
65-
66-
def test_xul(self):
67-
self.assertEqual("""<p style="">fubar</p>""",
68-
self.sanitize_html("""<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>"""))
69-
70-
def test_input_image(self):
71-
self.assertEqual("""<input type="image"/>""",
72-
self.sanitize_html("""<input type="image" src="javascript:alert('XSS');" />"""))
73-
74-
def test_non_alpha_non_digit(self):
75-
self.assertEqual(u"&lt;script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
76-
self.sanitize_html("""<script/XSS src="http://ha.ckers.org/xss.js"></script>"""))
77-
self.assertEqual("<a>foo</a>",
78-
self.sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>'))
79-
self.assertEqual("<img src=\"http://ha.ckers.org/xss.js\"/>",
80-
self.sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>'))
81-
82-
def test_img_dynsrc_lowsrc(self):
83-
self.assertEqual("<img/>",
84-
self.sanitize_html("""<img dynsrc="javascript:alert('XSS')" />"""))
85-
self.assertEqual("<img/>",
86-
self.sanitize_html("""<img lowsrc="javascript:alert('XSS')" />"""))
87-
88-
def test_div_background_image_unicode_encoded(self):
89-
self.assertEqual('<div style="">foo</div>',
90-
self.sanitize_html("""<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>"""))
91-
92-
def test_div_expression(self):
93-
self.assertEqual(u'<div style="">foo</div>',
94-
self.sanitize_html("""<div style="width: expression(alert('XSS'));">foo</div>"""))
95-
96-
def test_img_vbscript(self):
97-
self.assertEqual(u'<img/>',
98-
self.sanitize_html("""<img src='vbscript:msgbox("XSS")' />"""))
99-
10026
def test_should_handle_astral_plane_characters(self):
10127
self.assertEqual(u"<p>\U0001d4b5 \U0001d538</p>",
10228
self.sanitize_html("<p>&#x1d4b5; &#x1d538;</p>"))
10329

104-
105-
for i,img_hack in enumerate(
106-
["""<img src="javascript:alert('XSS');" />""",
107-
"""<img src=javascript:alert('XSS') />""",
108-
"""<img src="JaVaScRiPt:alert('XSS')" />""",
109-
"""<img src='javascript:alert(&quot;XSS&quot;)' />""",
110-
"""<img src='javascript:alert(String.fromCharCode(88,83,83))' />""",
111-
"""<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />""",
112-
"""<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />""",
113-
"""<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />""",
114-
"""<img src="jav\tascript:alert('XSS');" />""",
115-
"""<img src="jav&#x09;ascript:alert('XSS');" />""",
116-
"""<img src="jav&#x0A;ascript:alert('XSS');" />""",
117-
"""<img src="jav&#x0D;ascript:alert('XSS');" />""",
118-
"""<img src=" &#14; javascript:alert('XSS');" />""",
119-
"""<img src="&#x20;javascript:alert('XSS');" />""",
120-
"""<img src="&#xA0;javascript:alert('XSS');" />"""]):
121-
SanitizeTest.addTest("test_should_not_fall_for_xss_image_hack_#%d"%i,
122-
"<img/>", img_hack)
123-
124-
for tag, attr in [('img','src'), ('a','href')]:
125-
close = tag in constants.voidElements and "/>boo" or ">boo</%s>" % tag
126-
127-
SanitizeTest.addTest("test_should_strip_%s_attribute_in_%s_with_bad_protocols" % (attr,tag),
128-
"""<%s title="1"%s""" % (tag, close),
129-
"""<%s %s="javascript:XSS" title="1">boo</%s>""" % (tag,attr,tag))
130-
131-
SanitizeTest.addTest("test_should_strip_%s_attribute_in_%s_with_bad_protocols_and_whitespace" % (attr,tag),
132-
"""<%s title="1"%s""" % (tag, close),
133-
"""<%s %s=" javascript:XSS" title="1">boo</%s>""" % (tag,attr,tag))
134-
135-
for img_attr in ['src', 'width', 'height', 'alt']:
136-
SanitizeTest.addTest("test_should_allow_image_%s_attribute" % img_attr,
137-
"<img %s=\"foo\"/>" % img_attr,
138-
"<img %s='foo' onclick='bar' />" % img_attr)
139-
14030
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
14131
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: continue ### TODO
14232
if tag_name != tag_name.lower(): continue ### TODO
@@ -183,6 +73,10 @@ def test_should_handle_astral_plane_characters(self):
18373
"""<a href="%s">foo</a>&qu F438 ot;"" % protocol)
18474

18575
def buildTestSuite():
76+
for filename in glob.glob("sanitizer/*.dat"):
77+
for test in simplejson.load(file(filename)):
78+
SanitizeTest.addTest('test_' + test['name'], test['output'], test['input'])
79+
18680
return unittest.TestLoader().loadTestsFromTestCase(SanitizeTest)
18781

18882
def main():

tests/test_tokenizer.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,8 @@
1-
import sys
2-
import os
31
import glob
42
import unittest
5-
6-
try:
7-
import simplejson
8-
except:
9-
import re
10-
class simplejson:
11-
def load(f):
12-
true, false = True, False
13-
input=re.sub(r'(".*?(?<!\\)")',r'u\1',f.read().decode('utf-8'))
14-
return eval(input.replace('\r',''))
15-
load = staticmethod(load)
3+
from support import simplejson
164

175
#RELEASE remove
18-
#Allow us to import the parent module
19-
os.chdir(os.path.split(os.path.abspath(__file__))[0])
20-
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
21-
226
from tokenizer import HTMLTokenizer
237
import constants
248
#END RELEASE

0 commit comments

Comments
 (0)
0