2
2
from xml .sax .saxutils import escape , unescape
3
3
from tokenizer import HTMLTokenizer
4
4
5
- class HTMLSanitizer ( HTMLTokenizer ) :
5
+ class HTMLSanitizerMixin :
6
6
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
7
7
8
8
acceptable_elements = ['a' , 'abbr' , 'acronym' , 'address' , 'area' , 'b' ,
@@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
130
130
# => <script> do_nasty_stuff() </script>
131
131
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
132
132
# => <a>Click here for $100</a>
133
- def __iter__ (self ):
134
- for token in HTMLTokenizer .__iter__ (self ):
135
- if token ["type" ] in ["StartTag" , "EndTag" , "EmptyTag" ]:
136
- if token ["name" ] in self .allowed_elements :
137
- if token .has_key ("data" ):
138
- attrs = dict ([(name ,val ) for name ,val in token ["data" ][::- 1 ] if name in self .allowed_attributes ])
139
- for attr in self .attr_val_is_uri :
140
- if not attrs .has_key (attr ): continue
141
- val_unescaped = re .sub ("[`\000 -\040 \177 -\240 \s]+" , '' , unescape (attrs [attr ])).lower ()
142
- if re .match ("^[a-z0-9][-+.a-z0-9]*:" ,val_unescaped ) and (val_unescaped .split (':' )[0 ] not in self .allowed_protocols ):
143
- del attrs [attr ]
144
- if attrs .has_key ('style' ):
145
- attrs ['style' ] = self .sanitize_css (attrs ['style' ])
146
- token ["data" ] = [[name ,val ] for name ,val in attrs .items ()]
147
- yield token
148
- else :
149
- if token ["type" ] == "EndTag" :
150
- token ["data" ] = "</%s>" % token ["name" ]
151
- elif token ["data" ]:
152
- attrs = '' .join ([' %s="%s"' % (k ,escape (v )) for k ,v in token ["data" ]])
153
- token ["data" ] = "<%s%s>" % (token ["name" ],attrs )
154
- else :
155
- token ["data" ] = "<%s>" % token ["name" ]
156
- if token ["type" ] == "EmptyTag" :
157
- token ["data" ]= token ["data" ][:- 1 ] + "/>"
158
- token ["type" ] = "Characters"
159
- del token ["name" ]
160
- yield token
161
- elif token ["type" ] == "Comment" :
162
- pass
133
+ def sanitize_token (self , token ):
134
+ if token ["type" ] in ["StartTag" , "EndTag" , "EmptyTag" ]:
135
+ if token ["name" ] in self .allowed_elements :
136
+ if token .has_key ("data" ):
137
+ attrs = dict ([(name ,val ) for name ,val in token ["data" ][::- 1 ] if name in self .allowed_attributes ])
138
+ for attr in self .attr_val_is_uri :
139
+ if not attrs .has_key (attr ): continue
140
+ val_unescaped = re .sub ("[`\000 -\040 \177 -\240 \s]+" , '' , unescape (attrs [attr ])).lower ()
141
+ if re .match ("^[a-z0-9][-+.a-z0-9]*:" ,val_unescaped ) and (val_unescaped .split (':' )[0 ] not in self .allowed_protocols ):
142
+ del attrs [attr ]
143
+ if attrs .has_key ('style' ):
144
+ attrs ['style' ] = self .sanitize_css (attrs ['style' ])
145
+ token ["data" ] = [[name ,val ] for name ,val in attrs .items ()]
146
+ return token
163
147
else :
164
- yield token
148
+ if token ["type" ] == "EndTag" :
149
+ token ["data" ] = "</%s>" % token ["name" ]
150
+ elif token ["data" ]:
151
+ attrs = '' .join ([' %s="%s"' % (k ,escape (v )) for k ,v in token ["data" ]])
152
+ token ["data" ] = "<%s%s>" % (token ["name" ],attrs )
153
+ else :
154
+ token ["data" ] = "<%s>" % token ["name" ]
155
+ if token ["type" ] == "EmptyTag" :
156
+ token ["data" ]= token ["data" ][:- 1 ] + "/>"
157
+ token ["type" ] = "Characters"
158
+ del token ["name" ]
159
+ return token
160
+ elif token ["type" ] == "Comment" :
161
+ pass
162
+ else :
163
+ return token
165
164
166
165
def sanitize_css (self , style ):
167
166
# disallow urls
@@ -187,3 +186,9 @@ def sanitize_css(self, style):
187
186
clean
F438
.append (prop + ': ' + value + ';' )
188
187
189
188
return ' ' .join (clean )
189
+
190
+ class HTMLSanitizer (HTMLTokenizer , HTMLSanitizerMixin ):
191
+ def __iter__ (self ):
192
+ for token in HTMLTokenizer .__iter__ (self ):
193
+ token = self .sanitize_token (token )
194
+ if token : yield token
0 commit comments