8000 refactor allowed_token and disallowed_token as new methods in HTMLSan… · valievkarim/html5lib-python@52f9ca6 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 52f9ca6

Browse files
mfagsnedders
authored andcommitted
refactor allowed_token and disallowed_token as new methods in HTMLSanitizerMixin for usage in subclass.
1 parent 90aa9f4 commit 52f9ca6

File tree

1 file changed

+51
-45
lines changed

1 file changed

+51
-45
lines changed

html5lib/sanitizer.py

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -168,57 +168,63 @@ def sanitize_token(self, token):
168168
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
169169
tokenTypes["EmptyTag"]):
170170
if token["name"] in self.allowed_elements:
171-
if "data" in token:
172-
attrs = dict([(name,val) for name,val in
173-
token["data"][::-1]
174-
if name in self.allowed_attributes])
175-
for attr in self.attr_val_is_uri:
176-
if attr not in attrs:
177-
continue
178-
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
179-
unescape(attrs[attr])).lower()
180-
#remove replacement characters from unescaped characters
181-
val_unescaped = val_unescaped.replace("\ufffd", "")
182-
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
183-
(val_unescaped.split(':')[0] not in
184-
self.allowed_protocols)):
185-
del attrs[attr]
186-
for attr in self.svg_attr_val_allows_ref:
187-
if attr in attrs:
188-
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
189-
' ',
190-
unescape(attrs[attr]))
191-
if (token["name"] in self.svg_allow_local_href and
192-
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
193-
attrs['xlink:href'])):
194-
del attrs['xlink:href']
195-
if 'style' in attrs:
196-
attrs['style'] = self.sanitize_css(attrs['style'])
197-
token["data"] = [[name,val] for name,val in list(attrs.items())]
198-
return token
171+
return self.allowed_token(token, token_type)
199172
else:
200-
if token_type == tokenTypes["EndTag"]:
201-
token["data"] = "</%s>" % token["name"]
202-
elif token["data"]:
203 10000 -
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
204-
token["data"] = "<%s%s>" % (token["name"],attrs)
205-
else:
206-
token["data"] = "<%s>" % token["name"]
207-
if token.get("selfClosing"):
208-
token["data"]=token["data"][:-1] + "/>"
209-
210-
if token["type"] in list(tokenTypes.keys()):
211-
token["type"] = "Characters"
212-
else:
213-
token["type"] = tokenTypes["Characters"]
214-
215-
del token["name"]
216-
return token
173+
return self.disallowed_token(token, token_type)
217174
elif token_type == tokenTypes["Comment"]:
218175
pass
219176
else:
220177
return token
221178

179+
def allowed_token(self, token, token_type):
180+
if "data" in token:
181+
attrs = dict([(name,val) for name,val in
182+
token["data"][::-1]
183+
if name in self.allowed_attributes])
184+
for attr in self.attr_val_is_uri:
185+
if attr not in attrs:
186+
continue
187+
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
188+
unescape(attrs[attr])).lower()
189+
#remove replacement characters from unescaped characters
190+
val_unescaped = val_unescaped.replace("\ufffd", "")
191+
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
192+
(val_unescaped.split(':')[0] not in
193+
self.allowed_protocols)):
194+
del attrs[attr]
195+
for attr in self.svg_attr_val_allows_ref:
196+
if attr in attrs:
197+
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
198+
' ',
199+
unescape(attrs[attr]))
200+
if (token["name"] in self.svg_allow_local_href and
201+
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
202+
attrs['xlink:href'])):
203+
del attrs['xlink:href']
204+
if 'style' in attrs:
205+
attrs['style'] = self.sanitize_css(attrs['style'])
206+
token["data"] = [[name,val] for name,val in list(attrs.items())]
207+
return token
208+
209+
def disallowed_token(self, token, token_type):
210+
if token_type == tokenTypes["EndTag"]:
211+
token["data"] = "</%s>" % token["name"]
212+
elif token["data"]:
213+
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
214+
token["data"] = "<%s%s>" % (token["name"],attrs)
215+
else:
216+
token["data"] = "<%s>" % token["name"]
217+
if token.get("selfClosing"):
218+
token["data"]=token["data"][:-1] + "/>"
219+
220+
if token["type"] in list(tokenTypes.keys()):
221+
token["type"] = "Characters"
222+
else:
223+
token["type"] = tokenTypes["Characters"]
224+
225+
del token["name"]
226+
return token
227+
222228
def sanitize_css(self, style):
223229
# disallow urls
224230
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)

0 commit comments

Comments
 (0)
0