8000 Merge branch 'main' of https://github.com/python/cpython · python/cpython@5ea2411 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5ea2411

Browse files
committed
Merge branch 'main' of https://github.com/python/cpython
2 parents 9748fb3 + ee76e36 commit 5ea2411

File tree

4 files changed

+94
-23
lines changed

4 files changed

+94
-23
lines changed

Doc/library/html.parser.rst

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ Example HTML Parser Application
4343

4444
As a basic example, below is a simple HTML parser that uses the
4545
:class:`HTMLParser` class to print out start tags, end tags, and data
46-
as they are encountered::
46+
as they are encountered:
47+
48+
.. testcode::
4749

4850
from html.parser import HTMLParser
4951

6365

6466
The output will then be:
6567

66-
.. code-block:: none
68+
.. testoutput::
6769

6870
Encountered a start tag: html
6971
Encountered a start tag: head
@@ -230,7 +232,9 @@ Examples
230232
--------
231233

232234
The following class implements a parser that will be used to illustrate more
233-
examples::
235+
examples:
236+
237+
.. testcode::
234238

235239
from html.parser import HTMLParser
236240
from html.entities import name2codepoint
@@ -266,13 +270,17 @@ examples::
266270

267271
parser = MyHTMLParser()
268272

269-
Parsing a doctype::
273+
Parsing a doctype:
274+
275+
.. doctest::
270276

271277
>>> parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
272278
... '"http://www.w3.org/TR/html4/strict.dtd">')
273279
Decl : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"
274280

275-
Parsing an element with a few attributes and a title::
281+
Parsing an element with a few attributes and a title:
282+
283+
.. doctest::
276284

277285
>>> parser.feed('<img src="python-logo.png" alt="The Python logo">')
278286
Start tag: img
@@ -285,7 +293,9 @@ Parsing an element with a few attributes and a title::
285293
End tag : h1
286294

287295
The content of ``script`` and ``style`` elements is returned as is, without
288-
further parsing::
296+
further parsing:
297+
298+
.. doctest::
289299

290300
>>> parser.feed('<style type="text/css">#python { color: green }</style>')
291301
Start tag: style
@@ -300,35 +310,48 @@ further parsing::
300310
Data : alert("<strong>hello!</strong>");
301311
End tag : script
302312

303-
Parsing comments::
313+
Parsing comments:
314+
315+
.. doctest::
304316

305-
>>> parser.feed('<!-- a comment -->'
317+
>>> parser.feed('<!--a comment-->'
306318
... '<!--[if IE 9]>IE-specific content<![endif]-->')
307-
Comment : a comment
319+
Comment : a comment
308320
Comment : [if IE 9]>IE-specific content<![endif]
309321

310322
Parsing named and numeric character references and converting them to the
311-
correct char (note: these 3 references are all equivalent to ``'>'``)::
323+
correct char (note: these 3 references are all equivalent to ``'>'``):
312324

325+
.. doctest::
326+
327+
>>> parser = MyHTMLParser()
328+
>>> parser.feed('&gt;&#62;&#x3E;')
329+
Data : >>>
330+
331+
>>> parser = MyHTMLParser(convert_charrefs=False)
313332
>>> parser.feed('&gt;&#62;&#x3E;')
314333
Named ent: >
315334
Num ent : >
316335
Num ent : >
317336

318337
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
319338
:meth:`~HTMLParser.handle_data` might be called more than once
320-
(unless *convert_charrefs* is set to ``True``)::
339+
(unless *convert_charrefs* is set to ``True``):
321340

322-
>>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
341+
.. doctest::
342+
343+
>>> for chunk in ['<sp', 'an>buff', 'ered', ' text</s', 'pan>']:
323344
... parser.feed(chunk)
324345
...
325346
Start tag: span
326347
Data : buff
327348
Data : ered
328-
Data : text
349+
Data : text
329350
End tag : span
330351

331-
Parsing invalid HTML (e.g. unquoted attributes) also works::
352+
Parsing invalid HTML (e.g. unquoted attributes) also works:
353+
354+
.. doctest::
332355

333356
>>> parser.feed('<p><a class=link href=#main>tag soup</p ></a>')
334357
Start tag: p

Lib/html/parser.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import _markupbase
1313

1414
from html import unescape
15+
from html.entities import html5 as html5_entities
1516

1617

1718
__all__ = ['HTMLParser']
@@ -23,6 +24,7 @@
2324

2425
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
2526
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27+
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2628

2729
starttagopen = re.compile('<[a-zA-Z]')
2830
piclose = re.compile('>')
@@ -57,6 +59,22 @@
5759
# </ and the tag name, so maybe this should be fixed
5860
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
5961

62+
# Character reference processing logic specific to attribute values
63+
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
64+
def _replace_attr_charref(match):
65+
ref = match.group(0)
66+
# Numeric / hex char refs must always be unescaped
67+
if ref.startswith('&#'):
68+
return unescape(ref)
69+
# Named character / entity references must only be unescaped
70+
# if they are an exact match, and they are not followed by an equals sign
71+
if not ref.endswith('=') and ref[1:] in html5_entities:
72+
return unescape(ref)
73+
# Otherwise do not unescape
74+
return ref
75+
76+
def _unescape_attrvalue(s):
77+
return attr_charref.sub(_replace_attr_charref, s)
6078

6179

6280
class HTMLParser(_markupbase.ParserBase):
@@ -323,7 +341,7 @@ def parse_starttag(self, i):
323341
attrvalue[:1] == '"' == attrvalue[-1:]:
324342
attrvalue = attrvalue[1:-1]
325343
if attrvalue:
326-
attrvalue = unescape(attrvalue)
344+
attrvalue = _unescape_attrvalue(attrvalue)
327345
attrs.append((attrname.lower(), attrvalue))
328346
k = m.end()
329347

Lib/test/test_htmlparser.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -348,18 +348,16 @@ def test_convert_charrefs(self):
348348
collector = lambda: EventCollectorCharrefs()
349349
self.assertTrue(collector().convert_charrefs)
350350
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
351-
# check charrefs in the middle of the text/attributes
352-
expected = [('starttag', 'a', [('href', 'foo"zar')]),
353-
('data', 'a"z'), ('endtag', 'a')]
351+
# check charrefs in the middle of the text
352+
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
354353
for charref in charrefs:
355-
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
354+
self._run_check('<a>a{0}z</a>'.format(charref),
356355
expected, collector=collector())
357-
# check charrefs at the beginning/end of the text/attributes
358-
expected = [('data', '"'),
359-
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
356+
# check charrefs at the beginning/end of the text
357+
expected = [('data', '"'), ('starttag', 'a', []),
360358
('data', '"'), ('endtag', 'a'), ('data', '"')]
361359
for charref in charrefs:
362-
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
360+
self._run_check('{0}<a>'
363361
'{0}</a>{0}'.format(charref),
364362
expected, collector=collector())
365363
# check charrefs in <script>/<style> elements
@@ -382,6 +380,35 @@ def test_convert_charrefs(self):
382380
self._run_check('no charrefs here', [('data', 'no charrefs here')],
383381
collector=collector())
384382

383+
def test_convert_charrefs_in_attribute_values(self):
384+
# default value for convert_charrefs is now True
385+
collector = lambda: EventCollectorCharrefs()
386+
self.assertTrue(collector().convert_charrefs)
387+
388+
# always unescape terminated entity refs, numeric and hex char refs:
389+
# - regardless whether they are at start, middle, end of attribute
390+
# - or followed by alphanumeric, non-alphanumeric, or equals char
391+
charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
392+
expected = [('starttag', 'a',
393+
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
394+
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
395+
('endtag', 'a')]
396+
for charref in charrefs:
397+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
398+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
399+
.format(charref), expected, collector=collector())
400+
401+
# only unescape unterminated entity matches if they are not followed by
402+
# an alphanumeric or an equals sign
403+
charref = '&cent'
404+
expected = [('starttag', 'a',
405+
[('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
406+
('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
407+
('endtag', 'a')]
408+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
409+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
410+
.format(charref), expected, collector=collector())
411+
385412
# the remaining tests were for the "tolerant" parser (which is now
386413
# the default), and check various kind of broken markup
387414
def test_tolerant_parsing(self):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix :class:`html.parser.HTMLParser` to not unescape character entities in
2+
attribute values if they are followed by an ASCII alphanumeric or an equals
3+
sign.

0 commit comments

Comments
 (0)
0