From 2706182706222865ba6d8f76cd357a2bc1cc768f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 18 Jun 2025 13:28:15 +0300 Subject: [PATCH 1/5] gh-135661: Fix comment parsing in HTMLParser * "--!>" now ends the comment. * "-- >" no longer ends the comment. * Support abnormally ended empty comments "<-->" and "<--->". --- Lib/html/parser.py | 18 +++++++++++- Lib/test/test_htmlparser.py | 28 +++++++++++++++++-- ...-06-18-13-28-08.gh-issue-135661.nADrzJ.rst | 3 ++ 3 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-135661.nADrzJ.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index ba416e7fa6e3fe..08651af9dbe132 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -29,7 +29,8 @@ starttagopen = re.compile('<[a-zA-Z]') endtagopen = re.compile('') -commentclose = re.compile(r'--\s*>') +commentclose = re.compile(r'--!?>') +commentabruptclose = re.compile(r'-?>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will @@ -309,6 +310,21 @@ def parse_html_declaration(self, i): else: return self.parse_bogus_comment(i) + # Internal -- parse comment, return length or -1 if not terminated + # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state + def parse_comment(self, i, report=True): + rawdata = self.rawdata + assert rawdata.startswith('" '' '' + '' '' + # abrupt-closing-of-empty-comment + '' + '' '' '' - '') + '' + '' + '' + '' + '' + # nested-comment + ' -->' + '' + '' + ) expected = [('comment', " I'm a valid comment "), ('comment', 'me too!'), ('comment', '--'), + ('comment', '-'), + ('comment', ''), + ('comment', ''), ('comment', ''), ('comment', '--I have many hyphens--'), ('comment', ' I have a > in the middle '), - ('comment', ' and I have -- in the middle! ')] + ('comment', ' and I have -- in the middle! '), + ('comment', 'incorrectly-closed-comment'), + ('comment', ''), + ('comment', '--!'), + ('comment', '-- >'), + ('comment', ' '), + ('comment', '`` now +ends the comment. ``-- >`` no longer ends the comment. Support abnormally +ended empty comments ``<-->`` and ``<--->``. From f1c9efa9bc28093b550bfcb723f627dc8b928b65 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 25 Jun 2025 15:00:33 +0300 Subject: [PATCH 2/5] Add more tests: "". --- Lib/test/test_htmlparser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 4325469c540bb4..ba6612a049172e 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -344,6 +344,8 @@ def test_comments(self): '' '' '' + '' + '' # nested-comment ' -->' '' @@ -363,6 +365,8 @@ def test_comments(self): ('comment', ''), ('comment', '--!'), ('comment', '-- >'), + ('comment', '-!>'), + ('comment', '!>'), ('comment', ' '), ('comment', ' Date: Wed, 25 Jun 2025 15:07:04 +0300 Subject: [PATCH 3/5] Move to gh-102555. Co-author: Kerim Kabirov --- ....nADrzJ.rst => 2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/Library/{2025-06-18-13-28-08.gh-issue-135661.nADrzJ.rst => 2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst} (100%) diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-135661.nADrzJ.rst b/Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst similarity index 100% rename from Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-135661.nADrzJ.rst rename to Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst From b1462496c4b29a9bed84115c91aa0210e47a7557 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 3 Jul 2025 16:39:02 +0300 Subject: [PATCH 4/5] Move to Security. --- .../2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/{Library => Security}/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst (100%) diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst similarity index 100% rename from Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst rename to Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst From c8d97403990a2c9040ca71adc3e093169b400a2b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 4 Jul 2025 09:02:13 +0300 Subject: [PATCH 5/5] Update Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst Co-authored-by: Ezio Melotti --- .../Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst index 6168f6f00cf33d..71d15ee0852ebd 100644 --- a/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst +++ b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst @@ -1,3 +1,3 @@ -Fix comment parsing in :class:`html.parser.HTMLParser`. ``--!>`` now -ends the comment. ``-- >`` no longer ends the comment. Support abnormally -ended empty comments ``<-->`` and ``<--->``. +Fix comment parsing in :class:`html.parser.HTMLParser` according to the +HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the +comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.