From c91c5ce960e2c67b79cd094d22a5d4c82bfdca35 Mon Sep 17 00:00:00 2001 From: Kerim Kabirov Date: Sun, 31 Mar 2024 16:45:07 +0200 Subject: [PATCH 1/6] Increase HTML standard compliance for closing comment tags --- Lib/_markupbase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_markupbase.py b/Lib/_markupbase.py index 3ad7e279960f7e..2e2e00dd4b1f6f 100644 --- a/Lib/_markupbase.py +++ b/Lib/_markupbase.py @@ -9,7 +9,7 @@ _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match -_commentclose = re.compile(r'--\s*>') +_commentclose = re.compile(r'--!?>') _markedsectionclose = re.compile(r']\s*]\s*>') # An analysis of the MS-Word extensions is available at From 4536d8bbc618d3de3251450457220a9b9697a52a Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sun, 31 Mar 2024 14:57:20 +0000 Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Security/2024-03-31-14-57-20.gh-issue-102555.2P8jGn.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Security/2024-03-31-14-57-20.gh-issue-102555.2P8jGn.rst diff --git a/Misc/NEWS.d/next/Security/2024-03-31-14-57-20.gh-issue-102555.2P8jGn.rst b/Misc/NEWS.d/next/Security/2024-03-31-14-57-20.gh-issue-102555.2P8jGn.rst new file mode 100644 index 00000000000000..f031c37774433c --- /dev/null +++ b/Misc/NEWS.d/next/Security/2024-03-31-14-57-20.gh-issue-102555.2P8jGn.rst @@ -0,0 +1 @@ +Follow the `parsing recommendation `_ and `standard `_ for closing comment tag in the :mod:`html.parser`. Increased compliance leads to predictable behavior, thus enhancing security. From 9147ff64f87e8c268300be493889ab2b4aefc77f Mon Sep 17 00:00:00 2001 From: Kerim Kabirov Date: Sat, 6 Apr 2024 15:00:01 +0200 Subject: [PATCH 3/6] Add more edge cases and tests --- Lib/_markupbase.py | 4 ++-- Lib/test/test_htmlparser.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Lib/_markupbase.py b/Lib/_markupbase.py index 2e2e00dd4b1f6f..3c6986600eb64c 100644 --- a/Lib/_markupbase.py +++ b/Lib/_markupbase.py @@ -81,7 +81,7 @@ def parse_declaration(self, i): # A simple, practical version could look like: ((name|stringlit) S*) + '>' n = len(rawdata) if rawdata[j:j+2] == '--': #comment - # Locate --.*-- as the body of the comment + # Locate the body of the comment. return self.parse_comment(i) elif rawdata[j] == '[': #marked section # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section @@ -166,7 +166,7 @@ def parse_comment(self, i, report=1): rawdata = self.rawdata if rawdata[i:i+4] != '' '' '' - '') + '' + '' + '' + '' + '') expected = [('comment', " I'm a valid comment "), ('comment', 'me too!'), ('comment', '--'), ('comment', ''), ('comment', '--I have many hyphens--'), ('comment', ' I have a > in the middle '), - ('comment', ' and I have -- in the middle! ')] + ('comment', ' and I have -- in the middle! '), + ('comment', ''), + ('comment', ''), + ('comment', ' Date: Sat, 6 Apr 2024 15:18:49 +0200 Subject: [PATCH 4/6] Add invalid HTML comment closing tags test cases --- Lib/test/test_htmlparser.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 3c5c87d1ba72ba..937be33061e8ce 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -328,7 +328,10 @@ def test_comments(self): '' '' '' - '') + '' + '' + '' + '') expected = [('comment', " I'm a valid comment "), ('comment', 'me too!'), ('comment', '--'), @@ -339,7 +342,11 @@ def test_comments(self): ('comment', ''), ('comment', ''), ('comment', ''), + ('comment', 'Me too (invalid character) --x>'), + ('comment', 'Me too (invalid characters) --cheese>') + ] self._run_check(html, expected) def test_condcoms(self): From 9e687bf2c2aeb83cd7522e3d8ead17520ad27b8b Mon Sep 17 00:00:00 2001 From: Kerim Kabirov Date: Sat, 6 Apr 2024 15:27:32 +0200 Subject: [PATCH 5/6] Add html closing comment tags test cases Handle the test cases mentioned in #102555 --- Lib/test/test_htmlparser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 937be33061e8ce..62fc4403a1fe30 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -329,6 +329,8 @@ def test_comments(self): '' '' '' + '' '' '' '') @@ -343,6 +345,8 @@ def test_comments(self): ('comment', ''), ('comment', ''), ('comment', 'Me too (invalid character) --x>'), ('comment', 'Me too (invalid characters) --cheese>') From caba26781b36a0e2e7309d361b128434f6a64419 Mon Sep 17 00:00:00 2001 From: Kerim Kabirov Date: Sat, 6 Apr 2024 16:15:30 +0200 Subject: [PATCH 6/6] Add EOF abrupted comment tag case handling and tests --- Lib/_markupbase.py | 10 ++++++++-- Lib/html/parser.py | 2 +- Lib/test/test_htmlparser.py | 7 ++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Lib/_markupbase.py b/Lib/_markupbase.py index 3c6986600eb64c..ac95f44b69a316 100644 --- a/Lib/_markupbase.py +++ b/Lib/_markupbase.py @@ -161,13 +161,19 @@ def parse_marked_section(self, i, report=1): self.unknown_decl(rawdata[i+3: j]) return match.end(0) - # Internal -- parse comment, return length or -1 if not terminated - def parse_comment(self, i, report=1): + # Internal -- parse comment + # if end is True, returns EOF location if no close tag is found, otherwise + # return length or -1 if not terminated + def parse_comment(self, i, report=1, end=False): rawdata = self.rawdata if rawdata[i:i+4] != '' '' '' - '') + '' + '