diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index f4777bb2462138..5842d05baf38ef 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -454,9 +454,10 @@ added matters. To illustrate:: The name is parsed as everything up to the '``:``' and returned - unmodified. The value is determined by stripping leading whitespace off - the remainder of the first line, joining all subsequent lines together, - and stripping any trailing carriage return or linefeed characters. + stripped of trailing whitespace. The value is determined by stripping + leading whitespace off the remainder of the first line, joining all + subsequent lines together, and stripping any trailing carriage + return or linefeed characters. .. method:: header_store_parse(name, value) diff --git a/Lib/email/_policybase.py b/Lib/email/_policybase.py index c9cbadd2a80c48..a7c1310fefee22 100644 --- a/Lib/email/_policybase.py +++ b/Lib/email/_policybase.py @@ -292,15 +292,15 @@ def _sanitize_header(self, name, value): def header_source_parse(self, sourcelines): """+ - The name is parsed as everything up to the ':' and returned unmodified. - The value is determined by stripping leading whitespace off the - remainder of the first line, joining all subsequent lines together, and - stripping any trailing carriage return or linefeed characters. + The name is parsed as everything up to the ':' and returned stripped + of any trailing whitespace. The value is determined by stripping leading + whitespace off the remainder of the first line, joining all subsequent + lines together, and stripping any trailing carriage return or linefeed characters. """ name, value = sourcelines[0].split(':', 1) value = value.lstrip(' \t') + ''.join(sourcelines[1:]) - return (name, value.rstrip('\r\n')) + return (name.rstrip(' \t'), value.rstrip('\r\n')) def header_store_parse(self, name, value): """+ diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 06d6b4a3afcd07..46790b3cd63cbb 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -34,7 +34,7 @@ NLCRE_crack = re.compile(r'(\r\n|\r|\n)') # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character # except controls, SP, and ":". -headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') +headerRE = re.compile(r'^(From |[\041-\071\073-\176]*[ \t]*:|[\t ])') EMPTYSTRING = '' NL = '\n' boundaryendRE = re.compile( diff --git a/Lib/test/test_email/data/msg_48.txt b/Lib/test/test_email/data/msg_48.txt new file mode 100644 index 00000000000000..999b32b08c8c6a --- /dev/null +++ b/Lib/test/test_email/data/msg_48.txt @@ -0,0 +1,7 @@ +Subject: Regarding messages containing whitespace that follow field names +To: receiver@example.org +x-whitespace-after-fieldname : value +Date: Fri, 20 May 2022 18:13:19 +1200 +From: sender@example.org + +Field names can be followed by arbitrary whitespace diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 39d4ace8d4a1d8..e08149490fb6fd 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -431,6 +431,14 @@ def test_get_param_funky_continuation_lines(self): msg = self._msgobj('msg_22.txt') self.assertEqual(msg.get_payload(1).get_param('name'), 'wibble.JPG') + def test_whitespace_after_fieldname(self): + # As part of obsolete email syntax, fieldnames can be followed by arbitrary whitespace + msg = self._msgobj("msg_48.txt") + + self.assertEqual(msg["x-whitespace-after-fieldname"], "value") + self.assertEqual(msg.get_payload(), + "Field names can be followed by arbitrary whitespace\n") + # test_headerregistry.TestContentTypeHeader.semis_inside_quotes def test_get_param_with_semis_in_quotes(self): msg = email.message_from_string( diff --git a/Misc/NEWS.d/next/Library/2022-05-24-15-55-09.gh-issue-93158.KH0YPY.rst b/Misc/NEWS.d/next/Library/2022-05-24-15-55-09.gh-issue-93158.KH0YPY.rst new file mode 100644 index 00000000000000..7cbfbd7443a46e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-05-24-15-55-09.gh-issue-93158.KH0YPY.rst @@ -0,0 +1,2 @@ +The :mod:`email` library now parses messages that use obsolete email syntax where +header field names can be followed by whitespace.