8000 bpo-21315: Fix parsing of encoded words with missing leading ws. · python/cpython@5f0f8f3 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5f0f8f3

Browse files
committed
bpo-21315: Fix parsing of encoded words with missing leading ws.
Because of missing leading whitespace, encoded word would get parsed as unstructured token. This patch fixes that by looking for encoded words when splitting tokens with whitespace. Missing trailing whitespace around encoded word now register a defect instead. Original patch suggestion by David R. Murray on bpo-21315.
1 parent 5c08ce9 commit 5f0f8f3

File tree

4 files changed

+36
-4
lines changed

4 files changed

+36
-4
lines changed

Lib/email/_header_value_parser.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
from email import _encoded_words as _ew
7676
from email import errors
7777
from email import utils
78-
78+
from email.header import ecre as rfc2047_matcher
7979
#
8080
# Useful constants and functions
8181
#
@@ -1049,6 +1049,10 @@ def get_encoded_word(value):
10491049
_validate_xtext(vtext)
10501050
ew.append(vtext)
10511051
text = ''.join(remainder)
1052+
# Encoded words should be followed by a LWS.
1053+
if value and value[0] != ' ':
1054+
ew.defects.append(errors.InvalidHeaderDefect(
1055+
"missing trailing whitespace after encoded-word"))
10521056
return ew, value
10531057

10541058
def get_unstructured(value):
@@ -1101,6 +1105,10 @@ def get_unstructured(value):
11011105
unstructured.append(token)
11021106
continue
11031107
tok, *remainder = _wsp_splitter(value, 1)
1108+
# Split in the middle of an atom if there is a rfc2047 encoded word
1109+
# which does not have WS on both sides.
1110+
if rfc2047_matcher.search(tok):
1111+
tok, *remainder = value.partition('=?')
11041112
vtext = ValueTerminal(tok, 'vtext')
11051113
_validate_xtext(vtext)
11061114
unstructured.append(vtext)

Lib/test/test_email/test__header_value_parser.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self):
118118
'=?us-ascii?q?first?==?utf-8?q?second?=',
119119
'first',
120120
'first',
121-
[],
121+
[errors.InvalidHeaderDefect],
122122
'=?utf-8?q?second?=')
123123

124124
def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self):
361361
'=?utf-8?q?foo?==?utf-8?q?bar?=',
362362
'foobar',
363363
'foobar',
364+
[errors.InvalidHeaderDefect,
365+
errors.InvalidHeaderDefect],
366+
'')
367+
368+
def test_get_unstructured_ew_without_leading_whitespace(self):
369+
self._test_get_x(
370+
self._get_unst,
371+
'nowhitespace=?utf-8?q?somevalue?=',
372+
'nowhitespacesomevalue',
373+
'nowhitespacesomevalue',
374+
[errors.InvalidHeaderDefect],
375+
'')
376+
377+
def test_get_unstructured_ew_without_trailing_whitespace(self):
378+
self._test_get_x(
379+
self._get_unst,
380+
'=?utf-8?q?somevalue?=nowhitespace',
381+
'somevaluenowhitespace',
382+
'somevaluenowhitespace',
364383
[errors.InvalidHeaderDefect],
365384
'')
366385

@@ -546,7 +565,8 @@ def test_encoded_word_inside_quotes(self):
546565
'"=?utf-8?Q?not_really_valid?="',
547566
'"not really valid"',
548567
'not really valid',
549-
[errors.InvalidHeaderDefect],
568+
[errors.InvalidHeaderDefect,
569+
errors.InvalidHeaderDefect],
550570
'')
551571

552572
# get_comment

Lib/test/test_email/test_headerregistry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
11801180

11811181
'rfc2047_atom_in_quoted_string_is_decoded':
11821182
('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
1183-
[errors.InvalidHeaderDefect],
1183+
[errors.InvalidHeaderDefect,
1184+
errors.InvalidHeaderDefect],
11841185
'Éric <foo@example.com>',
11851186
'Éric',
11861187
'foo@example.com',
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Email headers containing 2047 encoded words with no leading whitespace are
2+
parsed correctly. Also, missing trailing whitespaces now register a defect
3+
instead of silently ignoring.

0 commit comments

Comments
 (0)
0