8000 [3.12] gh-92081: Fix for email.generator.Generator with whitespace be… · python/cpython@ffe9ba0 · GitHub
[go: up one dir, main page]

Skip to content

Commit ffe9ba0

Browse files
[3.12] gh-92081: Fix for email.generator.Generator with whitespace between encoded words. (GH-92281) (#119246)
* Fix for email.generator.Generator with whitespace between encoded words. email.generator.Generator currently does not handle whitespace between encoded words correctly when the encoded words span multiple lines. The current generator will create an encoded word for each line. If the end of the line happens to correspond with the end real word in the plaintext, the generator will place an unencoded space at the start of the subsequent lines to represent the whitespace between the plaintext words. A compliant decoder will strip all the whitespace from between two encoded words which leads to missing spaces in the round-tripped output. The fix for this is to make sure that whitespace between two encoded words ends up inside of one or the other of the encoded words. This fix places the space inside of the second encoded word. A second problem happens with continuation lines. A continuation line that starts with whitespace and is followed by a non-encoded word is fine because the newline between such continuation lines is defined as condensing to a single space character. When the continuation line starts with whitespace followed by an encoded word, however, the RFCs specify that the word is run together with the encoded word on the previous line. This is because normal words are filded on syntactic breaks by encoded words are not. The solution to this is to add the whitespace to the start of the encoded word on the continuation line. Test cases are from GH-92081 * Rename a variable so it's not confused with the final variable. (cherry picked from commit a6fdb31) Co-authored-by: Toshio Kuratomi <a.badger@gmail.com>
1 parent 386e492 commit ffe9ba0

File tree

4 files changed

+79
-8
lines changed
  • Misc/NEWS.d/next/Library
  • 4 files changed

    +79
    -8
    lines changed

    Lib/email/_header_value_parser.py

    Lines changed: 41 additions & 7 deletions
    Original file line numberDiff line numberDiff line change
    @@ -2784,11 +2784,15 @@ def _refold_parse_tree(parse_tree, *, policy):
    27842784
    # max_line_length 0/None means no limit, ie: infinitely long.
    27852785
    maxlen = policy.max_line_length or sys.maxsize
    27862786
    encoding = 'utf-8' if policy.utf8 else 'us-ascii'
    2787-
    lines = ['']
    2788-
    last_ew = None
    2787+
    lines = [''] # Folded lines to be output
    2788+
    leading_whitespace = '' # When we have whitespace between two encoded
    2789+
    # words, we may need to encode the whitespace
    2790+
    # at the beginning of the second word.
    2791+
    last_ew = None # Points to the last encoded character if there's an ew on
    2792+
    # the line
    27892793
    last_charset = None
    27902794
    wrap_as_ew_blocked = 0
    2791-
    want_encoding = False
    2795+
    want_encoding = False # This is set to True if we need to encode this part
    27922796
    end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
    27932797
    parts = list(parse_tree)
    27942798
    while parts:
    @@ -2812,10 +2816,12 @@ def _refold_parse_tree(parse_tree, *, policy):
    28122816
    # 'charset' property on the policy.
    28132817
    charset = 'utf-8'
    28142818
    want_encoding = True
    2819+
    28152820
    if part.token_type == 'mime-parameters':
    28162821
    # Mime parameter folding (using RFC2231) is extra special.
    28172822
    _fold_mime_parameters(part, lines, maxlen, encoding)
    28182823
    continue
    2824+
    28192825
    if want_encoding and not wrap_as_ew_blocked:
    28202826
    if not part.as_ew_allowed:
    28212827
    want_encoding = False
    @@ -2847,21 +2853,38 @@ def _refold_parse_tree(parse_tree, *, policy):
    28472853
    last_charset == 'utf-8' and charset != 'us-ascii')):
    28482854
    last_ew = None
    28492855
    last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
    2850-
    part.ew_combine_allowed, charset)
    2856+
    part.ew_combine_allowed, charset, leading_whitespace)
    2857+
    # This whitespace has been added to the lines in _fold_as_ew()
    2858+
    # so clear it now.
    2859+
    leading_whitespace = ''
    28512860
    last_charset = charset
    28522861
    want_encoding = False
    28532862
    continue
    2863+
    28542864
    if len(tstr) <= maxlen - len(lines[-1]):
    28552865
    lines[-1] += tstr
    28562866
    continue
    2867+
    28572868
    # This part is too long to fit. The RFC wants us to break at
    28582869
    # "major syntactic breaks", so unless we don't consider this
    28592870
    # to be one, check if it will fit on the next line by itself.
    2871+
    leading_whitespace = ''
    28602872
    if (part.syntactic_break and
    28612873
    len(tstr) + 1 <= maxlen):
    28622874
    newline = _steal_trailing_WSP_if_exists(lines)
    28632875
    if newline or part.startswith_fws():
    2876+
    # We're going to fold the data onto a new line here. Due to
    2877+
    # the way encoded strings handle continuation lines, we need to
    2878+
    # be prepared to encode any whitespace if the next line turns
    2879+
    # out to start with an encoded word.
    28642880
    lines.append(newline + tstr)
    2881+
    2882+
    whitespace_accumulator = []
    2883+
    for char in lines[-1]:
    2884+
    if char not in WSP:
    2885+
    break
    2886+
    whitespace_accumulator.append(char)
    2887+
    leading_whitespace = ''.join(whitespace_accumulator)
    28652888
    last_ew = None
    28662889
    continue
    28672890
    if not hasattr(part, 'encode'):
    @@ -2885,9 +2908,10 @@ def _refold_parse_tree(parse_tree, *, policy):
    28852908
    else:
    28862909
    # We can't fold it onto the next line either...
    28872910
    lines[-1] += tstr
    2911+
    28882912
    return policy.linesep.join(lines) + policy.linesep
    28892913

    2890-
    def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
    2914+
    def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
    28912915
    """Fold string to_encode into lines as encoded word, combining if allowed.
    28922916
    Return the new value for last_ew, or None if ew_combine_allowed is False.
    28932917
    @@ -2902,14 +2926,15 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
    29022926
    to_encode = str(
    29032927
    get_unstructured(lines[-1][last_ew:] + to_encode))
    29042928
    lines[-1] = lines[-1][:last_ew]
    2905-
    if to_encode[0] in WSP:
    2929+
    elif to_encode[0] in WSP:
    29062930
    # We're joining this to non-encoded text, so don't encode
    29072931
    # the leading blank.
    29082932
    leading_wsp = to_encode[0]
    29092933
    to_encode = to_encode[1:]
    29102934
    if (len(lines[-1]) == maxlen):
    29112935
    lines.append(_steal_trailing_WSP_if_exists(lines))
    29122936
    lines[-1] += leading_wsp
    2937+
    29132938
    trailing_wsp = ''
    29142939
    if to_encode[-1] in WSP:
    29152940
    # Likewise for the trailing space.
    @@ -2929,11 +2954,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, F438 ew_combine_allowed, charset):
    29292954

    29302955
    while to_encode:
    29312956
    remaining_space = maxlen - len(lines[-1])
    2932-
    text_space = remaining_space - chrome_len
    2957+
    text_space = remaining_space - chrome_len - len(leading_whitespace)
    29332958
    if text_space <= 0:
    29342959
    lines.append(' ')
    29352960
    continue
    29362961

    2962+
    # If we are at the start of a continuation line, prepend whitespace
    2963+
    # (we only want to do this when the line starts with an encoded word
    2964+
    # but if we're folding in this helper function, then we know that we
    2965+
    # are going to be writing out an encoded word.)
    2966+
    if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
    2967+
    encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
    2968+
    lines[-1] += encoded_word
    2969+
    leading_whitespace = ''
    2970+
    29372971
    to_encode_word = to_encode[:text_space]
    29382972
    encoded_word = _ew.encode(to_encode_word, charset=encode_as)
    29392973
    excess = len(encoded_word) - remaining_space

    Lib/test/test_email/test_generator.py

    Lines changed: 35 additions & 0 deletions
    Original file line numberDiff line numberDiff line change
    @@ -281,6 +281,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
    281281
    ioclass = io.BytesIO
    282282
    typ = lambda self, x: x.encode('ascii')
    283283

    284+
    def test_defaults_handle_spaces_between_encoded_words_when_folded(self):
    285+
    source = ("Уведомление о принятии в работу обращения для"
    286+
    " подключения услуги")
    287+
    expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n'
    288+
    ' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n'
    289+
    ' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii')
    290+
    msg = EmailMessage()
    291+
    msg['Subject'] = source
    292+
    s = io.BytesIO()
    293+
    g = BytesGenerator(s)
    294+
    g.flatten(msg)
    295+
    self.assertEqual(s.getvalue(), expected)
    296+
    297+
    def test_defaults_handle_spaces_at_start_of_subject(self):
    298+
    source = " Уведомление"
    299+
    expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n"
    300+
    msg = EmailMessage()
    301+
    msg['Subject'] = source
    302+
    s = io.BytesIO()
    303+
    g = BytesGenerator(s)
    304+
    g.flatten(msg)
    305+
    self.assertEqual(s.getvalue(), expected)
    306+
    307+
    def test_defaults_handle_spaces_at_start_of_continuation_line(self):
    308+
    source = " ф ффффффффффффффффффф ф ф"
    309+
    expected = (b"Subject: "
    310+
    b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n"
    311+
    b" =?utf-8?b?INGEINGE?=\n\n")
    312+
    msg = EmailMessage()
    313+
    msg['Subject'] = source
    314+
    s = io.BytesIO()
    315+
    g = BytesGenerator(s)
    316+
    g.flatten(msg)
    317+
    self.assertEqual(s.getvalue(), expected)
    318+
    284319
    def test_cte_type_7bit_handles_unknown_8bit(self):
    285320
    source = ("Subject: Maintenant je vous présente mon "
    286321
    "collègue\n\n").encode('utf-8')

    Lib/test/test_email/test_headerregistry.py

    Lines changed: 2 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -7,6 +7,7 @@
    77
    from test.test_email import TestEmailBase, parameterize
    88
    from email import headerregistry
    99
    from email.headerregistry import Address, Group
    10+
    from email.header import decode_header
    1011
    from test.support import ALWAYS_EQ
    1112

    1213

    @@ -1648,7 +1649,7 @@ def test_address_display_names(self):
    16481649
    'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. '
    16491650
    'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.',
    16501651
    '=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c'
    1651-
    '=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse'
    1652+
    '=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse'
    16521653
    '_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8'
    16531654
    '?q?p=C3=B4tenti=2E?=',
    16541655
    ),
    Lines changed: 1 addition & 0 deletions
    Original file line numberDiff line numberDiff line change
    @@ -0,0 +1 @@
    1+
    Fix missing spaces in email headers when the spaces are mixed with encoded 8-bit characters.

    0 commit comments

    Comments
     (0)
    0