From 5f0f8f322c0609598cf7e801c51745a40e9500d5 Mon Sep 17 00:00:00 2001
From: Abhilash Raj <raj.abhilash1@gmail.com>
Date: Sun, 19 May 2019 10:49:55 -0700
Subject: [PATCH 1/2] bpo-21315: Fix parsing of encoded words with missing
 leading ws.

Because of missing leading whitespace, encoded word would get parsed as
unstructured token. This patch fixes that by looking for encoded words when
splitting tokens with whitespace.

Missing trailing whitespace around encoded word now register a defect
instead.

Original patch suggestion by David R. Murray on bpo-21315.
---
 Lib/email/_header_value_parser.py             | 10 +++++++-
 .../test_email/test__header_value_parser.py   | 24 +++++++++++++++++--
 Lib/test/test_email/test_headerregistry.py    |  3 ++-
 .../2019-05-19-10-48-46.bpo-21315.PgXVqF.rst  |  3 +++
 4 files changed, 36 insertions(+), 4 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst

diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 649f1539fa02ab..0251a84432dd2d 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -75,7 +75,7 @@
 from email import _encoded_words as _ew
 from email import errors
 from email import utils
-
+from email.header import ecre as rfc2047_matcher
 #
 # Useful constants and functions
 #
@@ -1049,6 +1049,10 @@ def get_encoded_word(value):
         _validate_xtext(vtext)
         ew.append(vtext)
         text = ''.join(remainder)
+    # Encoded words should be followed by a LWS.
+    if value and value[0] != ' ':
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
     return ew, value
 
 def get_unstructured(value):
@@ -1101,6 +1105,10 @@ def get_unstructured(value):
                 unstructured.append(token)
                 continue
         tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WS on both sides.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
         vtext = ValueTerminal(tok, 'vtext')
         _validate_xtext(vtext)
         unstructured.append(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 676732bb3d0261..693487bc960fc0 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self):
                          '=?us-ascii?q?first?==?utf-8?q?second?=',
                          'first',
                          'first',
-                         [],
+                         [errors.InvalidHeaderDefect],
                          '=?utf-8?q?second?=')
 
     def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self):
             '=?utf-8?q?foo?==?utf-8?q?bar?=',
             'foobar',
             'foobar',
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_leading_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            'nowhitespace=?utf-8?q?somevalue?=',
+            'nowhitespacesomevalue',
+            'nowhitespacesomevalue',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_trailing_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            '=?utf-8?q?somevalue?=nowhitespace',
+            'somevaluenowhitespace',
+            'somevaluenowhitespace',
             [errors.InvalidHeaderDefect],
             '')
 
@@ -546,7 +565,8 @@ def test_encoded_word_inside_quotes(self):
             '"=?utf-8?Q?not_really_valid?="',
             '"not really valid"',
             'not really valid',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+             errors.InvalidHeaderDefect],
             '')
 
     # get_comment
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index d1007099f666c9..e6db3acedcc139 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
 
         'rfc2047_atom_in_quoted_string_is_decoded':
             ('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
             'Éric <foo@example.com>',
             'Éric',
             'foo@example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
new file mode 100644
index 00000000000000..e58124477adf95
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -0,0 +1,3 @@
+Email headers containing 2047 encoded words with no leading whitespace are
+parsed correctly. Also, missing trailing whitespaces now register a defect
+instead of silently ignoring.

From b7fa03664a497ec271b14ad5af2a0d3dc071b364 Mon Sep 17 00:00:00 2001
From: Abhilash Raj <raj.abhilash1@gmail.com>
Date: Wed, 22 May 2019 21:09:23 -0700
Subject: [PATCH 2/2] Changes as per review

---
 Lib/email/_header_value_parser.py             | 21 +++++++++++++++----
 .../2019-05-19-10-48-46.bpo-21315.PgXVqF.rst  |  7 ++++---
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 0251a84432dd2d..2b3b44b9a819cc 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -75,7 +75,7 @@
 from email import _encoded_words as _ew
 from email import errors
 from email import utils
-from email.header import ecre as rfc2047_matcher
+
 #
 # Useful constants and functions
 #
@@ -96,6 +96,18 @@
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
 
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
 #
 # TokenList and its subclasses
 #
@@ -1049,8 +1061,8 @@ def get_encoded_word(value):
         _validate_xtext(vtext)
         ew.append(vtext)
         text = ''.join(remainder)
-    # Encoded words should be followed by a LWS.
-    if value and value[0] != ' ':
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
         ew.defects.append(errors.InvalidHeaderDefect(
             "missing trailing whitespace after encoded-word"))
     return ew, value
@@ -1106,7 +1118,8 @@ def get_unstructured(value):
                 continue
         tok, *remainder = _wsp_splitter(value, 1)
         # Split in the middle of an atom if there is a rfc2047 encoded word
-        # which does not have WS on both sides.
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
         if rfc2047_matcher.search(tok):
             tok, *remainder = value.partition('=?')
         vtext = ValueTerminal(tok, 'vtext')
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
index e58124477adf95..dd0dd7f72c0a3f 100644
--- a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -1,3 +1,4 @@
-Email headers containing 2047 encoded words with no leading whitespace are
-parsed correctly. Also, missing trailing whitespaces now register a defect
-instead of silently ignoring.
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+