gh-94606: Fix error when message has Unicode surrogate but is not val…

…id surrogateescaped string
python · serhiy-storchaka · Dec 11, 2023 · Jul 7, 2022 · Jul 7, 2022 · Jul 6, 2022
commit fea2ac97efdbbe6cbeb2b9d0730061e99d525ebb
@@ -290,7 +290,7 @@ def get_payload(self, i=None, decode=False):
         cte = str(self.get('content-transfer-encoding', '')).lower()
         # payload may be bytes here.
         if isinstance(payload, str):
-            if utils._has_surrogates(payload):
+            if utils._has_decoded_with_surrogateescape(payload):
                 bpayload = payload.encode('ascii', 'surrogateescape')
                 if not decode:
                     try:

@@ -49,16 +49,30 @@
 escapesre = re.compile(r'[\\"]')
 
 def _has_surrogates(s):
-    """Return True if s contains surrogate-escaped binary data."""
+        """Return True if s may contain surrogate-escaped binary data."""
     # This check is based on the fact that unless there are surrogates, utf8
     # (Python's default encoding) can encode any string.  This is the fastest
-    # way to check for surrogates, see issue 11454 for timings.
+    # way to check for surrogates, see issue 11454 (moved to GH 55663) for timings.
+    # This will pass some strings that are not valid for surrogateescape encoding.
     try:
         s.encode()
         return False
     except UnicodeEncodeError:
         return True
 
+def _has_decoded_with_surrogateescape(s):
+    """Return True if s is a valid str decoded using surrogateescape"""
+    # Slower test than _has_surrogates to be used when the string must
+    # be encodable with surrogateescape, but is no slower if the string
+    # does not have any unicode surrogate characters.
+    if _has_surrogates(s):
+        try:
+            s.encode('ascii', 'surrogateescape')
+        except UnicodeEncodeError:
+            return False
+        return True
+    return False
+
 # How to deal with a string containing bytes before handing it to the
 # application through the 'normal' interface.
 def _sanitize(string):

@@ -954,6 +954,42 @@ def test_get_body_malformed(self):
         # AttributeError: 'str' object has no attribute 'is_attachment'
         m.get_body()
 
+    def test_get_payload_unicode_surrogate1(self):
+        """test that fix for GH issue 94606 does not break this"""
+        msg = "String that could have been decod\udcc3\udcabd with surrogateescape"
+        expected = b'String that could have been decod\xc3\xabd with surrogateescape'
+        m = self._str_msg(msg)
+        payload = m.get_payload(decode=True)
+        self.assertEqual(expected, payload)
+
+    def test_get_payload_unicode_surrogate2(self):
+        """test that fix for GH issue 94606 does not break this"""
+        msg = "Unicode string with a utf-8 charactër"
+        expected = b'Unicode string with a utf-8 charact\xebr'
+        m = self._str_msg(msg)
+        payload = m.get_payload(decode=True)
+        self.assertEqual(expected, payload)
+
+    def test_get_payload_unicode_surrogate3(self):
+        """test for GH issue 94606"""
+        msg = "String that could not have been dëcod\udcc3\udcabd with surrogateescape"
+        expected = b'String that could not have been d\xebcod\\udcc3\udcabd with surrogateescape'
+        m = self._str_msg(msg)
+        # In GH issue 94606, this would raise
+        # UnicodeEncodeError: 'ascii' codec can't encode character '\xeb' in position 33: ordinal not in range(128)
+        payload = m.get_payload(decode=True)
+        self.assertEqual(expected, payload)
+
+    def test_get_payload_unicode_surrogate4(self):
+        """test for GH issue 94606"""
+        msg = "Different reason \udfff could not have been decoded with surrogateescape"
+        expected = b'Different reason \\udfff could not have been decoded with surrogateescape'
+        m = self._str_msg(msg)
+        # In GH issue 94606, this would raise
+        # UnicodeEncodeError: 'ascii' codec can't encode character '\udfff' in position 17: ordinal not in range(128)
+        payload = m.get_payload(decode=True)
+        self.assertEqual(expected, payload)
+
 
 class TestMIMEPart(TestEmailMessageBase, TestEmailBase):
     # Doing the full test run here may seem a bit redundant, since the two