gh-150771: Fix email serialization for shift_jis and euc-jp

shamit · cursoragent · shamit · commit 5928e5b55842 · 2026-06-09T07:36:49.000+05:30
Convert surrogate-escaped payloads through the input charset before
encoding to iso-2022-jp, fixing UnicodeEncodeError when printing
messages created with set_content().

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
@@ -16,6 +16,7 @@
 import email.quoprimime
 
 from email import errors
+from email import utils
 from email.encoders import encode_7or8bit
 
 
@@ -438,5 +439,12 @@ def body_encode(self, string):
             return email.quoprimime.body_encode(string)
         else:
             if isinstance(string, str):
-                string = string.encode(self.output_charset).decode('ascii')
+                if utils._has_surrogates(string):
+                    string = string.encode('ascii', 'surrogateescape')
+                    if self.input_charset != self.output_charset:
+                        string = (string.decode(self.input_codec)
+                                  .encode(self.output_codec))
+                    string = string.decode('ascii', 'surrogateescape')
+                else:
+                    string = string.encode(self.output_charset).decode('ascii')
             return string
diff --git a/Lib/email/message.py b/Lib/email/message.py
@@ -352,7 +352,9 @@ def set_payload(self, payload, charset=None):
                 return
             if not isinstance(charset, Charset):
                 charset = Charset(charset)
-            payload = payload.encode(charset.output_charset, 'surrogateescape')
+            if not utils._has_surrogates(payload):
+                payload = payload.encode(charset.output_charset,
+                                          'surrogateescape')
         if hasattr(payload, 'decode'):
             self._payload = payload.decode('ascii', 'surrogateescape')
         else:
diff --git a/Lib/test/test_email/test_contentmanager.py b/Lib/test/test_email/test_contentmanager.py
@@ -355,6 +355,38 @@ def test_set_text_charset_cp949(self):
         self.assertEqual(m.get_payload(decode=True).decode('ks_c_5601-1987'), content)
         self.assertEqual(m.get_content(), content)
 
+    def test_set_text_charset_shift_jis(self):
+        m = self._make_message()
+        content = "\u65e5\u672c\u8a9e\n"
+        raw_data_manager.set_content(m, content, charset='shift_jis')
+        self.assertEqual(m['Content-Type'], 'text/plain; charset="shift_jis"')
+        self.assertEqual(m['Content-Transfer-Encoding'], '8bit')
+        self.assertEqual(m.get_payload(decode=True), content.encode('shift_jis'))
+        self.assertEqual(m.get_content(), content)
+        # Serialization converts the payload to iso-2022-jp for output.
+        self.assertEqual(str(m), textwrap.dedent("""\
+            Content-Type: text/plain; charset="iso-2022-jp"
+            Content-Transfer-Encoding: 7bit
+
+            \x1b$BF|K\\8l\x1b(B
+            """))
+
+    def test_set_text_charset_euc_jp(self):
+        m = self._make_message()
+        content = "\u65e5\u672c\u8a9e\n"
+        raw_data_manager.set_content(m, content, charset='euc-jp')
+        self.assertEqual(m['Content-Type'], 'text/plain; charset="euc-jp"')
+        self.assertEqual(m['Content-Transfer-Encoding'], '8bit')
+        self.assertEqual(m.get_payload(decode=True), content.encode('euc-jp'))
+        self.assertEqual(m.get_content(), content)
+        # Serialization converts the payload to iso-2022-jp for output.
+        self.assertEqual(str(m), textwrap.dedent("""\
+            Content-Type: text/plain; charset="iso-2022-jp"
+            Content-Transfer-Encoding: 7bit
+
+            \x1b$BF|K\\8l\x1b(B
+            """))
+
     def test_set_text_plain_long_line_heuristics(self):
         m = self._make_message()
         content = ("Simple but long message that is over 78 characters"
diff --git a/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.rst b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.rst
@@ -0,0 +1,3 @@
+Fix serialization of :mod:`email` messages using ``shift_jis`` or ``euc-jp``
+charsets.  Converting surrogate-escaped payloads to the required
+``iso-2022-jp`` output charset no longer raises :exc:`UnicodeEncodeError`.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Fix serialization of :mod:`email` messages using ``shift_jis`` or ``euc-jp``
	`2`	`+charsets. Converting surrogate-escaped payloads to the required`
	`3`	+``iso-2022-jp`` output charset no longer raises :exc:`UnicodeEncodeError`.