Skip to content

Commit 5928e5b

Browse files
shamitcursoragent
authored andcommitted
gh-150771: Fix email serialization for shift_jis and euc-jp
Convert surrogate-escaped payloads through the input charset before encoding to iso-2022-jp, fixing UnicodeEncodeError when printing messages created with set_content(). Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 29a920e commit 5928e5b

4 files changed

Lines changed: 47 additions & 2 deletions

File tree

Lib/email/charset.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import email.quoprimime
1717

1818
from email import errors
19+
from email import utils
1920
from email.encoders import encode_7or8bit
2021

2122

@@ -438,5 +439,12 @@ def body_encode(self, string):
438439
return email.quoprimime.body_encode(string)
439440
else:
440441
if isinstance(string, str):
441-
string = string.encode(self.output_charset).decode('ascii')
442+
if utils._has_surrogates(string):
443+
string = string.encode('ascii', 'surrogateescape')
444+
if self.input_charset != self.output_charset:
445+
string = (string.decode(self.input_codec)
446+
.encode(self.output_codec))
447+
string = string.decode('ascii', 'surrogateescape')
448+
else:
449+
string = string.encode(self.output_charset).decode('ascii')
442450
return string

Lib/email/message.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,9 @@ def set_payload(self, payload, charset=None):
352352
return
353353
if not isinstance(charset, Charset):
354354
charset = Charset(charset)
355-
payload = payload.encode(charset.output_charset, 'surrogateescape')
355+
if not utils._has_surrogates(payload):
356+
payload = payload.encode(charset.output_charset,
357+
'surrogateescape')
356358
if hasattr(payload, 'decode'):
357359
self._payload = payload.decode('ascii', 'surrogateescape')
358360
else:

Lib/test/test_email/test_contentmanager.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,38 @@ def test_set_text_charset_cp949(self):
355355
self.assertEqual(m.get_payload(decode=True).decode('ks_c_5601-1987'), content)
356356
self.assertEqual(m.get_content(), content)
357357

358+
def test_set_text_charset_shift_jis(self):
359+
m = self._make_message()
360+
content = "\u65e5\u672c\u8a9e\n"
361+
raw_data_manager.set_content(m, content, charset='shift_jis')
362+
self.assertEqual(m['Content-Type'], 'text/plain; charset="shift_jis"')
363+
self.assertEqual(m['Content-Transfer-Encoding'], '8bit')
364+
self.assertEqual(m.get_payload(decode=True), content.encode('shift_jis'))
365+
self.assertEqual(m.get_content(), content)
366+
# Serialization converts the payload to iso-2022-jp for output.
367+
self.assertEqual(str(m), textwrap.dedent("""\
368+
Content-Type: text/plain; charset="iso-2022-jp"
369+
Content-Transfer-Encoding: 7bit
370+
371+
\x1b$BF|K\\8l\x1b(B
372+
"""))
373+
374+
def test_set_text_charset_euc_jp(self):
375+
m = self._make_message()
376+
content = "\u65e5\u672c\u8a9e\n"
377+
raw_data_manager.set_content(m, content, charset='euc-jp')
378+
self.assertEqual(m['Content-Type'], 'text/plain; charset="euc-jp"')
379+
self.assertEqual(m['Content-Transfer-Encoding'], '8bit')
380+
self.assertEqual(m.get_payload(decode=True), content.encode('euc-jp'))
381+
self.assertEqual(m.get_content(), content)
382+
# Serialization converts the payload to iso-2022-jp for output.
383+
self.assertEqual(str(m), textwrap.dedent("""\
384+
Content-Type: text/plain; charset="iso-2022-jp"
385+
Content-Transfer-Encoding: 7bit
386+
387+
\x1b$BF|K\\8l\x1b(B
388+
"""))
389+
358390
def test_set_text_plain_long_line_heuristics(self):
359391
m = self._make_message()
360392
content = ("Simple but long message that is over 78 characters"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix serialization of :mod:`email` messages using ``shift_jis`` or ``euc-jp``
2+
charsets. Converting surrogate-escaped payloads to the required
3+
``iso-2022-jp`` output charset no longer raises :exc:`UnicodeEncodeError`.

0 commit comments

Comments
 (0)