diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 5981791820e740..e3ee13c3912241 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -16,6 +16,7 @@ import email.quoprimime from email import errors +from email import utils from email.encoders import encode_7or8bit @@ -438,5 +439,12 @@ def body_encode(self, string): return email.quoprimime.body_encode(string) else: if isinstance(string, str): - string = string.encode(self.output_charset).decode('ascii') + if utils._has_surrogates(string): + string = string.encode('ascii', 'surrogateescape') + if self.input_charset != self.output_charset: + string = (string.decode(self.input_codec) + .encode(self.output_codec)) + string = string.decode('ascii', 'surrogateescape') + else: + string = string.encode(self.output_charset).decode('ascii') return string diff --git a/Lib/email/message.py b/Lib/email/message.py index 641fb2e944d431..dff113ea407f07 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -352,7 +352,9 @@ def set_payload(self, payload, charset=None): return if not isinstance(charset, Charset): charset = Charset(charset) - payload = payload.encode(charset.output_charset, 'surrogateescape') + if not utils._has_surrogates(payload): + payload = payload.encode(charset.output_charset, + 'surrogateescape') if hasattr(payload, 'decode'): self._payload = payload.decode('ascii', 'surrogateescape') else: diff --git a/Lib/test/test_email/test_contentmanager.py b/Lib/test/test_email/test_contentmanager.py index bc0e5d35618159..a28f9c1402984b 100644 --- a/Lib/test/test_email/test_contentmanager.py +++ b/Lib/test/test_email/test_contentmanager.py @@ -355,6 +355,38 @@ def test_set_text_charset_cp949(self): self.assertEqual(m.get_payload(decode=True).decode('ks_c_5601-1987'), content) self.assertEqual(m.get_content(), content) + def test_set_text_charset_shift_jis(self): + m = self._make_message() + content = "\u65e5\u672c\u8a9e\n" + raw_data_manager.set_content(m, content, charset='shift_jis') + self.assertEqual(m['Content-Type'], 'text/plain; charset="shift_jis"') + self.assertEqual(m['Content-Transfer-Encoding'], '8bit') + self.assertEqual(m.get_payload(decode=True), content.encode('shift_jis')) + self.assertEqual(m.get_content(), content) + # Serialization converts the payload to iso-2022-jp for output. + self.assertEqual(str(m), textwrap.dedent("""\ + Content-Type: text/plain; charset="iso-2022-jp" + Content-Transfer-Encoding: 7bit + + \x1b$BF|K\\8l\x1b(B + """)) + + def test_set_text_charset_euc_jp(self): + m = self._make_message() + content = "\u65e5\u672c\u8a9e\n" + raw_data_manager.set_content(m, content, charset='euc-jp') + self.assertEqual(m['Content-Type'], 'text/plain; charset="euc-jp"') + self.assertEqual(m['Content-Transfer-Encoding'], '8bit') + self.assertEqual(m.get_payload(decode=True), content.encode('euc-jp')) + self.assertEqual(m.get_content(), content) + # Serialization converts the payload to iso-2022-jp for output. + self.assertEqual(str(m), textwrap.dedent("""\ + Content-Type: text/plain; charset="iso-2022-jp" + Content-Transfer-Encoding: 7bit + + \x1b$BF|K\\8l\x1b(B + """)) + def test_set_text_plain_long_line_heuristics(self): m = self._make_message() content = ("Simple but long message that is over 78 characters" diff --git a/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst new file mode 100644 index 00000000000000..79d724f354f237 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst @@ -0,0 +1,3 @@ +Fix serialization of :mod:`email` messages using ``shift_jis`` or ``euc-jp`` +charsets. Converting surrogate-escaped payloads to the required +``iso-2022-jp`` output charset no longer raises :exc:`UnicodeEncodeError`.