Skip to content

Commit 50bfbb9

Browse files
committed
#19063: fix set_payload handling of non-ASCII string input.
This version of the fix raises an error instead of accepting the invalid input (ie: if a non-ASCII string is used but no charset is specified).
1 parent 34bd9fc commit 50bfbb9

6 files changed

Lines changed: 85 additions & 38 deletions

File tree

Doc/library/email.message.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,13 @@ Here are the methods of the :class:`Message` class:
196196

197197
Set the entire message object's payload to *payload*. It is the client's
198198
responsibility to ensure the payload invariants. Optional *charset* sets
199-
the message's default character set; see :meth:`set_charset` for details.
199+
the message's character set; see :meth:`set_charset` for details. If
200+
*payload* is a string containing non-ASCII characters, *charset* is
201+
required.
202+
203+
.. versionchanged:: 3.4
204+
Previous to 3.4 *charset* was not required when *payload* was a
205+
non-ASCII string, but omitting it produced nonsense results.
200206

201207
.. method:: set_charset(charset)
202208

Lib/email/charset.py

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -378,18 +378,19 @@ def _get_encoder(self, header_bytes):
378378
return None
379379

380380
def body_encode(self, string):
381-
"""Body-encode a string by converting it first to bytes.
381+
"""Body-encode a string, converting it first to bytes if needed.
382382
383383
The type of encoding (base64 or quoted-printable) will be based on
384-
self.body_encoding. If body_encoding is None, we assume the
385-
output charset is a 7bit encoding, so re-encoding the decoded
386-
string using the ascii codec produces the correct string version
387-
of the content.
384+
self.body_encoding. If body_encoding is None, we perform no CTE
385+
encoding (the CTE will be either 7bit or 8bit), we just encode the
386+
binary representation to ascii using the surrogateescape error handler,
387+
which will enable the Generators to produce the correct output.
388388
"""
389-
# 7bit/8bit encodings return the string unchanged (module conversions)
389+
if not string:
390+
return string
391+
if isinstance(string, str):
392+
string = string.encode(self.output_charset)
390393
if self.body_encoding is BASE64:
391-
if isinstance(string, str):
392-
string = string.encode(self.output_charset)
393394
return email.base64mime.body_encode(string)
394395
elif self.body_encoding is QP:
395396
# quopromime.body_encode takes a string, but operates on it as if
@@ -398,15 +399,7 @@ def body_encode(self, string):
398399
# character set, then, we must turn it into pseudo bytes via the
399400
# latin1 charset, which will encode any byte as a single code point
400401
# between 0 and 255, which is what body_encode is expecting.
401-
#
402-
# Note that this clause doesn't handle the case of a _payload that
403-
# is already bytes. It never did, and the semantics of _payload
404-
# being bytes has never been nailed down, so fixing that is a
405-
# longer term TODO.
406-
if isinstance(string, str):
407-
string = string.encode(self.output_charset).decode('latin1')
402+
string = string.decode('latin1')
408403
return email.quoprimime.body_encode(string)
409404
else:
410-
if isinstance(string, str):
411-
string = string.encode(self.output_charset).decode('ascii')
412-
return string
405+
return string.decode('ascii', 'surrogateescape')

Lib/email/message.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -301,9 +301,23 @@ def set_payload(self, payload, charset=None):
301301
Optional charset sets the message's default character set. See
302302
set_charset() for details.
303303
"""
304-
if isinstance(payload, bytes):
305-
payload = payload.decode('ascii', 'surrogateescape')
306-
self._payload = payload
304+
if hasattr(payload, 'encode'):
305+
if charset is None:
306+
try:
307+
payload.encode('ascii', 'surrogateescape')
308+
except UnicodeError:
309+
raise TypeError("charset argument must be specified"
310+
" when non-ASCII characters are used in the"
311+
" payload") from None
312+
self._payload = payload
313+
return
314+
if not isinstance(charset, Charset):
315+
charset = Charset(charset)
316+
payload = payload.encode(charset.output_charset)
317+
if hasattr(payload, 'decode'):
318+
self._payload = payload.decode('ascii', 'surrogateescape')
319+
else:
320+
self._payload = payload
307321
if charset is not None:
308322
self.set_charset(charset)
309323

@@ -342,7 +356,7 @@ def set_charset(self, charset):
342356
try:
343357
cte(self)
344358
except TypeError:
345-
self._payload = charset.body_encode(self._payload)
359+
self._payload = charset.body_encode(self.get_payload(decode=True))
346360
self.add_header('Content-Transfer-Encoding', cte)
347361

348362
def get_charset(self):

Lib/test/test_email/test_contentmanager.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,12 +208,11 @@ def test_get_text_plain_bad_utf8_quoted_printable_ignore_errors(self):
208208
"Basìc tëxt.\n")
209209

210210
def test_get_text_plain_utf8_base64_recoverable_bad_CTE_data(self):
211-
m = self._str_msg(textwrap.dedent("""\
211+
m = self._bytes_msg(textwrap.dedent("""\
212212
Content-Type: text/plain; charset="utf8"
213213
Content-Transfer-Encoding: base64
214214
215-
QmFzw6xjIHTDq3h0Lgo\xFF=
216-
"""))
215+
QmFzw6xjIHTDq3h0Lgo""").encode('ascii') + b'\xFF=\n')
217216
self.assertEqual(raw_data_manager.get_content(m, errors='ignore'),
218217
"Basìc tëxt.\n")
219218

Lib/test/test_email/test_email.py

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,44 @@ def test_set_payload_with_charset(self):
9292
msg.set_payload('This is a string payload', charset)
9393
self.assertEqual(msg.get_charset().input_charset, 'iso-8859-1')
9494

95+
def test_set_payload_with_8bit_data_and_charset(self):
96+
data = b'\xd0\x90\xd0\x91\xd0\x92'
97+
charset = Charset('utf-8')
98+
msg = Message()
99+
msg.set_payload(data, charset)
100+
self.assertEqual(msg['content-transfer-encoding'], 'base64')
101+
self.assertEqual(msg.get_payload(decode=True), data)
102+
self.assertEqual(msg.get_payload(), '0JDQkdCS\n')
103+
104+
def test_set_payload_with_non_ascii_and_charset_body_encoding_none(self):
105+
data = b'\xd0\x90\xd0\x91\xd0\x92'
106+
charset = Charset('utf-8')
107+
charset.body_encoding = None # Disable base64 encoding
108+
msg = Message()
109+
msg.set_payload(data.decode('utf-8'), charset)
110+
self.assertEqual(msg['content-transfer-encoding'], '8bit')
111+
self.assertEqual(msg.get_payload(decode=True), data)
112+
113+
def test_set_payload_with_8bit_data_and_charset_body_encoding_none(self):
114+
data = b'\xd0\x90\xd0\x91\xd0\x92'
115+
charset = Charset('utf-8')
116+
charset.body_encoding = None # Disable base64 encoding
117+
msg = Message()
118+
msg.set_payload(data, charset)
119+
self.assertEqual(msg['content-transfer-encoding'], '8bit')
120+
self.assertEqual(msg.get_payload(decode=True), data)
121+
122+
def test_set_payload_to_list(self):
123+
msg = Message()
124+
msg.set_payload([])
125+
self.assertEqual(msg.get_payload(), [])
126+
127+
def test_set_payload_with_non_ascii_and_no_charset_raises(self):
128+
data = b'\xd0\x90\xd0\x91\xd0\x92'.decode('utf-8')
129+
msg = Message()
130+
with self.assertRaises(TypeError):
131+
msg.set_payload(data)
132+
95133
def test_get_charsets(self):
96134
eq = self.assertEqual
97135

@@ -558,20 +596,10 @@ def test_broken_base64_payload(self):
558596
self.assertIsInstance(msg.defects[0],
559597
errors.InvalidBase64CharactersDefect)
560598

561-
def test_broken_unicode_payload(self):
562-
# This test improves coverage but is not a compliance test.
563-
# The behavior in this situation is currently undefined by the API.
564-
x = 'this is a br\xf6ken thing to do'
565-
msg = Message()
566-
msg['content-type'] = 'text/plain'
567-
msg['content-transfer-encoding'] = '8bit'
568-
msg.set_payload(x)
569-
self.assertEqual(msg.get_payload(decode=True),
570-
bytes(x, 'raw-unicode-escape'))
571-
572599
def test_questionable_bytes_payload(self):
573600
# This test improves coverage but is not a compliance test,
574-
# since it involves poking inside the black box.
601+
# since it involves poking inside the black box in a way
602+
# that actually breaks the model invariants.
575603
x = 'this is a quéstionable thing to do'.encode('utf-8')
576604
msg = Message()
577605
msg['content-type'] = 'text/plain; charset="utf-8"'

Misc/NEWS

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ Core and Builtins
2626
Library
2727
-------
2828

29+
- Issue #19063: if a Charset's body_encoding was set to None, the email
30+
package would generate a message claiming the Content-Transfer-Encoding
31+
was 7bit, and produce garbage output for the content. This now works.
32+
A couple of other set_payload mishandlings of non-ASCII are also fixed.
33+
In addition, calling set_payload with a string argument without
34+
specifying a charset now raises an error (this is a new error in 3.4).
35+
2936
- Issue #15475: Add __sizeof__ implementations for itertools objects.
3037

3138
- Issue #19880: Fix a reference leak in unittest.TestCase. Explicitly break

0 commit comments

Comments
 (0)