Skip to content

Commit 224ef3e

Browse files
committed
#24211: Add RFC6532 support to the email library.
This could use more edge case tests, but the basic functionality is tested. (Note that this changeset does not add tailored support for the RFC 6532 message/global MIME type, but the email package generic facilities will handle it.) Reviewed by Maciej Szulik.
1 parent c1ecef7 commit 224ef3e

7 files changed

Lines changed: 64 additions & 5 deletions

File tree

Doc/library/email.policy.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,14 @@ added matters. To illustrate::
378378
In addition to the settable attributes listed above that apply to all
379379
policies, this policy adds the following additional attributes:
380380

381+
.. attribute:: utf8
382+
383+
If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
384+
headers by encoding them as "encoded words". If ``True``, follow
385+
:rfc:`6532` and use ``utf-8`` encoding for headers. Messages
386+
formatted in this way may be passed to SMTP servers that support
387+
the ``SMTPUTF8`` extension (:rfc:`6531`).
388+
381389
.. attribute:: refold_source
382390

383391
If the value for a header in the ``Message`` object originated from a

Doc/whatsnew/3.5.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,12 @@ email
356356
header (``None`` if there is no such header). (Contributed by Abhilash Raj
357357
in :issue:`21083`.)
358358

359+
* A new policy option :attr:`~email.policy.EmailPolicy.utf8` can be set
360+
``True`` to encode email headers using the utf8 charset instead of using
361+
encoded words. This allows ``Messages`` to be formatted according to
362+
:rfc:`6532` and used with an SMTP server that supports the :rfc:`6531`
363+
``SMTPUTF8`` extension. (Contributed by R. David Murray in :issue:`24211`.)
364+
359365
glob
360366
----
361367

Lib/email/_header_value_parser.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -320,17 +320,18 @@ def cte_encode(self, charset, policy):
320320
return ''.join(res)
321321

322322
def _fold(self, folded):
323+
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
323324
for part in self.parts:
324325
tstr = str(part)
325326
tlen = len(tstr)
326327
try:
327-
str(part).encode('us-ascii')
328+
str(part).encode(encoding)
328329
except UnicodeEncodeError:
329330
if any(isinstance(x, errors.UndecodableBytesDefect)
330331
for x in part.all_defects):
331332
charset = 'unknown-8bit'
332333
else:
333-
# XXX: this should be a policy setting
334+
# XXX: this should be a policy setting when utf8 is False.
334335
charset = 'utf-8'
335336
tstr = part.cte_encode(charset, folded.policy)
336337
tlen = len(tstr)
@@ -394,11 +395,12 @@ class UnstructuredTokenList(TokenList):
394395

395396
def _fold(self, folded):
396397
last_ew = None
398+
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
397399
for part in self.parts:
398400
tstr = str(part)
399401
is_ew = False
400402
try:
401-
str(part).encode('us-ascii')
403+
str(part).encode(encoding)
402404
except UnicodeEncodeError:
403405
if any(isinstance(x, errors.UndecodableBytesDefect)
404406
for x in part.all_defects):
@@ -475,12 +477,13 @@ def _fold(self, folded):
475477
# comment that becomes a barrier across which we can't compose encoded
476478
# words.
477479
last_ew = None
480+
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
478481
for part in self.parts:
479482
tstr = str(part)
480483
tlen = len(tstr)
481484
has_ew = False
482485
try:
483-
str(part).encode('us-ascii')
486+
str(part).encode(encoding)
484487
except UnicodeEncodeError:
485488
if any(isinstance(x, errors.UndecodableBytesDefect)
486489
for x in part.all_defects):

Lib/email/policy.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ class EmailPolicy(Policy):
3535
In addition to the settable attributes listed above that apply to
3636
all Policies, this policy adds the following additional attributes:
3737
38+
utf8 -- if False (the default) message headers will be
39+
serialized as ASCII, using encoded words to encode
40+
any non-ASCII characters in the source strings. If
41+
True, the message headers will be serialized using
42+
utf8 and will not contain encoded words (see RFC
43+
6532 for more on this serialization format).
44+
3845
refold_source -- if the value for a header in the Message object
3946
came from the parsing of some source, this attribute
4047
indicates whether or not a generator should refold
@@ -72,6 +79,7 @@ class EmailPolicy(Policy):
7279
7380
"""
7481

82+
utf8 = False
7583
refold_source = 'long'
7684
header_factory = HeaderRegistry()
7785
content_manager = raw_data_manager
@@ -175,9 +183,13 @@ def fold_binary(self, name, value):
175183
refold_header setting, since there is no way to know whether the binary
176184
data consists of single byte characters or multibyte characters.
177185
186+
If utf8 is true, headers are encoded to utf8, otherwise to ascii with
187+
non-ASCII unicode rendered as encoded words.
188+
178189
"""
179190
folded = self._fold(name, value, refold_binary=self.cte_type=='7bit')
180-
return folded.encode('ascii', 'surrogateescape')
191+
charset = 'utf8' if self.utf8 else 'ascii'
192+
return folded.encode(charset, 'surrogateescape')
181193

182194
def _fold(self, name, value, refold_binary=False):
183195
if hasattr(value, 'name'):
@@ -199,3 +211,4 @@ def _fold(self, name, value, refold_binary=False):
199211
strict = default.clone(raise_on_defect=True)
200212
SMTP = default.clone(linesep='\r\n')
201213
HTTP = default.clone(linesep='\r\n', max_line_length=None)
214+
SMTPUTF8 = SMTP.clone(utf8=True)

Lib/test/test_email/test_generator.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import textwrap
33
import unittest
44
from email import message_from_string, message_from_bytes
5+
from email.message import EmailMessage
56
from email.generator import Generator, BytesGenerator
67
from email import policy
78
from test.test_email import TestEmailBase, parameterize
@@ -194,6 +195,27 @@ def test_cte_type_7bit_transforms_8bit_cte(self):
194195
g.flatten(msg)
195196
self.assertEqual(s.getvalue(), expected)
196197

198+
def test_smtputf8_policy(self):
199+
msg = EmailMessage()
200+
msg['From'] = "Páolo <főo@bar.com>"
201+
msg['To'] = 'Dinsdale'
202+
msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
203+
msg.set_content("oh là là, know what I mean, know what I mean?")
204+
expected = textwrap.dedent("""\
205+
From: Páolo <főo@bar.com>
206+
To: Dinsdale
207+
Subject: Nudge nudge, wink, wink \u1F609
208+
Content-Type: text/plain; charset="utf-8"
209+
Content-Transfer-Encoding: 8bit
210+
MIME-Version: 1.0
211+
212+
oh là là, know what I mean, know what I mean?
213+
""").encode('utf-8').replace(b'\n', b'\r\n')
214+
s = io.BytesIO()
215+
g = BytesGenerator(s, policy=policy.SMTPUTF8)
216+
g.flatten(msg)
217+
self.assertEqual(s.getvalue(), expected)
218+
197219

198220
if __name__ == '__main__':
199221
unittest.main()

Lib/test/test_email/test_policy.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class PolicyAPITests(unittest.TestCase):
2727
# If any of these defaults change, the docs must be updated.
2828
policy_defaults = compat32_defaults.copy()
2929
policy_defaults.update({
30+
'utf8': False,
3031
'raise_on_defect': False,
3132
'header_factory': email.policy.EmailPolicy.header_factory,
3233
'refold_source': 'long',
@@ -42,6 +43,9 @@ class PolicyAPITests(unittest.TestCase):
4243
email.policy.default: make_defaults(policy_defaults, {}),
4344
email.policy.SMTP: make_defaults(policy_defaults,
4445
{'linesep': '\r\n'}),
46+
email.policy.SMTPUTF8: make_defaults(policy_defaults,
47+
{'linesep': '\r\n',
48+
'utf8': True}),
4549
email.policy.HTTP: make_defaults(policy_defaults,
4650
{'linesep': '\r\n',
4751
'max_line_length': None}),

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ Core and Builtins
4747
Library
4848
-------
4949

50+
- Issue #24211: The email library now supports RFC 6532: it can generate
51+
headers using utf-8 instead of encoded words.
52+
5053
- Issue #16314: Added support for the LZMA compression in distutils.
5154

5255
- Issue #21804: poplib now supports RFC 6856 (UTF8).

0 commit comments

Comments
 (0)