Use strict error handler

python · vstinner · Jun 7, 2024 · Jun 7, 2024 · Jun 11, 2024 · Jun 11, 2024
commit e830944769b3ee25bc251bc1b88a71f1c95d1e7c
@@ -387,7 +387,7 @@ APIs:
    arguments, calculate the size of the resulting Python Unicode string and return
    a string with the values formatted into it.  The variable arguments must be C
    types and must correspond exactly to the format characters in the *format*
-   string. The *format* string is decoded from UTF-8 with the "replace" error
+   string. The *format* string is decoded from UTF-8 with the "strict" error
    handler.
 
    A conversion specifier contains two or more characters and has the following

@@ -262,7 +262,7 @@ Porting to Python 3.14
 ----------------------
 
 * :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with
-  the "replace" error handler, instead of decoding it from ASCII.
+  the "strict" error handler, instead of decoding it from ASCII.
   (Contributed by Victor Stinner in :gh:`119182`.)
 
 Deprecated

diff --git a/Lib/test/test_capi/test_exceptions.py b/Lib/test/test_capi/test_exceptions.py
@@ -264,21 +264,18 @@ def test_format(self):
         PyErr_Format = getattr(pythonapi, name)
         PyErr_Format.argtypes = (py_object, c_char_p,)
         PyErr_Format.restype = py_object
-
         with self.assertRaises(ZeroDivisionError) as e:
             PyErr_Format(ZeroDivisionError, b'%s %d', b'error', c_int(42))
         self.assertEqual(e.exception.args, ('error 42',))
-
-        with self.assertRaises(ZeroDivisionError) as e:
-            PyErr_Format(ZeroDivisionError, b'invalid \xff')
-        self.assertEqual(e.exception.args, ('invalid \ufffd',))
-
         with self.assertRaises(ZeroDivisionError) as e:
             PyErr_Format(ZeroDivisionError, b'%s', 'помилка'.encode())
         self.assertEqual(e.exception.args, ('помилка',))
 
         with self.assertRaisesRegex(OverflowError, 'not in range'):
             PyErr_Format(ZeroDivisionError, b'%c', c_int(-1))
+        with self.assertRaisesRegex(ValueError, 'format string') as cm:
+            PyErr_Format(ZeroDivisionError, b'\xff')
+        self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError)
 
         self.assertRaises(SystemError, PyErr_Format, list, b'error')
         # CRASHES PyErr_Format(ZeroDivisionError, NULL)
@@ -382,7 +379,7 @@ def test_err_formatunraisable(self):
             self.assertEqual(str(cm.unraisable.exc_value), 'oops!')
             self.assertEqual(cm.unraisable.exc_traceback.tb_lineno,
                              firstline + 15)
-            self.assertEqual(cm.unraisable.err_msg, 'undecodable \ufffd')
+            self.assertIsNone(cm.unraisable.err_msg)
             self.assertIsNone(cm.unraisable.object)
 
         with support.catch_unraisable_exception() as cm:
@@ -406,8 +403,7 @@ def test_err_formatunraisable(self):
               support.captured_stderr() as stderr):
             formatunraisable(CustomError('oops!'), b'undecodable \xff')
         lines = stderr.getvalue().splitlines()
-        self.assertEqual(lines[0], 'undecodable \ufffd:')
-        self.assertEqual(lines[1], 'Traceback (most recent call last):')
+        self.assertEqual(lines[0], 'Traceback (most recent call last):')
         self.assertEqual(lines[-1], f'{__name__}.CustomError: oops!')
 
         with (support.swap_attr(sys, 'unraisablehook', None),

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -384,12 +384,13 @@ def check_format(expected, format, *args):
         check_format('ascii\x7f=unicode\xe9',
                      b'ascii\x7f=%U', 'unicode\xe9')
 
-        # Non-ASCII format and non-ASCII arguments are both decoded
-        # from UTF-8/replace
-        check_format('unicode\xe9=\u20ac',
-                     'unicode\xe9=%s'.encode(), '\u20ac'.encode())
-        check_format('invalid\ufffd=abc\ufffd',
-                     b'invalid\xe9=%s', b'abc\xe9')
+        # The %s arguments are decoded from UTF-8/replace.
+        # The format string is decoded from UTF-8/strict.
+        check_format('value=utf8 \u20ac',
+                     'value=%s'.encode(), 'utf8 \u20ac'.encode())
+        with self.assertRaisesRegex(ValueError, 'format string') as cm:
+            PyUnicode_FromFormat(b'invalid format string\xff: %s', b'abc')
+        self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError)
 
         # test "%c"
         check_format('\uabcd',

diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst
@@ -1,3 +1,3 @@
 :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with
-the "replace" error handler, instead of decoding it from ASCII. Patch by
+the "strict" error handler, instead of decoding it from ASCII. Patch by
 Victor Stinner.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -2905,7 +2905,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
             }
 
             if (unicode_decode_utf8_writer(&writer, f, len,
-                                           _Py_ERROR_REPLACE, "replace") < 0) {
+                                           _Py_ERROR_STRICT, "strict") < 0) {
+                PyObject *exc = PyErr_GetRaisedException();
+                PyErr_Format(PyExc_ValueError,
+                    "PyUnicode_FromFormatV() expects a valid UTF-8-encoded "
+                    "format string, got an invalid UTF-8 string");
+                _PyErr_ChainExceptions1(exc);
                 goto fail;
             }