python · vstinner · Jun 7, 2024 · Jun 7, 2024 · Jun 11, 2024 · Jun 11, 2024
@@ -387,7 +387,7 @@ APIs:
    arguments, calculate the size of the resulting Python Unicode string and return
    a string with the values formatted into it.  The variable arguments must be C
    types and must correspond exactly to the format characters in the *format*
-   ASCII-encoded string.
+   string. The *format* string is decoded from UTF-8.
 
    A conversion specifier contains two or more characters and has the following
    components, which must occur in this order:
@@ -487,7 +487,8 @@ APIs:
 
       * - ``s``
         - :c:expr:`const char*` or :c:expr:`const wchar_t*`
-        - A null-terminated C character array.
+        - A null-terminated C character array. :c:expr:`const char*` is decoded
+          from UTF-8 with the "replace" error handler.
 
       * - ``p``
         - :c:expr:`const void*`
@@ -576,6 +577,9 @@ APIs:
    .. versionchanged:: 3.13
       Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added.
 
+   .. versionchanged:: 3.14
+      The format string is now decoded from UTF-8 instead of ASCII.
+
 
 .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs)
 

@@ -261,6 +261,10 @@ New Features
 Porting to Python 3.14
 ----------------------
 
+* :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8,
+  instead of ASCII.
+  (Contributed by Victor Stinner in :gh:`119182`.)
+
 Deprecated
 ----------
 

diff --git a/Lib/test/test_capi/test_exceptions.py b/Lib/test/test_capi/test_exceptions.py
@@ -273,8 +273,10 @@ def test_format(self):
 
         with self.assertRaisesRegex(OverflowError, 'not in range'):
             PyErr_Format(ZeroDivisionError, b'%c', c_int(-1))
-        with self.assertRaisesRegex(ValueError, 'format string'):
+        with self.assertRaisesRegex(ValueError, 'format string') as cm:
             PyErr_Format(ZeroDivisionError, b'\xff')
+        self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError)
+
         self.assertRaises(SystemError, PyErr_Format, list, b'error')
         # CRASHES PyErr_Format(ZeroDivisionError, NULL)
         # CRASHES PyErr_Format(py_object(), b'error')

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -380,16 +380,23 @@ def check_format(expected, format, *args):
             text = PyUnicode_FromFormat(format, *args)
             self.assertEqual(expected, text)
 
-        # ascii format, non-ascii argument
+        # ASCII format, non-ASCII %U argument
         check_format('ascii\x7f=unicode\xe9',
                      b'ascii\x7f=%U', 'unicode\xe9')
 
-        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
-        # raises an error
-        self.assertRaisesRegex(ValueError,
-            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
-            'string, got a non-ASCII byte: 0xe9$',
-            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
+        # The %s arguments are decoded from UTF-8/replace.
+        # The format string is decoded from UTF-8/strict.
+        check_format('value=utf8 \u20ac',
+                     'value=%s'.encode(), 'utf8 \u20ac'.encode())
+        with self.assertRaisesRegex(ValueError, 'format string') as cm:
+            PyUnicode_FromFormat(b'invalid format string\xff: %s', b'abc')
+        self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError)
+
+        # Truncated UTF-8 format strings
+        with self.assertRaisesRegex(ValueError, 'format string'):
+            PyUnicode_FromFormat(b'truncated utf8: \xc3')
+        with self.assertRaisesRegex(ValueError, 'format string'):
+            PyUnicode_FromFormat(b'truncated utf8: \xe2\x82')
 
         # test "%c"
         check_format('\uabcd',

diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst
@@ -0,0 +1,2 @@
+:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8,
+instead of ASCII. Patch by Victor Stinner.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -205,8 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                            const char *s, Py_ssize_t size,
-                           _Py_error_handler error_handler, const char *errors,
-                           Py_ssize_t *consumed);
+                           _Py_error_handler error_handler, const char *errors);
 #ifdef Py_DEBUG
 static inline int unicode_is_finalizing(void);
 static int unicode_is_singleton(PyObject *unicode);
@@ -2402,7 +2401,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
 
     if (width < 0) {
         return unicode_decode_utf8_writer(writer, str, length,
-                                          _Py_ERROR_REPLACE, "replace", NULL);
+                                          _Py_ERROR_REPLACE, "replace");
     }
 
     PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
@@ -2896,28 +2895,26 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
             const char *p;
             Py_ssize_t len;
 
-            p = f;
-            do
-            {
-                if ((unsigned char)*p > 127) {
-                    PyErr_Format(PyExc_ValueError,
-                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
-                        "string, got a non-ASCII byte: 0x%02x",
-                        (unsigned char)*p);
-                    goto fail;
-                }
-                p++;
+            p = strchr(f, '%');
+            if (p != NULL) {
+                len = p - f;
             }
-            while (*p != '\0' && *p != '%');
-            len = p - f;
-
-            if (*p == '\0')
+            else {
+                len = strlen(f);
                 writer.overallocate = 0;
+            }
 
-            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
+            if (unicode_decode_utf8_writer(&writer, f, len,
+                                           _Py_ERROR_STRICT, "strict") < 0) {
+                PyObject *exc = PyErr_GetRaisedException();
+                PyErr_SetString(PyExc_ValueError,
+                    "PyUnicode_FromFormatV() expects a valid UTF-8-encoded "
+                    "format string, got an invalid UTF-8 string");
+                _PyErr_ChainExceptions1(exc);
                 goto fail;
+            }
 
-            f = p;
+            f += len;
         }
     }
     va_end(vargs2);
@@ -4930,13 +4927,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                            const char *s, Py_ssize_t size,
-                           _Py_error_handler error_handler, const char *errors,
-                           Py_ssize_t *consumed)
+                           _Py_error_handler error_handler, const char *errors)
 {
     if (size == 0) {
-        if (consumed) {
-            *consumed = 0;
-        }
         return 0;
     }
 
@@ -4954,17 +4947,14 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
         writer->pos += decoded;
 
         if (decoded == size) {
-            if (consumed) {
-                *consumed = size;
-            }
             return 0;
         }
         s += decoded;
         size -= decoded;
     }
 
     return unicode_decode_utf8_impl(writer, starts, s, end,
-                                    error_handler, errors, consumed);
+                                    error_handler, errors, NULL);
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8,
		instead of ASCII. Patch by Victor Stinner.