Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Doc/c-api/unicode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ APIs:
arguments, calculate the size of the resulting Python Unicode string and return
a string with the values formatted into it. The variable arguments must be C
types and must correspond exactly to the format characters in the *format*
ASCII-encoded string.
string. The *format* string is decoded from UTF-8.

A conversion specifier contains two or more characters and has the following
components, which must occur in this order:
Expand Down Expand Up @@ -487,7 +487,8 @@ APIs:

* - ``s``
- :c:expr:`const char*` or :c:expr:`const wchar_t*`
- A null-terminated C character array.
- A null-terminated C character array. :c:expr:`const char*` is decoded
from UTF-8 with the "replace" error handler.

* - ``p``
- :c:expr:`const void*`
Expand Down Expand Up @@ -576,6 +577,9 @@ APIs:
.. versionchanged:: 3.13
Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added.

.. versionchanged:: 3.14
The format string is now decoded from UTF-8 instead of ASCII.


.. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs)

Expand Down
4 changes: 4 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@ New Features
Porting to Python 3.14
----------------------

* :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8,
instead of ASCII.
(Contributed by Victor Stinner in :gh:`119182`.)

Deprecated
----------

Expand Down
4 changes: 3 additions & 1 deletion Lib/test/test_capi/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,10 @@ def test_format(self):

with self.assertRaisesRegex(OverflowError, 'not in range'):
PyErr_Format(ZeroDivisionError, b'%c', c_int(-1))
with self.assertRaisesRegex(ValueError, 'format string'):
with self.assertRaisesRegex(ValueError, 'format string') as cm:
PyErr_Format(ZeroDivisionError, b'\xff')
self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError)

self.assertRaises(SystemError, PyErr_Format, list, b'error')
# CRASHES PyErr_Format(ZeroDivisionError, NULL)
# CRASHES PyErr_Format(py_object(), b'error')
Expand Down
21 changes: 14 additions & 7 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,16 +380,23 @@ def check_format(expected, format, *args):
text = PyUnicode_FromFormat(format, *args)
self.assertEqual(expected, text)

# ascii format, non-ascii argument
# ASCII format, non-ASCII %U argument
check_format('ascii\x7f=unicode\xe9',
b'ascii\x7f=%U', 'unicode\xe9')

# non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
# raises an error
self.assertRaisesRegex(ValueError,
r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
'string, got a non-ASCII byte: 0xe9$',
PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
# The %s arguments are decoded from UTF-8/replace.
# The format string is decoded from UTF-8/strict.
check_format('value=utf8 \u20ac',
'value=%s'.encode(), 'utf8 \u20ac'.encode())
with self.assertRaisesRegex(ValueError, 'format string') as cm:
PyUnicode_FromFormat(b'invalid format string\xff: %s', b'abc')
Comment thread
vstinner marked this conversation as resolved.
self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError)

# Truncated UTF-8 format strings
with self.assertRaisesRegex(ValueError, 'format string'):
PyUnicode_FromFormat(b'truncated utf8: \xc3')
with self.assertRaisesRegex(ValueError, 'format string'):
PyUnicode_FromFormat(b'truncated utf8: \xe2\x82')

# test "%c"
check_format('\uabcd',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8,
instead of ASCII. Patch by Victor Stinner.
48 changes: 19 additions & 29 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
Comment thread
vstinner marked this conversation as resolved.
_Py_error_handler error_handler, const char *errors);
#ifdef Py_DEBUG
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
Expand Down Expand Up @@ -2402,7 +2401,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,

if (width < 0) {
return unicode_decode_utf8_writer(writer, str, length,
_Py_ERROR_REPLACE, "replace", NULL);
_Py_ERROR_REPLACE, "replace");
}

PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
Expand Down Expand Up @@ -2896,28 +2895,26 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
const char *p;
Py_ssize_t len;

p = f;
do
{
if ((unsigned char)*p > 127) {
PyErr_Format(PyExc_ValueError,
"PyUnicode_FromFormatV() expects an ASCII-encoded format "
"string, got a non-ASCII byte: 0x%02x",
(unsigned char)*p);
goto fail;
}
p++;
p = strchr(f, '%');
if (p != NULL) {
len = p - f;
}
while (*p != '\0' && *p != '%');
len = p - f;

if (*p == '\0')
else {
len = strlen(f);
writer.overallocate = 0;
}

if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
if (unicode_decode_utf8_writer(&writer, f, len,
_Py_ERROR_STRICT, "strict") < 0) {
PyObject *exc = PyErr_GetRaisedException();
PyErr_SetString(PyExc_ValueError,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why raise ValueError explicitly? If you want a ValueError for compatibility, UnicodeDecode is a subclass of ValueError, so this is a backward compatible change. Other functions which take const char * do not raise ValueError explicitly.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message helps debugging such issue: it points directly to the format string.

"PyUnicode_FromFormatV() expects a valid UTF-8-encoded "
"format string, got an invalid UTF-8 string");
_PyErr_ChainExceptions1(exc);
goto fail;
}

f = p;
f += len;
}
}
va_end(vargs2);
Expand Down Expand Up @@ -4930,13 +4927,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
_Py_error_handler error_handler, const char *errors)
{
if (size == 0) {
if (consumed) {
*consumed = 0;
}
return 0;
}

Expand All @@ -4954,17 +4947,14 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
writer->pos += decoded;

if (decoded == size) {
if (consumed) {
*consumed = size;
}
return 0;
}
s += decoded;
size -= decoded;
}

return unicode_decode_utf8_impl(writer, starts, s, end,
error_handler, errors, consumed);
error_handler, errors, NULL);
}


Expand Down