Skip to content

Commit 8f674cc

Browse files
committed
Close python#17694: Add minimum length to _PyUnicodeWriter
* Add also min_char attribute to _PyUnicodeWriter structure (currently unused) * _PyUnicodeWriter_Init() has no more argument (except the writer itself): min_length and overallocate must be set explicitly * In error handlers, only enable overallocation if the replacement string is longer than 1 character * CJK decoders don't use overallocation anymore * Set min_length, instead of preallocating memory using _PyUnicodeWriter_Prepare(), in many decoders * _PyUnicode_DecodeUnicodeInternal() checks for integer overflow
1 parent e84a51c commit 8f674cc

7 files changed

Lines changed: 81 additions & 71 deletions

File tree

Include/unicodeobject.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -898,22 +898,28 @@ typedef struct {
898898
Py_UCS4 maxchar;
899899
Py_ssize_t size;
900900
Py_ssize_t pos;
901-
/* minimum length of the buffer when overallocation is enabled,
902-
see _PyUnicodeWriter_Init() */
901+
902+
/* minimum number of allocated characters (default: 0) */
903903
Py_ssize_t min_length;
904+
905+
/* minimum character (default: 127, ASCII) */
906+
Py_UCS4 min_char;
907+
908+
/* If non-zero, overallocate the buffer by 25% (default: 0). */
904909
unsigned char overallocate;
910+
905911
/* If readonly is 1, buffer is a shared string (cannot be modified)
906912
and size is set to 0. */
907913
unsigned char readonly;
908914
} _PyUnicodeWriter ;
909915

910916
/* Initialize a Unicode writer.
911-
912-
If min_length is greater than zero, _PyUnicodeWriter_Prepare()
913-
overallocates the buffer and min_length is the minimum length in characters
914-
of the buffer. */
917+
*
918+
* By default, the minimum buffer size is 0 character and overallocation is
919+
* disabled. Set min_length, min_char and overallocate attributes to control
920+
* the allocation of the buffer. */
915921
PyAPI_FUNC(void)
916-
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length);
922+
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
917923

918924
/* Prepare the buffer to write 'length' characters
919925
with the specified maximum character.

Modules/cjkcodecs/multibytecodec.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,8 @@ MultibyteCodec_Decode(MultibyteCodecObject *self,
633633
return make_tuple(PyUnicode_New(0, 0), 0);
634634
}
635635

636-
_PyUnicodeWriter_Init(&buf.writer, datalen);
636+
_PyUnicodeWriter_Init(&buf.writer);
637+
buf.writer.min_length = datalen;
637638
buf.excobj = NULL;
638639
buf.inbuf = buf.inbuf_top = (unsigned char *)data;
639640
buf.inbuf_end = buf.inbuf_top + datalen;
@@ -839,7 +840,7 @@ decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data,
839840
{
840841
buf->inbuf = buf->inbuf_top = (const unsigned char *)data;
841842
buf->inbuf_end = buf->inbuf_top + size;
842-
_PyUnicodeWriter_Init(&buf->writer, size);
843+
buf->writer.min_length += size;
843844
return 0;
844845
}
845846

@@ -1037,7 +1038,7 @@ mbidecoder_decode(MultibyteIncrementalDecoderObject *self,
10371038
data = pdata.buf;
10381039
size = pdata.len;
10391040

1040-
_PyUnicodeWriter_Init(&buf.writer, 1);
1041+
_PyUnicodeWriter_Init(&buf.writer);
10411042
buf.excobj = NULL;
10421043
origpending = self->pendingsize;
10431044

@@ -1241,7 +1242,7 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self,
12411242
if (sizehint == 0)
12421243
return PyUnicode_New(0, 0);
12431244

1244-
_PyUnicodeWriter_Init(&buf.writer, 1);
1245+
_PyUnicodeWriter_Init(&buf.writer);
12451246
buf.excobj = NULL;
12461247
cres = NULL;
12471248

Objects/complexobject.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,7 @@ complex__format__(PyObject* self, PyObject* args)
705705
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
706706
return NULL;
707707

708-
_PyUnicodeWriter_Init(&writer, 0);
708+
_PyUnicodeWriter_Init(&writer);
709709
ret = _PyComplex_FormatAdvancedWriter(
710710
&writer,
711711
self,

Objects/floatobject.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1711,7 +1711,7 @@ float__format__(PyObject *self, PyObject *args)
17111711
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
17121712
return NULL;
17131713

1714-
_PyUnicodeWriter_Init(&writer, 0);
1714+
_PyUnicodeWriter_Init(&writer);
17151715
ret = _PyFloat_FormatAdvancedWriter(
17161716
&writer,
17171717
self,

Objects/longobject.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4379,7 +4379,7 @@ long__format__(PyObject *self, PyObject *args)
43794379
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
43804380
return NULL;
43814381

4382-
_PyUnicodeWriter_Init(&writer, 0);
4382+
_PyUnicodeWriter_Init(&writer);
43834383
ret = _PyLong_FormatAdvancedWriter(
43844384
&writer,
43854385
self,

Objects/stringlib/unicode_format.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -906,7 +906,6 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
906906
int recursion_depth, AutoNumber *auto_number)
907907
{
908908
_PyUnicodeWriter writer;
909-
Py_ssize_t minlen;
910909

911910
/* check the recursion level */
912911
if (recursion_depth <= 0) {
@@ -915,8 +914,9 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
915914
return NULL;
916915
}
917916

918-
minlen = PyUnicode_GET_LENGTH(input->str) + 100;
919-
_PyUnicodeWriter_Init(&writer, minlen);
917+
_PyUnicodeWriter_Init(&writer);
918+
writer.overallocate = 1;
919+
writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
920920

921921
if (!do_markup(input, args, kwargs, &writer, recursion_depth,
922922
auto_number)) {

0 commit comments

Comments
 (0)