Skip to content

Commit 9f4b1e9

Browse files
author
Victor Stinner
committed
Fix and deprecated the unicode_internal codec
unicode_internal codec uses Py_UNICODE instead of the real internal representation (PEP 393: Py_UCS1, Py_UCS2 or Py_UCS4) for backward compatibility.
1 parent 240c55f commit 9f4b1e9

4 files changed

Lines changed: 42 additions & 10 deletions

File tree

Doc/library/codecs.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,6 +1173,8 @@ particular, the following variants typically exist:
11731173
| unicode_internal | | Return the internal |
11741174
| | | representation of the |
11751175
| | | operand |
1176+
| | | |
1177+
| | | .. deprecated:: 3.3 |
11761178
+--------------------+---------+---------------------------+
11771179

11781180
The following codecs provide bytes-to-bytes mappings.

Doc/whatsnew/3.3.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,8 @@ versions.
250250

251251
(:issue:`12100`)
252252

253+
The ``unicode_internal`` codec has been deprecated.
254+
253255
crypt
254256
-----
255257

Modules/_codecsmodule.c

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -675,18 +675,30 @@ unicode_internal_encode(PyObject *self,
675675
PyObject *obj;
676676
const char *errors = NULL;
677677
const char *data;
678-
Py_ssize_t size;
678+
Py_ssize_t len, size;
679679

680680
if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
681681
&obj, &errors))
682682
return NULL;
683683

684684
if (PyUnicode_Check(obj)) {
685+
Py_UNICODE *u;
686+
685687
if (PyUnicode_READY(obj) < 0)
686688
return NULL;
687-
data = PyUnicode_AS_DATA(obj);
688-
size = PyUnicode_GET_DATA_SIZE(obj);
689-
return codec_tuple(PyBytes_FromStringAndSize(data, size),
689+
690+
if (PyErr_WarnEx(PyExc_DeprecationWarning,
691+
"unicode_internal codecs has been deprecated",
692+
1))
693+
return NULL;
694+
695+
u = PyUnicode_AsUnicodeAndSize(obj, &len);
696+
if (u == NULL)
697+
return NULL;
698+
if (len > PY_SSIZE_T_MAX / sizeof(Py_UNICODE))
699+
return PyErr_NoMemory();
700+
size = len * sizeof(Py_UNICODE);
701+
return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size),
690702
PyUnicode_GET_LENGTH(obj));
691703
}
692704
else {

Objects/unicodeobject.c

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6237,6 +6237,11 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
62376237
PyObject *errorHandler = NULL;
62386238
PyObject *exc = NULL;
62396239

6240+
if (PyErr_WarnEx(PyExc_DeprecationWarning,
6241+
"unicode_internal codecs has been deprecated",
6242+
1))
6243+
return NULL;
6244+
62406245
/* XXX overflow detection missing */
62416246
v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
62426247
if (v == NULL)
@@ -6270,15 +6275,26 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
62706275
errors, &errorHandler,
62716276
"unicode_internal", reason,
62726277
&starts, &end, &startinpos, &endinpos, &exc, &s,
6273-
&v, &outpos)) {
6278+
&v, &outpos))
62746279
goto onError;
6275-
}
6280+
continue;
62766281
}
6277-
else {
6278-
if (unicode_putchar(&v, &outpos, ch) < 0)
6279-
goto onError;
6280-
s += Py_UNICODE_SIZE;
6282+
6283+
s += Py_UNICODE_SIZE;
6284+
#ifndef Py_UNICODE_WIDE
6285+
if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6286+
{
6287+
Py_UCS4 ch2 = *(Py_UNICODE*)s;
6288+
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF)
6289+
{
6290+
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
6291+
s += Py_UNICODE_SIZE;
6292+
}
62816293
}
6294+
#endif
6295+
6296+
if (unicode_putchar(&v, &outpos, ch) < 0)
6297+
goto onError;
62826298
}
62836299

62846300
if (PyUnicode_Resize(&v, outpos) < 0)

0 commit comments

Comments
 (0)