Skip to content

Commit febcdd3

Browse files
author
martin.v.loewis
committed
Patch #1455898: Incremental mode for "mbcs" codec.
git-svn-id: http://svn.python.org/projects/python/trunk@46945 6015fed2-1504-0410-9fe1-9d1591cc4771
1 parent d03fa0e commit febcdd3

6 files changed

Lines changed: 211 additions & 47 deletions

File tree

Doc/api/concrete.tex

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,6 +1431,18 @@ \subsubsection{Built-in Codecs \label{builtinCodecs}}
14311431
raised by the codec.
14321432
\end{cfuncdesc}
14331433

1434+
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s,
1435+
int size,
1436+
const char *errors,
1437+
int *consumed}
1438+
If \var{consumed} is \NULL{}, behave like
1439+
\cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{},
1440+
\cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead
1441+
byte and the number of bytes that have been decoded will be stored in
1442+
\var{consumed}.
1443+
\versionadded{2.5}
1444+
\end{cfuncdesc}
1445+
14341446
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
14351447
Py_ssize_t size,
14361448
const char *errors}

Include/unicodeobject.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
938938
const char *errors /* error handling */
939939
);
940940

941+
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
942+
const char *string, /* MBCS encoded string */
943+
Py_ssize_t length, /* size of string */
944+
const char *errors, /* error handling */
945+
Py_ssize_t *consumed /* bytes consumed */
946+
);
947+
941948
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
942949
PyObject *unicode /* Unicode object */
943950
);

Lib/encodings/mbcs.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
2222
def encode(self, input, final=False):
2323
return codecs.mbcs_encode(input,self.errors)[0]
2424

25-
class IncrementalDecoder(codecs.IncrementalDecoder):
26-
def decode(self, input, final=False):
27-
return codecs.mbcs_decode(input,self.errors)[0]
25+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
26+
def _buffer_decode(self, input, errors, final):
27+
return codecs.mbcs_decode(input,self.errors,final)
28+
2829
class StreamWriter(Codec,codecs.StreamWriter):
2930
pass
3031

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ Extension Modules
156156
Library
157157
-------
158158

159+
- Patch #1455898: The MBCS codec now supports the incremental mode for
160+
double-byte encodings.
161+
159162
- ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
160163
guarantee that adjacent triples in the return list always describe
161164
non-adjacent blocks. Previously, a pair of matching blocks could end

Modules/_codecsmodule.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -479,15 +479,20 @@ mbcs_decode(PyObject *self,
479479
PyObject *args)
480480
{
481481
const char *data;
482-
Py_ssize_t size;
482+
Py_ssize_t size, consumed;
483483
const char *errors = NULL;
484+
int final = 1;
485+
PyObject *decoded;
484486

485-
if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
486-
&data, &size, &errors))
487+
if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
488+
&data, &size, &errors, &final))
487489
return NULL;
488490

489-
return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
490-
size);
491+
decoded = PyUnicode_DecodeMBCSStateful(
492+
data, size, errors, final ? NULL : &consumed);
493+
if (!decoded)
494+
return NULL;
495+
return codec_tuple(decoded, final ? size : consumed);
491496
}
492497

493498
#endif /* MS_WINDOWS */

Objects/unicodeobject.c

Lines changed: 175 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
28202820

28212821
/* --- MBCS codecs for Windows -------------------------------------------- */
28222822

2823-
PyObject *PyUnicode_DecodeMBCS(const char *s,
2824-
Py_ssize_t size,
2825-
const char *errors)
2823+
#if SIZEOF_INT < SIZEOF_SSIZE_T
2824+
#define NEED_RETRY
2825+
#endif
2826+
2827+
/* XXX This code is limited to "true" double-byte encodings, as
2828+
a) it assumes an incomplete character consists of a single byte, and
2829+
b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2830+
encodings, see IsDBCSLeadByteEx documentation. */
2831+
2832+
static int is_dbcs_lead_byte(const char *s, int offset)
2833+
{
2834+
const char *curr = s + offset;
2835+
2836+
if (IsDBCSLeadByte(*curr)) {
2837+
const char *prev = CharPrev(s, curr);
2838+
return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2839+
}
2840+
return 0;
2841+
}
2842+
2843+
/*
2844+
* Decode MBCS string into unicode object. If 'final' is set, converts
2845+
* trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2846+
*/
2847+
static int decode_mbcs(PyUnicodeObject **v,
2848+
const char *s, /* MBCS string */
2849+
int size, /* sizeof MBCS string */
2850+
int final)
28262851
{
2827-
PyUnicodeObject *v;
28282852
Py_UNICODE *p;
2829-
DWORD usize;
2853+
Py_ssize_t n = 0;
2854+
int usize = 0;
2855+
2856+
assert(size >= 0);
2857+
2858+
/* Skip trailing lead-byte unless 'final' is set */
2859+
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2860+
--size;
28302861

28312862
/* First get the size of the result */
2832-
assert(size < INT_MAX);
2833-
usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
2834-
if (size > 0 && usize==0)
2835-
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2863+
if (size > 0) {
2864+
usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2865+
if (usize == 0) {
2866+
PyErr_SetFromWindowsErrWithFilename(0, NULL);
2867+
return -1;
2868+
}
2869+
}
28362870

2837-
v = _PyUnicode_New(usize);
2838-
if (v == NULL)
2839-
return NULL;
2840-
if (usize == 0)
2841-
return (PyObject *)v;
2842-
p = PyUnicode_AS_UNICODE(v);
2843-
if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
2844-
Py_DECREF(v);
2845-
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2871+
if (*v == NULL) {
2872+
/* Create unicode object */
2873+
*v = _PyUnicode_New(usize);
2874+
if (*v == NULL)
2875+
return -1;
2876+
}
2877+
else {
2878+
/* Extend unicode object */
2879+
n = PyUnicode_GET_SIZE(*v);
2880+
if (_PyUnicode_Resize(v, n + usize) < 0)
2881+
return -1;
2882+
}
2883+
2884+
/* Do the conversion */
2885+
if (size > 0) {
2886+
p = PyUnicode_AS_UNICODE(*v) + n;
2887+
if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2888+
PyErr_SetFromWindowsErrWithFilename(0, NULL);
2889+
return -1;
2890+
}
2891+
}
2892+
2893+
return size;
2894+
}
2895+
2896+
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2897+
Py_ssize_t size,
2898+
const char *errors,
2899+
Py_ssize_t *consumed)
2900+
{
2901+
PyUnicodeObject *v = NULL;
2902+
int done;
2903+
2904+
if (consumed)
2905+
*consumed = 0;
2906+
2907+
#ifdef NEED_RETRY
2908+
retry:
2909+
if (size > INT_MAX)
2910+
done = decode_mbcs(&v, s, INT_MAX, 0);
2911+
else
2912+
#endif
2913+
done = decode_mbcs(&v, s, (int)size, !consumed);
2914+
2915+
if (done < 0) {
2916+
Py_XDECREF(v);
2917+
return NULL;
2918+
}
2919+
2920+
if (consumed)
2921+
*consumed += done;
2922+
2923+
#ifdef NEED_RETRY
2924+
if (size > INT_MAX) {
2925+
s += done;
2926+
size -= done;
2927+
goto retry;
28462928
}
2929+
#endif
28472930

28482931
return (PyObject *)v;
28492932
}
28502933

2851-
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2934+
PyObject *PyUnicode_DecodeMBCS(const char *s,
28522935
Py_ssize_t size,
28532936
const char *errors)
28542937
{
2855-
PyObject *repr;
2856-
char *s;
2857-
DWORD mbcssize;
2938+
return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2939+
}
28582940

2859-
/* If there are no characters, bail now! */
2860-
if (size==0)
2861-
return PyString_FromString("");
2941+
/*
2942+
* Convert unicode into string object (MBCS).
2943+
* Returns 0 if succeed, -1 otherwise.
2944+
*/
2945+
static int encode_mbcs(PyObject **repr,
2946+
const Py_UNICODE *p, /* unicode */
2947+
int size) /* size of unicode */
2948+
{
2949+
int mbcssize = 0;
2950+
Py_ssize_t n = 0;
2951+
2952+
assert(size >= 0);
28622953

28632954
/* First get the size of the result */
2864-
assert(size<INT_MAX);
2865-
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
2866-
if (mbcssize==0)
2867-
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2955+
if (size > 0) {
2956+
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2957+
if (mbcssize == 0) {
2958+
PyErr_SetFromWindowsErrWithFilename(0, NULL);
2959+
return -1;
2960+
}
2961+
}
28682962

2869-
repr = PyString_FromStringAndSize(NULL, mbcssize);
2870-
if (repr == NULL)
2871-
return NULL;
2872-
if (mbcssize == 0)
2873-
return repr;
2963+
if (*repr == NULL) {
2964+
/* Create string object */
2965+
*repr = PyString_FromStringAndSize(NULL, mbcssize);
2966+
if (*repr == NULL)
2967+
return -1;
2968+
}
2969+
else {
2970+
/* Extend string object */
2971+
n = PyString_Size(*repr);
2972+
if (_PyString_Resize(repr, n + mbcssize) < 0)
2973+
return -1;
2974+
}
28742975

28752976
/* Do the conversion */
2876-
s = PyString_AS_STRING(repr);
2877-
assert(size < INT_MAX);
2878-
if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
2879-
Py_DECREF(repr);
2880-
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2977+
if (size > 0) {
2978+
char *s = PyString_AS_STRING(*repr) + n;
2979+
if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2980+
PyErr_SetFromWindowsErrWithFilename(0, NULL);
2981+
return -1;
2982+
}
28812983
}
2984+
2985+
return 0;
2986+
}
2987+
2988+
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2989+
Py_ssize_t size,
2990+
const char *errors)
2991+
{
2992+
PyObject *repr = NULL;
2993+
int ret;
2994+
2995+
#ifdef NEED_RETRY
2996+
retry:
2997+
if (size > INT_MAX)
2998+
ret = encode_mbcs(&repr, p, INT_MAX);
2999+
else
3000+
#endif
3001+
ret = encode_mbcs(&repr, p, (int)size);
3002+
3003+
if (ret < 0) {
3004+
Py_XDECREF(repr);
3005+
return NULL;
3006+
}
3007+
3008+
#ifdef NEED_RETRY
3009+
if (size > INT_MAX) {
3010+
p += INT_MAX;
3011+
size -= INT_MAX;
3012+
goto retry;
3013+
}
3014+
#endif
3015+
28823016
return repr;
28833017
}
28843018

@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
28933027
NULL);
28943028
}
28953029

3030+
#undef NEED_RETRY
3031+
28963032
#endif /* MS_WINDOWS */
28973033

28983034
/* --- Character Mapping Codec -------------------------------------------- */

0 commit comments

Comments
 (0)