Patch #1455898: Incremental mode for "mbcs" codec.

martin.v.loewis · martin.v.loewis · commit febcdd3a084b · 2006-06-14T05:21:04.000Z
git-svn-id: http://svn.python.org/projects/python/trunk@46945 6015fed2-1504-0410-9fe1-9d1591cc4771
diff --git a/Doc/api/concrete.tex b/Doc/api/concrete.tex
@@ -1431,6 +1431,18 @@ \subsubsection{Built-in Codecs \label{builtinCodecs}}
   raised by the codec.
 \end{cfuncdesc}
 
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s,
+                                               int size,
+                                               const char *errors,
+                                               int *consumed}
+  If \var{consumed} is \NULL{}, behave like
+  \cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{},
+  \cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead
+  byte and the number of bytes that have been decoded will be stored in
+  \var{consumed}.
+  \versionadded{2.5}
+\end{cfuncdesc}
+
 \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
                                                Py_ssize_t size,
                                                const char *errors}
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -938,6 +938,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
     const char *errors          /* error handling */
     );
 
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
+    const char *string,         /* MBCS encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
     PyObject *unicode           /* Unicode object */
     );
diff --git a/Lib/encodings/mbcs.py b/Lib/encodings/mbcs.py
@@ -22,9 +22,10 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
     def encode(self, input, final=False):
         return codecs.mbcs_encode(input,self.errors)[0]
 
-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.mbcs_decode(input,self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.mbcs_decode(input,self.errors,final)
+
 class StreamWriter(Codec,codecs.StreamWriter):
     pass
 
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -156,6 +156,9 @@ Extension Modules
 Library
 -------
 
+- Patch #1455898: The MBCS codec now supports the incremental mode for
+  double-byte encodings.
+
 - ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
   guarantee that adjacent triples in the return list always describe
   non-adjacent blocks.  Previously, a pair of matching blocks could end
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
@@ -479,15 +479,20 @@ mbcs_decode(PyObject *self,
 	    PyObject *args)
 {
     const char *data;
-    Py_ssize_t size;
+    Py_ssize_t size, consumed;
     const char *errors = NULL;
+    int final = 1;
+    PyObject *decoded;
 
-    if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
-			  &data, &size, &errors))
+    if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
+			  &data, &size, &errors, &final))
 	return NULL;
 
-    return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
-		       size);
+    decoded = PyUnicode_DecodeMBCSStateful(
+	data, size, errors, final ? NULL : &consumed);
+    if (!decoded)
+	return NULL;
+    return codec_tuple(decoded, final ? size : consumed);
 }
 
 #endif /* MS_WINDOWS */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
 
 /* --- MBCS codecs for Windows -------------------------------------------- */
 
-PyObject *PyUnicode_DecodeMBCS(const char *s,
-				Py_ssize_t size,
-				const char *errors)
+#if SIZEOF_INT < SIZEOF_SSIZE_T
+#define NEED_RETRY
+#endif
+
+/* XXX This code is limited to "true" double-byte encodings, as
+   a) it assumes an incomplete character consists of a single byte, and
+   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
+      encodings, see IsDBCSLeadByteEx documentation. */
+
+static int is_dbcs_lead_byte(const char *s, int offset)
+{
+    const char *curr = s + offset;
+
+    if (IsDBCSLeadByte(*curr)) {
+	const char *prev = CharPrev(s, curr);
+	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+    }
+    return 0;
+}
+
+/*
+ * Decode MBCS string into unicode object. If 'final' is set, converts
+ * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ */
+static int decode_mbcs(PyUnicodeObject **v,
+			const char *s, /* MBCS string */
+			int size, /* sizeof MBCS string */
+			int final)
 {
-    PyUnicodeObject *v;
     Py_UNICODE *p;
-    DWORD usize;
+    Py_ssize_t n = 0;
+    int usize = 0;
+
+    assert(size >= 0);
+
+    /* Skip trailing lead-byte unless 'final' is set */
+    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
+	--size;
 
     /* First get the size of the result */
-    assert(size < INT_MAX);
-    usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
-    if (size > 0 && usize==0)
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+	if (usize == 0) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
 
-    v = _PyUnicode_New(usize);
-    if (v == NULL)
-        return NULL;
-    if (usize == 0)
-	return (PyObject *)v;
-    p = PyUnicode_AS_UNICODE(v);
-    if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
-        Py_DECREF(v);
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (*v == NULL) {
+	/* Create unicode object */
+	*v = _PyUnicode_New(usize);
+	if (*v == NULL)
+	    return -1;
+    }
+    else {
+	/* Extend unicode object */
+	n = PyUnicode_GET_SIZE(*v);
+	if (_PyUnicode_Resize(v, n + usize) < 0)
+	    return -1;
+    }
+
+    /* Do the conversion */
+    if (size > 0) {
+	p = PyUnicode_AS_UNICODE(*v) + n;
+	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
+
+    return size;
+}
+
+PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
+					Py_ssize_t size,
+					const char *errors,
+					Py_ssize_t *consumed)
+{
+    PyUnicodeObject *v = NULL;
+    int done;
+
+    if (consumed)
+	*consumed = 0;
+
+#ifdef NEED_RETRY
+  retry:
+    if (size > INT_MAX)
+	done = decode_mbcs(&v, s, INT_MAX, 0);
+    else
+#endif
+	done = decode_mbcs(&v, s, (int)size, !consumed);
+
+    if (done < 0) {
+        Py_XDECREF(v);
+	return NULL;
+    }
+
+    if (consumed)
+	*consumed += done;
+
+#ifdef NEED_RETRY
+    if (size > INT_MAX) {
+	s += done;
+	size -= done;
+	goto retry;
     }
+#endif
 
     return (PyObject *)v;
 }
 
-PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+PyObject *PyUnicode_DecodeMBCS(const char *s,
 				Py_ssize_t size,
 				const char *errors)
 {
-    PyObject *repr;
-    char *s;
-    DWORD mbcssize;
+    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}
 
-    /* If there are no characters, bail now! */
-    if (size==0)
-	    return PyString_FromString("");
+/*
+ * Convert unicode into string object (MBCS).
+ * Returns 0 if succeed, -1 otherwise.
+ */
+static int encode_mbcs(PyObject **repr,
+			const Py_UNICODE *p, /* unicode */
+			int size) /* size of unicode */
+{
+    int mbcssize = 0;
+    Py_ssize_t n = 0;
+
+    assert(size >= 0);
 
     /* First get the size of the result */
-    assert(size<INT_MAX);
-    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
-    if (mbcssize==0)
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+	if (mbcssize == 0) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
 
-    repr = PyString_FromStringAndSize(NULL, mbcssize);
-    if (repr == NULL)
-        return NULL;
-    if (mbcssize == 0)
-        return repr;
+    if (*repr == NULL) {
+	/* Create string object */
+	*repr = PyString_FromStringAndSize(NULL, mbcssize);
+	if (*repr == NULL)
+	    return -1;
+    }
+    else {
+	/* Extend string object */
+	n = PyString_Size(*repr);
+	if (_PyString_Resize(repr, n + mbcssize) < 0)
+	    return -1;
+    }
 
     /* Do the conversion */
-    s = PyString_AS_STRING(repr);
-    assert(size < INT_MAX);
-    if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
-        Py_DECREF(repr);
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	char *s = PyString_AS_STRING(*repr) + n;
+	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
     }
+
+    return 0;
+}
+
+PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+				Py_ssize_t size,
+				const char *errors)
+{
+    PyObject *repr = NULL;
+    int ret;
+
+#ifdef NEED_RETRY
+ retry:
+    if (size > INT_MAX)
+	ret = encode_mbcs(&repr, p, INT_MAX);
+    else
+#endif
+	ret = encode_mbcs(&repr, p, (int)size);
+
+    if (ret < 0) {
+	Py_XDECREF(repr);
+	return NULL;
+    }
+
+#ifdef NEED_RETRY
+    if (size > INT_MAX) {
+	p += INT_MAX;
+	size -= INT_MAX;
+	goto retry;
+    }
+#endif
+
     return repr;
 }
 
@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
 				NULL);
 }
 
+#undef NEED_RETRY
+
 #endif /* MS_WINDOWS */
 
 /* --- Character Mapping Codec -------------------------------------------- */