Check newly created consistency using _PyUnicode_CheckConsistency(str, 1)

vstinner · vstinner · commit 8f825060f1c1 · 2012-04-27T13:55:39.000+02:00
* In debug mode, fill the string data with invalid characters
 * Simplify also reference counting in PyCodec_BackslashReplaceErrors()
   and PyCodec_XMLCharRefReplaceError()
diff --git a/Modules/_json.c b/Modules/_json.c
@@ -246,6 +246,7 @@ ascii_escape_unicode(PyObject *pystr)
         }
     }
     output[chars++] = '"';
+    assert(_PyUnicode_CheckConsistency(rval, 1));
     return rval;
 }
 
diff --git a/Modules/md5module.c b/Modules/md5module.c
@@ -397,6 +397,7 @@ MD5_hexdigest(MD5object *self, PyObject *unused)
         c = (digest[i] & 0xf);
         hex_digest[j++] = Py_hexdigits[c];
     }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
     return retval;
 }
 
diff --git a/Modules/sha1module.c b/Modules/sha1module.c
@@ -373,6 +373,7 @@ SHA1_hexdigest(SHA1object *self, PyObject *unused)
         c = (digest[i] & 0xf);
         hex_digest[j++] = Py_hexdigits[c];
     }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
     return retval;
 }
 
diff --git a/Modules/sha256module.c b/Modules/sha256module.c
@@ -466,6 +466,7 @@ SHA256_hexdigest(SHAobject *self, PyObject *unused)
         c = (digest[i] & 0xf);
         hex_digest[j++] = Py_hexdigits[c];
     }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
     return retval;
 }
 
diff --git a/Modules/sha512module.c b/Modules/sha512module.c
@@ -532,6 +532,7 @@ SHA512_hexdigest(SHAobject *self, PyObject *unused)
         c = (digest[i] & 0xf);
         hex_digest[j++] = Py_hexdigits[c];
     }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
     return retval;
 }
 
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
@@ -626,6 +626,7 @@ PyBytes_Repr(PyObject *obj, int smartquotes)
             *p++ = c;
     }
     *p++ = quote;
+    assert(_PyUnicode_CheckConsistency(v, 1));
     return v;
 }
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -967,7 +967,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     PyObject *obj;
     PyCompactUnicodeObject *unicode;
     void *data;
-    int kind_state;
+    enum PyUnicode_Kind kind;
     int is_sharing, is_ascii;
     Py_ssize_t char_size;
     Py_ssize_t struct_size;
@@ -986,17 +986,17 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     is_sharing = 0;
     struct_size = sizeof(PyCompactUnicodeObject);
     if (maxchar < 128) {
-        kind_state = PyUnicode_1BYTE_KIND;
+        kind = PyUnicode_1BYTE_KIND;
         char_size = 1;
         is_ascii = 1;
         struct_size = sizeof(PyASCIIObject);
     }
     else if (maxchar < 256) {
-        kind_state = PyUnicode_1BYTE_KIND;
+        kind = PyUnicode_1BYTE_KIND;
         char_size = 1;
     }
     else if (maxchar < 65536) {
-        kind_state = PyUnicode_2BYTE_KIND;
+        kind = PyUnicode_2BYTE_KIND;
         char_size = 2;
         if (sizeof(wchar_t) == 2)
             is_sharing = 1;
@@ -1007,7 +1007,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
                             "invalid maximum character passed to PyUnicode_New");
             return NULL;
         }
-        kind_state = PyUnicode_4BYTE_KIND;
+        kind = PyUnicode_4BYTE_KIND;
         char_size = 4;
         if (sizeof(wchar_t) == 4)
             is_sharing = 1;
@@ -1041,27 +1041,27 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     _PyUnicode_LENGTH(unicode) = size;
     _PyUnicode_HASH(unicode) = -1;
     _PyUnicode_STATE(unicode).interned = 0;
-    _PyUnicode_STATE(unicode).kind = kind_state;
+    _PyUnicode_STATE(unicode).kind = kind;
     _PyUnicode_STATE(unicode).compact = 1;
     _PyUnicode_STATE(unicode).ready = 1;
     _PyUnicode_STATE(unicode).ascii = is_ascii;
     if (is_ascii) {
         ((char*)data)[size] = 0;
         _PyUnicode_WSTR(unicode) = NULL;
     }
-    else if (kind_state == PyUnicode_1BYTE_KIND) {
+    else if (kind == PyUnicode_1BYTE_KIND) {
         ((char*)data)[size] = 0;
         _PyUnicode_WSTR(unicode) = NULL;
         _PyUnicode_WSTR_LENGTH(unicode) = 0;
         unicode->utf8 = NULL;
         unicode->utf8_length = 0;
-        }
+    }
     else {
         unicode->utf8 = NULL;
         unicode->utf8_length = 0;
-        if (kind_state == PyUnicode_2BYTE_KIND)
+        if (kind == PyUnicode_2BYTE_KIND)
             ((Py_UCS2*)data)[size] = 0;
-        else /* kind_state == PyUnicode_4BYTE_KIND */
+        else /* kind == PyUnicode_4BYTE_KIND */
             ((Py_UCS4*)data)[size] = 0;
         if (is_sharing) {
             _PyUnicode_WSTR_LENGTH(unicode) = size;
@@ -1072,6 +1072,13 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
             _PyUnicode_WSTR(unicode) = NULL;
         }
     }
+#ifdef Py_DEBUG
+    /* Fill the data with invalid characters to detect bugs earlier.
+       _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
+       at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
+       and U+FFFFFFFF is an invalid character in Unicode 6.0. */
+    memset(data, 0xff, size * kind);
+#endif
     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
     return obj;
 }
diff --git a/Python/codecs.c b/Python/codecs.c
@@ -534,6 +534,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
         data = PyUnicode_DATA(res);
         for (i = 0; i < len; ++i)
             PyUnicode_WRITE(kind, data, i, '?');
+        assert(_PyUnicode_CheckConsistency(res, 1));
         return Py_BuildValue("(Nn)", res, end);
     }
     else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
@@ -559,6 +560,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
         data = PyUnicode_DATA(res);
         for (i=0; i < len; i++)
             PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
+        assert(_PyUnicode_CheckConsistency(res, 1));
         return Py_BuildValue("(Nn)", res, end);
     }
     else {
@@ -652,8 +654,8 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
             }
             *outp++ = ';';
         }
-        restuple = Py_BuildValue("(On)", res, end);
-        Py_DECREF(res);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        restuple = Py_BuildValue("(Nn)", res, end);
         Py_DECREF(object);
         return restuple;
     }
@@ -720,8 +722,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
             *outp++ = Py_hexdigits[c&0xf];
         }
 
-        restuple = Py_BuildValue("(On)", res, end);
-        Py_DECREF(res);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        restuple = Py_BuildValue("(Nn)", res, end);
         Py_DECREF(object);
         return restuple;
     }
diff --git a/Python/compile.c b/Python/compile.c
@@ -263,6 +263,7 @@ _Py_Mangle(PyObject *privateobj, PyObject *ident)
         Py_DECREF(result);
         return NULL;
     }
+    assert(_PyUnicode_CheckConsistency(result, 1));
     return result;
 }
 
diff --git a/Python/import.c b/Python/import.c
@@ -992,6 +992,7 @@ make_source_pathname(PyObject *path)
                              (j = dot0-right));
     PyUnicode_WRITE(kind, data, i+j,   'p');
     PyUnicode_WRITE(kind, data, i+j+1, 'y');
+    assert(_PyUnicode_CheckConsistency(result, 1));
     return result;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -246,6 +246,7 @@ ascii_escape_unicode(PyObject *pystr)`
`246`	`246`	`}`
`247`	`247`	`}`
`248`	`248`	`output[chars++] = '"';`
	`249`	`+ assert(_PyUnicode_CheckConsistency(rval, 1));`
`249`	`250`	`return rval;`
`250`	`251`	`}`
`251`	`252`
Original file line number	Diff line number	Diff line change
`@@ -397,6 +397,7 @@ MD5_hexdigest(MD5object self, PyObject unused)`
`397`	`397`	`c = (digest[i] & 0xf);`
`398`	`398`	`hex_digest[j++] = Py_hexdigits[c];`
`399`	`399`	`}`
	`400`	`+ assert(_PyUnicode_CheckConsistency(retval, 1));`
`400`	`401`	`return retval;`
`401`	`402`	`}`
`402`	`403`
Original file line number	Diff line number	Diff line change
`@@ -373,6 +373,7 @@ SHA1_hexdigest(SHA1object self, PyObject unused)`
`373`	`373`	`c = (digest[i] & 0xf);`
`374`	`374`	`hex_digest[j++] = Py_hexdigits[c];`
`375`	`375`	`}`
	`376`	`+ assert(_PyUnicode_CheckConsistency(retval, 1));`
`376`	`377`	`return retval;`
`377`	`378`	`}`
`378`	`379`
Original file line number	Diff line number	Diff line change
`@@ -466,6 +466,7 @@ SHA256_hexdigest(SHAobject self, PyObject unused)`
`466`	`466`	`c = (digest[i] & 0xf);`
`467`	`467`	`hex_digest[j++] = Py_hexdigits[c];`
`468`	`468`	`}`
	`469`	`+ assert(_PyUnicode_CheckConsistency(retval, 1));`
`469`	`470`	`return retval;`
`470`	`471`	`}`
`471`	`472`
Original file line number	Diff line number	Diff line change
`@@ -532,6 +532,7 @@ SHA512_hexdigest(SHAobject self, PyObject unused)`
`532`	`532`	`c = (digest[i] & 0xf);`
`533`	`533`	`hex_digest[j++] = Py_hexdigits[c];`
`534`	`534`	`}`
	`535`	`+ assert(_PyUnicode_CheckConsistency(retval, 1));`
`535`	`536`	`return retval;`
`536`	`537`	`}`
`537`	`538`
Original file line number	Diff line number	Diff line change
`@@ -626,6 +626,7 @@ PyBytes_Repr(PyObject *obj, int smartquotes)`
`626`	`626`	`*p++ = c;`
`627`	`627`	`}`
`628`	`628`	`*p++ = quote;`
	`629`	`+ assert(_PyUnicode_CheckConsistency(v, 1));`
`629`	`630`	`return v;`
`630`	`631`	`}`
`631`	`632`
Original file line number	Diff line number	Diff line change
`@@ -534,6 +534,7 @@ PyObject PyCodec_ReplaceErrors(PyObject exc)`
`534`	`534`	`data = PyUnicode_DATA(res);`
`535`	`535`	`for (i = 0; i < len; ++i)`
`536`	`536`	`PyUnicode_WRITE(kind, data, i, '?');`
	`537`	`+ assert(_PyUnicode_CheckConsistency(res, 1));`
`537`	`538`	`return Py_BuildValue("(Nn)", res, end);`
`538`	`539`	`}`
`539`	`540`	`else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {`
`@@ -559,6 +560,7 @@ PyObject PyCodec_ReplaceErrors(PyObject exc)`
`559`	`560`	`data = PyUnicode_DATA(res);`
`560`	`561`	`for (i=0; i < len; i++)`
`561`	`562`	`PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);`
	`563`	`+ assert(_PyUnicode_CheckConsistency(res, 1));`
`562`	`564`	`return Py_BuildValue("(Nn)", res, end);`
`563`	`565`	`}`
`564`	`566`	`else {`
`@@ -652,8 +654,8 @@ PyObject PyCodec_XMLCharRefReplaceErrors(PyObject exc)`
`652`	`654`	`}`
`653`	`655`	`*outp++ = ';';`
`654`	`656`	`}`
`655`		`- restuple = Py_BuildValue("(On)", res, end);`
`656`		`- Py_DECREF(res);`
	`657`	`+ assert(_PyUnicode_CheckConsistency(res, 1));`
	`658`	`+ restuple = Py_BuildValue("(Nn)", res, end);`
`657`	`659`	`Py_DECREF(object);`
`658`	`660`	`return restuple;`
`659`	`661`	`}`
`@@ -720,8 +722,8 @@ PyObject PyCodec_BackslashReplaceErrors(PyObject exc)`
`720`	`722`	`*outp++ = Py_hexdigits[c&0xf];`
`721`	`723`	`}`
`722`	`724`
`723`		`- restuple = Py_BuildValue("(On)", res, end);`
`724`		`- Py_DECREF(res);`
	`725`	`+ assert(_PyUnicode_CheckConsistency(res, 1));`
	`726`	`+ restuple = Py_BuildValue("(Nn)", res, end);`
`725`	`727`	`Py_DECREF(object);`
`726`	`728`	`return restuple;`
`727`	`729`	`}`
Original file line number	Diff line number	Diff line change
`@@ -263,6 +263,7 @@ _Py_Mangle(PyObject privateobj, PyObject ident)`
`263`	`263`	`Py_DECREF(result);`
`264`	`264`	`return NULL;`
`265`	`265`	`}`
	`266`	`+ assert(_PyUnicode_CheckConsistency(result, 1));`
`266`	`267`	`return result;`
`267`	`268`	`}`
`268`	`269`
Original file line number	Diff line number	Diff line change
`@@ -992,6 +992,7 @@ make_source_pathname(PyObject *path)`
`992`	`992`	`(j = dot0-right));`
`993`	`993`	`PyUnicode_WRITE(kind, data, i+j, 'p');`
`994`	`994`	`PyUnicode_WRITE(kind, data, i+j+1, 'y');`
	`995`	`+ assert(_PyUnicode_CheckConsistency(result, 1));`
`995`	`996`	`return result;`
`996`	`997`	`}`
`997`	`998`