@@ -1967,6 +1967,216 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
19671967 return Py_BuildValue ("(Nn)" , result , utf8_len );
19681968}
19691969
1970+ static PyObject *
1971+ unicode_getutf8buffer (PyObject * self , PyObject * args )
1972+ {
1973+ PyObject * unicode ;
1974+ const char * errors = NULL ;
1975+ if (!PyArg_ParseTuple (args , "O|s" , & unicode , & errors )) {
1976+ return NULL ;
1977+ }
1978+
1979+ Py_buffer buffer ;
1980+ if (_PyUnicode_GetUTF8Buffer (unicode , errors , & buffer ) < 0 ) {
1981+ return NULL ;
1982+ }
1983+
1984+ assert (buffer .obj != NULL );
1985+ assert (buffer .obj == unicode || PyBytes_CheckExact (buffer .obj ));
1986+
1987+ PyObject * result = PyBytes_FromStringAndSize (buffer .buf , buffer .len );
1988+ PyBuffer_Release (& buffer );
1989+ return result ;
1990+ }
1991+
1992+ static PyObject *
1993+ unicode_test_getutf8buffer (PyObject * self , PyObject * Py_UNUSED (ignored ))
1994+ {
1995+ Py_buffer buf ;
1996+
1997+ // Test 1: ASCII string
1998+ PyObject * str = PyUnicode_FromString ("hello" );
1999+ if (str == NULL ) {
2000+ return NULL ;
2001+ }
2002+ Py_ssize_t refcnt = Py_REFCNT (str );
2003+
2004+ // _PyUnicode_GetUTF8Buffer() must not fail for ASCII string.
2005+ int ret = _PyUnicode_GetUTF8Buffer (str , NULL , & buf );
2006+ assert (ret == 0 );
2007+
2008+ if (buf .obj != str ) {
2009+ PyErr_Format (TestError ,
2010+ "buf.obj must be equal to str. (%s:%d)" ,
2011+ __FILE__ , __LINE__ );
2012+ PyBuffer_Release (& buf );
2013+ Py_DECREF (str );
2014+ return NULL ;
2015+ }
2016+
2017+ if (buf .len != PyUnicode_GET_LENGTH (str )) {
2018+ PyErr_Format (TestError ,
2019+ "buf.len must be equal to len(str). (%s:%d)" ,
2020+ __FILE__ , __LINE__ );
2021+ PyBuffer_Release (& buf );
2022+ Py_DECREF (str );
2023+ return NULL ;
2024+ }
2025+ assert (((const char * )buf .buf )[5 ] == '\0' );
2026+
2027+ if ((Py_UCS1 * )buf .buf != PyUnicode_1BYTE_DATA (str )) {
2028+ PyErr_Format (TestError ,
2029+ "buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)" ,
2030+ __FILE__ , __LINE__ );
2031+ PyBuffer_Release (& buf );
2032+ Py_DECREF (str );
2033+ return NULL ;
2034+ }
2035+
2036+ if (refcnt + 1 != Py_REFCNT (str )) {
2037+ PyErr_Format (TestError ,
2038+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2039+ refcnt + 1 , Py_REFCNT (str ),
2040+ __FILE__ , __LINE__ );
2041+ PyBuffer_Release (& buf );
2042+ Py_DECREF (str );
2043+ return NULL ;
2044+ }
2045+
2046+ PyBuffer_Release (& buf );
2047+
2048+ if (refcnt != Py_REFCNT (str )) {
2049+ PyErr_Format (TestError ,
2050+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2051+ refcnt , Py_REFCNT (str ),
2052+ __FILE__ , __LINE__ );
2053+ Py_DECREF (str );
2054+ return NULL ;
2055+ }
2056+
2057+ Py_DECREF (str );
2058+
2059+ // Test 2: non-ASCII string
2060+
2061+ // "hello" in Japanese. len(str)==5, len(str.encode()) == 15.
2062+ str = PyUnicode_FromString ("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf" );
2063+ if (str == NULL ) {
2064+ return NULL ;
2065+ }
2066+ refcnt = Py_REFCNT (str );
2067+ assert (PyUnicode_GET_LENGTH (str ) == 5 );
2068+
2069+ if (_PyUnicode_GetUTF8Buffer (str , NULL , & buf ) < 0 ) {
2070+ Py_DECREF (str );
2071+ if (!PyErr_Occurred ()) {
2072+ PyErr_Format (TestError ,
2073+ "_PyUnicode_GetUTF8Buffer() returned nonzero "
2074+ "without exception set. (%s:%d)" ,
2075+ __FILE__ , __LINE__ );
2076+ }
2077+ return NULL ;
2078+ }
2079+
2080+ if (!PyBytes_CheckExact (buf .obj )) {
2081+ PyErr_Format (TestError ,
2082+ "buf.obj must be a bytes object, got %R (%s:%d)" ,
2083+ buf .obj , __FILE__ , __LINE__ );
2084+ PyBuffer_Release (& buf );
2085+ Py_DECREF (str );
2086+ return NULL ;
2087+ }
2088+
2089+ if (buf .len != 15 ) {
2090+ PyErr_Format (TestError ,
2091+ "Expected buf.len == 15, actual %zd (%s:%d)" ,
2092+ buf .len , __FILE__ , __LINE__ );
2093+ PyBuffer_Release (& buf );
2094+ Py_DECREF (str );
2095+ return NULL ;
2096+ }
2097+ assert (((const char * )buf .buf )[15 ] == '\0' );
2098+
2099+ if (refcnt != Py_REFCNT (str )) {
2100+ PyErr_Format (TestError ,
2101+ "Py_REFCNT(str) must not be changed. (%s:%d)" ,
2102+ __FILE__ , __LINE__ );
2103+ // Do not DECREF here because refcnt is broken.
2104+ return NULL ;
2105+ }
2106+
2107+ PyBuffer_Release (& buf );
2108+
2109+ // Test 3: There is a UTF-8 cache
2110+ // Reuse str of the previoss test.
2111+
2112+ const char * cache = PyUnicode_AsUTF8 (str );
2113+ if (cache == NULL ) {
2114+ return NULL ;
2115+ }
2116+
2117+ if (_PyUnicode_GetUTF8Buffer (str , NULL , & buf ) < 0 ) {
2118+ Py_DECREF (str );
2119+ if (!PyErr_Occurred ()) {
2120+ PyErr_Format (TestError ,
2121+ "_PyUnicode_GetUTF8Buffer() returned nonzero "
2122+ "without exception set. (%s:%d)" ,
2123+ __FILE__ , __LINE__ );
2124+ }
2125+ return NULL ;
2126+ }
2127+
2128+ if (buf .obj != str ) {
2129+ PyErr_Format (TestError ,
2130+ "buf.obj must be equal to str. (%s:%d)" ,
2131+ __FILE__ , __LINE__ );
2132+ PyBuffer_Release (& buf );
2133+ Py_DECREF (str );
2134+ return NULL ;
2135+ }
2136+
2137+ if (buf .buf != cache ) {
2138+ PyErr_Format (TestError ,
2139+ "buf.buf must be equal to the UTF-8 cache (%s:%d)" ,
2140+ __FILE__ , __LINE__ );
2141+ PyBuffer_Release (& buf );
2142+ Py_DECREF (str );
2143+ return NULL ;
2144+ }
2145+
2146+ if (buf .len != 15 ) {
2147+ PyErr_Format (TestError ,
2148+ "Expected buf.len == 15, actual %zd (%s:%d)" ,
2149+ buf .len , __FILE__ , __LINE__ );
2150+ PyBuffer_Release (& buf );
2151+ Py_DECREF (str );
2152+ return NULL ;
2153+ }
2154+ assert (((const char * )buf .buf )[15 ] == '\0' );
2155+
2156+ if (refcnt + 1 != Py_REFCNT (str )) {
2157+ PyErr_Format (TestError ,
2158+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2159+ refcnt + 1 , Py_REFCNT (str ),
2160+ __FILE__ , __LINE__ );
2161+ // Do not DECREF here because refcnt is broken.
2162+ return NULL ;
2163+ }
2164+
2165+ PyBuffer_Release (& buf );
2166+
2167+ if (refcnt != Py_REFCNT (str )) {
2168+ PyErr_Format (TestError ,
2169+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2170+ refcnt , Py_REFCNT (str ),
2171+ __FILE__ , __LINE__ );
2172+ // Do not DECREF here because refcnt is broken.
2173+ return NULL ;
2174+ }
2175+
2176+ Py_DECREF (str );
2177+ Py_RETURN_NONE ;
2178+ }
2179+
19702180static PyObject *
19712181unicode_findchar (PyObject * self , PyObject * args )
19722182{
@@ -5392,6 +5602,8 @@ static PyMethodDef TestMethods[] = {
53925602 {"unicode_asucs4" , unicode_asucs4 , METH_VARARGS },
53935603 {"unicode_asutf8" , unicode_asutf8 , METH_VARARGS },
53945604 {"unicode_asutf8andsize" , unicode_asutf8andsize , METH_VARARGS },
5605+ {"unicode_getutf8buffer" , unicode_getutf8buffer , METH_VARARGS },
5606+ {"unicode_test_getutf8buffer" , unicode_test_getutf8buffer , METH_NOARGS },
53955607 {"unicode_findchar" , unicode_findchar , METH_VARARGS },
53965608 {"unicode_copycharacters" , unicode_copycharacters , METH_VARARGS },
53975609 {"unicode_encodedecimal" , unicode_encodedecimal , METH_VARARGS },
0 commit comments