From 120b75cc1b50bd8d25c827d665dac16868b6aec9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 19 Sep 2025 13:17:16 +0200 Subject: [PATCH 1/4] gh-139156: Use PyBytesWriter in UTF-32 encoder Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the PyBytesWriter API. --- Objects/unicodeobject.c | 80 +++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c8d2c68615e13e..7b47814fc3eb0c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6089,45 +6089,33 @@ _PyUnicode_EncodeUTF32(PyObject *str, const char *errors, int byteorder) { - int kind; - const void *data; - Py_ssize_t len; - PyObject *v; - uint32_t *out; -#if PY_LITTLE_ENDIAN - int native_ordering = byteorder <= 0; -#else - int native_ordering = byteorder >= 0; -#endif - const char *encoding; - Py_ssize_t nsize, pos; - PyObject *errorHandler = NULL; - PyObject *exc = NULL; - PyObject *rep = NULL; - if (!PyUnicode_Check(str)) { PyErr_BadArgument(); return NULL; } - kind = PyUnicode_KIND(str); - data = PyUnicode_DATA(str); - len = PyUnicode_GET_LENGTH(str); + int kind = PyUnicode_KIND(str); + const void *data = PyUnicode_DATA(str); + Py_ssize_t len = PyUnicode_GET_LENGTH(str); if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) return PyErr_NoMemory(); - nsize = len + (byteorder == 0); - v = PyBytes_FromStringAndSize(NULL, nsize * 4); - if (v == NULL) + Py_ssize_t nsize = len + (byteorder == 0); + PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4); + if (writer == NULL) { return NULL; + } /* output buffer is 4-bytes aligned */ - assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); - out = (uint32_t *)PyBytes_AS_STRING(v); - if (byteorder == 0) + assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4)); + uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer); + if (byteorder == 0) { *out++ = 0xFEFF; - if (len == 0) - goto done; + } + if (len == 0) { + return PyBytesWriter_Finish(writer); + } + const char *encoding; if (byteorder == -1) encoding = "utf-32-le"; else if (byteorder == 1) @@ -6135,13 +6123,21 @@ _PyUnicode_EncodeUTF32(PyObject *str, else encoding = "utf-32"; +#if PY_LITTLE_ENDIAN + int native_ordering = byteorder <= 0; +#else + int native_ordering = byteorder >= 0; +#endif if (kind == PyUnicode_1BYTE_KIND) { ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); - goto done; + return PyBytesWriter_Finish(writer); } - pos = 0; - while (pos < len) { + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *rep = NULL; + + for (Py_ssize_t pos = 0; pos < len; ) { Py_ssize_t newpos, repsize, moreunits; if (kind == PyUnicode_2BYTE_KIND) { @@ -6188,21 +6184,18 @@ _PyUnicode_EncodeUTF32(PyObject *str, /* four bytes are reserved for each surrogate */ if (moreunits > 0) { - Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); - if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { - /* integer overflow */ - PyErr_NoMemory(); + out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out); + if (out == NULL) { goto error; } - if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0) - goto error; - out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; } if (PyBytes_Check(rep)) { memcpy(out, PyBytes_AS_STRING(rep), repsize); out += repsize / 4; - } else /* rep is unicode */ { + } + else { + /* rep is unicode */ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, &out, native_ordering); @@ -6214,18 +6207,17 @@ _PyUnicode_EncodeUTF32(PyObject *str, /* Cut back to size actually needed. This is necessary for, for example, encoding of a string containing isolated surrogates and the 'ignore' handler is used. */ - nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); - if (nsize != PyBytes_GET_SIZE(v)) - _PyBytes_Resize(&v, nsize); + nsize = (unsigned char*) out - (unsigned char*) PyBytesWriter_GetData(writer); + PyObject *bytes = PyBytesWriter_FinishWithSize(writer, nsize); Py_XDECREF(errorHandler); Py_XDECREF(exc); - done: - return v; + return bytes; + error: Py_XDECREF(rep); Py_XDECREF(errorHandler); Py_XDECREF(exc); - Py_XDECREF(v); + PyBytesWriter_Discard(writer); return NULL; } From 9b5deeb77110ffc00f490576a3888259bbcab131 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 22 Sep 2025 08:05:42 +0200 Subject: [PATCH 2/4] Add UCS1 fast path --- Objects/unicodeobject.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7b47814fc3eb0c..0a391b3a5f0c9c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6100,6 +6100,33 @@ _PyUnicode_EncodeUTF32(PyObject *str, if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) return PyErr_NoMemory(); Py_ssize_t nsize = len + (byteorder == 0); + +#if PY_LITTLE_ENDIAN + int native_ordering = byteorder <= 0; +#else + int native_ordering = byteorder >= 0; +#endif + if (kind == PyUnicode_1BYTE_KIND) { + // gh-139156: Don't use PyBytesWriter API here since it has an overhead + // on short strings + PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4); + if (v == NULL) { + return NULL; + } + + /* output buffer is 4-bytes aligned */ + assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); + uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v); + if (byteorder == 0) { + *out++ = 0xFEFF; + } + if (len > 0) { + ucs1lib_utf32_encode((const Py_UCS1 *)data, len, + &out, native_ordering); + } + return v; + } + PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4); if (writer == NULL) { return NULL; @@ -6123,16 +6150,6 @@ _PyUnicode_EncodeUTF32(PyObject *str, else encoding = "utf-32"; -#if PY_LITTLE_ENDIAN - int native_ordering = byteorder <= 0; -#else - int native_ordering = byteorder >= 0; -#endif - if (kind == PyUnicode_1BYTE_KIND) { - ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); - return PyBytesWriter_Finish(writer); - } - PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *rep = NULL; From 4a4b9ff9bf390177086e7cf5445a6da56527dc9a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 22 Sep 2025 14:28:10 +0200 Subject: [PATCH 3/4] Use PyBytesWriter_FinishWithPointer() --- Objects/unicodeobject.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0a391b3a5f0c9c..5d8ee80332dd53 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6221,14 +6221,13 @@ _PyUnicode_EncodeUTF32(PyObject *str, Py_CLEAR(rep); } + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + /* Cut back to size actually needed. This is necessary for, for example, encoding of a string containing isolated surrogates and the 'ignore' handler is used. */ - nsize = (unsigned char*) out - (unsigned char*) PyBytesWriter_GetData(writer); - PyObject *bytes = PyBytesWriter_FinishWithSize(writer, nsize); - Py_XDECREF(errorHandler); - Py_XDECREF(exc); - return bytes; + return PyBytesWriter_FinishWithPointer(writer, out); error: Py_XDECREF(rep); From 1c54b6a872010e2c3a59d3ece3df926397a6aac2 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 22 Sep 2025 14:28:59 +0200 Subject: [PATCH 4/4] Cleanup --- Objects/unicodeobject.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5d8ee80332dd53..934faf236cf3c1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6106,6 +6106,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, #else int native_ordering = byteorder >= 0; #endif + if (kind == PyUnicode_1BYTE_KIND) { // gh-139156: Don't use PyBytesWriter API here since it has an overhead // on short strings @@ -6155,8 +6156,6 @@ _PyUnicode_EncodeUTF32(PyObject *str, PyObject *rep = NULL; for (Py_ssize_t pos = 0; pos < len; ) { - Py_ssize_t newpos, repsize, moreunits; - if (kind == PyUnicode_2BYTE_KIND) { pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, &out, native_ordering); @@ -6169,6 +6168,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, if (pos == len) break; + Py_ssize_t newpos; rep = unicode_encode_call_errorhandler( errors, &errorHandler, encoding, "surrogates not allowed", @@ -6176,6 +6176,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, if (!rep) goto error; + Py_ssize_t repsize, moreunits; if (PyBytes_Check(rep)) { repsize = PyBytes_GET_SIZE(rep); if (repsize & 3) {