From 120b75cc1b50bd8d25c827d665dac16868b6aec9 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Fri, 19 Sep 2025 13:17:16 +0200
Subject: [PATCH 1/4] gh-139156: Use PyBytesWriter in UTF-32 encoder

Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the
PyBytesWriter API.
---
 Objects/unicodeobject.c | 80 +++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 44 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c8d2c68615e13e..7b47814fc3eb0c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6089,45 +6089,33 @@ _PyUnicode_EncodeUTF32(PyObject *str,
                        const char *errors,
                        int byteorder)
 {
-    int kind;
-    const void *data;
-    Py_ssize_t len;
-    PyObject *v;
-    uint32_t *out;
-#if PY_LITTLE_ENDIAN
-    int native_ordering = byteorder <= 0;
-#else
-    int native_ordering = byteorder >= 0;
-#endif
-    const char *encoding;
-    Py_ssize_t nsize, pos;
-    PyObject *errorHandler = NULL;
-    PyObject *exc = NULL;
-    PyObject *rep = NULL;
-
     if (!PyUnicode_Check(str)) {
         PyErr_BadArgument();
         return NULL;
     }
-    kind = PyUnicode_KIND(str);
-    data = PyUnicode_DATA(str);
-    len = PyUnicode_GET_LENGTH(str);
+    int kind = PyUnicode_KIND(str);
+    const void *data = PyUnicode_DATA(str);
+    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
 
     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
         return PyErr_NoMemory();
-    nsize = len + (byteorder == 0);
-    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
-    if (v == NULL)
+    Py_ssize_t nsize = len + (byteorder == 0);
+    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
+    if (writer == NULL) {
         return NULL;
+    }
 
     /* output buffer is 4-bytes aligned */
-    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
-    out = (uint32_t *)PyBytes_AS_STRING(v);
-    if (byteorder == 0)
+    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
+    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
+    if (byteorder == 0) {
         *out++ = 0xFEFF;
-    if (len == 0)
-        goto done;
+    }
+    if (len == 0) {
+        return PyBytesWriter_Finish(writer);
+    }
 
+    const char *encoding;
     if (byteorder == -1)
         encoding = "utf-32-le";
     else if (byteorder == 1)
@@ -6135,13 +6123,21 @@ _PyUnicode_EncodeUTF32(PyObject *str,
     else
         encoding = "utf-32";
 
+#if PY_LITTLE_ENDIAN
+    int native_ordering = byteorder <= 0;
+#else
+    int native_ordering = byteorder >= 0;
+#endif
     if (kind == PyUnicode_1BYTE_KIND) {
         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
-        goto done;
+        return PyBytesWriter_Finish(writer);
     }
 
-    pos = 0;
-    while (pos < len) {
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    PyObject *rep = NULL;
+
+    for (Py_ssize_t pos = 0; pos < len; ) {
         Py_ssize_t newpos, repsize, moreunits;
 
         if (kind == PyUnicode_2BYTE_KIND) {
@@ -6188,21 +6184,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,
 
         /* four bytes are reserved for each surrogate */
         if (moreunits > 0) {
-            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
-            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
-                /* integer overflow */
-                PyErr_NoMemory();
+            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
+            if (out == NULL) {
                 goto error;
             }
-            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
-                goto error;
-            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
         }
 
         if (PyBytes_Check(rep)) {
             memcpy(out, PyBytes_AS_STRING(rep), repsize);
             out += repsize / 4;
-        } else /* rep is unicode */ {
+        }
+        else {
+            /* rep is unicode */
             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
                                  &out, native_ordering);
@@ -6214,18 +6207,17 @@ _PyUnicode_EncodeUTF32(PyObject *str,
     /* Cut back to size actually needed. This is necessary for, for example,
        encoding of a string containing isolated surrogates and the 'ignore'
        handler is used. */
-    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
-    if (nsize != PyBytes_GET_SIZE(v))
-      _PyBytes_Resize(&v, nsize);
+    nsize = (unsigned char*) out - (unsigned char*) PyBytesWriter_GetData(writer);
+    PyObject *bytes = PyBytesWriter_FinishWithSize(writer, nsize);
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
-  done:
-    return v;
+    return bytes;
+
   error:
     Py_XDECREF(rep);
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
-    Py_XDECREF(v);
+    PyBytesWriter_Discard(writer);
     return NULL;
 }
 

From 9b5deeb77110ffc00f490576a3888259bbcab131 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 22 Sep 2025 08:05:42 +0200
Subject: [PATCH 2/4] Add UCS1 fast path

---
 Objects/unicodeobject.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7b47814fc3eb0c..0a391b3a5f0c9c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6100,6 +6100,33 @@ _PyUnicode_EncodeUTF32(PyObject *str,
     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
         return PyErr_NoMemory();
     Py_ssize_t nsize = len + (byteorder == 0);
+
+#if PY_LITTLE_ENDIAN
+    int native_ordering = byteorder <= 0;
+#else
+    int native_ordering = byteorder >= 0;
+#endif
+    if (kind == PyUnicode_1BYTE_KIND) {
+        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
+        // on short strings
+        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
+        if (v == NULL) {
+            return NULL;
+        }
+
+        /* output buffer is 4-bytes aligned */
+        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
+        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
+        if (byteorder == 0) {
+            *out++ = 0xFEFF;
+        }
+        if (len > 0) {
+            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
+                                 &out, native_ordering);
+        }
+        return v;
+    }
+
     PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
     if (writer == NULL) {
         return NULL;
@@ -6123,16 +6150,6 @@ _PyUnicode_EncodeUTF32(PyObject *str,
     else
         encoding = "utf-32";
 
-#if PY_LITTLE_ENDIAN
-    int native_ordering = byteorder <= 0;
-#else
-    int native_ordering = byteorder >= 0;
-#endif
-    if (kind == PyUnicode_1BYTE_KIND) {
-        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
-        return PyBytesWriter_Finish(writer);
-    }
-
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
     PyObject *rep = NULL;

From 4a4b9ff9bf390177086e7cf5445a6da56527dc9a Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 22 Sep 2025 14:28:10 +0200
Subject: [PATCH 3/4] Use PyBytesWriter_FinishWithPointer()

---
 Objects/unicodeobject.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0a391b3a5f0c9c..5d8ee80332dd53 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6221,14 +6221,13 @@ _PyUnicode_EncodeUTF32(PyObject *str,
         Py_CLEAR(rep);
     }
 
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+
     /* Cut back to size actually needed. This is necessary for, for example,
        encoding of a string containing isolated surrogates and the 'ignore'
        handler is used. */
-    nsize = (unsigned char*) out - (unsigned char*) PyBytesWriter_GetData(writer);
-    PyObject *bytes = PyBytesWriter_FinishWithSize(writer, nsize);
-    Py_XDECREF(errorHandler);
-    Py_XDECREF(exc);
-    return bytes;
+    return PyBytesWriter_FinishWithPointer(writer, out);
 
   error:
     Py_XDECREF(rep);

From 1c54b6a872010e2c3a59d3ece3df926397a6aac2 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 22 Sep 2025 14:28:59 +0200
Subject: [PATCH 4/4] Cleanup

---
 Objects/unicodeobject.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5d8ee80332dd53..934faf236cf3c1 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6106,6 +6106,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
 #else
     int native_ordering = byteorder >= 0;
 #endif
+
     if (kind == PyUnicode_1BYTE_KIND) {
         // gh-139156: Don't use PyBytesWriter API here since it has an overhead
         // on short strings
@@ -6155,8 +6156,6 @@ _PyUnicode_EncodeUTF32(PyObject *str,
     PyObject *rep = NULL;
 
     for (Py_ssize_t pos = 0; pos < len; ) {
-        Py_ssize_t newpos, repsize, moreunits;
-
         if (kind == PyUnicode_2BYTE_KIND) {
             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
                                         &out, native_ordering);
@@ -6169,6 +6168,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
         if (pos == len)
             break;
 
+        Py_ssize_t newpos;
         rep = unicode_encode_call_errorhandler(
                 errors, &errorHandler,
                 encoding, "surrogates not allowed",
@@ -6176,6 +6176,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
         if (!rep)
             goto error;
 
+        Py_ssize_t repsize, moreunits;
         if (PyBytes_Check(rep)) {
             repsize = PyBytes_GET_SIZE(rep);
             if (repsize & 3) {