From fde5c160cf0df584d060b360b53610b9b263dad0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 12 Mar 2026 12:17:43 +0200 Subject: [PATCH 1/4] gh-145850: Change some implementation details in struct.Struct * calling it with non-ASCII string format will now raise a ValueError instead of UnicodeEncodeError * calling it with non-ASCII bytes format will now raise a UnicodeDecodeError instead of struct.error * getting the format attribute of uninitialized object will now raise an AttributeError instead of RuntimeError. --- Lib/test/test_struct.py | 21 ++++-- ...-03-12-12-17-39.gh-issue-145850.uW3stt.rst | 6 ++ Modules/_struct.c | 68 +++++++------------ 3 files changed, 45 insertions(+), 50 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst diff --git a/Lib/test/test_struct.py b/Lib/test/test_struct.py index e3e02097b1f550..4d477e02ebb63f 100644 --- a/Lib/test/test_struct.py +++ b/Lib/test/test_struct.py @@ -605,7 +605,7 @@ def test_Struct_reinitialization(self): self.assertEqual(s.unpack(packed), (1, 2)) with self.assertWarnsRegex(FutureWarning, msg): - with self.assertRaises(UnicodeEncodeError): + with self.assertRaises(ValueError): s.__init__('\udc00') self.assertEqual(s.format, '>hh') self.assertEqual(s.pack(1, 2), packed) @@ -872,10 +872,10 @@ def __init__(self, *args, **kwargs): with self.assertWarnsRegex(DeprecationWarning, warnmsg + 'bad char'): my_struct = MyStruct(format='$') self.assertEqual(my_struct.pack(12345), b'\x30\x39') - with self.assertWarnsRegex(DeprecationWarning, warnmsg + ".*can't encode"): + with self.assertWarnsRegex(DeprecationWarning, warnmsg + "non-ASCII"): my_struct = MyStruct('\udc00') self.assertEqual(my_struct.pack(12345), b'\x30\x39') - with self.assertWarnsRegex(DeprecationWarning, warnmsg + ".*can't encode"): + with self.assertWarnsRegex(DeprecationWarning, warnmsg + "non-ASCII"): my_struct = MyStruct(format='\udc00') self.assertEqual(my_struct.pack(12345), b'\x30\x39') @@ -928,11 +928,16 @@ def __init__(self, newargs, initargs): with self.assertWarns(FutureWarning): with self.assertRaises(struct.error): MyStruct(('>h',), ('$',)) - with self.assertRaises(UnicodeEncodeError): + with self.assertRaises(ValueError): MyStruct(('\udc00',), ('>h',)) + with self.assertRaises(UnicodeDecodeError): + MyStruct((b'\xa4',), ('>h',)) with self.assertWarns(FutureWarning): - with self.assertRaises(UnicodeEncodeError): + with self.assertRaises(ValueError): MyStruct(('>h',), ('\udc00',)) + with self.assertWarns(FutureWarning): + with self.assertRaises(UnicodeDecodeError): + MyStruct(('>h',), (b'\xa4',)) with self.assertWarns(FutureWarning): my_struct = MyStruct(('>h',), ('h', 42) with self.assertRaises(TypeError): @@ -1004,7 +1011,7 @@ def test_operations_on_half_initialized_Struct(self): self.assertRaises(RuntimeError, S.pack_into, spam, 1) self.assertRaises(RuntimeError, S.unpack, spam) self.assertRaises(RuntimeError, S.unpack_from, spam) - self.assertRaises(RuntimeError, getattr, S, 'format') + self.assertRaises(AttributeError, getattr, S, 'format') self.assertRaises(RuntimeError, S.__sizeof__) self.assertRaises(RuntimeError, repr, S) self.assertEqual(S.size, -1) diff --git a/Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst b/Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst new file mode 100644 index 00000000000000..c9d22c291097ba --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst @@ -0,0 +1,6 @@ +Changed some implementation details in :class:`struct.Struct`: calling it +with non-ASCII string format will now raise a :exc:`ValueError` instead of +:exc:`UnicodeEncodeError`, calling it with non-ASCII bytes format will now +raise a :exc:`UnicodeDecodeError` instead of :exc:`struct.error`, getting +the :attr:`!format` attribute of uninitialized object will now raise an +:exc:`AttributeError` instead of :exc:`RuntimeError`. diff --git a/Modules/_struct.c b/Modules/_struct.c index 7eddc9bdc38a89..4d311c1f1af441 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -12,6 +12,7 @@ #include "pycore_lock.h" // _PyOnceFlag_CallOnce() #include "pycore_long.h" // _PyLong_AsByteArray() #include "pycore_moduleobject.h" // _PyModule_GetState() +#include "pycore_unicodeobject.h" // _PyUnicode_Copy() #include "pycore_weakref.h" // FT_CLEAR_WEAKREFS() #include // offsetof() @@ -1635,8 +1636,9 @@ prepare_s(PyStructObject *self, PyObject *format) _structmodulestate *state = get_struct_state_structinst(self); - fmt = PyBytes_AS_STRING(format); - if (strlen(fmt) != (size_t)PyBytes_GET_SIZE(format)) { + assert(PyUnicode_IS_ASCII(format)); + fmt = (const char *)PyUnicode_1BYTE_DATA(format); + if (strlen(fmt) != (size_t)PyUnicode_GET_LENGTH(format)) { PyErr_SetString(state->StructError, "embedded null character"); return -1; @@ -1780,12 +1782,15 @@ static int set_format(PyStructObject *self, PyObject *format) { if (PyUnicode_Check(format)) { - format = PyUnicode_AsASCIIString(format); - if (format == NULL) + if (!PyUnicode_IS_ASCII(format)) { + PyErr_SetString(PyExc_ValueError, "non-ASCII character in struct format"); return -1; + } + format = _PyUnicode_Copy(format); } else if (PyBytes_Check(format)) { - Py_INCREF(format); + format = PyUnicode_DecodeASCII(PyBytes_AS_STRING(format), + PyBytes_GET_SIZE(format), NULL); } else { PyErr_Format(PyExc_TypeError, @@ -1793,6 +1798,9 @@ set_format(PyStructObject *self, PyObject *format) "not %T", format); return -1; } + if (format == NULL) { + return -1; + } if (prepare_s(self, format)) { Py_DECREF(format); return -1; @@ -1821,7 +1829,7 @@ Struct_impl(PyTypeObject *type, PyObject *format) if (self == NULL) { return NULL; } - self->s_format = Py_NewRef(Py_None); + self->s_format = NULL; self->s_codes = NULL; self->s_size = -1; self->s_len = -1; @@ -1878,7 +1886,7 @@ s_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (self == NULL) { return NULL; } - self->s_format = Py_NewRef(Py_None); + self->s_format = NULL; self->s_codes = NULL; self->s_size = -1; self->s_len = -1; @@ -1892,7 +1900,7 @@ s_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return NULL; } PyObject *exc = PyErr_GetRaisedException(); - Py_SETREF(self->s_format, Py_NewRef(Py_None)); + Py_CLEAR(self->s_format); if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "Invalid 'format' argument for Struct.__new__(): %S", exc)) { @@ -1910,8 +1918,8 @@ s_new(PyTypeObject *type, PyObject *args, PyObject *kwds) static bool same_format(PyStructObject *s, PyObject *format) { - Py_ssize_t size = PyBytes_GET_SIZE(s->s_format); - const void *data = PyBytes_AS_STRING(s->s_format); + Py_ssize_t size = PyUnicode_GET_LENGTH(s->s_format); + const void *data = PyUnicode_1BYTE_DATA(s->s_format); if (PyUnicode_Check(format) && PyUnicode_IS_ASCII(format)) { return PyUnicode_GET_LENGTH(format) == size && memcmp(PyUnicode_1BYTE_DATA(format), data, size) == 0; @@ -1938,7 +1946,7 @@ static int Struct___init___impl(PyStructObject *self, PyObject *format) /*[clinic end generated code: output=b8e80862444e92d0 input=1af78a5f57d82cec]*/ { - if (self->s_format == Py_None) { + if (self->s_format == NULL) { if (set_format(self, format) < 0) { return -1; } @@ -1965,7 +1973,7 @@ s_init(PyObject *self, PyObject *args, PyObject *kwargs) { if (!((PyStructObject *)self)->init_called && Py_TYPE(self)->tp_init == s_init - && ((PyStructObject *)self)->s_format != Py_None) + && ((PyStructObject *)self)->s_format != NULL) { /* Struct.__init__() was called implicitly. * __new__() already did all the work. */ @@ -2508,22 +2516,6 @@ Struct_pack_into_impl(PyStructObject *self, Py_buffer *buffer, Py_RETURN_NONE; } -static PyObject * -s_get_format(PyObject *op, void *Py_UNUSED(closure)) -{ - PyStructObject *self = PyStructObject_CAST(op); - ENSURE_STRUCT_IS_READY(self); - return PyUnicode_FromStringAndSize(PyBytes_AS_STRING(self->s_format), - PyBytes_GET_SIZE(self->s_format)); -} - -static PyObject * -s_get_size(PyObject *op, void *Py_UNUSED(closure)) -{ - PyStructObject *self = PyStructObject_CAST(op); - return PyLong_FromSsize_t(self->s_size); -} - /*[clinic input] Struct.__sizeof__ [clinic start generated code]*/ @@ -2545,14 +2537,7 @@ s_repr(PyObject *op) { PyStructObject *self = PyStructObject_CAST(op); ENSURE_STRUCT_IS_READY(self); - PyObject* fmt = PyUnicode_FromStringAndSize( - PyBytes_AS_STRING(self->s_format), PyBytes_GET_SIZE(self->s_format)); - if (fmt == NULL) { - return NULL; - } - PyObject* s = PyUnicode_FromFormat("%s(%R)", _PyType_Name(Py_TYPE(self)), fmt); - Py_DECREF(fmt); - return s; + return PyUnicode_FromFormat("%s(%R)", _PyType_Name(Py_TYPE(self)), self->s_format); } /* List of functions */ @@ -2569,15 +2554,13 @@ static struct PyMethodDef s_methods[] = { static PyMemberDef s_members[] = { {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(PyStructObject, weakreflist), Py_READONLY}, + {"format", Py_T_OBJECT_EX, offsetof(PyStructObject, s_format), + Py_READONLY, PyDoc_STR("struct format string")}, + {"size", Py_T_PYSSIZET, offsetof(PyStructObject, s_size), Py_READONLY, + PyDoc_STR("struct size in bytes")}, {NULL} /* sentinel */ }; -static PyGetSetDef s_getsetlist[] = { - {"format", s_get_format, NULL, PyDoc_STR("struct format string"), NULL}, - {"size", s_get_size, NULL, PyDoc_STR("struct size in bytes"), NULL}, - {NULL} /* sentinel */ -}; - static PyType_Slot PyStructType_slots[] = { {Py_tp_dealloc, s_dealloc}, {Py_tp_getattro, PyObject_GenericGetAttr}, @@ -2588,7 +2571,6 @@ static PyType_Slot PyStructType_slots[] = { {Py_tp_clear, s_clear}, {Py_tp_methods, s_methods}, {Py_tp_members, s_members}, - {Py_tp_getset, s_getsetlist}, {Py_tp_init, s_init}, {Py_tp_new, s_new}, {0, 0}, From f22d34d6ad3e1f9839fe4f1fff247f13004441b4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 12 Mar 2026 14:16:30 +0200 Subject: [PATCH 2/4] Use PyUnicode_FromObject() instead of _PyUnicode_Copy(). --- Modules/_struct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Modules/_struct.c b/Modules/_struct.c index 4d311c1f1af441..c8ec5072a7bde2 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -12,7 +12,6 @@ #include "pycore_lock.h" // _PyOnceFlag_CallOnce() #include "pycore_long.h" // _PyLong_AsByteArray() #include "pycore_moduleobject.h" // _PyModule_GetState() -#include "pycore_unicodeobject.h" // _PyUnicode_Copy() #include "pycore_weakref.h" // FT_CLEAR_WEAKREFS() #include // offsetof() @@ -1786,7 +1785,7 @@ set_format(PyStructObject *self, PyObject *format) PyErr_SetString(PyExc_ValueError, "non-ASCII character in struct format"); return -1; } - format = _PyUnicode_Copy(format); + format = PyUnicode_FromObject(format); } else if (PyBytes_Check(format)) { format = PyUnicode_DecodeASCII(PyBytes_AS_STRING(format), From 330181b674994ae674bfdbbf16e2a71eb59da8d0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 12 Mar 2026 14:22:18 +0200 Subject: [PATCH 3/4] Raise ValueError instead of UnicodeDecodeError. --- Lib/test/test_struct.py | 6 +++--- .../2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst | 2 +- Modules/_struct.c | 11 +++++------ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_struct.py b/Lib/test/test_struct.py index 4d477e02ebb63f..c7dc69defded50 100644 --- a/Lib/test/test_struct.py +++ b/Lib/test/test_struct.py @@ -930,13 +930,13 @@ def __init__(self, newargs, initargs): MyStruct(('>h',), ('$',)) with self.assertRaises(ValueError): MyStruct(('\udc00',), ('>h',)) - with self.assertRaises(UnicodeDecodeError): + with self.assertRaises(ValueError): MyStruct((b'\xa4',), ('>h',)) with self.assertWarns(FutureWarning): with self.assertRaises(ValueError): MyStruct(('>h',), ('\udc00',)) with self.assertWarns(FutureWarning): - with self.assertRaises(UnicodeDecodeError): + with self.assertRaises(ValueError): MyStruct(('>h',), (b'\xa4',)) with self.assertWarns(FutureWarning): my_struct = MyStruct(('>h',), ('h', 42) diff --git a/Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst b/Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst index c9d22c291097ba..35ba57a95b0e7e 100644 --- a/Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst +++ b/Misc/NEWS.d/next/Library/2026-03-12-12-17-39.gh-issue-145850.uW3stt.rst @@ -1,6 +1,6 @@ Changed some implementation details in :class:`struct.Struct`: calling it with non-ASCII string format will now raise a :exc:`ValueError` instead of :exc:`UnicodeEncodeError`, calling it with non-ASCII bytes format will now -raise a :exc:`UnicodeDecodeError` instead of :exc:`struct.error`, getting +raise a :exc:`ValueError` instead of :exc:`struct.error`, getting the :attr:`!format` attribute of uninitialized object will now raise an :exc:`AttributeError` instead of :exc:`RuntimeError`. diff --git a/Modules/_struct.c b/Modules/_struct.c index c8ec5072a7bde2..2059218029ea34 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -1635,7 +1635,10 @@ prepare_s(PyStructObject *self, PyObject *format) _structmodulestate *state = get_struct_state_structinst(self); - assert(PyUnicode_IS_ASCII(format)); + if (!PyUnicode_IS_ASCII(format)) { + PyErr_SetString(PyExc_ValueError, "non-ASCII character in struct format"); + return -1; + } fmt = (const char *)PyUnicode_1BYTE_DATA(format); if (strlen(fmt) != (size_t)PyUnicode_GET_LENGTH(format)) { PyErr_SetString(state->StructError, @@ -1781,15 +1784,11 @@ static int set_format(PyStructObject *self, PyObject *format) { if (PyUnicode_Check(format)) { - if (!PyUnicode_IS_ASCII(format)) { - PyErr_SetString(PyExc_ValueError, "non-ASCII character in struct format"); - return -1; - } format = PyUnicode_FromObject(format); } else if (PyBytes_Check(format)) { format = PyUnicode_DecodeASCII(PyBytes_AS_STRING(format), - PyBytes_GET_SIZE(format), NULL); + PyBytes_GET_SIZE(format), "surrogateescape"); } else { PyErr_Format(PyExc_TypeError, From 1cce527deb2e332d75611b32453fb66e65a8b1ab Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 12 Mar 2026 18:58:59 +0200 Subject: [PATCH 4/4] Fix the fuzzing tests. --- Modules/_xxtestfuzz/fuzzer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Modules/_xxtestfuzz/fuzzer.c b/Modules/_xxtestfuzz/fuzzer.c index f3a22f3f6a87cb..6cb11562476e40 100644 --- a/Modules/_xxtestfuzz/fuzzer.c +++ b/Modules/_xxtestfuzz/fuzzer.c @@ -133,6 +133,10 @@ static int fuzz_struct_unpack(const char* data, size_t size) { if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) { PyErr_Clear(); } + /* Ignore any ValueError, these are triggered by non-ASCII format. */ + if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { + PyErr_Clear(); + } /* Ignore any struct.error exceptions, these can be caused by invalid formats or incomplete buffers both of which are common. */ if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) {