From 6eea3f1ee598468f1cff9f437f634f5c9c9e9ab0 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 9 Aug 2024 13:02:37 +0200 Subject: [PATCH 1/3] gh-122854: Add Py_HashBuffer() function --- Doc/c-api/hash.rst | 18 ++++++++++++ Doc/whatsnew/3.14.rst | 3 ++ Include/cpython/pyhash.h | 2 ++ Include/internal/pycore_pyhash.h | 3 -- Lib/test/test_capi/test_hash.py | 10 +++++++ ...-08-09-13-12-20.gh-issue-122854.-1OgvU.rst | 2 ++ Modules/_datetimemodule.c | 2 +- Modules/_hashopenssl.c | 3 +- Modules/_sre/sre.c | 2 +- Modules/_testcapi/hash.c | 29 ++++++++++++++++--- Modules/_xxtestfuzz/fuzzer.c | 3 +- Objects/bytesobject.c | 2 +- Objects/codeobject.c | 4 +-- Objects/memoryobject.c | 2 +- Objects/unicodeobject.c | 2 +- Python/import.c | 2 +- Python/pyhash.c | 15 ++++++---- 17 files changed, 79 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/C_API/2024-08-09-13-12-20.gh-issue-122854.-1OgvU.rst diff --git a/Doc/c-api/hash.rst b/Doc/c-api/hash.rst index 7345a048a4128b..6d06a4dde49286 100644 --- a/Doc/c-api/hash.rst +++ b/Doc/c-api/hash.rst @@ -89,6 +89,24 @@ See also the :c:member:`PyTypeObject.tp_hash` member and :ref:`numeric-hash`. .. versionadded:: 3.13 + +.. c:function:: Py_hash_t Py_HashBuffer(const void *ptr, Py_ssize_t len) + + Compute and return the hash value of a buffer of *len* bytes + starting at address *ptr*. The hash is guaranteed to match that of + :class:`bytes`, :class:`memoryview`, and other built-in objects + that implement the :ref:`buffer protocol `. + + Use this function to implement hashing for immutable objects whose + ``tp_richcompare`` function compares to another object's buffer. + + *len* must be greater than or equal to ``0``. + + The function cannot fail: it cannot return ``-1``. + + .. versionadded:: 3.14 + + .. c:function:: Py_hash_t PyObject_GenericHash(PyObject *obj) Generic hashing function that is meant to be put into a type diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index ee7d333e2b208e..b2fdb6f66b847d 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -471,6 +471,9 @@ New Features an interned string and deallocate it during module shutdown. (Contribued by Eddie Elizondo in :gh:`113601`.) +* Add :c:func:`Py_HashBuffer` to compute and return the hash value of a buffer. + (Contributed by Antoine Pitrou and Victor Stinner in :gh:`122854`.) + Porting to Python 3.14 ---------------------- diff --git a/Include/cpython/pyhash.h b/Include/cpython/pyhash.h index 825c034a8d8474..876a7f0ea44f4d 100644 --- a/Include/cpython/pyhash.h +++ b/Include/cpython/pyhash.h @@ -45,3 +45,5 @@ PyAPI_FUNC(PyHash_FuncDef*) PyHash_GetFuncDef(void); PyAPI_FUNC(Py_hash_t) Py_HashPointer(const void *ptr); PyAPI_FUNC(Py_hash_t) PyObject_GenericHash(PyObject *); + +PyAPI_FUNC(Py_hash_t) Py_HashBuffer(const void *ptr, Py_ssize_t len); diff --git a/Include/internal/pycore_pyhash.h b/Include/internal/pycore_pyhash.h index 0ce08900e96f0b..9414e7761171d2 100644 --- a/Include/internal/pycore_pyhash.h +++ b/Include/internal/pycore_pyhash.h @@ -20,9 +20,6 @@ _Py_HashPointerRaw(const void *ptr) return (Py_hash_t)x; } -// Export for '_datetime' shared extension -PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t); - /* Hash secret * * memory layout on 64 bit systems diff --git a/Lib/test/test_capi/test_hash.py b/Lib/test/test_capi/test_hash.py index cb2b3635f01328..da85ebb2d90d3f 100644 --- a/Lib/test/test_capi/test_hash.py +++ b/Lib/test/test_capi/test_hash.py @@ -78,6 +78,16 @@ def python_hash_pointer(x): VOID_P_MAX = -1 & (2 ** (8 * SIZEOF_VOID_P) - 1) self.assertEqual(hash_pointer(VOID_P_MAX), -2) + def test_hash_bytes(self): + hash_bytes = _testcapi.hash_bytes + + def check(data): + self.assertEqual(hash_bytes(data), hash(data)) + + check(b'') + check(b'abc') + check(b'x' * 1024) + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/C_API/2024-08-09-13-12-20.gh-issue-122854.-1OgvU.rst b/Misc/NEWS.d/next/C_API/2024-08-09-13-12-20.gh-issue-122854.-1OgvU.rst new file mode 100644 index 00000000000000..b94d8f4e483bda --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2024-08-09-13-12-20.gh-issue-122854.-1OgvU.rst @@ -0,0 +1,2 @@ +Add :c:func:`Py_HashBuffer` to compute and return the hash value of a buffer. +Patch by Antoine Pitrou and Victor Stinner. diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 79314e06c82f69..4da175f252935f 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -3842,7 +3842,7 @@ datetime_date_replace_impl(PyDateTime_Date *self, int year, int month, static Py_hash_t generic_hash(unsigned char *data, int len) { - return _Py_HashBytes(data, len); + return Py_HashBuffer(data, len); } diff --git a/Modules/_hashopenssl.c b/Modules/_hashopenssl.c index 14d9c186151232..2c9a9feecc79f0 100644 --- a/Modules/_hashopenssl.c +++ b/Modules/_hashopenssl.c @@ -25,7 +25,6 @@ #include #include "Python.h" #include "pycore_hashtable.h" -#include "pycore_pyhash.h" // _Py_HashBytes() #include "pycore_strhex.h" // _Py_strhex() #include "hashlib.h" @@ -186,7 +185,7 @@ static const py_hashentry_t py_hashes[] = { static Py_uhash_t py_hashentry_t_hash_name(const void *key) { - return _Py_HashBytes(key, strlen((const char *)key)); + return Py_HashBuffer(key, strlen((const char *)key)); } static int diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 01420d1a10b1cf..8c05ee5895bb61 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -2944,7 +2944,7 @@ pattern_hash(PatternObject *self) return -1; } - hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize); + hash2 = Py_HashBuffer(self->code, sizeof(self->code[0]) * self->codesize); hash ^= hash2; hash ^= self->flags; diff --git a/Modules/_testcapi/hash.c b/Modules/_testcapi/hash.c index 809d537bfef0d3..9457ffcbf247c9 100644 --- a/Modules/_testcapi/hash.c +++ b/Modules/_testcapi/hash.c @@ -45,6 +45,14 @@ hash_getfuncdef(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args)) } +static PyObject * +long_from_hash(Py_hash_t hash) +{ + Py_BUILD_ASSERT(sizeof(long long) >= sizeof(hash)); + return PyLong_FromLongLong(hash); +} + + static PyObject * hash_pointer(PyObject *Py_UNUSED(module), PyObject *arg) { @@ -54,8 +62,21 @@ hash_pointer(PyObject *Py_UNUSED(module), PyObject *arg) } Py_hash_t hash = Py_HashPointer(ptr); - Py_BUILD_ASSERT(sizeof(long long) >= sizeof(hash)); - return PyLong_FromLongLong(hash); + return long_from_hash(hash); +} + + +static PyObject * +hash_bytes(PyObject *Py_UNUSED(module), PyObject *args) +{ + char *ptr; + Py_ssize_t len; + if (!PyArg_ParseTuple(args, "y#", &ptr, &len)) { + return NULL; + } + + Py_hash_t hash = Py_HashBuffer(ptr, len); + return long_from_hash(hash); } @@ -64,14 +85,14 @@ object_generichash(PyObject *Py_UNUSED(module), PyObject *arg) { NULLABLE(arg); Py_hash_t hash = PyObject_GenericHash(arg); - Py_BUILD_ASSERT(sizeof(long long) >= sizeof(hash)); - return PyLong_FromLongLong(hash); + return long_from_hash(hash); } static PyMethodDef test_methods[] = { {"hash_getfuncdef", hash_getfuncdef, METH_NOARGS}, {"hash_pointer", hash_pointer, METH_O}, + {"hash_bytes", hash_bytes, METH_VARARGS}, {"object_generichash", object_generichash, METH_O}, {NULL}, }; diff --git a/Modules/_xxtestfuzz/fuzzer.c b/Modules/_xxtestfuzz/fuzzer.c index 6ea9f64d628530..a04f1412eefda1 100644 --- a/Modules/_xxtestfuzz/fuzzer.c +++ b/Modules/_xxtestfuzz/fuzzer.c @@ -15,7 +15,6 @@ #endif #include -#include "pycore_pyhash.h" // _Py_HashBytes() #include #include @@ -45,7 +44,7 @@ static int fuzz_builtin_int(const char* data, size_t size) { /* Pick a random valid base. (When the fuzzed function takes extra parameters, it's somewhat normal to hash the input to generate those parameters. We want to exercise all code paths, so we do so here.) */ - int base = _Py_HashBytes(data, size) % 37; + int base = Py_HashBuffer(data, size) % 37; if (base == 1) { // 1 is the only number between 0 and 36 that is not a valid base. base = 0; diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index e88b199d89f758..ffe031abb978ff 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1598,7 +1598,7 @@ _Py_COMP_DIAG_PUSH _Py_COMP_DIAG_IGNORE_DEPR_DECLS if (a->ob_shash == -1) { /* Can't fail */ - a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a)); + a->ob_shash = Py_HashBuffer(a->ob_sval, Py_SIZE(a)); } return a->ob_shash; _Py_COMP_DIAG_POP diff --git a/Objects/codeobject.c b/Objects/codeobject.c index ef24b51b961eeb..6f0b3f8b9a3262 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2561,12 +2561,12 @@ hash_const(const void *key) if (PySlice_Check(op)) { PySliceObject *s = (PySliceObject *)op; PyObject *data[3] = { s->start, s->stop, s->step }; - return _Py_HashBytes(&data, sizeof(data)); + return Py_HashBuffer(&data, sizeof(data)); } else if (PyTuple_CheckExact(op)) { Py_ssize_t size = PyTuple_GET_SIZE(op); PyObject **data = _PyTuple_ITEMS(op); - return _Py_HashBytes(data, sizeof(PyObject *) * size); + return Py_HashBuffer(data, sizeof(PyObject *) * size); } Py_hash_t h = PyObject_Hash(op); if (h == -1) { diff --git a/Objects/memoryobject.c b/Objects/memoryobject.c index 226bd6defdec5a..498a37c1a3d869 100644 --- a/Objects/memoryobject.c +++ b/Objects/memoryobject.c @@ -3087,7 +3087,7 @@ memory_hash(PyObject *_self) } /* Can't fail */ - self->hash = _Py_HashBytes(mem, view->len); + self->hash = Py_HashBuffer(mem, view->len); if (mem != view->buf) PyMem_Free(mem); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 148d3e55bf830e..cda89069e040de 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11672,7 +11672,7 @@ unicode_hash(PyObject *self) if (hash != -1) { return hash; } - x = _Py_HashBytes(PyUnicode_DATA(self), + x = Py_HashBuffer(PyUnicode_DATA(self), PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x); diff --git a/Python/import.c b/Python/import.c index f4c0d544fbdefa..c9212ec1ac0d7c 100644 --- a/Python/import.c +++ b/Python/import.c @@ -1174,7 +1174,7 @@ hashtable_key_from_2_strings(PyObject *str1, PyObject *str2, const char sep) static Py_uhash_t hashtable_hash_str(const void *key) { - return _Py_HashBytes(key, strlen((const char *)key)); + return Py_HashBuffer(key, strlen((const char *)key)); } static int diff --git a/Python/pyhash.c b/Python/pyhash.c index 1504fa201c9902..216f437dd9a2d4 100644 --- a/Python/pyhash.c +++ b/Python/pyhash.c @@ -22,7 +22,7 @@ extern PyHash_FuncDef PyHash_Func; static PyHash_FuncDef PyHash_Func; #endif -/* Count _Py_HashBytes() calls */ +/* Count Py_HashBuffer() calls */ #ifdef Py_HASH_STATS #define Py_HASH_STATS_MAX 32 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; @@ -146,9 +146,8 @@ PyObject_GenericHash(PyObject *obj) } Py_hash_t -_Py_HashBytes(const void *src, Py_ssize_t len) +Py_HashBuffer(const void *ptr, Py_ssize_t len) { - Py_hash_t x; /* We make the hash of the empty string be 0, rather than using (prefix ^ suffix), since this slightly obfuscates the hash secret @@ -161,11 +160,12 @@ _Py_HashBytes(const void *src, Py_ssize_t len) hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++; #endif + Py_hash_t x; #if Py_HASH_CUTOFF > 0 if (len < Py_HASH_CUTOFF) { /* Optimize hashing of very small strings with inline DJBX33A. */ Py_uhash_t hash; - const unsigned char *p = src; + const unsigned char *p = ptr; hash = 5381; /* DJBX33A starts with 5381 */ switch(len) { @@ -186,10 +186,13 @@ _Py_HashBytes(const void *src, Py_ssize_t len) } else #endif /* Py_HASH_CUTOFF */ - x = PyHash_Func.hash(src, len); + { + x = PyHash_Func.hash(ptr, len); + } - if (x == -1) + if (x == -1) { return -2; + } return x; } From ef25088d02cbbb6566c17517a8dbb264fb40098f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 27 Aug 2024 14:22:15 +0200 Subject: [PATCH 2/3] Rename hash_bytes() to hash_buffer() in tests --- Lib/test/test_capi/test_hash.py | 6 +++--- Modules/_testcapi/hash.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_capi/test_hash.py b/Lib/test/test_capi/test_hash.py index da85ebb2d90d3f..f553ffb0d90f33 100644 --- a/Lib/test/test_capi/test_hash.py +++ b/Lib/test/test_capi/test_hash.py @@ -78,11 +78,11 @@ def python_hash_pointer(x): VOID_P_MAX = -1 & (2 ** (8 * SIZEOF_VOID_P) - 1) self.assertEqual(hash_pointer(VOID_P_MAX), -2) - def test_hash_bytes(self): - hash_bytes = _testcapi.hash_bytes + def test_hash_buffer(self): + hash_buffer = _testcapi.hash_buffer def check(data): - self.assertEqual(hash_bytes(data), hash(data)) + self.assertEqual(hash_buffer(data), hash(data)) check(b'') check(b'abc') diff --git a/Modules/_testcapi/hash.c b/Modules/_testcapi/hash.c index 9457ffcbf247c9..1525344a93fbcf 100644 --- a/Modules/_testcapi/hash.c +++ b/Modules/_testcapi/hash.c @@ -67,7 +67,7 @@ hash_pointer(PyObject *Py_UNUSED(module), PyObject *arg) static PyObject * -hash_bytes(PyObject *Py_UNUSED(module), PyObject *args) +hash_buffer(PyObject *Py_UNUSED(module), PyObject *args) { char *ptr; Py_ssize_t len; @@ -92,7 +92,7 @@ object_generichash(PyObject *Py_UNUSED(module), PyObject *arg) static PyMethodDef test_methods[] = { {"hash_getfuncdef", hash_getfuncdef, METH_NOARGS}, {"hash_pointer", hash_pointer, METH_O}, - {"hash_bytes", hash_bytes, METH_VARARGS}, + {"hash_buffer", hash_buffer, METH_VARARGS}, {"object_generichash", object_generichash, METH_O}, {NULL}, }; From d5620b13521b5a75e3050d4874c8d62a8be1e8c9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 27 Aug 2024 14:23:25 +0200 Subject: [PATCH 3/3] Address Erlend's review --- Doc/c-api/hash.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Doc/c-api/hash.rst b/Doc/c-api/hash.rst index 6d06a4dde49286..00f8cb887dc7eb 100644 --- a/Doc/c-api/hash.rst +++ b/Doc/c-api/hash.rst @@ -98,11 +98,12 @@ See also the :c:member:`PyTypeObject.tp_hash` member and :ref:`numeric-hash`. that implement the :ref:`buffer protocol `. Use this function to implement hashing for immutable objects whose - ``tp_richcompare`` function compares to another object's buffer. + :c:member:`~PyTypeObject.tp_richcompare` function compares to another + object's buffer. *len* must be greater than or equal to ``0``. - The function cannot fail: it cannot return ``-1``. + This function always succeeds. .. versionadded:: 3.14