8000 gh-106931: Intern Statically Allocated Strings Globally (gh-107272) · python/cpython@b72947a · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit b72947a

Browse files
gh-106931: Intern Statically Allocated Strings Globally (gh-107272)
We tried this before with a dict and for all interned strings. That ran into problems due to interpreter isolation. However, exclusively using a per-interpreter cache caused some inconsistency that can eliminate the benefit of interning. Here we circle back to using a global cache, but only for statically allocated strings. We also use a more-basic _Py_hashtable_t for that global cache instead of a dict. Ideally we would only have the global cache, but the optional isolation of each interpreter's allocator means that a non-static string object must not outlive its interpreter. Thus we would have to store a copy of each such interned string in the global cache, tied to the main interpreter.
1 parent 4f67921 commit b72947a

File tree

10 files changed

+123
-4
lines changed

10 files changed

+123
-4
lines changed

Include/cpython/unicodeobject.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,11 @@ typedef struct {
140140
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
141141
set, use the PyASCIIObject structure. */
142142
unsigned int ascii:1;
143+
/* The object is statically allocated. */
144+
unsigned int statically_allocated:1;
143145
/* Padding to ensure that PyUnicode_DATA() is always aligned to
144146
4 bytes (see issue #19537 on m68k). */
145-
unsigned int :25;
147+
unsigned int :24;
146148
} state;
147149
} PyASCIIObject;
148150

Include/internal/pycore_global_objects.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ extern "C" {
88
# error "this header requires Py_BUILD_CORE define"
99
#endif
1010

11+
#include "pycore_hashtable.h" // _Py_hashtable_t
1112
#include "pycore_gc.h" // PyGC_Head
1213
#include "pycore_global_strings.h" // struct _Py_global_strings
1314
#include "pycore_hamt.h" // PyHamtNode_Bitmap
@@ -28,6 +29,11 @@ extern "C" {
2829
#define _Py_SINGLETON(NAME) \
2930
_Py_GLOBAL_OBJECT(singletons.NAME)
3031

32+
struct _Py_cached_objects {
33+
// XXX We could statically allocate the hashtable.
34+
_Py_hashtable_t *interned_strings;
35+
};
36+
3137
struct _Py_static_objects {
3238
struct {
3339
/* Small integers are preallocated in this array so that they

Include/internal/pycore_hashtable.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ PyAPI_FUNC(int) _Py_hashtable_foreach(
106106
void *user_data);
107107

108108
PyAPI_FUNC(size_t) _Py_hashtable_size(const _Py_hashtable_t *ht);
109+
PyAPI_FUNC(size_t) _Py_hashtable_len(const _Py_hashtable_t *ht);
109110

110111
/* Add a new entry to the hash. The key must not be present in the hash table.
111112
Return 0 on success, -1 on memory error. */

Include/internal/pycore_runtime.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ typedef struct pyruntimestate {
249249
struct _types_runtime_state types;
250250

251251
/* All the objects that are shared by the runtime's interpreters. */
252+
struct _Py_cached_objects cached_objects;
252253
struct _Py_static_objects static_objects;
253254

254255
/* The following fields are here to avoid allocation during init.

Include/internal/pycore_runtime_init.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ extern PyTypeObject _PyExc_MemoryError;
214214
.kind = 1, \
215215
.compact = 1, \
216216
.ascii = (ASCII), \
217+
.statically_allocated = 1, \
217218
}, \
218219
}
219220
#define _PyASCIIObject_INIT(LITERAL) \

Lib/test/test_sys.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from test.support.script_helper import assert_python_ok, assert_python_failure
1515
from test.support import threading_helper
1616
from test.support import import_helper
17+
from test.support import interpreters
1718
import textwrap
1819
import unittest
1920
import warnings
@@ -699,6 +700,35 @@ def __hash__(self):
699700

700701
self.assertRaises(TypeError, sys.intern, S("abc"))
701702

703+
def test_subinterp_intern_dynamically_allocated(self):
704+
global INTERN_NUMRUNS
705+
INTERN_NUMRUNS += 1
706+
s = "never interned before" + str(INTERN_NUMRUNS)
707+
t = sys.intern(s)
708+
self.assertIs(t, s)
709+
710+
interp = interpreters.create()
711+
interp.run(textwrap.dedent(f'''
712+
import sys
713+
t = sys.intern({s!r})
714+
assert id(t) != {id(s)}, (id(t), {id(s)})
715+
assert id(t) != {id(t)}, (id(t), {id(t)})
716+
'''))
717+
718+
def test_subinterp_intern_statically_allocated(self):
719+
# See Tools/build/generate_global_objects.py for the list
720+
# of strings that are always statically allocated.
721+
s = '__init__'
722+
t = sys.intern(s)
723+
724+
print('------------------------')
725+
interp = interpreters.create()
726+
interp.run(textwrap.dedent(f'''
727+
import sys
728+
t = sys.intern({s!r})
729+
assert id(t) == {id(t)}, (id(t), {id(t)})
730+
'''))
731+
702732
def test_sys_flags(self):
703733
self.assertTrue(sys.flags)
704734
attrs = ("debug",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Statically allocated string objects are now interned globally instead of
2+
per-interpreter. This fixes a situation where such a string would only be
3+
interned in a single interpreter. Normal string objects are unaffected.

Objects/unicodeobject.c

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,15 +236,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp)
236236
return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
237237
}
238238

239+
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
240+
239241
Py_ssize_t
240242
_PyUnicode_InternedSize(void)
241243
{
242-
return PyObject_Length(get_interned_dict(_PyInterpreterState_GET()));
244+
PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
245+
return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
246+
}
247+
248+
static Py_hash_t unicode_hash(PyObject *);
249+
static int unicode_compare_eq(PyObject *, PyObject *);
250+
251+
static Py_uhash_t
252+
hashtable_unicode_hash(const void *key)
253+
{
254+
return unicode_hash((PyObject *)key);
255+
}
256+
257+
static int
258+
hashtable_unicode_compare(const void *key1, const void *key2)
259+
{
260+
PyObject *obj1 = (PyObject *)key1;
261+
PyObject *obj2 = (PyObject *)key2;
262+
if (obj1 != NULL && obj2 != NULL) {
263+
return unicode_compare_eq(obj1, obj2);
264+
}
265+
else {
266+
return obj1 == obj2;
267+
}
243268
}
244269

245270
static int
246271
init_interned_dict(PyInterpreterState *interp)
247272
{
273+
if (_Py_IsMainInterpreter(interp)) {
274+
assert(INTERNED_STRINGS == NULL);
275+
_Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
276+
INTERNED_STRINGS = _Py_hashtable_new_full(
277+
hashtable_unicode_hash,
278+
hashtable_unicode_compare,
279+
NULL,
280+
NULL,
281+
&hashtable_alloc
282+
);
283+
if (INTERNED_STRINGS == NULL) {
284+
return -1;
285+
}
286+
}
248287
assert(get_interned_dict(interp) == NULL);
249288
PyObject *interned = interned = PyDict_New();
250289
if (interned == NULL) {
@@ -263,6 +302,10 @@ clear_interned_dict(PyInterpreterState *interp)
263302
Py_DECREF(interned);
264303
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
265304
}
305+
if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
306+
_Py_hashtable_destroy(INTERNED_STRINGS);
307+
INTERNED_STRINGS = NULL;
308+
}
266309
}
267310

268311
#define _Py_RETURN_UNICODE_EMPTY() \
@@ -1223,6 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
12231266
_PyUnicode_STATE(unicode).kind = kind;
12241267
_PyUnicode_STATE(unicode).compact = 1;
12251268
_PyUnicode_STATE(unicode).ascii = is_ascii;
1269+
_PyUnicode_STATE(unicode).statically_allocated = 0;
12261270
if (is_ascii) {
12271271
((char*)data)[size] = 0;
12281272
}
@@ -1553,7 +1597,9 @@ unicode_dealloc(PyObject *unicode)
15531597
* we accidentally decref an immortal string out of existence. Since
15541598
* the string is an immortal object, 10000 just re-set the reference count.
15551599
*/
1556-
if (PyUnicode_CHECK_INTERNED(unicode)) {
1600+
if (PyUnicode_CHECK_INTERNED(unicode)
1601+
|| _PyUnicode_STATE(unicode).statically_allocated)
1602+
{
15571603
_Py_SetImmortal(unicode);
15581604
return;
15591605
}
@@ -14503,6 +14549,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
1450314549
_PyUnicode_STATE(self).kind = kind;
1450414550
_PyUnicode_STATE(self).compact = 0;
1450514551
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14552+
_PyUnicode_STATE(self).statically_allocated = 0;
1450614553
_PyUnicode_UTF8_LENGTH(self) = 0;
1450714554
_PyUnicode_UTF8(self) = NULL;
1450814555
_PyUnicode_DATA_ANY(self) = NULL;
@@ -14726,6 +14773,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
1472614773
return;
1472714774
}
1472814775

14776+
/* Look in the global cache first. */
14777+
PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14778+
if (r != NULL && r != s) {
14779+
Py_SETREF(*p, Py_NewRef(r));
14780+
return;
14781+
}
14782+
14783+
/* Handle statically allocated strings. */
14784+
if (_PyUnicode_STATE(s).statically_allocated) {
14785+
assert(_Py_IsImmortal(s));
14786+
if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
14787+
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14788+
}
14789+
return;
14790+
}
14791+
14792+
/* Look in the per-interpreter cache. */
1472914793
PyObject *interned = get_interned_dict(interp);
1473014794
assert(interned != NULL);
1473114795

@@ -14741,9 +14805,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
1474114805
}
1474214806

1474314807
if (_Py_IsImmortal(s)) {
14808+
// XXX Restrict this to the main interpreter?
1474414809
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14745-
return;
14810+
return;
1474614811
}
14812+
1474714813
#ifdef Py_REF_DEBUG
1474814814
/* The reference count value excluding the 2 references from the
1474914815
interned dictionary should be excluded from the RefTotal. The

Python/hashtable.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,13 @@ _Py_hashtable_size(const _Py_hashtable_t *ht)
129129
}
130130

131131

132+
size_t
133+
_Py_hashtable_len(const _Py_hashtable_t *ht)
134+
{
135+
return ht->nentries;
136+
}
137+
138+
132139
_Py_hashtable_entry_t *
133140
_Py_hashtable_get_entry_generic(_Py_hashtable_t *ht, const void *key)
134141
{

Tools/build/deepfreeze.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ def generate_unicode(self, name: str, s: str) -> str:
208208
self.write(".kind = 1,")
209209
self.write(".compact = 1,")
210210
self.write(".ascii = 1,")
211+
self.write(".statically_allocated = 1,")
211212
self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
212213
return f"& {name}._ascii.ob_base"
213214
else:
@@ -220,6 +221,7 @@ def generate_unicode(self, name: str, s: str) -> str:
220221
self.write(f".kind = {kind},")
221222
self.write(".compact = 1,")
222223
self.write(".ascii = 0,")
224+
self.write(".statically_allocated = 1,")
223225
utf8 = s.encode('utf-8')
224226
self.write(f'.utf8 = {make_string_literal(utf8)},')
225227
self.write(f'.utf8_length = {len(utf8)},')

0 commit comments

Comments
 (0)
0