diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 97dda73f9b584d..fd097a0ef75c7e 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -10,6 +10,7 @@ extern "C" { #include "pycore_fileutils.h" // _Py_error_handler #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_global_objects.h"// _Py_INTERP_CACHED_OBJECT // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111). @@ -358,6 +359,19 @@ extern PyTypeObject _PyUnicodeASCIIIter_Type; // All these are "ref-neutral", like the public PyUnicode_InternInPlace. +/* This hashtable holds statically allocated interned strings. + * See InternalDocs/string_interning.md for details. + */ +#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings + +/* This dictionary holds per-interpreter interned strings. + * See InternalDocs/string_interning.md for details. + */ +static inline PyObject *get_interned_dict(PyInterpreterState *interp) +{ + return _Py_INTERP_CACHED_OBJECT(interp, interned_strings); +} + // Explicit interning routines: PyAPI_FUNC(void) _PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **); PyAPI_FUNC(void) _PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **); diff --git a/Lib/test/support/constants_helper.py b/Lib/test/support/constants_helper.py new file mode 100644 index 00000000000000..5dbf418110a325 --- /dev/null +++ b/Lib/test/support/constants_helper.py @@ -0,0 +1,23 @@ +import re +from pathlib import Path + +from typing import Iterable + +from test.support.project_files_helper import iter_all_c_files + + +# copypaste from 'Tools/build/generate_global_objects.py' +def iter_global_strings() -> Iterable[str]: + id_regex = re.compile(r"\b_Py_ID\((\w+)\)") + str_regex = re.compile(r'\b_Py_DECLARE_STR\((?:\w+), "(.*?)"\)') + for filename in iter_all_c_files(): + infile = Path(filename) + if not infile.exists(): + # The file must have been a temporary file. + continue + with infile.open(encoding="utf-8") as infile_open: + for line in infile_open: + for m in id_regex.finditer(line): + yield m.group(1) + for m in str_regex.finditer(line): + yield m.group(1) diff --git a/Lib/test/support/project_files_helper.py b/Lib/test/support/project_files_helper.py new file mode 100644 index 00000000000000..59f278821e1702 --- /dev/null +++ b/Lib/test/support/project_files_helper.py @@ -0,0 +1,22 @@ +from pathlib import Path + +from typing import Iterable + +ROOT = Path(__file__).resolve().parents[3] + + +# copypaste from 'Tools/build/generate_global_objects.py' +def iter_all_c_files() -> Iterable[Path]: + for top_directory_name in ( + "Modules", + "Objects", + "Parser", + "PC", + "Programs", + "Python", + ): + for dirname, _, files in (ROOT / top_directory_name).walk(): + for name in files: + if not name.endswith((".c", ".h")): + continue + yield dirname / name diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py index 655f5a9be7fa31..fd5e09edfddfe3 100644 --- a/Lib/test/test_code.py +++ b/Lib/test/test_code.py @@ -211,6 +211,7 @@ from test.support import (cpython_only, check_impl_detail, requires_debug_ranges, gc_collect, Py_GIL_DISABLED) +from test.support.constants_helper import iter_global_strings from test.support.script_helper import assert_python_ok from test.support import threading_helper, import_helper from test.support.bytecode_helper import instructions_with_positions @@ -1251,6 +1252,41 @@ class MyInt(int): self.assertIsInstance(code.co_consts[1], Unhashable) self.assertEqual(code.co_consts[2], code.co_consts[3]) + @cpython_only + @unittest.skipIf(Py_GIL_DISABLED, "free-threaded build interns all string constants") + def test__Py_DECLARE_STR_is_interned(self): + for global_string in iter_global_strings(): + with self.subTest(global_string=global_string): + self.assertIsInterned(eval(f"'{global_string}'")) + + noninternable_by_default = textwrap.dedent(f''' + not-internable + not.internable + не_интернируемый + str with spaces + {chr(0x011111)} + {chr(0x9999)} + {chr(0x100)} + ''') + + @cpython_only + @unittest.skipIf(Py_GIL_DISABLED, "free-threaded build interns all string constants") + def test_non_internable_strings_not_interned(self): + for noninternable in self.noninternable_by_default.strip().splitlines(): + with self.subTest(noninternable=noninternable): + self.assertIsNotInterned(eval(f"'{noninternable}'")) + + @cpython_only + @unittest.skipIf(Py_GIL_DISABLED, "free-threaded build interns all string constants") + def test_explicitly_interned_strings(self): + for noninternable in self.noninternable_by_default.strip().splitlines(): + self.assertIsNotInterned(noninternable) + sys.intern(noninternable) + with self.subTest(noninternable=noninternable): + self.assertIsInterned(noninternable) + interned_from_code = eval(f"'{noninternable}'") + self.assertIsInterned(interned_from_code) + self.assertIs(noninternable, interned_from_code) class CodeWeakRefTest(unittest.TestCase): diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 3aea2038fd17e7..620eaa0f082629 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -113,7 +113,7 @@ PyCode_ClearWatcher(int watcher_id) #define _PyCodeObject_CAST(op) (assert(PyCode_Check(op)), (PyCodeObject *)(op)) -static int +static inline int should_intern_string(PyObject *o) { #ifdef Py_GIL_DISABLED @@ -196,6 +196,8 @@ intern_strings(PyObject *tuple) return 0; } +#define _constants_tuple_modified(modified) if (modified) *modified = 1 + /* Intern constants. In the default build, this interns selected string constants. In the free-threaded build, this also interns non-string constants. */ @@ -206,14 +208,28 @@ intern_constants(PyObject *tuple, int *modified) for (Py_ssize_t i = PyTuple_GET_SIZE(tuple); --i >= 0; ) { PyObject *v = PyTuple_GET_ITEM(tuple, i); if (PyUnicode_CheckExact(v)) { + if (PyUnicode_CHECK_INTERNED(v) != 0) { + continue; + } +#if !defined(Py_GIL_DISABLED) + PyObject *interned = _Py_hashtable_get(INTERNED_STRINGS, v); + if (interned == NULL) { + interned = PyDict_GetItemWithError(get_interned_dict(interp), v); + if (PyErr_Occurred()) return -1; + } + if (interned != NULL && interned != v) { + Py_INCREF(interned); + PyTuple_SET_ITEM(tuple, i, interned); + Py_DECREF(v); + _constants_tuple_modified(modified); + } else +#endif if (should_intern_string(v)) { PyObject *w = v; _PyUnicode_InternMortal(interp, &v); if (w != v) { PyTuple_SET_ITEM(tuple, i, v); - if (modified) { - *modified = 1; - } + _constants_tuple_modified(modified); } } } @@ -242,9 +258,7 @@ intern_constants(PyObject *tuple, int *modified) PyTuple_SET_ITEM(tuple, i, v); Py_DECREF(w); - if (modified) { - *modified = 1; - } + _constants_tuple_modified(modified); } Py_DECREF(tmp); } @@ -273,9 +287,7 @@ intern_constants(PyObject *tuple, int *modified) } PyTuple_SET_ITEM(tuple, i, v); Py_DECREF(slice); - if (modified) { - *modified = 1; - } + _constants_tuple_modified(modified); } Py_DECREF(tmp); } @@ -293,9 +305,7 @@ intern_constants(PyObject *tuple, int *modified) else if (interned != v) { PyTuple_SET_ITEM(tuple, i, interned); Py_SETREF(v, interned); - if (modified) { - *modified = 1; - } + _constants_tuple_modified(modified); } } #endif diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f737a885f197a0..711f828eb603e5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -207,18 +207,6 @@ _PyUnicode_GetEmpty(void) return &_Py_STR(empty); } -/* This dictionary holds per-interpreter interned strings. - * See InternalDocs/string_interning.md for details. - */ -static inline PyObject *get_interned_dict(PyInterpreterState *interp) -{ - return _Py_INTERP_CACHED_OBJECT(interp, interned_strings); -} - -/* This hashtable holds statically allocated interned strings. - * See InternalDocs/string_interning.md for details. - */ -#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings /* Get number of all interned strings for the current interpreter. */ Py_ssize_t