From c213b3756764f2a094f523b327ae00badd326a2a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 4 Apr 2022 08:56:59 +0300 Subject: [PATCH 1/8] gh-91524: Speed up the regular expression substitution Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. --- Doc/whatsnew/3.11.rst | 5 + Lib/re/__init__.py | 22 +-- Lib/re/_constants.py | 2 +- Lib/re/_parser.py | 46 ++---- Modules/_sre/clinic/sre.c.h | 35 +++- Modules/_sre/sre.c | 298 +++++++++++++++++++++++++++++++---- Modules/_sre/sre.h | 10 ++ Modules/_sre/sre_constants.h | 2 +- 8 files changed, 335 insertions(+), 85 deletions(-) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index d803801f273c55..401bcaef0d892b 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -512,6 +512,11 @@ Optimizations becomes 272 bytes from 352 bytes on 64bit platform. (Contributed by Inada Naoki in :issue:`46845`.) +* Speed up the regular expression substitution (functions :func:`re.sub` and + :func:`re.subn` and corresponding :class:`re.Pattern` methods) for + replacement strings containing group references by 2--3 times. + (Contributed by Serhiy Storchaka in :issue:`91524`.) + Faster CPython ============== diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index c9b511422f1e59..1ec7c0a40c70e2 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -124,6 +124,7 @@ import enum from . import _compiler, _parser import functools +import _sre # public symbols @@ -227,7 +228,7 @@ def compile(pattern, flags=0): def purge(): "Clear the regular expression caches" _cache.clear() - _compile_repl.cache_clear() + _compile_template.cache_clear() # SPECIAL_CHARS # closing ')', '}' and ']' @@ -282,24 +283,9 @@ def _compile(pattern, flags): return p @functools.lru_cache(_MAXCACHE) -def _compile_repl(repl, pattern): +def _compile_template(pattern, repl): # internal: compile replacement pattern - return _parser.parse_template(repl, pattern) - -def _expand(pattern, match, template): - # internal: Match.expand implementation hook - template = _parser.parse_template(template, pattern) - return _parser.expand_template(template, match) - -def _subx(pattern, template): - # internal: Pattern.sub/subn implementation helper - template = _compile_repl(template, pattern) - if not template[0] and len(template[1]) == 1: - # literal replacement - return template[1][0] - def filter(match, template=template): - return _parser.expand_template(template, match) - return filter + return _sre.template(pattern, _parser.parse_template(repl, pattern)) # register myself for pickling diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index 5e999dea337d3d..d3ad4ebdebe67f 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20220402 +MAGIC = 20220409 from _sre import MAXREPEAT, MAXGROUPS diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index f191f809a1491e..fb80895eef6870 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -980,24 +980,28 @@ def parse(str, flags=0, state=None): return p -def parse_template(source, state): +def parse_template(source, pattern): # parse 're' replacement string into list of literals and # group references s = Tokenizer(source) sget = s.get - groups = [] - literals = [] + result = [] literal = [] lappend = literal.append + def addliteral(): + if s.istext: + result.append(''.join(literal)) + else: + # The tokenizer implicitly decodes bytes objects as latin-1, we must + # therefore re-encode the final representation. + result.append(''.join(literal).encode('latin-1')) + del literal[:] def addgroup(index, pos): - if index > state.groups: + if index > pattern.groups: raise s.error("invalid group reference %d" % index, pos) - if literal: - literals.append(''.join(literal)) - del literal[:] - groups.append((len(literals), index)) - literals.append(None) - groupindex = state.groupindex + addliteral() + result.append(index) + groupindex = pattern.groupindex while True: this = sget() if this is None: @@ -1006,7 +1010,6 @@ def addgroup(index, pos): # group c = this[1] if c == "g": - name = "" if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") @@ -1057,22 +1060,5 @@ def addgroup(index, pos): lappend(this) else: lappend(this) - if literal: - literals.append(''.join(literal)) - if not isinstance(source, str): - # The tokenizer implicitly decodes bytes objects as latin-1, we must - # therefore re-encode the final representation. - literals = [None if s is None else s.encode('latin-1') for s in literals] - return groups, literals - -def expand_template(template, match): - g = match.group - empty = match.string[:0] - groups, literals = template - literals = literals[:] - try: - for index, group in groups: - literals[index] = g(group) or empty - except IndexError: - raise error("invalid group reference %d" % index) from None - return empty.join(literals) + addliteral() + return result diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index 34cbe21f14071b..b9ac14054bebfb 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -625,6 +625,39 @@ _sre_compile(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject return return_value; } +PyDoc_STRVAR(_sre_template__doc__, +"template($module, pattern, template, /)\n" +"--\n" +"\n"); + +#define _SRE_TEMPLATE_METHODDEF \ + {"template", (PyCFunction)(void(*)(void))_sre_template, METH_FASTCALL, _sre_template__doc__}, + +static PyObject * +_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template); + +static PyObject * +_sre_template(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + PyObject *pattern; + PyObject *template; + + if (!_PyArg_CheckPositional("template", nargs, 2, 2)) { + goto exit; + } + pattern = args[0]; + if (!PyList_Check(args[1])) { + _PyArg_BadArgument("template", "argument 2", "list", args[1]); + goto exit; + } + template = args[1]; + return_value = _sre_template_impl(module, pattern, template); + +exit: + return return_value; +} + PyDoc_STRVAR(_sre_SRE_Match_expand__doc__, "expand($self, /, template)\n" "--\n" @@ -923,4 +956,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const exit: return return_value; } -/*[clinic end generated code: output=9d7510a57a157a38 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=9454416028280667 input=a9049054013a1b77]*/ diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index bd9204da428af8..7e2e02d0884440 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -51,13 +51,6 @@ static const char copyright[] = #include -/* name of this module, minus the leading underscore */ -#if !defined(SRE_MODULE) -#define SRE_MODULE "sre" -#endif - -#define SRE_PY_MODULE "re" - /* defining this one enables tracing */ #undef VERBOSE @@ -254,6 +247,8 @@ typedef struct { PyTypeObject *Pattern_Type; PyTypeObject *Match_Type; PyTypeObject *Scanner_Type; + PyTypeObject *Template_Type; + PyObject *compile_template; // reference to re._compile_template } _sremodulestate; static _sremodulestate * @@ -769,15 +764,12 @@ _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls, } static PyObject* -call(const char* module, const char* function, PyObject* args) +import(const char* module, const char* function) { PyObject* name; PyObject* mod; PyObject* func; - PyObject* result; - if (!args) - return NULL; name = PyUnicode_FromString(module); if (!name) return NULL; @@ -787,12 +779,7 @@ call(const char* module, const char* function, PyObject* args) return NULL; func = PyObject_GetAttrString(mod, function); Py_DECREF(mod); - if (!func) - return NULL; - result = PyObject_CallObject(func, args); - Py_DECREF(func); - Py_DECREF(args); - return result; + return func; } /*[clinic input] @@ -1057,6 +1044,57 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, } +static PyObject * +compile_template(_sremodulestate *module_state, + PatternObject *pattern, PyObject *template) +{ + /* delegate to Python code */ + PyObject *func = module_state->compile_template; + if (func == NULL) { + func = import("re", "_compile_template"); + if (func == NULL) { + return NULL; + } + Py_XSETREF(module_state->compile_template, func); + } + + PyObject *args[] = {(PyObject *)pattern, template}; + PyObject *result = PyObject_Vectorcall(func, args, 2, NULL); + + if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) { + /* If the replacement string is unhashable (e.g. bytearray), + * convert it to the basic type (str or bytes) and repeat. */ + if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) { + PyErr_Clear(); + template = _PyUnicode_Copy(template); + } + else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) { + PyErr_Clear(); + template = PyBytes_FromObject(template); + } + else { + return NULL; + } + if (template == NULL) { + return NULL; + } + args[1] = template; + result = PyObject_Vectorcall(func, args, 2, NULL); + Py_DECREF(template); + } + + if (result != NULL && Py_TYPE(result) != module_state->Template_Type) { + PyErr_Format(PyExc_RuntimeError, + "the result of compiling a replacement string is %.200s", + Py_TYPE(result)->tp_name); + Py_DECREF(result); + return NULL; + } + return result; +} + +static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */ + static PyObject* pattern_subx(_sremodulestate* module_state, PatternObject* self, @@ -1076,14 +1114,14 @@ pattern_subx(_sremodulestate* module_state, Py_ssize_t n; Py_ssize_t i, b, e; int isbytes, charsize; - int filter_is_callable; + int filter_type; Py_buffer view; if (PyCallable_Check(ptemplate)) { /* sub/subn takes either a function or a template */ filter = ptemplate; Py_INCREF(filter); - filter_is_callable = 1; + filter_type = -1; } else { /* if not callable, check if it's a literal string */ int literal; @@ -1103,16 +1141,22 @@ pattern_subx(_sremodulestate* module_state, if (literal) { filter = ptemplate; Py_INCREF(filter); - filter_is_callable = 0; + filter_type = 0; } else { /* not a literal; hand it over to the template compiler */ - filter = call( - SRE_PY_MODULE, "_subx", - PyTuple_Pack(2, self, ptemplate) - ); + filter = compile_template(module_state, self, ptemplate); if (!filter) return NULL; - filter_is_callable = PyCallable_Check(filter); + + assert(Py_TYPE(filter) == module_state->Template_Type); + if (Py_SIZE(filter) == 0) { + Py_INCREF(((TemplateObject *)filter)->literal); + Py_SETREF(filter, ((TemplateObject *)filter)->literal); + filter_type = 0; + } + else { + filter_type = 1; + } } } @@ -1163,12 +1207,18 @@ pattern_subx(_sremodulestate* module_state, } - if (filter_is_callable) { + if (filter_type) { /* pass match object through filter */ match = pattern_new_match(module_state, self, &state, 1); if (!match) goto error; - item = PyObject_CallOneArg(filter, match); + if (filter_type > 0) { + item = expand_template((TemplateObject *)filter, + (MatchObject *)match); + } + else { + item = PyObject_CallOneArg(filter, match); + } Py_DECREF(match); if (!item) goto error; @@ -1503,6 +1553,62 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, return (PyObject*) self; } +/*[clinic input] +_sre.template + + pattern: object + template: object(subclass_of="&PyList_Type") + / + +[clinic start generated code]*/ + +static PyObject * +_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template) +/*[clinic end generated code: output=d51290e596ebca86 input=56d2d1895cd04d9a]*/ +{ + _sremodulestate *module_state = get_sre_module_state(module); + TemplateObject *self = NULL; + Py_ssize_t n = PyList_GET_SIZE(template); + if ((n & 1) == 0 || n < 1) { + goto bad_template; + } + n /= 2; + self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n); + if (!self) + return NULL; + self->chunks = 1 + 2*n; + self->literal = PyList_GET_ITEM(template, 0); + Py_INCREF(self->literal); + for (Py_ssize_t i = 0; i < n; i++) { + Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1)); + if (index == -1 && PyErr_Occurred()) { + Py_DECREF(self); + return NULL; + } + if (index < 0) { + goto bad_template; + } + self->items[i].index = index; + + PyObject *literal = PyList_GET_ITEM(template, 2*i+2); + // Skip empty literals. + if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) || + (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal))) + { + literal = NULL; + self->chunks--; + } + Py_XINCREF(literal); + self->items[i].literal = literal; + } + return (PyObject*) self; + +bad_template: + PyErr_SetString(PyExc_TypeError, "invalid template"); + Py_XDECREF(self); + return NULL; +} + /* -------------------------------------------------------------------- */ /* Code validation */ @@ -2126,11 +2232,14 @@ static PyObject * _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template) /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/ { - /* delegate to Python code */ - return call( - SRE_PY_MODULE, "_expand", - PyTuple_Pack(3, self->pattern, self, template) - ); + _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self)); + PyObject *filter = compile_template(module_state, self->pattern, template); + if (filter == NULL) { + return NULL; + } + PyObject *result = expand_template((TemplateObject *)filter, self); + Py_DECREF(filter); + return result; } static PyObject* @@ -2715,6 +2824,107 @@ pattern_scanner(_sremodulestate *module_state, return (PyObject*) scanner; } +/* -------------------------------------------------------------------- */ +/* template methods */ + +static int +template_traverse(TemplateObject *self, visitproc visit, void *arg) +{ + Py_VISIT(Py_TYPE(self)); + Py_VISIT(self->literal); + for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) { + Py_VISIT(self->items[i].literal); + } + return 0; +} + +static int +template_clear(TemplateObject *self) +{ + Py_CLEAR(self->literal); + for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) { + Py_CLEAR(self->items[i].literal); + } + return 0; +} + +static void +template_dealloc(TemplateObject *self) +{ + PyTypeObject *tp = Py_TYPE(self); + + PyObject_GC_UnTrack(self); + (void)template_clear(self); + tp->tp_free(self); + Py_DECREF(tp); +} + +static PyObject * +expand_template(TemplateObject *self, MatchObject *match) +{ + if (Py_SIZE(self) == 0) { + Py_INCREF(self->literal); + return self->literal; + } + + PyObject *buffer[10]; + PyObject **out = buffer; + Py_ssize_t count = 0; + PyObject *list = NULL; + if (self->chunks > 10 || !PyUnicode_Check(self->literal)) { + list = PyList_New(self->chunks); + if (!list) { + return NULL; + } + out = &PyList_GET_ITEM(list, 0); + } + + Py_INCREF(self->literal); + out[count++] = self->literal; + for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) { + Py_ssize_t index = self->items[i].index; + if (index >= match->groups) { + PyErr_SetString(PyExc_IndexError, "no such group"); + Py_DECREF(list); + return NULL; + } + PyObject *item = match_getslice_by_index(match, index, Py_None); + if (item == NULL) { + Py_DECREF(list); + } + if (item != Py_None) { + Py_INCREF(item); + out[count++] = item; + } + Py_DECREF(item); + + PyObject *literal = self->items[i].literal; + if (literal != NULL) { + Py_INCREF(literal); + out[count++] = literal; + } + } + + PyObject *result; + if (PyUnicode_Check(self->literal)) { + result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count); + } + else { + Py_SET_SIZE(list, count); + result = _PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list); + } + if (list) { + Py_DECREF(list); + } + else { + for (Py_ssize_t i = 0; i < count; i++) { + Py_DECREF(out[i]); + } + } + return result; +} + + static Py_hash_t pattern_hash(PatternObject *self) { @@ -2937,15 +3147,32 @@ static PyType_Slot scanner_slots[] = { }; static PyType_Spec scanner_spec = { - .name = "_" SRE_MODULE ".SRE_Scanner", + .name = "_sre.SRE_Scanner", .basicsize = sizeof(ScannerObject), .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC), .slots = scanner_slots, }; +static PyType_Slot template_slots[] = { + {Py_tp_dealloc, template_dealloc}, + {Py_tp_traverse, template_traverse}, + {Py_tp_clear, template_clear}, + {0, NULL}, +}; + +static PyType_Spec template_spec = { + .name = "_sre.SRE_Template", + .basicsize = sizeof(TemplateObject), + .itemsize = sizeof(((TemplateObject *)0)->items[0]), + .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE | + Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC), + .slots = template_slots, +}; + static PyMethodDef _functions[] = { _SRE_COMPILE_METHODDEF + _SRE_TEMPLATE_METHODDEF _SRE_GETCODESIZE_METHODDEF _SRE_ASCII_ISCASED_METHODDEF _SRE_UNICODE_ISCASED_METHODDEF @@ -2962,6 +3189,7 @@ sre_traverse(PyObject *module, visitproc visit, void *arg) Py_VISIT(state->Pattern_Type); Py_VISIT(state->Match_Type); Py_VISIT(state->Scanner_Type); + Py_VISIT(state->Template_Type); return 0; } @@ -2974,6 +3202,7 @@ sre_clear(PyObject *module) Py_CLEAR(state->Pattern_Type); Py_CLEAR(state->Match_Type); Py_CLEAR(state->Scanner_Type); + Py_CLEAR(state->Template_Type); return 0; } @@ -3014,6 +3243,7 @@ sre_exec(PyObject *m) CREATE_TYPE(m, state->Pattern_Type, &pattern_spec); CREATE_TYPE(m, state->Match_Type, &match_spec); CREATE_TYPE(m, state->Scanner_Type, &scanner_spec); + CREATE_TYPE(m, state->Template_Type, &template_spec); if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) { goto error; @@ -3043,7 +3273,7 @@ static PyModuleDef_Slot sre_slots[] = { static struct PyModuleDef sremodule = { .m_base = PyModuleDef_HEAD_INIT, - .m_name = "_" SRE_MODULE, + .m_name = "_sre", .m_size = sizeof(_sremodulestate), .m_methods = _functions, .m_slots = sre_slots, diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h index 129f5595269f5b..39e788ba7f9deb 100644 --- a/Modules/_sre/sre.h +++ b/Modules/_sre/sre.h @@ -54,6 +54,16 @@ typedef struct { Py_ssize_t mark[1]; } MatchObject; +typedef struct { + PyObject_VAR_HEAD + Py_ssize_t chunks; + PyObject *literal; + struct { + Py_ssize_t index; + PyObject *literal; + } items[0]; +} TemplateObject; + typedef struct SRE_REPEAT_T { Py_ssize_t count; const SRE_CODE* pattern; /* points to REPEAT operator arguments */ diff --git a/Modules/_sre/sre_constants.h b/Modules/_sre/sre_constants.h index b1ef27eccc8397..2b8378d7eeb19d 100644 --- a/Modules/_sre/sre_constants.h +++ b/Modules/_sre/sre_constants.h @@ -11,7 +11,7 @@ * See the sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20220402 +#define SRE_MAGIC 20220409 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 From 50c435f724f86764aba590806d3735b5fec57b41 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 14 Apr 2022 08:37:17 +0000 Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst diff --git a/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst b/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst new file mode 100644 index 00000000000000..2fc35a3980fe3f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst @@ -0,0 +1,3 @@ +Speed up the regular expression substitution (functions :func:`re.sub` and +:func:`re.subn` and corresponding :class:`re.Pattern` methods) for +replacement strings containing group references by 2--3 times. From a47446dfe05aa6d353c7cde27f63bc303e7aa2ce Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 15 Apr 2022 09:32:34 +0300 Subject: [PATCH 3/8] Address review comments. --- Modules/_sre/sre.c | 38 +++++++++++++++++++++++++------------- Modules/_sre/sre.h | 5 +++-- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 7e2e02d0884440..12100bf3960286 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -1114,14 +1114,14 @@ pattern_subx(_sremodulestate* module_state, Py_ssize_t n; Py_ssize_t i, b, e; int isbytes, charsize; - int filter_type; + enum {LITERAL, TEMPLATE, CALLABLE} filter_type; Py_buffer view; if (PyCallable_Check(ptemplate)) { /* sub/subn takes either a function or a template */ filter = ptemplate; Py_INCREF(filter); - filter_type = -1; + filter_type = CALLABLE; } else { /* if not callable, check if it's a literal string */ int literal; @@ -1141,7 +1141,7 @@ pattern_subx(_sremodulestate* module_state, if (literal) { filter = ptemplate; Py_INCREF(filter); - filter_type = 0; + filter_type = LITERAL; } else { /* not a literal; hand it over to the template compiler */ filter = compile_template(module_state, self, ptemplate); @@ -1152,10 +1152,10 @@ pattern_subx(_sremodulestate* module_state, if (Py_SIZE(filter) == 0) { Py_INCREF(((TemplateObject *)filter)->literal); Py_SETREF(filter, ((TemplateObject *)filter)->literal); - filter_type = 0; + filter_type = LITERAL; } else { - filter_type = 1; + filter_type = TEMPLATE; } } } @@ -1207,16 +1207,17 @@ pattern_subx(_sremodulestate* module_state, } - if (filter_type) { + if (filter_type != LITERAL) { /* pass match object through filter */ match = pattern_new_match(module_state, self, &state, 1); if (!match) goto error; - if (filter_type > 0) { + if (filter_type == TEMPLATE) { item = expand_template((TemplateObject *)filter, (MatchObject *)match); } else { + assert(filter_type == CALLABLE); item = PyObject_CallOneArg(filter, match); } Py_DECREF(match); @@ -1566,6 +1567,10 @@ static PyObject * _sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template) /*[clinic end generated code: output=d51290e596ebca86 input=56d2d1895cd04d9a]*/ { + /* template is a list containing interleaved literal strings (str or bytes) + * and group indices (int), as returned by _parser.parse_template: + * [literal1, group1, literal2, ..., literalN]. + */ _sremodulestate *module_state = get_sre_module_state(module); TemplateObject *self = NULL; Py_ssize_t n = PyList_GET_SIZE(template); @@ -2867,11 +2872,16 @@ expand_template(TemplateObject *self, MatchObject *match) return self->literal; } + PyObject *result = NULL; + Py_ssize_t count = 0; // the number of non-empty chunks + /* For small number of strings use a buffer allocated on the stack, + * otherwise use a list object. */ PyObject *buffer[10]; PyObject **out = buffer; - Py_ssize_t count = 0; PyObject *list = NULL; - if (self->chunks > 10 || !PyUnicode_Check(self->literal)) { + if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) || + !PyUnicode_Check(self->literal)) + { list = PyList_New(self->chunks); if (!list) { return NULL; @@ -2885,12 +2895,11 @@ expand_template(TemplateObject *self, MatchObject *match) Py_ssize_t index = self->items[i].index; if (index >= match->groups) { PyErr_SetString(PyExc_IndexError, "no such group"); - Py_DECREF(list); - return NULL; + goto cleanup; } PyObject *item = match_getslice_by_index(match, index, Py_None); if (item == NULL) { - Py_DECREF(list); + goto cleanup; } if (item != Py_None) { Py_INCREF(item); @@ -2905,7 +2914,6 @@ expand_template(TemplateObject *self, MatchObject *match) } } - PyObject *result; if (PyUnicode_Check(self->literal)) { result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count); } @@ -2913,6 +2921,8 @@ expand_template(TemplateObject *self, MatchObject *match) Py_SET_SIZE(list, count); result = _PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list); } + +cleanup: if (list) { Py_DECREF(list); } @@ -3190,6 +3200,7 @@ sre_traverse(PyObject *module, visitproc visit, void *arg) Py_VISIT(state->Match_Type); Py_VISIT(state->Scanner_Type); Py_VISIT(state->Template_Type); + Py_VISIT(state->compile_template); return 0; } @@ -3203,6 +3214,7 @@ sre_clear(PyObject *module) Py_CLEAR(state->Match_Type); Py_CLEAR(state->Scanner_Type); Py_CLEAR(state->Template_Type); + Py_CLEAR(state->compile_template); return 0; } diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h index 39e788ba7f9deb..469b06bd324d6d 100644 --- a/Modules/_sre/sre.h +++ b/Modules/_sre/sre.h @@ -56,11 +56,12 @@ typedef struct { typedef struct { PyObject_VAR_HEAD - Py_ssize_t chunks; + Py_ssize_t chunks; /* the number of group references and non-NULL literals + * self->chunks <= 2*Py_SIZE(self) + 1 */ PyObject *literal; struct { Py_ssize_t index; - PyObject *literal; + PyObject *literal; /* NULL if empty */ } items[0]; } TemplateObject; From 78dc0276b19b3bea21781c3a4324a05432c062e0 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Sun, 23 Oct 2022 20:41:53 +0000 Subject: [PATCH 4/8] remove ^M dos line endings --- .../Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst b/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst index 2fc35a3980fe3f..b3f01755eaa33c 100644 --- a/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst +++ b/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst @@ -1,3 +1,3 @@ -Speed up the regular expression substitution (functions :func:`re.sub` and -:func:`re.subn` and corresponding :class:`re.Pattern` methods) for +Speed up the regular expression substitution (functions :func:`re.sub` and +:func:`re.subn` and corresponding :class:`re.Pattern` methods) for replacement strings containing group references by 2--3 times. From 7ab7ade481109ccc42856c97010f52cdf4254200 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Sun, 23 Oct 2022 20:47:01 +0000 Subject: [PATCH 5/8] correct mis-merge in import() --- Modules/_sre/sre.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 7165c82338af7d..70a6119d1665e4 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -757,13 +757,9 @@ import(const char* module, const char* function) { PyObject* func; - name = PyUnicode_FromString(module); - if (!name) - return NULL; func = _PyImport_GetModuleAttrString(module, function); if (!func) return NULL; - Py_DECREF(mod); return func; } From 7eebbffb55c2a81f0cffd6d54c8b5cd4283f1ec1 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Sun, 23 Oct 2022 20:50:59 +0000 Subject: [PATCH 6/8] remove import(), an internal API exists for that. --- Modules/_sre/sre.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 70a6119d1665e4..bf5fd52b1c0e75 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -752,17 +752,6 @@ _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls, return match; } -static PyObject* -import(const char* module, const char* function) -{ - PyObject* func; - - func = _PyImport_GetModuleAttrString(module, function); - if (!func) - return NULL; - return func; -} - /*[clinic input] _sre.SRE_Pattern.findall @@ -1032,7 +1021,7 @@ compile_template(_sremodulestate *module_state, /* delegate to Python code */ PyObject *func = module_state->compile_template; if (func == NULL) { - func = import("re", "_compile_template"); + func = _PyImport_GetModuleAttrString("re", "_compile_template"); if (func == NULL) { return NULL; } From 00d5e12ade861d5186738c71d47aab290db26d50 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Sun, 23 Oct 2022 20:57:11 +0000 Subject: [PATCH 7/8] improve the _sre.template docstring. --- Modules/_sre/clinic/sre.c.h | 10 ++++++++-- Modules/_sre/sre.c | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index ca416229faa1b4..da641081ce9e3c 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -1071,7 +1071,13 @@ _sre_compile(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject PyDoc_STRVAR(_sre_template__doc__, "template($module, pattern, template, /)\n" "--\n" -"\n"); +"\n" +"\n" +"\n" +" template\n" +" A list containing interleaved literal strings (str or bytes) and group\n" +" indices (int), as returned by re._parser.parse_template():\n" +" [literal1, group1, ..., literalN, groupN]"); #define _SRE_TEMPLATE_METHODDEF \ {"template", _PyCFunction_CAST(_sre_template), METH_FASTCALL, _sre_template__doc__}, @@ -1454,4 +1460,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const } return _sre_SRE_Scanner_search_impl(self, cls); } -/*[clinic end generated code: output=e90b95b6691bdbfa input=a9049054013a1b77]*/ +/*[clinic end generated code: output=e3ba72156dd71572 input=a9049054013a1b77]*/ diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index bf5fd52b1c0e75..aae02652664f72 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -1529,13 +1529,16 @@ _sre.template pattern: object template: object(subclass_of="&PyList_Type") + A list containing interleaved literal strings (str or bytes) and group + indices (int), as returned by re._parser.parse_template(): + [literal1, group1, ..., literalN, groupN] / [clinic start generated code]*/ static PyObject * _sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template) -/*[clinic end generated code: output=d51290e596ebca86 input=56d2d1895cd04d9a]*/ +/*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/ { /* template is a list containing interleaved literal strings (str or bytes) * and group indices (int), as returned by _parser.parse_template: From 1c817444aa4e33fe48af09f673ed1a38dd859573 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Sun, 23 Oct 2022 21:03:17 +0000 Subject: [PATCH 8/8] :gh: instead of :issue: --- Doc/whatsnew/3.12.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index d67c983cea88c3..8f8a994615103d 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -208,7 +208,7 @@ Optimizations * Speed up the regular expression substitution (functions :func:`re.sub` and :func:`re.subn` and corresponding :class:`re.Pattern` methods) for replacement strings containing group references by 2--3 times. - (Contributed by Serhiy Storchaka in :issue:`91524`.) + (Contributed by Serhiy Storchaka in :gh:`91524`.) CPython bytecode changes