8000 gh-111259: Optimize complementary character sets in RE (GH-120742) · python/cpython@8bc76ae · GitHub
[go: up one dir, main page]

Skip to content

Commit 8bc76ae

Browse files
gh-111259: Optimize complementary character sets in RE (GH-120742)
Patterns like "[\s\S]" or "\s|\S" which match any character are now compiled to the same effective code as a dot with the DOTALL modifier ("(?s:.)").
1 parent 3846fcf commit 8bc76ae

File tree

4 files changed

+50
-13
lines changed

4 files changed

+50
-13
lines changed

Lib/re/_compiler.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
2929
}
3030

31+
_CHARSET_ALL = [(NEGATE, None)]
32+
3133
def _combine_flags(flags, add_flags, del_flags,
3234
TYPE_FLAGS=_parser.TYPE_FLAGS):
3335
if add_flags & TYPE_FLAGS:
@@ -84,17 +86,22 @@ def _compile(code, pattern, flags):
8486
code[skip] = _len(code) - skip
8587
elif op is IN:
8688
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
87-
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
88-
emit(IN_LOC_IGNORE)
89-
elif not hascased:
90-
emit(IN)
91-
elif not fixes: # ascii
92-
emit(IN_IGNORE)
89+
if not charset:
90+
emit(FAILURE)
91+
elif charset == _CHARSET_ALL:
92+
emit(ANY_ALL)
9393
else:
94-
emit(IN_UNI_IGNORE)
95-
skip = _len(code); emit(0)
96-
_compile_charset(charset, flags, code)
97-
code[skip] = _len(code) - skip
94+
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
95+
emit(IN_LOC_IGNORE)
96+
elif not hascased:
97+
emit(IN)
98+
elif not fixes: # ascii
99+
emit(IN_IGNORE)
100+
else:
101+
emit(IN_UNI_IGNORE)
102+
skip = _len(code); emit(0)
103+
_compile_charset(charset, flags, code)
104+
code[skip] = _len(code) - skip
98105
elif op is ANY:
99106
if flags & SRE_FLAG_DOTALL:
100107
emit(ANY_ALL)
@@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
277284
charmap[i] = 1
278285
elif op is NEGATE:
279286
out.append((op, av))
287+
elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
288+
# Optimize [\s\S] etc.
289+
out = [] if out else _CHARSET_ALL
290+
return out, False
280291
else:
281292
tail.append((op, av))
282293
except IndexError:
@@ -519,13 +530,18 @@ def _compile_info(code, pattern, flags):
519530
# look for a literal prefix
520531
prefix = []
521532
prefix_skip = 0
522-
charset = [] # not used
533+
charset = None # not used
523534
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
524535
# look for literal prefix
525536
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
526537
# if no prefix, look for charset prefix
527538
if not prefix:
528539
charset = _get_charset_prefix(pattern, flags)
540+
if charset:
541+
charset, hascased = _optimize_charset(charset)
542+
assert not hascased
543+
if charset == _CHARSET_ALL:
544+
charset = None
529545
## if prefix:
530546
## print("*** PREFIX", prefix, prefix_skip)
531547
## if charset:
@@ -560,8 +576,6 @@ def _compile_info(code, pattern, flags):
560576
# generate overlap table
561577
code.extend(_generate_overlap_table(prefix))
562578
elif charset:
563-
charset, hascased = _optimize_charset(charset)
564-
assert not hascased
565579
_compile_charset(charset, flags, code)
566580
code[skip] = len(code) - skip
567581

Lib/re/_constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ def _makecodes(*names):
206206
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
207207
}
208208

209+
CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2]))
210+
209211
# flags
210212
SRE_FLAG_IGNORECASE = 2 # case insensitive
211213
SRE_FLAG_LOCALE = 4 # honour system locale

Lib/test/test_re.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2473,6 +2473,24 @@ def test_regression_gh94675(self):
24732473
def test_fail(self):
24742474
self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
24752475

2476+
def test_character_set_any(self):
2477+
# The union of complementary character sets mathes any character
2478+
# and is equivalent to "(?s:.)".
2479+
s = '1x\n'
2480+
for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
2481+
with self.subTest(pattern=p):
2482+
self.assertEqual(re.findall(p, s), list(s))
2483+
self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
2484+
2485+
def test_character_set_none(self):
2486+
# Negation of the union of complementary character sets does not match
2487+
# any character.
2488+
s = '1x\n'
2489+
for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
2490+
with self.subTest(pattern=p):
2491+
self.assertIsNone(re.search(p, s))
2492+
self.assertIsNone(re.search('(?s:.)' + p, s))
2493+
24762494

24772495
def get_debug_out(pat):
24782496
with captured_stdout() as out:
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
:mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match
2+
any character as effectively as a dot with the ``DOTALL`` modifier
3+
(``"(?s:.)"``).

0 commit comments

Comments
 (0)
0