8000 gh-122288: Improve performances of `fnmatch.translate` by picnixz · Pull Request #122289 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-122288: Improve performances of fnmatch.translate #122289

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 42 additions & 48 deletions Lib/fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,24 +77,30 @@ def translate(pat):
There is no way to quote meta-characters.
"""

STAR = object()
parts = _translate(pat, STAR, '.')
return _join_translated_parts(parts, STAR)
parts, indices = _translate(pat, '*', '.')
return _join_translated_parts(parts, indices)

_re_setops_sub = re.compile(r'([&~|])').sub
_re_escape = functools.lru_cache(maxsize=512)(re.escape)

def _translate(pat, STAR, QUESTION_MARK):
def _translate(pat, star, question_mark):
res = []
add = res.append
star_indices = []

i, n = 0, len(pat)
while i < n:
c = pat[i]
i = i+1
if c == '*':
# store the position of the wildcard
star_indices.append(len(res))
add(star)
# compress consecutive `*` into one
if (not res) or res[-1] is not STAR:
add(STAR)
while i < n and pat[i] == '*':
i += 1
elif c == '?':
add(QUESTION_MARK)
add(question_mark)
elif c == '[':
j = i
if j < n and pat[j] == '!':
Expand Down Expand Up @@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK):
# Hyphens that create ranges shouldn't be escaped.
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
for s in chunks)
# Escape set operations (&&, ~~ and ||).
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
i = j+1
if not stuff:
# Empty range: never match.
Expand All @@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK):
# Negated empty range: match any character.
add('.')
else:
# Escape set operations (&&, ~~ and ||).
stuff = _re_setops_sub(r'\\\1', stuff)
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] in ('^', '['):
stuff = '\\' + stuff
add(f'[{stuff}]')
else:
add(re.escape(c))
assert i == n
return res


def _join_translated_parts(inp, STAR):
# Deal with STARs.
res = []
add = res.append
i, n = 0, len(inp)
# Fixed pieces at the start?
while i < n and inp[i] is not STAR:
add(inp[i])
i += 1
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
while i < n:
assert inp[i] is STAR
i += 1
if i == n:
add(".*")
break
assert inp[i] is not STAR
fixed = []
while i < n and inp[i] is not STAR:
fixed.append(inp[i])
i += 1
fixed = "".join(fixed)
if i == n:
add(".*")
add(fixed)
else:
add(f"(?>.*?{fixed})")
add(_re_escape(c))
assert i == n
res = "".join(res)
return res, star_indices


def _join_translated_parts(parts, star_indices):
if not star_indices:
return fr'(?s:{"".join(parts)})\Z'
iter_star_indices = iter(star_indices)
j = next(iter_star_indices)
buffer = parts[:j] # fixed pieces at the start
append, extend = buffer.append, buffer.extend
i = j + 1
for j in iter_star_indices:
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
append('(?>.*?')
extend(parts[i:j])
append(')')
i = j + 1
append('.*')
extend(parts[i:])
res = ''.join(buffer)
return fr'(?s:{res})\Z'
2 changes: 1 addition & 1 deletion Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
if part:
if not include_hidden and part[0] in '*?':
results.append(r'(?!\.)')
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0])
if idx < last_part_idx:
results.append(any_sep)
res = ''.join(results)
Expand Down
69 changes: 69 additions & 0 deletions Lib/test/test_fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,75 @@ def test_translate(self):
self.assertTrue(re.match(fatre, 'cbabcaxc'))
self.assertFalse(re.match(fatre, 'dabccbad'))

def test_translate_wildcards(self):
for pattern, expect in [
('ab*', r'(?s:ab.*)\Z'),
('ab*cd', r'(?s:ab.*cd)\Z'),
('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'),
('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'),
('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'),
('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'),
('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
]:
with self.subTest(pattern):
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)

for pattern, expect in [
('*ab', r'(?s:.*ab)\Z'),
('*ab*', r'(?s:(?>.*?ab).*)\Z'),
('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'),
('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'),
('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'),
('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'),
('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'),
('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
]:
with self.subTest(pattern):
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)

def test_translate_expressions(self):
for pattern, expect in [
('[', r'(?s:\[)\Z'),
('[!', r'(?s:\[!)\Z'),
('[]', r'(?s:\[\])\Z'),
('[abc', r'(?s:\[abc)\Z'),
('[!abc', r'(?s:\[!abc)\Z'),
('[abc]', r'(?s:[abc])\Z'),
('[!abc]', r'(?s:[^abc])\Z'),
('[!abc][!def]', r'(?s:[^abc][^def])\Z'),
# with [[
('[[', r'(?s:\[\[)\Z'),
('[[a', r'(?s:\[\[a)\Z'),
('[[]', r'(?s:[\[])\Z'),
('[[]a', r'(?s:[\[]a)\Z'),
('[[]]', r'(?s:[\[]\])\Z'),
('[[]a]', r'(?s:[\[]a\])\Z'),
('[[a]', r'(?s:[\[a])\Z'),
('[[a]]', r'(?s:[\[a]\])\Z'),
('[[a]b', r'(?s:[\[a]b)\Z'),
# backslashes
('[\\', r'(?s:\[\\)\Z'),
(r'[\]', r'(?s:[\\])\Z'),
(r'[\\]', r'(?s:[\\\\])\Z'),
]:
with self.subTest(pattern):
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)

def test_indices_locations(self):
from fnmatch import _translate

blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a']
parts, indices = _translate(''.join(blocks), '*', '.')
expect_parts = ['a', r'\^', 'b', '*',
'.', '.', '[a-z]', '[1-9]', '*',
r'\+', r'\+', r'\[', r'\[', 'a']
self.assertListEqual(parts, expect_parts)
self.assertListEqual(indices, [3, 8])


class FilterTestCase(unittest.TestCase):

def test_filter(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch
by Bénédikt Tran.
Loading
0