8000 GH-117586: Speed up `pathlib.Path.glob()` by working with strings by barneygale · Pull Request #117589 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

GH-117586: Speed up pathlib.Path.glob() by working with strings #117589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move class into glob module.
  • Loading branch information
barneygale committed Apr 6, 2024
commit 26d2c03f3a46e797ed1b8d4edd19e0837bbf1cd2
185 changes: 185 additions & 0 deletions Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import os
import re
import fnmatch
import functools
import itertools
import operator
import stat
import sys

Expand Down Expand Up @@ -256,7 +258,9 @@ def escape(pathname):
return drive + pathname


_special_parts = ('', '.', '..')
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
_no_recurse_symlinks = object()


def translate(pat, *, recursive=False, include_hidden=False, seps=None):
Expand Down Expand Up @@ -312,3 +316,184 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
results.append(any_sep)
res = ''.join(results)
return fr'(?s:{res})\Z'


@functools.lru_cache(maxsize=512)
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity)."""
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
return re.compile(regex, flags=flags).match


if os.name == 'nt':
def _add_slash(pathname):
tail = os.path.splitroot(pathname)[2]
if not tail or tail[-1] in '\\/':
return pathname
return f'{pathname}\\'
else:
def _add_slash(pathname):
if not pathname or pathname[-1] == '/':
return pathname
return f'{pathname}/'


class _Globber:
"""Class providing shell-style pattern matching and globbing.
"""

def __init__(self, sep, case_sensitive, recursive=False):
self.sep = sep
self.case_sensitive = case_sensitive
self.recursive = recursive

# Low-level methods

lstat = staticmethod(os.lstat)
scandir = staticmethod(os.scandir)
add_slash = staticmethod(_add_slash)
concat_path = operator.add
parse_entry = operator.attrgetter('path')

# High-level methods

def compile(self, pat):
return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive)

def selector(self, parts):
"""Returns a function that selects from a given path, walking and
filtering according to the glob-style pattern parts in *parts*.
"""
if not parts:
return self.select_exists
part = parts.pop()
if self.recursive and part == '**':
selector = self.recursive_selector
elif part in _special_parts:
selector = self.special_selector
else:
selector = self.wildcard_selector
return selector(part, parts)

def special_selector(self, part, parts):
"""Returns a function that selects special children of the given path.
"""
select_next = self.selector(parts)

def select_special(path, exists=False):
path = self.concat_path(self.add_slash(path), part)
return select_next(path, exists)
return select_special

def wildcard_selector(self, part, parts):
"""Returns a function that selects direct children of a given path,
filtering by pattern.
"""

match = None if part == '*' else self.compile(part)
dir_only = bool(parts)
if dir_only:
select_next = self.selector(parts)

def select_wildcard(path, exists=False):
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with self.scandir(path) as scandir_it:
entries = list(scandir_it)
for entry in entries:
if match is None or match(entry.name):
if dir_only:
try:
if not entry.is_dir():
continue
except OSError:
continue
entry_path = self.parse_entry(entry)
if dir_only:
yield from select_next(entry_path, exists=True)
else:
yield entry_path
except OSError:
pass
return select_wildcard

def recursive_selector(self, part, parts):
"""Returns a function that selects a given path and all its children,
recursively, filtering by pattern.
"""
# Optimization: consume following '**' parts, which have no effect.
while parts and parts[-1] == '**':
parts.pop()

# Optimization: consume and join any following non-special parts here,
# rather than leaving them for the next selector. They're used to
# build a regular expression, which we use to filter the results of
# the recursive walk. As a result, non-special pattern segments
# following a '**' wildcard don't require additional filesystem access
# to expand.
follow_symlinks = self.recursive is not _no_recurse_symlinks
if follow_symlinks:
while parts and parts[-1] not in _special_parts:
part += self.sep + parts.pop()

match = None if part == '**' else self.compile(part)
dir_only = bool(parts)
select_next = self.selector(parts)

def select_recursive(path, exists=False):
path = self.add_slash(path)
match_pos = len(str(path))
if match is None or match(str(path), match_pos):
yield from select_next(path, exists)
stack = [path]
while stack:
yield from select_recursive_step(stack, match_pos)

def select_recursive_step(stack, match_pos):
path = stack.pop()
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with self.scandir(path) as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
is_dir = False
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
is_dir = True
except OSError:
pass

if is_dir or not dir_only:
entry_path = self.parse_entry(entry)
if match is None or match(str(entry_path), match_pos):
if dir_only:
yield from select_next(entry_path, exists=True)
else:
# Optimization: directly yield the path if this is
# last pattern part.
yield entry_path
if is_dir:
stack.append(entry_path)

return select_recursive

def select_exists(self, path, exists=False):
"""Yields the given path, if it exists.
"""
if exists:
# Optimization: this path is already known to exist, e.g. because
# it was returned from os.scandir(), so we skip calling lstat().
yield path
else:
try:
self.lstat(path)
yield path
except OSError:
pass
5 changes: 3 additions & 2 deletions Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
operating systems.
"""

import glob
import io
import ntpath
import os
Expand All @@ -23,7 +24,7 @@
except ImportError:
grp = None

from . import _abc, _glob
from . import _abc


__all__ = [
Expand Down Expand Up @@ -111,7 +112,7 @@ class PurePath(_abc.PurePathBase):
'_hash',
)
parser = os.path
_globber = _glob.Globber
_globber = glob._Globber

def __new__(cls, *args, **kwargs):
"""Construct a PurePath from one or several strings and or existing
Expand Down
11 changes: 3 additions & 8 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
"""

import functools
import glob
import operator
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO

from . import _glob

#
# Internals
#
Expand All @@ -43,12 +42,8 @@ def _ignore_error(exception):
def _is_case_sensitive(parser):
return parser.normcase('Aa') == 'Aa'

#
# Globbing helpers
#


class Globber(_glob.Globber):
class Globber(glob._Globber):
lstat = operator.methodcaller('lstat')
scandir = operator.methodcaller('_scandir')
add_slash = operator.methodcaller('joinpath', '')
Expand Down Expand Up @@ -699,7 +694,7 @@ def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
return iter([])
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
recursive = True if recurse_symlinks else _glob.no_recurse_symlinks
recursive = True if recurse_symlinks else glob._no_recurse_symlinks
globber = self._globber(self.parser.sep, case_sensitive, recursive)
return globber.selector(parts)

Expand Down
Loading
0