8000 GH-117586: Speed up `pathlib.Path.glob()` by working with strings by barneygale · Pull Request #117589 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

GH-117586: Speed up pathlib.Path.glob() by working with strings #117589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
GH-117586: Speed up pathlib.Path.glob() by working with strings
Move pathlib globbing implementation to a new module and class:
`pathlib._glob.Globber`. This class implements fast string-based globbing.
It's called by `pathlib.Path.glob()`, which then converts strings back to
path objects.

In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that
works with `PathBase` objects rather than strings, and calls user-defined
path methods like `PathBase.stat()` rather than `os.stat()`.

This sets the stage for two more improvements:

- GH-115060: Query non-wildcard segments with `lstat()`
- GH-116380: Move `pathlib._glob` to `glob` (unify implementations).
  • Loading branch information
barneygale committed Apr 6, 2024
commit fe2c46c9f07bbfaf8391cb9fe4179bc6e25665b2
41 changes: 16 additions & 25 deletions Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
except ImportError:
grp = None

from . import _abc
from . import _abc, _glob


__all__ = [
Expand Down Expand Up @@ -111,6 +111,7 @@ class PurePath(_abc.PurePathBase):
'_hash',
)
parser = os.path
_globber = _glob.Globber

def __new__(cls, *args, **kwargs):
"""Construct a PurePath from one or several strings and or existing
Expand Down Expand Up @@ -453,21 +454,6 @@ def as_uri(self):
from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path))

@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
parts = self._tail.copy()
pattern = self._raw_path
if self.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
elif pattern[-1] in (self.parser.sep, self.parser.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
parts.reverse()
return parts

@property
def _pattern_str(self):
"""The path expressed as a string, for use in pattern-matching."""
Expand Down Expand Up @@ -587,13 +573,9 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

def _direntry_str(self, entry):
# Transform an entry yielded from _scandir() into a path string.
return entry.name if str(self) == '.' else entry.path

def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
path_str = self._direntry_str(entry)
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
path._str = path_str
path._drv = self.drive
Expand Down Expand Up @@ -626,8 +608,18 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
sys.audit("pathlib.Path.glob", self, pattern)
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
if pattern.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
parts = pattern._tail.copy()
if not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
raw = pattern._raw_path
if raw[-1] in (self.parser.sep, self.parser.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
parts.reverse()
select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
return map(self.with_segments, select(str(self), exists=True))

def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
"""Recursively yield all existing files (of any kind, including
Expand All @@ -638,8 +630,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
pattern = '**' / pattern
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)

def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""
Expand Down
195 changes: 34 additions & 161 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
"""

import functools
import operator
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO

from . import _glob

#
# Internals
#
Expand Down Expand Up @@ -44,105 +47,21 @@ def _is_case_sensitive(parser):
# Globbing helpers
#

re = glob = None


@functools.lru_cache(maxsize=512)
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity)."""
global re, glob
if re is None:
import re, glob

flags = re.NOFLAG if case_sensitive else re.IGNORECASE
regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep)
return re.compile(regex, flags=flags).match


def _select_special(paths, part):
"""Yield special literal children of the given paths."""
for path in paths:
yield path._make_child_relpath(part)

class Globber(_glob.Globber):
lstat = operator.methodcaller('lstat')
scandir = operator.methodcaller('_scandir')
add_slash = operator.methodcaller('joinpath', '')

def _select_children(parent_paths, dir_only, match):
"""Yield direct children of given paths, filtering by name and type."""
for parent_path in parent_paths:
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with parent_path._scandir() as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
if dir_only:
try:
if not entry.is_dir():
continue
except OSError:
continue
# Avoid cost of making a path object for non-matching paths by
# matching against the os.DirEntry.name string.
if match is None or match(entry.name):
yield parent_path._make_child_direntry(entry)

def concat_path(self, path, text):
"""Appends text to the given path.
"""
return path.with_segments(path._raw_path + text)

def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
"""Yield given paths and all their children, recursively, filtering by
string and type.
"""
for parent_path in parent_paths:
if match is not None:
# If we're filtering paths through a regex, record the length of
# the parent path. We'll pass it to match(path, pos=...) later.
parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
paths = [parent_path._make_child_relpath('')]
while paths:
path = paths.pop()
if match is None or match(str(path), parent_len):
# Yield *directory* path that matches pattern (if any).
yield path
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with path._scandir() as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
# Handle directory entry.
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
# Recurse into this directory.
paths.append(path._make_child_direntry(entry))
continue
except OSError:
pass

# Handle file entry.
if not dir_only:
# Avoid cost of making a path object for non-matching
# files by matching against the os.DirEntry object.
if match is None or match(path._direntry_str(entry), parent_len):
# Yield *file* path that matches pattern (if any).
yield path._make_child_direntry(entry)


def _select_unique(paths):
"""Yields the given paths, filtering out duplicates."""
yielded = set()
try:
for path in paths:
path_str = str(path)
if path_str not in yielded:
yield path
yielded.add(path_str)
finally:
yielded.clear()
def parse_entry(self, entry):
"""Returns the path of an entry yielded from scandir().
"""
return entry


class UnsupportedOperation(NotImplementedError):
Expand Down Expand Up @@ -218,6 +137,7 @@ class PurePathBase:
'_resolving',
)
parser = ParserBase()
_globber = Globber

def __init__(self, path, *paths):
self._raw_path = self.parser.join(path, *paths) if paths else path
Expand Down Expand Up @@ -454,14 +374,6 @@ def is_absolute(self):
a drive)."""
return self.parser.isabs(self._raw_path)

@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
anchor, parts = self._stack
if anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
return parts

@property
def _pattern_str(self):
"""The path expressed as a string, for use in pattern-matching."""
Expand All @@ -487,8 +399,9 @@ def match(self, path_pattern, *, case_sensitive=None):
return False
if len(path_parts) > len(pattern_parts) and path_pattern.anchor:
return False
globber = self._globber(sep, case_sensitive)
for path_part, pattern_part in zip(path_parts, pattern_parts):
match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False)
match = globber.compile(pattern_part)
if match(path_part) is None:
return False
return True
Expand All @@ -502,7 +415,8 @@ def full_match(self, pattern, *, case_sensitive=None):
pattern = self.with_segments(pattern)
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive)
globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True)
match = globber.compile(pattern._pattern_str)
return match(self._pattern_str) is not None


Expand Down Expand Up @@ -772,11 +686,6 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())

def _direntry_str(self, entry):
# Transform an entry yielded from _scandir() into a path string.
# PathBase._scandir() yields PathBase objects, so use str().
return str(entry)

def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
# PathBase._scandir() yields PathBase objects, so this is a no-op.
Expand All @@ -785,62 +694,26 @@ def _make_child_direntry(self, entry):
def _make_child_relpath(self, name):
return self.joinpath(name)

def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
if not self.is_dir():
return iter([])
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
recursive = True if recurse_symlinks else _glob.no_recurse_symlinks
globber = self._globber(self.parser.sep, case_sensitive, recursive)
return globber.selector(parts)

def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
"""Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern.
"""
if not isinstance(pattern, PurePathBase):
pattern = self.with_segments(pattern)
if case_sensitive is None:
# TODO: evaluate case-sensitivity of each directory in _select_children().
case_sensitive = _is_case_sensitive(self.parser)

stack = pattern._pattern_stack
specials = ('', '.', '..')
deduplicate_paths = False
sep = self.parser.sep
paths = iter([self] if self.is_dir() else [])
while stack:
part = stack.pop()
if part in specials:
# Join special component (e.g. '..') onto paths.
paths = _select_special(paths, part)

elif part == '**':
# Consume following '**' components, which have no effect.
while stack and stack[-1] == '**':
stack.pop()

# Consume following non-special components, provided we're
# treating symlinks consistently. Each component is joined
# onto 'part', which is used to generate an re.Pattern object.
if recurse_symlinks:
while stack and stack[-1] not in specials:
part += sep + stack.pop()

# If the previous loop consumed pattern components, compile an
# re.Pattern object based on those components.
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None

# Recursively walk directories, filtering by type and regex.
paths = _select_recursive(paths, bool(stack), recurse_symlinks, match)

# De-duplicate if we've already seen a '**' component.
if deduplicate_paths:
paths = _select_unique(paths)
deduplicate_paths = True

elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")

else:
# If the pattern component isn't '*', compile an re.Pattern
# object based on the component.
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None

# Iterate over directories' children filtering by type and regex.
paths = _select_children(paths, bool(stack), match)
return paths
anchor, parts = pattern._stack
if anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
return select(self, exists=True)

def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
"""Recursively yield all existing files (of any kind, including
Expand Down
Loading
0