From 8bf4600d524fc3e1f6bdad10843b06805f870c0d Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 6 Mar 2023 17:08:13 +0000 Subject: [PATCH 01/10] GH-101362: Omit path anchor from `pathlib.PurePath()._parts` Improve performance of path construction by skipping the addition of the path anchor (`drive + root`) to the internal `_parts` list. This change allows us to simplify the implementations of `joinpath()`, `name`, `parent`, and `parents` a little. The public `parts` tuple is unaffected. --- Lib/pathlib.py | 40 ++++++++++++------------- Lib/test/test_pathlib.py | 64 ++++++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index d375529ff5f767..bae72c8151d3e0 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -221,10 +221,7 @@ def __init__(self, path): self._parts = path._parts def __len__(self): - if self._drv or self._root: - return len(self._parts) - 1 - else: - return len(self._parts) + return len(self._parts) def __getitem__(self, idx): if isinstance(idx, slice): @@ -269,7 +266,7 @@ def __new__(cls, *args): def __reduce__(self): # Using the parts tuple helps share interned path parts # when pickling related paths. - return (self.__class__, tuple(self._parts)) + return (self.__class__, self.parts) @classmethod def _parse_parts(cls, parts): @@ -295,8 +292,7 @@ def _parse_parts(cls, parts): if drv.startswith(sep): # pathlib assumes that UNC paths always have a root. root = sep - unfiltered_parsed = [drv + root] + rel.split(sep) - parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.'] + parsed = [sys.intern(x) for x in rel.split(sep) if x and x != '.'] return drv, root, parsed @classmethod @@ -318,10 +314,11 @@ def _from_parsed_parts(cls, drv, root, parts): @classmethod def _format_parsed_parts(cls, drv, root, parts): + tail = cls._flavour.sep.join(parts) if drv or root: - return drv + root + cls._flavour.sep.join(parts[1:]) + return f'{drv}{root}{tail}' else: - return cls._flavour.sep.join(parts) + return tail def __str__(self): """Return the string representation of the path, suitable for @@ -376,7 +373,7 @@ def _parts_normcase(self): try: return self._parts_normcase_cached except AttributeError: - self._parts_normcase_cached = [self._flavour.normcase(p) for p in self._parts] + self._parts_normcase_cached = [self._flavour.normcase(p) for p in self.parts] return self._parts_normcase_cached def __eq__(self, other): @@ -427,7 +424,7 @@ def anchor(self): def name(self): """The final path component, if any.""" parts = self._parts - if len(parts) == (1 if (self._drv or self._root) else 0): + if not parts: return '' return parts[-1] @@ -551,7 +548,10 @@ def parts(self): try: return self._parts_tuple except AttributeError: - self._parts_tuple = tuple(self._parts) + if self._drv or self._root: + self._parts_tuple = (self._drv + self._root,) + tuple(self._parts) + else: + self._parts_tuple = tuple(self._parts) return self._parts_tuple def joinpath(self, *args): @@ -564,13 +564,13 @@ def joinpath(self, *args): drv2, root2, parts2 = self._parse_parts(args) if root2: if not drv2 and drv1: - return self._from_parsed_parts(drv1, root2, [drv1 + root2] + parts2[1:]) + return self._from_parsed_parts(drv1, root2, parts2) else: return self._from_parsed_parts(drv2, root2, parts2) elif drv2: if drv2 == drv1 or self._flavour.normcase(drv2) == self._flavour.normcase(drv1): # Same drive => second path is relative to the first. - return self._from_parsed_parts(drv1, root1, parts1 + parts2[1:]) + return self._from_parsed_parts(drv1, root1, parts1 + parts2) else: return self._from_parsed_parts(drv2, root2, parts2) else: @@ -595,7 +595,7 @@ def parent(self): drv = self._drv root = self._root parts = self._parts - if len(parts) == 1 and (drv or root): + if not parts: return self return self._from_parsed_parts(drv, root, parts[:-1]) @@ -622,7 +622,7 @@ def is_reserved(self): # (e.g. r"..\NUL" is reserved but not r"foo\NUL" if "foo" does not # exist). We err on the side of caution and return True for paths # which are not considered reserved by Windows. - if self._parts[0].startswith('\\\\'): + if self._drv.startswith('\\\\'): # UNC paths are never reserved. return False name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') @@ -632,12 +632,12 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - path_pattern = self._flavour.normcase(path_pattern) - drv, root, pat_parts = self._parse_parts((path_pattern,)) + pat = type(self)(path_pattern) + pat_parts = pat._parts_normcase if not pat_parts: raise ValueError("empty pattern") parts = self._parts_normcase - if drv or root: + if pat._drv or pat._root: if len(pat_parts) != len(parts): return False elif len(pat_parts) > len(parts): @@ -806,7 +806,7 @@ def absolute(self): cwd = self._flavour.abspath(self._drv) else: cwd = os.getcwd() - return self._from_parts([cwd] + self._parts) + return self._from_parts((cwd,) + self.parts) def resolve(self, strict=False): """ diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index df9c1f6ba65deb..32825bace518b1 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -56,12 +56,12 @@ def test_parse_parts_common(self): check(['a', '.', 'b'], ('', '', ['a', 'b'])) check(['a', '.', '.'], ('', '', ['a'])) # The first part is anchored. - check(['/a/b'], ('', sep, [sep, 'a', 'b'])) - check(['/a', 'b'], ('', sep, [sep, 'a', 'b'])) - check(['/a/', 'b'], ('', sep, [sep, 'a', 'b'])) + check(['/a/b'], ('', sep, ['a', 'b'])) + check(['/a', 'b'], ('', sep, ['a', 'b'])) + check(['/a/', 'b'], ('', sep, ['a', 'b'])) # Ignoring parts before an anchored part. - check(['a', '/b', 'c'], ('', sep, [sep, 'b', 'c'])) - check(['a', '/b', '/c'], ('', sep, [sep, 'c'])) + check(['a', '/b', 'c'], ('', sep, ['b', 'c'])) + check(['a', '/b', '/c'], ('', sep, ['c'])) class PosixFlavourTest(_BaseFlavourTest, unittest.TestCase): @@ -72,9 +72,9 @@ def test_parse_parts(self): check = self._check_parse_parts # Collapsing of excess leading slashes, except for the double-slash # special case. - check(['//a', 'b'], ('', '//', ['//', 'a', 'b'])) - check(['///a', 'b'], ('', '/', ['/', 'a', 'b'])) - check(['////a', 'b'], ('', '/', ['/', 'a', 'b'])) + check(['//a', 'b'], ('', '//', ['a', 'b'])) + check(['///a', 'b'], ('', '/', ['a', 'b'])) + check(['////a', 'b'], ('', '/', ['a', 'b'])) # Paths which look like NT paths aren't treated specially. check(['c:a'], ('', '', ['c:a'])) check(['c:\\a'], ('', '', ['c:\\a'])) @@ -88,40 +88,40 @@ class NTFlavourTest(_BaseFlavourTest, unittest.TestCase): def test_parse_parts(self): check = self._check_parse_parts # First part is anchored. - check(['c:'], ('c:', '', ['c:'])) - check(['c:/'], ('c:', '\\', ['c:\\'])) - check(['/'], ('', '\\', ['\\'])) - check(['c:a'], ('c:', '', ['c:', 'a'])) - check(['c:/a'], ('c:', '\\', ['c:\\', 'a'])) - check(['/a'], ('', '\\', ['\\', 'a'])) + check(['c:'], ('c:', '', [])) + check(['c:/'], ('c:', '\\', [])) + check(['/'], ('', '\\', [])) + check(['c:a'], ('c:', '', ['a'])) + check(['c:/a'], ('c:', '\\', ['a'])) + check(['/a'], ('', '\\', ['a'])) # UNC paths. - check(['//a/b'], ('\\\\a\\b', '\\', ['\\\\a\\b\\'])) - check(['//a/b/'], ('\\\\a\\b', '\\', ['\\\\a\\b\\'])) - check(['//a/b/c'], ('\\\\a\\b', '\\', ['\\\\a\\b\\', 'c'])) + check(['//a/b'], ('\\\\a\\b', '\\', [])) + check(['//a/b/'], ('\\\\a\\b', '\\', [])) + check(['//a/b/c'], ('\\\\a\\b', '\\', ['c'])) # Second part is anchored, so that the first part is ignored. - check(['a', 'Z:b', 'c'], ('Z:', '', ['Z:', 'b', 'c'])) - check(['a', 'Z:/b', 'c'], ('Z:', '\\', ['Z:\\', 'b', 'c'])) + check(['a', 'Z:b', 'c'], ('Z:', '', ['b', 'c'])) + check(['a', 'Z:/b', 'c'], ('Z:', '\\', ['b', 'c'])) # UNC paths. - check(['a', '//b/c', 'd'], ('\\\\b\\c', '\\', ['\\\\b\\c\\', 'd'])) + check(['a', '//b/c', 'd'], ('\\\\b\\c', '\\', ['d'])) # Collapsing and stripping excess slashes. - check(['a', 'Z://b//c/', 'd/'], ('Z:', '\\', ['Z:\\', 'b', 'c', 'd'])) + check(['a', 'Z://b//c/', 'd/'], ('Z:', '\\', ['b', 'c', 'd'])) # UNC paths. - check(['a', '//b/c//', 'd'], ('\\\\b\\c', '\\', ['\\\\b\\c\\', 'd'])) + check(['a', '//b/c//', 'd'], ('\\\\b\\c', '\\', ['d'])) # Extended paths. - check(['//?/c:/'], ('\\\\?\\c:', '\\', ['\\\\?\\c:\\'])) - check(['//?/c:/a'], ('\\\\?\\c:', '\\', ['\\\\?\\c:\\', 'a'])) - check(['//?/c:/a', '/b'], ('\\\\?\\c:', '\\', ['\\\\?\\c:\\', 'b'])) + check(['//?/c:/'], ('\\\\?\\c:', '\\', [])) + check(['//?/c:/a'], ('\\\\?\\c:', '\\', ['a'])) + check(['//?/c:/a', '/b'], ('\\\\?\\c:', '\\', ['b'])) # Extended UNC paths (format is "\\?\UNC\server\share"). - check(['//?/UNC/b/c'], ('\\\\?\\UNC\\b\\c', '\\', ['\\\\?\\UNC\\b\\c\\'])) - check(['//?/UNC/b/c/d'], ('\\\\?\\UNC\\b\\c', '\\', ['\\\\?\\UNC\\b\\c\\', 'd'])) + check(['//?/UNC/b/c'], ('\\\\?\\UNC\\b\\c', '\\', [])) + check(['//?/UNC/b/c/d'], ('\\\\?\\UNC\\b\\c', '\\', ['d'])) # Second part has a root but not drive. - check(['a', '/b', 'c'], ('', '\\', ['\\', 'b', 'c'])) - check(['Z:/a', '/b', 'c'], ('Z:', '\\', ['Z:\\', 'b', 'c'])) - check(['//?/Z:/a', '/b', 'c'], ('\\\\?\\Z:', '\\', ['\\\\?\\Z:\\', 'b', 'c'])) + check(['a', '/b', 'c'], ('', '\\', ['b', 'c'])) + check(['Z:/a', '/b', 'c'], ('Z:', '\\', ['b', 'c'])) + check(['//?/Z:/a', '/b', 'c'], ('\\\\?\\Z:', '\\', ['b', 'c'])) # Joining with the same drive => the first path is appended to if # the second path is relative. - check(['c:/a/b', 'c:x/y'], ('c:', '\\', ['c:\\', 'a', 'b', 'x', 'y'])) - check(['c:/a/b', 'c:/x/y'], ('c:', '\\', ['c:\\', 'x', 'y'])) + check(['c:/a/b', 'c:x/y'], ('c:', '\\', ['a', 'b', 'x', 'y'])) + check(['c:/a/b', 'c:/x/y'], ('c:', '\\', ['x', 'y'])) # From e7a58c33f1a8f12d0c264cc222f7cacb6345d30b Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 6 Mar 2023 18:50:03 +0000 Subject: [PATCH 02/10] Add news blurb. --- .../next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst diff --git a/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst new file mode 100644 index 00000000000000..87617a503c0dba --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst @@ -0,0 +1,2 @@ +Speed up :class:`pathlib.Path` construction by omitting the path anchor from +the internal list of path parts. From 1d58aeda3fe46f4a62aaf2554402e2bdc834c6ec Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 6 Mar 2023 19:26:50 +0000 Subject: [PATCH 03/10] Rename `_parts` to `_tail` for clarity. --- Lib/pathlib.py | 74 +++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index bae72c8151d3e0..cb3044b629a465 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -211,17 +211,17 @@ def _select_from(self, parent_path, is_dir, exists, scandir, normcase): class _PathParents(Sequence): """This object provides sequence-like access to the logical ancestors of a path. Don't try to construct it yourself.""" - __slots__ = ('_pathcls', '_drv', '_root', '_parts') + __slots__ = ('_pathcls', '_drv', '_root', '_tail') def __init__(self, path): # We don't store the instance to avoid reference cycles self._pathcls = type(path) self._drv = path._drv self._root = path._root - self._parts = path._parts + self._tail = path._tail def __len__(self): - return len(self._parts) + return len(self._tail) def __getitem__(self, idx): if isinstance(idx, slice): @@ -232,7 +232,7 @@ def __getitem__(self, idx): if idx < 0: idx += len(self) return self._pathcls._from_parsed_parts(self._drv, self._root, - self._parts[:-idx - 1]) + self._tail[:-idx - 1]) def __repr__(self): return "<{}.parents>".format(self._pathcls.__name__) @@ -248,7 +248,7 @@ class PurePath(object): directly, regardless of your system. """ __slots__ = ( - '_drv', '_root', '_parts', + '_drv', '_root', '_tail', '_str', '_hash', '_parts_tuple', '_parts_normcase_cached', ) _flavour = os.path @@ -298,23 +298,23 @@ def _parse_parts(cls, parts): @classmethod def _from_parts(cls, args): self = object.__new__(cls) - drv, root, parts = self._parse_parts(args) + drv, root, tail = self._parse_parts(args) self._drv = drv self._root = root - self._parts = parts + self._tail = tail return self @classmethod - def _from_parsed_parts(cls, drv, root, parts): + def _from_parsed_parts(cls, drv, root, tail): self = object.__new__(cls) self._drv = drv self._root = root - self._parts = parts + self._tail = tail return self @classmethod - def _format_parsed_parts(cls, drv, root, parts): - tail = cls._flavour.sep.join(parts) + def _format_parsed_parts(cls, drv, root, tail): + tail = cls._flavour.sep.join(tail) if drv or root: return f'{drv}{root}{tail}' else: @@ -327,7 +327,7 @@ def __str__(self): return self._str except AttributeError: self._str = self._format_parsed_parts(self._drv, self._root, - self._parts) or '.' + self._tail) or '.' return self._str def __fspath__(self): @@ -423,10 +423,10 @@ def anchor(self): @property def name(self): """The final path component, if any.""" - parts = self._parts - if not parts: + tail = self._tail + if not tail: return '' - return parts[-1] + return tail[-1] @property def suffix(self): @@ -474,7 +474,7 @@ def with_name(self, name): if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail): raise ValueError("Invalid name %r" % (name)) return self._from_parsed_parts(self._drv, self._root, - self._parts[:-1] + [name]) + self._tail[:-1] + [name]) def with_stem(self, stem): """Return a new path with the stem changed.""" @@ -499,7 +499,7 @@ def with_suffix(self, suffix): else: name = name[:-len(old_suffix)] + suffix return self._from_parsed_parts(self._drv, self._root, - self._parts[:-1] + [name]) + self._tail[:-1] + [name]) def relative_to(self, other, /, *_deprecated, walk_up=False): """Return the relative path to another path identified by the passed @@ -549,9 +549,9 @@ def parts(self): return self._parts_tuple except AttributeError: if self._drv or self._root: - self._parts_tuple = (self._drv + self._root,) + tuple(self._parts) + self._parts_tuple = (self._drv + self._root,) + tuple(self._tail) else: - self._parts_tuple = tuple(self._parts) + self._parts_tuple = tuple(self._tail) return self._parts_tuple def joinpath(self, *args): @@ -560,22 +560,22 @@ def joinpath(self, *args): paths) or a totally different path (if one of the arguments is anchored). """ - drv1, root1, parts1 = self._drv, self._root, self._parts - drv2, root2, parts2 = self._parse_parts(args) + drv1, root1, tail1 = self._drv, self._root, self._tail + drv2, root2, tail2 = self._parse_parts(args) if root2: if not drv2 and drv1: - return self._from_parsed_parts(drv1, root2, parts2) + return self._from_parsed_parts(drv1, root2, tail2) else: - return self._from_parsed_parts(drv2, root2, parts2) + return self._from_parsed_parts(drv2, root2, tail2) elif drv2: if drv2 == drv1 or self._flavour.normcase(drv2) == self._flavour.normcase(drv1): # Same drive => second path is relative to the first. - return self._from_parsed_parts(drv1, root1, parts1 + parts2) + return self._from_parsed_parts(drv1, root1, tail1 + tail2) else: - return self._from_parsed_parts(drv2, root2, parts2) + return self._from_parsed_parts(drv2, root2, tail2) else: # Second path is non-anchored (common case). - return self._from_parsed_parts(drv1, root1, parts1 + parts2) + return self._from_parsed_parts(drv1, root1, tail1 + tail2) def __truediv__(self, key): try: @@ -585,7 +585,7 @@ def __truediv__(self, key): def __rtruediv__(self, key): try: - return self._from_parts([key] + self._parts) + return self._from_parts([key] + self._tail) except TypeError: return NotImplemented @@ -594,10 +594,10 @@ def parent(self): """The logical parent of the path.""" drv = self._drv root = self._root - parts = self._parts - if not parts: + tail = self._tail + if not tail: return self - return self._from_parsed_parts(drv, root, parts[:-1]) + return self._from_parsed_parts(drv, root, tail[:-1]) @property def parents(self): @@ -615,7 +615,7 @@ def is_absolute(self): def is_reserved(self): """Return True if the path contains one of the special names reserved by the system, if any.""" - if self._flavour is posixpath or not self._parts: + if self._flavour is posixpath or not self._tail: return False # NOTE: the rules for reserved names seem somewhat complicated @@ -625,7 +625,7 @@ def is_reserved(self): if self._drv.startswith('\\\\'): # UNC paths are never reserved. return False - name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') + name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES def match(self, path_pattern): @@ -698,8 +698,8 @@ def __new__(cls, *args, **kwargs): def _make_child_relpath(self, part): # This is an optimization used for dir walking. `part` must be # a single part relative to this path. - parts = self._parts + [part] - return self._from_parsed_parts(self._drv, self._root, parts) + tail = self._tail + [part] + return self._from_parsed_parts(self._drv, self._root, tail) def __enter__(self): # In previous versions of pathlib, __exit__() marked this path as @@ -1184,11 +1184,11 @@ def expanduser(self): (as returned by os.path.expanduser) """ if (not (self._drv or self._root) and - self._parts and self._parts[0][:1] == '~'): - homedir = self._flavour.expanduser(self._parts[0]) + self._tail and self._tail[0][:1] == '~'): + homedir = self._flavour.expanduser(self._tail[0]) if homedir[:1] == "~": raise RuntimeError("Could not determine home directory.") - return self._from_parts([homedir] + self._parts[1:]) + return self._from_parts([homedir] + self._tail[1:]) return self From cce5d7ba5087120f742b3b213a8165fc9b8108c2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 10 Mar 2023 18:16:11 +0000 Subject: [PATCH 04/10] Undo some changes to _format_parsed_parts() --- Lib/pathlib.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 77ea5b448c519f..d11ecde1781621 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -314,14 +314,11 @@ def _from_parsed_parts(cls, drv, root, tail): @classmethod def _format_parsed_parts(cls, drv, root, tail): - sep = cls._flavour.sep - tail = sep.join(tail) if drv or root: - return f'{drv}{root}{tail}' - elif cls._flavour.splitdrive(tail)[0]: - return f'.{sep}{tail}' - else: - return tail + return drv + root + cls._flavour.sep.join(tail) + elif tail and cls._flavour.splitdrive(tail[0])[0]: + tail = ['.'] + tail + return cls._flavour.sep.join(tail) def __str__(self): """Return the string representation of the path, suitable for From 0c0f5bd6ec2c6779cf2b80d220f18bc1cd83f770 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 3 Apr 2023 21:51:44 +0100 Subject: [PATCH 05/10] Move `str()` call out of initialiser. --- Lib/pathlib.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 5b4e4fdf51fe90..3bba852d683eeb 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -274,10 +274,7 @@ def __init__(self, *args): path = os.fspath(args[0]) else: path = self._flavour.join(*args) - if isinstance(path, str): - # Force-cast str subclasses to str (issue #21127) - path = str(path) - else: + if not isinstance(path, str): raise TypeError( "argument should be a str or an os.PathLike " "object where __fspath__ returns a str, " @@ -296,7 +293,7 @@ def _parse_path(cls, path): if drv.startswith(sep): # pathlib assumes that UNC paths always have a root. root = sep - parsed = [sys.intern(x) for x in rel.split(sep) if x and x != '.'] + parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.'] return drv, root, parsed def _load_parts(self): From 7a4e92fdf1d87d5f6b560c118e3ce827082619cd Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 3 Apr 2023 22:40:48 +0100 Subject: [PATCH 06/10] Optimize `_make_child_relpath()` --- Lib/pathlib.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 3bba852d683eeb..8870bb0682d145 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -120,7 +120,7 @@ def __init__(self, name, child_parts, flavour): def _select_from(self, parent_path, is_dir, exists, scandir, normcase): try: - path = parent_path._make_child_relpath(self.name) + path = parent_path._make_child(self.name) if (is_dir if self.dironly else exists)(path): for p in self.successor._select_from(path, is_dir, exists, scandir, normcase): yield p @@ -154,7 +154,7 @@ def _select_from(self, parent_path, is_dir, exists, scandir, normcase): continue name = entry.name if self.match(normcase(name)): - path = parent_path._make_child_relpath(name) + path = parent_path._make_child(name) for p in self.successor._select_from(path, is_dir, exists, scandir, normcase): yield p except PermissionError: @@ -181,7 +181,7 @@ def _iterate_directories(self, parent_path, is_dir, scandir): if not _ignore_error(e): raise if entry_is_dir and not entry.is_symlink(): - path = parent_path._make_child_relpath(entry.name) + path = parent_path._make_child(entry.name) for p in self._iterate_directories(path, is_dir, scandir): yield p except PermissionError: @@ -703,11 +703,21 @@ def __new__(cls, *args, **kwargs): cls = WindowsPath if os.name == 'nt' else PosixPath return object.__new__(cls) - def _make_child_relpath(self, part): - # This is an optimization used for dir walking. `part` must be - # a single part relative to this path. - tail = self._tail + [part] - return self._from_parsed_parts(self.drive, self.root, tail) + def _make_child(self, name): + path_str = str(self) + tail = self._tail + if tail: + path_str = f'{path_str}{self._flavour.sep}{name}' + elif path_str != '.': + path_str = f'{path_str}{name}' + else: + path_str = name + path = type(self)(path_str) + path._str = path_str + path._drv = self.drive + path._root = self.root + path._tail_cached = tail + [name] + return path def __enter__(self): # In previous versions of pathlib, __exit__() marked this path as @@ -762,7 +772,7 @@ def iterdir(self): special entries '.' and '..' are not included. """ for name in os.listdir(self): - yield self._make_child_relpath(name) + yield self._make_child(name) def _scandir(self): # bpo-24132: a future version of pathlib will support subclassing of @@ -1244,7 +1254,7 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False): else: paths.append((path, dirnames, filenames)) - paths += [path._make_child_relpath(d) for d in reversed(dirnames)] + paths += [path._make_child(d) for d in reversed(dirnames)] class PosixPath(Path, PurePosixPath): From c8d4b384e6106cb486c5f5e4cf1be1fc99225ebf Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 4 Apr 2023 00:54:55 +0100 Subject: [PATCH 07/10] Remove caching of `Path.parts` --- Lib/pathlib.py | 18 ++++++------------ Lib/test/test_pathlib.py | 2 -- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 8870bb0682d145..1cd1968a83469f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -248,7 +248,7 @@ class PurePath(object): """ __slots__ = ( '_raw_path', '_drv', '_root', '_tail_cached', - '_str', '_hash', '_parts_tuple', '_parts_normcase_cached', + '_str', '_hash', '_parts_normcase_cached', ) _flavour = os.path @@ -544,7 +544,7 @@ def relative_to(self, other, /, *_deprecated, walk_up=False): raise ValueError(f"{str(self)!r} and {str(other)!r} have different anchors") if step and not walk_up: raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}") - parts = ('..',) * step + self.parts[len(path.parts):] + parts = ['..'] * step + self._tail[len(path._tail):] return path_cls(*parts) def is_relative_to(self, other, /, *_deprecated): @@ -563,16 +563,10 @@ def is_relative_to(self, other, /, *_deprecated): def parts(self): """An object providing sequence-like access to the components in the filesystem path.""" - # We cache the tuple to avoid building a new one each time .parts - # is accessed. XXX is this necessary? - try: - return self._parts_tuple - except AttributeError: - if self.drive or self.root: - self._parts_tuple = (self.drive + self.root,) + tuple(self._tail) - else: - self._parts_tuple = tuple(self._tail) - return self._parts_tuple + if self.drive or self.root: + return (self.drive + self.root,) + tuple(self._tail) + else: + return tuple(self._tail) def joinpath(self, *args): """Combine this path with one or several arguments, and return a diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index 3fa6db649e7e7f..8b70489ecc28c2 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -416,8 +416,6 @@ def test_parts_common(self): p = P('a/b') parts = p.parts self.assertEqual(parts, ('a', 'b')) - # The object gets reused. - self.assertIs(parts, p.parts) # When the path is absolute, the anchor is a separate part. p = P('/a/b') parts = p.parts From d2a578db100e7aa50212e9ca6f31433a4c36aad2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 6 Apr 2023 11:38:01 +0100 Subject: [PATCH 08/10] Cache case-normalized string. --- Lib/pathlib.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 1cd1968a83469f..32e5aa05b09c20 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -249,6 +249,7 @@ class PurePath(object): __slots__ = ( '_raw_path', '_drv', '_root', '_tail_cached', '_str', '_hash', '_parts_normcase_cached', + '_str_normcase_cached', ) _flavour = os.path @@ -367,25 +368,34 @@ def as_uri(self): path = str(self) return prefix + urlquote_from_bytes(os.fsencode(path)) + @property + def _str_normcase(self): + # String with normalized case, for hashing and equality checks + try: + return self._str_normcase_cached + except AttributeError: + self._str_normcase_cached = self._flavour.normcase(str(self)) + return self._str_normcase_cached + @property def _parts_normcase(self): - # Cached parts with normalized case, for hashing and comparison. + # Cached parts with normalized case, for comparisons. try: return self._parts_normcase_cached except AttributeError: - self._parts_normcase_cached = [self._flavour.normcase(p) for p in self.parts] + self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) return self._parts_normcase_cached def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented - return self._parts_normcase == other._parts_normcase and self._flavour is other._flavour + return self._str_normcase == other._str_normcase and self._flavour is other._flavour def __hash__(self): try: return self._hash except AttributeError: - self._hash = hash(tuple(self._parts_normcase)) + self._hash = hash(self._str_normcase) return self._hash def __lt__(self, other): @@ -632,9 +642,9 @@ def match(self, path_pattern): Return True if this path matches the given pattern. """ pat = type(self)(path_pattern) - pat_parts = pat._parts_normcase - if not pat_parts: + if not pat.parts: raise ValueError("empty pattern") + pat_parts = pat._parts_normcase parts = self._parts_normcase if pat.drive or pat.root: if len(pat_parts) != len(parts): From de4df7adb6efdc9721ba8bdd8a9b8017812f6199 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 7 Apr 2023 14:18:19 +0100 Subject: [PATCH 09/10] Rename `_make_child` back to `_make_child_relpath` --- Lib/pathlib.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 32e5aa05b09c20..14c5d9a3f22830 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -120,7 +120,7 @@ def __init__(self, name, child_parts, flavour): def _select_from(self, parent_path, is_dir, exists, scandir, normcase): try: - path = parent_path._make_child(self.name) + path = parent_path._make_child_relpath(self.name) if (is_dir if self.dironly else exists)(path): for p in self.successor._select_from(path, is_dir, exists, scandir, normcase): yield p @@ -154,7 +154,7 @@ def _select_from(self, parent_path, is_dir, exists, scandir, normcase): continue name = entry.name if self.match(normcase(name)): - path = parent_path._make_child(name) + path = parent_path._make_child_relpath(name) for p in self.successor._select_from(path, is_dir, exists, scandir, normcase): yield p except PermissionError: @@ -181,7 +181,7 @@ def _iterate_directories(self, parent_path, is_dir, scandir): if not _ignore_error(e): raise if entry_is_dir and not entry.is_symlink(): - path = parent_path._make_child(entry.name) + path = parent_path._make_child_relpath(entry.name) for p in self._iterate_directories(path, is_dir, scandir): yield p except PermissionError: @@ -707,7 +707,7 @@ def __new__(cls, *args, **kwargs): cls = WindowsPath if os.name == 'nt' else PosixPath return object.__new__(cls) - def _make_child(self, name): + def _make_child_relpath(self, name): path_str = str(self) tail = self._tail if tail: @@ -776,7 +776,7 @@ def iterdir(self): special entries '.' and '..' are not included. """ for name in os.listdir(self): - yield self._make_child(name) + yield self._make_child_relpath(name) def _scandir(self): # bpo-24132: a future version of pathlib will support subclassing of @@ -1258,7 +1258,7 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False): else: paths.append((path, dirnames, filenames)) - paths += [path._make_child(d) for d in reversed(dirnames)] + paths += [path._make_child_relpath(d) for d in reversed(dirnames)] class PosixPath(Path, PurePosixPath): From 5a34f7e3046e158597e450249cf1b1d6f8a74940 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 8 Apr 2023 16:37:51 +0100 Subject: [PATCH 10/10] Add comments to `PurePath.__slots__` --- Lib/pathlib.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 14c5d9a3f22830..4ae1fae6f4b358 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -246,10 +246,41 @@ class PurePath(object): PureWindowsPath object. You can also instantiate either of these classes directly, regardless of your system. """ + __slots__ = ( - '_raw_path', '_drv', '_root', '_tail_cached', - '_str', '_hash', '_parts_normcase_cached', + # The `_raw_path` slot stores an unnormalized string path. This is set + # in the `__init__()` method. + '_raw_path', + + # The `_drv`, `_root` and `_tail_cached` slots store parsed and + # normalized parts of the path. They are set when any of the `drive`, + # `root` or `_tail` properties are accessed for the first time. The + # three-part division corresponds to the result of + # `os.path.splitroot()`, except that the tail is further split on path + # separators (i.e. it is a list of strings), and that the root and + # tail are normalized. + '_drv', '_root', '_tail_cached', + + # The `_str` slot stores the string representation of the path, + # computed from the drive, root and tail when `__str__()` is called + # for the first time. It's used to implement `_str_normcase` + '_str', + + # The `_str_normcase_cached` slot stores the string path with + # normalized case. It is set when the `_str_normcase` property is + # accessed for the first time. It's used to implement `__eq__()` + # `__hash__()`, and `_parts_normcase` '_str_normcase_cached', + + # The `_parts_normcase_cached` slot stores the case-normalized + # string path after splitting on path separators. It's set when the + # `_parts_normcase` property is accessed for the first time. It's used + # to implement comparison methods like `__lt__()`. + '_parts_normcase_cached', + + # The `_hash` slot stores the hash of the case-normalized string + # path. It's set when `__hash__()` is called for the first time. + '_hash', ) _flavour = os.path