8000 bpo-33695 shutil.copytree() + os.scandir() cache (#7874) · python/cpython@19c46a4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 19c46a4

Browse files
authored
bpo-33695 shutil.copytree() + os.scandir() cache (#7874)
1 parent cd44980 commit 19c46a4

File tree

3 files changed

+96
-56
lines changed

3 files changed

+96
-56
lines changed

Doc/whatsnew/3.8.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,14 @@ Optimizations
277277
See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
278278
(Contributed by Giampaolo Rodola' in :issue:`25427`.)
279279

280+
* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
281+
functions depending from it use cached :func:`os.stat` values. The speedup
282+
for copying a directory with 8000 files is around +9% on Linux, +20% on
283+
Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
284+
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
285+
on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
286+
287+
280288
* The default protocol in the :mod:`pickle` module is now Protocol 4,
281289
first introduced in Python 3.4. It offers better performance and smaller
282290
size compared to Protocol 3 available since Python 3.0.

Lib/shutil.py

Lines changed: 81 additions & 56 deletions
-
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
200200

201201
def _samefile(src, dst):
202202
# Macintosh, Unix.
203+
if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
204+
try:
205+
return os.path.samestat(src.stat(), os.stat(dst))
206+
except OSError:
207+
return False
208+
203209
if hasattr(os.path, 'samefile'):
204210
try:
205211
return os.path.samefile(src, dst)
@@ -210,6 +216,12 @@ def _samefile(src, dst):
210216
return (os.path.normcase(os.path.abspath(src)) ==
211217
os.path.normcase(os.path.abspath(dst)))
212218

219+
def _stat(fn):
220+
return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
221+
222+
def _islink(fn):
223+
return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
224+
213225
def copyfile(src, dst, *, follow_symlinks=True):
214226
"""Copy data from src to dst in the most efficient way possible.
215227
@@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
223235
file_size = 0
224236
for i, fn in enumerate([src, dst]):
225237
try:
226-
st = os.stat(fn)
238+
st = _stat(fn)
227239
except OSError:
228240
# File most likely does not exist
229241
pass
230242
else:
231243
# XXX What about other special files? (sockets, devices...)
232244
if stat.S_ISFIFO(st.st_mode):
245+
fn = fn.path if isinstance(fn, os.DirEntry) else fn
233246
raise SpecialFileError("`%s` is a named pipe" % fn)
234247
if _WINDOWS and i == 0:
235248
file_size = st.st_size
236249

237-
if not follow_symlinks and os.path.islink(src):
250+
if not follow_symlinks and _islink(src):
238251
os.symlink(os.readlink(src), dst)
239252
else:
240253
with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
@@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
270283
(e.g. Linux) this method does noth 67E6 ing.
271284
272285
"""
273-
if not follow_symlinks and os.path.islink(src) and os.path.islink(dst):
286+
if not follow_symlinks and _islink(src) and os.path.islink(dst):
274287
if hasattr(os, 'lchmod'):
275288
stat_func, chmod_func = os.lstat, os.lchmod
276289
else:
277290
return
278291
elif hasattr(os, 'chmod'):
279-
stat_func, chmod_func = os.stat, os.chmod
292+
stat_func, chmod_func = _stat, os.chmod
280293
else:
281294
return
282295

@@ -325,7 +338,7 @@ def _nop(*args, ns=None, follow_symlinks=None):
325338
pass
326339

327340
# follow symlinks (aka don't not follow symlinks)
328-
follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst))
341+
follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
329342
if follow:
330343
# use the real function if it exists
331344
def lookup(name):
@@ -339,7 +352,10 @@ def lookup(name):
339352
return fn
340353
return _nop
341354

342-
st = lookup("stat")(src, follow_symlinks=follow)
355+
if isinstance(src, os.DirEntry):
356+
st = src.stat(follow_symlinks=follow)
357+
else:
358+
st = lookup("stat")(src, follow_symlinks=follow)
343359
mode = stat.S_IMODE(st.st_mode)
344360
lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
345361
follow_symlinks=follow)
@@ -415,79 +431,47 @@ def _ignore_patterns(path, names):
415431
return set(ignored_names)
416432
return _ignore_patterns
417433

418-
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
419-
ignore_dangling_symlinks=False):
420-
"""Recursively copy a directory tree.
421-
422-
The destination directory must not already exist.
423-
If exception(s) occur, an Error is raised with a list of reasons.
424
425-
If the optional symlinks flag is true, symbolic links in the
426-
source tree result in symbolic links in the destination tree; if
427-
it is false, the contents of the files pointed to by symbolic
428-
links are copied. If the file pointed by the symlink doesn't
429-
exist, an exception will be added in the list of errors raised in
430-
an Error exception at the end of the copy process.
431-
432-
You can set the optional ignore_dangling_symlinks flag to true if you
433-
want to silence this exception. Notice that this has no effect on
434-
platforms that don't support os.symlink.
435-
436-
The optional ignore argument is a callable. If given, it
437-
is called with the `src` parameter, which is the directory
438-
being visited by copytree(), and `names` which is the list of
439-
`src` contents, as returned by os.listdir():
440-
441-
callable(src, names) -> ignored_names
442-
443-
Since copytree() is called recursively, the callable will be
444-
called once for each directory that is copied. It returns a
445-
list of names relative to the `src` directory that should
446-
not be copied.
447-
448-
The optional copy_function argument is a callable that will be used
449-
to copy each file. It will be called with the source path and the
450-
destination path as arguments. By default, copy2() is used, but any
451-
function that supports the same signature (like copy()) can be used.
452-
453-
"""
454-
names = os.listdir(src)
434+
def _copytree(entries, src, dst, symlinks, ignore, copy_function,
435+
ignore_dangling_symlinks):
455436
if ignore is not None:
456-
ignored_names = ignore(src, names)
437+
ignored_names = ignore(src, set(os.listdir(src)))
457438
else:
458439
ignored_names = set()
459440

460441
os.makedirs(dst)
461442
errors = []
462-
for name in names:
463-
if name in ignored_names:
443+
use_srcentry = copy_function is copy2 or copy_function is copy
444+
445+
for srcentry in entries:
446+
if srcentry.name in ignored_names:
464447
continue
465-
srcname = os.path.join(src, name)
466-
dstname = os.path.join(dst, name)
448+
srcname = os.path.join(src, srcentry.name)
449+
dstname = os.path.join(dst, srcentry.name)
450+
srcobj = srcentry if use_srcentry else srcname
467451
try:
468-
if os.path.islink(srcname):
452+
if srcentry.is_symlink():
469453
linkto = os.readlink(srcname)
470454
if symlinks:
471455
# We can't just leave it to `copy_function` because legacy
472456
# code with a custom `copy_function` may rely on copytree
473457
# doing the right thing.
474458
os.symlink(linkto, dstname)
475-
copystat(srcname, dstname, follow_symlinks=not symlinks)
459+
copystat(srcobj, dstname, follow_symlinks=not symlinks)
476460
else:
477461
# ignore dangling symlink if the flag is on
478462
if not os.path.exists(linkto) and ignore_dangling_symlinks:
479463
continue
480464
# otherwise let the copy occurs. copy2 will raise an error
481-
if os.path.isdir(srcname):
482-
copytree(srcname, dstname, symlinks, ignore,
465+
if srcentry.is_dir():
466+
copytree(srcobj, dstname, symlinks, ignore,
483467
copy_function)
484468
else:
485-
copy_function(srcname, dstname)
486-
elif os.path.isdir(srcname):
487-
copytree(srcname, dstname, symlinks, ignore, copy_function)
469+
copy_function(srcobj, dstname)
470+
elif srcentry.is_dir():
471+
copytree(srcobj, dstname, symlinks, ignore, copy_function)
488472
else:
489473
# Will raise a SpecialFileError for unsupported file types
490-
copy_function(srcname, dstname)
474+
copy_function(srcentry, dstname)
491475
# catch the Error from the recursive copytree so that we can
492476
# continue with other files
493477
except Error as err:
@@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
504488
raise Error(errors)
505489
return dst
506490

491+
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
492+
ignore_dangling_symlinks=False):
493+
"""Recursively copy a directory tree.
494+
495+
The destination directory must not already exist.
496+
If exception(s) occur, an Error is raised with a list of reasons.
497+
498+
If the optional symlinks flag is true, symbolic links in the
499+
source tree result in symbolic links in the destination tree; if
500+
it is false, the contents of the files pointed to by symbolic
501+
links are copied. If the file pointed by the symlink doesn't
502+
exist, an exception will be added in the list of errors raised in
503+
an Error exception at the end of the copy process.
504+
505+
You can set the optional ignore_dangling_symlinks flag to true if you
506+
want to silence this exception. Notice that this has no effect on
507+
platforms that don't support os.symlink.
508+
509+
The optional ignore argument is a callable. If given, it
510+
is called with the `src` parameter, which is the directory
511+
being visited by copytree(), and `names` which is the list of
512+
`src` contents, as returned by os.listdir():
513+
514+
callable(src, names) -> ignored_names
515+
516+
Since copytree() is called recursively, the callable will be
517+
called once for each directory that is copied. It returns a
518+
list of names relative to the `src` directory that should
519+
not be copied.
520+
521+
The optional copy_function argument is a callable that will be used
522+
to copy each file. It will be called with the source path and the
523+
destination path as arguments. By default, copy2() is used, but any
524+
function that supports the same signature (like copy()) can be used.
525+
526+
"""
527+
with os.scandir(src) as entries:
528+
return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
529+
ignore=ignore, copy_function=copy_function,
530+
ignore_dangling_symlinks=ignore_dangling_symlinks)
531+
507532
# version vulnerable to race conditions
508533
def _rmtree_unsafe(path, onerror):
509534
try:
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
2+
functions depending from it use cached :func:`os.stat` values. The speedup
3+
for copying a directory with 8000 files is around +9% on Linux, +20% on
4+
Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
5+
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
6+
on network filesystems.
7+
(Contributed by Giampaolo Rodola' in :issue:`33695`.)

0 commit comments

Comments
 (0)
0