10000 gh-102120: [TarFile] Add an iter function that doesn't cache (GH-102128) · python/cpython@50fce89 · GitHub
[go: up one dir, main page]

Skip to content

Commit 50fce89

Browse files
authored
gh-102120: [TarFile] Add an iter function that doesn't cache (GH-102128)
1 parent 097b783 commit 50fce89

File tree

4 files changed

+42
-7
lines changed

4 files changed

+42
-7
lines changed

Doc/library/tarfile.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ be finalized; only the internally used file object will be closed. See the
318318
.. versionadded:: 3.2
319319
Added support for the context management protocol.
320320

321-
.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1)
321+
.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1, stream=False)
322322

323323
All following arguments are optional and can be accessed as instance attributes
324324
as well.
@@ -369,6 +369,9 @@ be finalized; only the internally used file object will be closed. See the
369369
The *pax_headers* argument is an optional dictionary of strings which
370370
will be added as a pax global header if *format* is :const:`PAX_FORMAT`.
371371

372+
If *stream* is set to :const:`True` then while reading the archive info about files
373+
in the archive are not cached, saving memory.
374+
372375
.. versionchanged:: 3.2
373376
Use ``'surrogateescape'`` as the default for the *errors* argument.
374377

@@ -378,6 +381,8 @@ be finalized; only the internally used file object will be closed. See the
378381
.. versionchanged:: 3.6
379382
The *name* parameter accepts a :term:`path-like object`.
380383

384+
.. versionchanged:: 3.13
385+
Add the *stream* parameter.
381386

382387
.. classmethod:: TarFile.open(...)
383388

Lib/tarfile.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1633,7 +1633,7 @@ class TarFile(object):
16331633
def __init__(self, name=None, mode="r", fileobj=None, format=None,
16341634
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
16351635
errors="surrogateescape", pax_headers=None, debug=None,
1636-
errorlevel=None, copybufsize=None):
1636+
errorlevel=None, copybufsize=None, stream=False):
16371637
"""Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
16381638
read from an existing archive, 'a' to append data to an existing
16391639
file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1665,6 +1665,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None,
16651665
self.name = os.path.abspath(name) if name else None
16661666
self.fileobj = fileobj
16671667

1668+
self.stream = stream
1669+
16681670
# Init attributes.
16691671
if format is not None:
16701672
self.format = format
@@ -2631,7 +2633,9 @@ def next(self):
26312633
break
26322634

26332635
if tarinfo is not None:
2634-
self.members.append(tarinfo)
2636+
# if streaming the file we do not want to cache the tarinfo
2637+
if not self.stream:
2638+
self.members.append(tarinfo)
26352639
else:
26362640
self._loaded = True
26372641

@@ -2682,11 +2686,12 @@ def _getmember(self, name, tarinfo=None, normalize=False):
26822686

26832687
def _load(self):
26842688
"""Read through the entire archive file and look for readable
2685-
members.
2689+
members. This should not run if the file is set to stream.
26862690
"""
2687-
while self.next() is not None:
2688-
pass
2689-
self._loaded = True
2691+
if not self.stream:
2692+
while self.next() is not None:
2693+
pass
2694+
self._loaded = True
26902695

26912696
def _check(self, mode=None):
26922697
"""Check if TarFile is still open, and if the operation's mode

Lib/test/test_tarfile.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ def setUp(self):
100100
def tearDown(self):
101101
self.tar.close()
102102

103+
class StreamModeTest(ReadTest):
104+
105+
# Only needs to change how the tarfile is opened to set
106+
# stream mode
107+
def setUp(self):
108+
self.tar = tarfile.open(self.tarname, mode=self.mode,
109+
encoding="iso8859-1",
110+
stream=True)
103111

104112
class UstarReadTest(ReadTest, unittest.TestCase):
105113

@@ -852,6 +860,21 @@ class Bz2StreamReadTest(Bz2Test, StreamReadTest):
852860
class LzmaStreamReadTest(LzmaTest, StreamReadTest):
853861
pass
854862

863+
class TarStreamModeReadTest(StreamModeTest, unittest.TestCase):
864+
865+
def test_stream_mode_no_cache(self):
866+
for _ in self.tar:
867+
pass
868+
self.assertEqual(self.tar.members, [])
869+
870+
class GzipStreamModeReadTest(GzipTest, TarStreamModeReadTest):
871+
pass
872+
873+
class Bz2StreamModeReadTest(Bz2Test, TarStreamModeReadTest):
874+
pass
875+
876+
class LzmaStreamModeReadTest(LzmaTest, TarStreamModeReadTest):
877+
pass
855878

856879
class DetectReadTest(TarTest, unittest.TestCase):
857880
def _testfunc_file(self, name, mode):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Added a stream mode to ``tarfile`` that allows for reading
2+
archives without caching info about the inner files.

0 commit comments

Comments
 (0)
0