8000 Strip only entries immediately following a referenced entry · python/cpython@c80d21b · GitHub
[go: up one dir, main page]

Skip to content

Commit c80d21b

Browse files
committed
Strip only entries immediately following a referenced entry
- The previous implementation might cause [archive decryption header] and/or [archive extra data record] preceeding [central directory] be stripped.
1 parent f8fade1 commit c80d21b

File tree

3 files changed

+178
-55
lines changed

3 files changed

+178
-55
lines changed

Doc/library/zipfile.rst

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -550,17 +550,21 @@ ZipFile Objects
550550
will be removed.
551551

552552
If *removed* is not provided, local file entries no longer referenced in the
553-
central directory will be removed. The algorithm assumes that local file
554-
entries are stored consecutively. Extra bytes between entries will also be
555-
removed. Data before the first referenced entry is preserved unless it
556-
appears to be a sequence of consecutive local file entries.
553+
central directory will be removed. The algorithm assumes that local file
554+
entries are stored consecutively:
555+
#. Data before the first referenced entry is removed only when it appears to
556+
be a sequence of consecutive entries with no extra following bytes; extra
557+
preceeding bytes are preserved.
558+
#. Data between referenced entries is removed only when it appears to
559+
be a sequence of consecutive entries with no extra preceding bytes; extra
560+
following bytes are preserved.
557561

558562
``strict_descriptor=True`` can be provided to skip the slower scan for an
559563
unsigned data descriptor (deprecated in the latest ZIP specification and is
560564
only used by legacy tools) when checking for bytes resembling a valid local
561-
file entry before the first referenced entry. This improves performance,
562-
but may cause some stale local file entries to be preserved, as any entry
563-
using an unsigned descriptor cannot be detected.
565+
file entry. This improves performance, but may cause some stale local file
566+
entries to be preserved, as any entry using an unsigned descriptor cannot
567+
be detected.
564568

565569
*chunk_size* may be specified to control the buffer size when moving
566570
entry data (default is 1 MiB).

Lib/test/test_zipfile/test_core.py

Lines changed: 98 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1836,22 +1836,31 @@ def test_repack_file_entry_before_first_file(self):
18361836
with zipfile.ZipFile(TESTFN) as zh:
18371837
self.assertIsNone(zh.testzip())
18381838

1839-
def test_repack_bytes_between_files(self):
1840-
"""Should remove bytes between local file entries."""
1839+
def test_repack_bytes_before_removed_files(self):
1840+
"""Should preserve if there are bytes before stale local file entries."""
18411841
for ii in ([1], [1, 2], [2]):
18421842
with self.subTest(remove=ii):
18431843
# calculate the expected results
1844-
test_files = [data for j, data in enumerate(self.test_files) if j not in ii]
1845-
expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files)
1844+
with open(TESTFN, 'wb') as fh:
1845+
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
1846+
for i, (file, data) in enumerate(self.test_files):
1847+
if i 6D40 == ii[0]:
1848+
fh.write(b' dummy bytes ')
1849+
zh.start_dir = fh.tell()
1850+
zh.writestr(file, data)
1851+
for i in ii:
1852+
zh.remove(self.test_files[i][0])
1853+
expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()]
18461854
expected_size = os.path.getsize(TESTFN)
18471855

18481856
# do the removal and check the result
18491857
with open(TESTFN, 'wb') as fh:
18501858
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
18511859
for i, (file, data) in enumerate(self.test_files):
1860+
if i == ii[0]:
1861+
fh.write(b' dummy bytes ')
1862+
zh.start_dir = fh.tell()
18521863
zh.writestr(file, data)
1853-
fh.write(b' dummy bytes ')
1854-
zh.start_dir = fh.tell()
18551864
with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh:
18561865
for i in ii:
18571866
zh.remove(self.test_files[i][0])
@@ -1870,6 +1879,87 @@ def test_repack_bytes_between_files(self):
18701879
with zipfile.ZipFile(TESTFN) as zh:
18711880
self.assertIsNone(zh.testzip())
18721881

1882+
def test_repack_bytes_after_removed_files(self):
1883+
"""Should keep extra bytes if there are bytes after stale local file entries."""
1884+
for ii in ([1], [1, 2], [2]):
1885+
with self.subTest(remove=ii):
1886+
# calculate the expected results
1887+
with open(TESTFN, 'wb') as fh:
1888+
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
1889+
for i, (file, data) in enumerate(self.test_files):
1890+
if i not in ii:
1891+
zh.writestr(file, data)
1892+
if i == ii[-1]:
1893+
fh.write(b' dummy bytes ')
1894+
zh.start_dir = fh.tell()
1895+
expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()]
1896+
expected_size = os.path.getsize(TESTFN)
1897+
1898+
# do the removal and check the result
1899+
with open(TESTFN, 'wb') as fh:
1900+
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
1901+
for i, (file, data) in enumerate(self.test_files):
1902+
zh.writestr(file, data)
1903+
if i == ii[-1]:
1904+
fh.write(b' dummy bytes ')
1905+
zh.start_dir = fh.tell()
1906+
with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh:
1907+
for i in ii:
1908+
zh.remove(self.test_files[i][0])
1909+
zh.repack()
1910+
1911+
# check infolist
1912+
self.assertEqual(
1913+
[ComparableZipInfo(zi) for zi in zh.infolist()],
1914+
expected_zinfos,
1915+
)
1916+
1917+
# check file size
1918+
self.assertEqual(os.path.getsize(TESTFN), expected_size)
1919+
1920+
# make sure the zip file is still valid
1921+
with zipfile.ZipFile(TESTFN) as zh:
1922+
self.assertIsNone(zh.testzip())
1923+
1924+
def test_repack_bytes_between_removed_files(self):
1925+
"""Should strip only local file entries before random bytes."""
1926+
# calculate the expected results
1927+
with open(TESTFN, 'wb') as fh:
1928+
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
1929+
zh.writestr(*self.test_files[0])
1930+
fh.write(b' dummy bytes ')
1931+
zh.start_dir = fh.tell()
1932+
zh.writestr(*self.test_files[2])
1933+
zh.remove(self.test_files[2][0])
1934+
expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()]
1935+
expected_size = os.path.getsize(TESTFN)
1936+
1937+
# do the removal and check the result
1938+
with open(TESTFN, 'wb') as fh:
1939+
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
1940+
zh.writestr(*self.test_files[0])
1941+
zh.writestr(*self.test_files[1])
1942+
fh.write(b' dummy bytes ')
1943+
zh.start_dir = fh.tell()
1944+
zh.writestr(*self.test_files[2])
1945+
with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh:
1946+
zh.remove(self.test_files[1][0])
1947+
zh.remove(self.test_files[2][0])
1948+
zh.repack()
1949+
1950+
# check infolist
1951+
self.assertEqual(
1952+
[ComparableZipInfo(zi) for zi in zh.infolist()],
1953+
expected_zinfos,
1954+
)
1955+
1956+
# check file size
1957+
self.assertEqual(os.path.getsize(TESTFN), expected_size)
1958+
1959+
# make sure the zip file is still valid
1960+
with zipfile.ZipFile(TESTFN) as zh:
1961+
self.assertIsNone(zh.testzip())
1962+
18731963
def test_repack_zip64(self):
18741964
"""Should correctly handle file entries with zip64."""
18751965
for ii in ([0], [0, 1], [1], [2]):
@@ -2011,7 +2101,7 @@ def test_repack_data_descriptor_no_sig_strict(self):
20112101
if self.compression not in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA):
20122102
self.skipTest('require unsupported decompression method')
20132103

2014-
for ii in ([0], [0, 1]):
2104+
for ii in ([0], [0, 1], [1], [2]):
20152105
with self.subTest(remove=ii):
20162106
# calculate the expected results
20172107
with open(TESTFN, 'wb') as fh:
@@ -2055,7 +2145,7 @@ def test_repack_data_descriptor_no_sig_strict_by_decompressoin(self):
20552145
if self.compression in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA):
20562146
self.skipTest('require supported decompression method')
20572147

2058-
for ii in ([0], [0, 1]):
2148+
for ii in ([0], [0, 1], [1], [2]):
20592149
with self.subTest(remove=ii):
20602150
# calculate the expected results
20612151
test_files = [data for j, data in enumerate(self.test_files) if j not in ii]

Lib/zipfile/__init__.py

Lines changed: 69 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,66 +1385,78 @@ def repack(self, zfile, removed=None):
13851385
Assumes that local file entries are stored consecutively, with no gaps
13861386
or overlaps.
13871387
1388-
Stripping occurs in two phases:
1388+
Behavior:
13891389
1390-
1. Before the first recorded file entry:
1391-
- If a sequence of valid local file entries (starting with
1392-
`PK\x03\x04`) is found immediately before the first recorded
1393-
entry, it is stripped.
1394-
- Otherwise, all leading bytes are preserved (e.g., in cases such
1395-
as self-extracting archives or embedded ZIP payloads).
1390+
1. If any referenced entry overlaps with another, a `BadZipFile` error
1391+
is raised since safe repacking cannot be guaranteed.
13961392
1397-
2. Between or after the recorded entries:
1398-
- Any bytes between two recorded entries, or between the last
1399-
recorded and the central directory, are removed—regardless of
1400-
whether they resemble valid entries.
1393+
2. Data before the first referenced entry is stripped only when it
1394+
appears to be a sequence of consecutive entries with no extra
1395+
following bytes; extra preceeding bytes are preserved.
1396+
1397+
3. Data between referenced entries is stripped only when it appears to
1398+
be a sequence of consecutive entries with no extra preceding bytes;
1399+
extra following bytes are preserved.
1400+
1401+
4. This is to prevent an unexpected data removal (false positive),
1402+
though a false negative may happen in certain rare cases.
14011403
14021404
Examples:
14031405
1404-
Stripping before first recorded entry:
1406+
Stripping before the first referenced entry:
14051407
14061408
[random bytes]
1407-
[unreferenced local file entry 1]
1408-
[unreferenced local file entry 2]
1409+
[unreferenced local file entry]
14091410
[random bytes]
14101411
<-- stripping start
1411-
[unreferenced local file entry 3]
1412-
[unreferenced local file entry 4]
1412+
[unreferenced local file entry]
1413+
[unreferenced local file entry]
14131414
<-- stripping end
1414-
[recorded local file entry 1]
1415+
[local file entry 1] (or central directory)
14151416
...
1416-
[central directory]
14171417
1418-
Stripping between recorded entries:
1418+
Stripping between referenced entries:
14191419
14201420
...
1421-
[recorded local file entry 5]
1421+
[local file entry]
14221422
<-- stripping start
1423+
[unreferenced local file entry]
1424+
[unreferenced local file entry]
1425+
<-- stripping end
14231426
[random bytes]
14241427
[unreferenced local file entry]
14251428
[random bytes]
1426-
<-- stripping end
1427-
[recorded local file entry 6]
1429+
[local file entry] (or central directory)
14281430
...
1429-
[recorded local file entry n]
1430-
<-- stripping start
1431+
1432+
No stripping:
1433+
14311434
[unreferenced local file entry]
1432-
<-- stripping end
1433-
[central directory]
1435+
[random bytes]
1436+
[local file entry 1] (or central directory)
1437+
...
14341438
14351439
No stripping:
14361440
1437-
[unreferenced local file entry 1]
1438-
[unreferenced local file entry 2]
14391441
...
1440-
[unreferenced local file entry n]
1442+
[local file entry]
14411443
[random bytes]
1442-
[recorded local file entry 1]
1444+
[unreferenced local file entry]
1445+
[local file entry] (or central directory)
14431446
...
14441447
1445-
removed: None or a sequence of ZipInfo instances representing removed
1446-
entries. When provided, only their corresponding local file
1447-
entries are stripped.
1448+
Side effects:
1449+
- Modifies the ZIP file in place.
1450+
- Updates zfile.start_dir to account for removed data.
1451+
- Sets zfile._didModify to True.
1452+
- Adjusts header_offset and _end_offset of referenced ZipInfo
1453+
instances.
1454+
1455+
Parameters:
1456+
zfile: A ZipFile object representing the archive to repack.
1457+
removed: Optional. A sequence of ZipInfo instances representing
1458+
the previously removed entries. When provided, only their
1459+
corresponding local file entries are stripped.
14481460
"""
14491461
removed_zinfos = set(removed or ())
14501462

@@ -1512,17 +1524,34 @@ def repack(self, zfile, removed=None):
15121524
entry_offset += used_entry_size
15131525

15141526
else:
1527+
old_header_offset = zinfo.header_offset
1528+
zinfo.header_offset -= entry_offset
1529+
15151530
if entry_offset > 0:
1516-
old_header_offset = zinfo.header_offset
1517-
zinfo.header_offset -= entry_offset
15181531
self._copy_bytes(fp, old_header_offset, zinfo.header_offset, used_entry_size)
15191532

1520-
if zinfo._end_offset is not None:
1521-
zinfo._end_offset = zinfo.header_offset + used_entry_size
1522-
1523-
# update entry_offset for subsequent files to follow
15241533
if used_entry_size < entry_size:
1525-
entry_offset += entry_size - used_entry_size
1534+
stale_entry_size = self._validate_local_file_entry_sequence(
1535+
fp,
1536+
old_header_offset + used_entry_size,
1537+
old_header_offset + entry_size,
1538+
)
1539+
else:
1540+
stale_entry_size = 0
1541+
1542+
if stale_entry_size > 0:
1543+
self._copy_bytes(
1544+
fp,
1545+
old_header_offset + used_entry_size + stale_entry_size,
1546+
zinfo.header_offset + used_entry_size,
1547+
entry_size - used_entry_size - stale_entry_size,
1548+
)
1549+
1550+
# update entry_offset for subsequent files to follow
1551+
entry_offset += stale_entry_size
1552+
1553+
if zinfo._end_offset is not None:
1554+
zinfo._end_offset = zinfo.header_offset + entry_size - stale_entry_size
15261555

15271556
# update state
15281557
zfile.start_dir -= entry_offset

0 commit comments

Comments
 (0)
0