8000 bpo-43613: Faster implementation of gzip.compress and gzip.decompress by rhpvorderman · Pull Request #27941 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-43613: Faster implementation of gzip.compress and gzip.decompress #27941

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Sep 2, 2021
Merged
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
608876c
Add test for zlib.compress wbits
rhpvorderman Mar 24, 2021
12fb5a5
Add wbits argument to zlib.compress
rhpvorderman Mar 24, 2021
1b34510
Use clinic to generate input
rhpvorderman Mar 24, 2021
4e90311
Update documentation for zlib
rhpvorderman Mar 24, 2021
9717252
Add blurb news entry for zlib.compress wbits parameter
rhpvorderman Mar 24, 2021
d70390e
Fix doc typo
rhpvorderman Mar 24, 2021
c819e3e
Remove unnecessary whitespace, add punctionation and complete sentences.
rhpvorderman Jun 1, 2021
1f3481f
Break line to comply with PEP-7
rhpvorderman Jun 1, 2021
8019932
Update blurb to include :func: reference
rhpvorderman Jun 1, 2021
0ea98cf
Remove erroneous double backticks
rhpvorderman Jun 1, 2021
d1c86dc
Faster gzip.compress implementation
rhpvorderman Aug 24, 2021
19a0358
More efficiently decompress gzip files in memory
rhpvorderman Aug 24, 2021
5155857
Ensure correct endianness
rhpvorderman Aug 24, 2021
fa188a6
Remove redundant line
rhpvorderman Aug 24, 2021
4e76cf5
Fix typos and test errors
rhpvorderman Aug 24, 2021
0280c95
Revert changing default on compress for backwards compatibility 8000
rhpvorderman Aug 25, 2021
8ddee29
Update documentation with gzip speed improvements
rhpvorderman Aug 25, 2021
77f79fd
Add a blurb for gzip speed improvements
rhpvorderman Aug 25, 2021
ca3e543
Use + instead of bytes.join() method
rhpvorderman Aug 25, 2021
f881f7e
Reword docstring for read_gzip_header
rhpvorderman Aug 30, 2021
97a8100
Update docstring for gzip.compress
rhpvorderman Aug 30, 2021
eeb7766
Use subtest for zlib.compress/decompress test.
rhpvorderman Aug 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
More efficiently decompress gzip files in memory
  • Loading branch information
rhpvorderman committed Aug 25, 2021
commit 19a03589391d643a4c1fccdd2a106e8e9e0ccb08
121 changes: 73 additions & 48 deletions Lib/gzip.py
< 8000 td class="blob-code blob-code-deletion js-file-line"> while len(data) < n:
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,58 @@ def __iter__(self):
return self._buffer.__iter__()


def _read_exact(fp, n):
'''Read exactly *n* bytes from `fp`

This method is required because fp may be unbuffered,
i.e. return short reads.
'''
data = fp.read(n)
while len(data) < n:
b = self._fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data


def _read_gzip_header(fp):
'''Read a gzip header from a filestream and progresses the stream to
the end of the header. Returns last mtime if header was present or None
if no header was present'''
magic = fp.read(2)
if magic == b'':
return None

if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)

(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
if method != 8:
raise BadGzipFile('Unknown compression method')

if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", read_exact(fp, 2))
_read_exact(fp, extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FHCRC:
_read_exact(self._fp, 2) # Read & discard the 16-bit header CRC
return last_mtime


class _GzipReader(_compression.DecompressReader):
def __init__(self, fp):
super().__init__(_PaddedFile(fp), zlib.decompressobj,
Expand All @@ -415,53 +467,11 @@ def _init_read(self):
self._crc = zlib.crc32(b"")
self._stream_size = 0 # Decompressed size of unconcatenated stream

def _read_exact(self, n):
'''Read exactly *n* bytes from `self._fp`

This method is required because self._fp may be unbuffered,
i.e. return short reads.
'''

data = self._fp.read(n)
b = self._fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data

def _read_gzip_header(self):
magic = self._fp.read(2)
if magic == b'':
last_mtime = _read_gzip_header(self.fp)
if last_mtime is None:
return False

if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)

(method, flag,
self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
if method != 8:
raise BadGzipFile('Unknown compression method')

if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", self._read_exact(2))
self._read_exact(extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FHCRC:
self._read_exact(2) # Read & discard the 16-bit header CRC
self._last_mtime = last_mtime
return True

def read(self, size=-1):
Expand Down Expand Up @@ -524,7 +534,7 @@ def _read_eof(self):
# We check that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
crc32, isize = struct.unpack("<II", self._read_exact(8))
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))< 8FF9 /td>
if crc32 != self._crc:
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
hex(self._crc)))
Expand Down Expand Up @@ -586,8 +596,23 @@ def decompress(data):
"""Decompress a gzip compressed string in one shot.
Return the decompressed string.
"""
with GzipFile(fileobj=io.BytesIO(data)) as f:
return f.read()
decompressed_members = []
fp = io.BytesIO(data)
while True:
if _read_gzip_header(fp) is None:
return b"".join(decompressed_members)
# Use a zlib raw deflate compressor
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
# Read all the data except the header
decompressed = do.decompress(data[fp.tell():])
checksum, length = struct.unpack("<II", do.unused_data[:8])
crc = zlib.crc32(decompressed)
if crc != checksum:
raise BadGzipFile("CRC check failed")
if length != (len(block) & 0xffffffff):
raise BadGzipFile("Incorrect length of data produced")
decompressed_members.append(decompressed)
data = do.unused_data[8:].lstrip()


def main():
Expand Down
0