8000 bpo-41486: Faster bz2/lzma/zlib via new output buffering (GH-21740) · python/cpython@f9bedb6 · GitHub
[go: up one dir, main page]

Skip to content

Commit f9bedb6

Browse files
author
Ma Lin
authored
bpo-41486: Faster bz2/lzma/zlib via new output buffering (GH-21740)
Faster bz2/lzma/zlib via new output buffering. Also adds .readall() function to _compression.DecompressReader class to take best advantage of this in the consume-all-output at once scenario. Often a 5-20% speedup in common scenarios due to less data copying. Contributed by Ma Lin.
1 parent a5e6444 commit f9bedb6

File tree

7 files changed

+670
-254
lines changed

7 files changed

+670
-254
lines changed

Doc/whatsnew/3.10.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,6 +1243,12 @@ Optimizations
12431243
for more details. (Contributed by Victor Stinner and Pablo Galindo in
12441244
:issue:`38980`.)
12451245
1246+
* Use a new output buffer management code for :mod:`bz2` / :mod:`lzma` /
1247+
:mod:`zlib` modules, and add ``.readall()`` function to
1248+
``_compression.DecompressReader`` class. bz2 decompression 1.09x ~ 1.17x
1249+
faster, lzma decompression 1.20x ~ 1.32x faster, ``GzipFile.read(-1)`` 1.11x
1250+
~ 1.18x faster. (Contributed by Ma Lin, reviewed by Gregory P. Smith, in :issue:`41486`)
1251+
12461252
* Function parameters and their annotations are no longer computed at runtime,
12471253
but rather at compilation time. They are stored as a tuple of strings at the
12481254
bytecode level. It is now around 2 times faster to create a function with
Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
/*
2+
_BlocksOutputBuffer is used to maintain an output buffer
3+
that has unpredictable size. Suitable for compression/decompression
4+
API (bz2/lzma/zlib) that has stream->next_out and stream->avail_out:
5+
6+
stream->next_out: point to the next output position.
7+
stream->avail_out: the number of available bytes left in the buffer.
8+
9+
It maintains a list of bytes object, so there is no overhead of resizing
10+
the buffer.
11+
12+
Usage:
13+
14+
1, Initialize the struct instance like this:
15+
_BlocksOutputBuffer buffer = {.list = NULL};
16+
Set .list to NULL for _BlocksOutputBuffer_OnError()
17+
18+
2, Initialize the buffer use one of these functions:
19+
_BlocksOutputBuffer_InitAndGrow()
20+
_BlocksOutputBuffer_InitWithSize()
21+
22+
3, If (avail_out == 0), grow the buffer:
23+
_BlocksOutputBuffer_Grow()
24+
25+
4, Get the current outputted data size:
26+
_BlocksOutputBuffer_GetDataSize()
27+
28+
5, Finish the buffer, and return a bytes object:
29+
_BlocksOutputBuffer_Finish()
30+
31+
6, Clean up the buffer when an error occurred:
32+
_BlocksOutputBuffer_OnError()
33+
*/
34+
35+
#ifndef Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H
36+
#define Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H
37+
#ifdef __cplusplus
38+
extern "C" {
39+
#endif
40+
41+
#include "Python.h"
42+
43+
typedef struct {
44+
// List of bytes objects
45+
PyObject *list;
46+
// Number of whole allocated size
47+
Py_ssize_t allocated;
48+
// Max length of the buffer, negative number means unlimited length.
49+
Py_ssize_t max_length;
50+
} _BlocksOutputBuffer;
51+
52+
static const char unable_allocate_msg[] = "Unable to allocate output buffer.";
53+
54+
/* In 32-bit build, the max block size should <= INT32_MAX. */
55+
#define OUTPUT_BUFFER_MAX_BLOCK_SIZE (256*1024*1024)
56+
57+
/* Block size sequence */
58+
#define KB (1024)
59+
#define MB (1024*1024)
60+
const Py_ssize_t BUFFER_BLOCK_SIZE[] =
61+
{ 32*KB, 64*KB, 256*KB, 1*MB, 4*MB, 8*MB, 16*MB, 16*MB,
62+
32*MB, 32*MB, 32*MB, 32*MB, 64*MB, 64*MB, 128*MB, 128*MB,
63+
OUTPUT_BUFFER_MAX_BLOCK_SIZE };
64+
#undef KB
65+
#undef MB
66+
67+
/* According to the block sizes defined by BUFFER_BLOCK_SIZE, the whole
68+
allocated size growth step is:
69+
1 32 KB +32 KB
70+
2 96 KB +64 KB
71+
3 352 KB +256 KB
72+
4 1.34 MB +1 MB
73+
5 5.34 MB +4 MB
74+
6 13.34 MB +8 MB
75+
7 29.34 MB +16 MB
76+
8 45.34 MB +16 MB
77+
9 77.34 MB +32 MB
78+
10 109.34 MB +32 MB
79+
11 141.34 MB +32 MB
80+
12 173.34 MB +32 MB
81+
13 237.34 MB +64 MB
82+
14 301.34 MB +64 MB
83+
15 429.34 MB +128 MB
84+
16 557.34 MB +128 MB
85+
17 813.34 MB +256 MB
86+
18 1069.34 MB +256 MB
87+
19 1325.34 MB +256 MB
88+
20 1581.34 MB +256 MB
89+
21 1837.34 MB +256 MB
90+
22 2093.34 MB +256 MB
91+
...
92+
*/
93+
94+
/* Initialize the buffer, and grow the buffer.
95+
96+
max_length: Max length of the buffer, -1 for unlimited length.
97+
98+
On success, return allocated size (>=0)
99+
On failure, return -1
100+
*/
101+
static inline Py_ssize_t
102+
_BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer,
103+
const Py_ssize_t max_length,
104+
void **next_out)
105+
{
106+
PyObject *b;
107+
Py_ssize_t block_size;
108+
109+
// ensure .list was set to NULL
110+
assert(buffer->list == NULL);
111+
112+
// get block size
113+
if (0 <= max_length && max_length < BUFFER_BLOCK_SIZE[0]) {
114+
block_size = max_length;
115+
} else {
116+
block_size = BUFFER_BLOCK_SIZE[0];
117+
}
118+
119+
// the first block
120+
b = PyBytes_FromStringAndSize(NULL, block_size);
121+
if (b == NULL) {
122+
return -1;
123+
}
124+
125+
// create the list
126+
buffer->list = PyList_New(1);
127+
if (buffer->list == NULL) {
128+
Py_DECREF(b);
129+
return -1;
130+
}
131+
PyList_SET_ITEM(buffer->list, 0, b);
132+
133+
// set variables
134+
buffer->allocated = block_size;
135+
buffer->max_length = max_length;
136+
137+
*next_out = PyBytes_AS_STRING(b);
138+
return block_size;
139+
}
140+
141+
/* Initialize the buffer, with an initial size.
142+
143+
Check block size limit in the outer wrapper function. For example, some libs
144+
accept UINT32_MAX as the maximum block size, then init_size should <= it.
145+
146+
On success, return allocated size (>=0)
147+
On failure, return -1
148+
*/
149+
static inline Py_ssize_t
150+
_BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer,
151+
const Py_ssize_t init_size,
152+
void **next_out)
153+
{
154+
PyObject *b;
155+
156+
// ensure .list was set to NULL
157+
assert(buffer->list == NULL);
158+
159+
// the first block
160+
b = PyBytes_FromStringAndSize(NULL, init_size);
161+
if (b == NULL) {
162+
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
163+
return -1;
164+
}
165+
166+
// create the list
167+
buffer->list = PyList_New(1);
168+
if (buffer->list == NULL) {
169+
Py_DECREF(b);
170+
return -1;
171+
}
172+
PyList_SET_ITEM(buffer->list, 0, b);
173+
174+
// set variables
175+
buffer->allocated = init_size;
176+
buffer->max_length = -1;
177+
178+
*next_out = PyBytes_AS_STRING(b);
179+
return init_size;
180+
}
181+
182+
/* Grow the buffer. The avail_out must be 0, please check it before calling.
183+
184+
On success, return allocated size (>=0)
185+
On failure, return -1
186+
*/
187+
static inline Py_ssize_t
188+
_BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
189+
void **next_out,
190+
const Py_ssize_t avail_out)
191+
{
192+
PyObject *b;
193+
const Py_ssize_t list_len = Py_SIZE(buffer->list);
194+
Py_ssize_t block_size;
195+
196+
// ensure no gaps in the data
197+
if (avail_out != 0) {
198+
PyErr_SetString(PyExc_SystemError,
199+
"avail_out is non-zero in _BlocksOutputBuffer_Grow().");
200+
return -1;
201+
}
202+
203+
// get block size
204+
if (list_len < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) {
205+
block_size = BUFFER_BLOCK_SIZE[list_len];
206+
} else {
207+
block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1];
208+
}
209+
210+
// check max_length
211+
if (buffer->max_length >= 0) {
212+
// if (rest == 0), should not grow the buffer.
213+
Py_ssize_t rest = buffer->max_length - buffer->allocated;
214+
assert(rest > 0);
215+
216+
// block_size of the last block
217+
if (block_size > rest) {
218+
block_size = rest;
219+
}
220+
}
221+
222+
// check buffer->allocated overflow
223+
if (block_size > PY_SSIZE_T_MAX - buffer->allocated) {
224+
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
225+
return -1;
226+
}
227+
228+
// create the block
229+
b = PyBytes_FromStringAndSize(NULL, block_size);
230+
if (b == NULL) {
231+
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
232+
return -1;
233+
}
234+
if (PyList_Append(buffer->list, b) < 0) {
235+
Py_DECREF(b);
236+
return -1;
237+
}
238+
Py_DECREF(b);
239+
240+
// set variables
241+
buffer->allocated += block_size;
242+
243+
*next_out = PyBytes_AS_STRING(b);
244+
return block_size;
245+
}
246+
247+
/* Return the current outputted data size. */
248+
static inline Py_ssize_t
249+
_BlocksOutputBuffer_GetDataSize(_BlocksOutputBuffer *buffer,
250+
const Py_ssize_t avail_out)
251+
{
252+
return buffer->allocated - avail_out;
253+
}
254+
255+
/* Finish the buffer.
256+
257+
Return a bytes object on success
258+
Return NULL on failure
259+
*/
260+
static inline PyObject *
261+
_BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer,
262+
const Py_ssize_t avail_out)
263+
{
264+
PyObject *result, *block;
265+
const Py_ssize_t list_len = Py_SIZE(buffer->list);
266+
267+
// fast path for single block
268+
if ((list_len == 1 && avail_out == 0) ||
269+
(list_len == 2 && Py_SIZE(PyList_GET_ITEM(buffer->list, 1)) == avail_out))
270+
{
271+
block = PyList_GET_ITEM(buffer->list, 0);
272+
Py_INCREF(block);
273+
274+
Py_CLEAR(buffer->list);
275+
return block;
276+
}
277+
278+
// final bytes object
279+
result = PyBytes_FromStringAndSize(NULL, buffer->allocated - avail_out);
280+
if (result == NULL) {
281+
PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
282+
return NULL;
283+
}
284+
285+
// memory copy
286+
if (list_len > 0) {
287+
char *posi = PyBytes_AS_STRING(result);
288+
289+
// blocks except the last one
290+
Py_ssize_t i = 0;
291+
for (; i < list_len-1; i++) {
292+
block = PyList_GET_ITEM(buffer->list, i);
293+
memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block));
294+
posi += Py_SIZE(block);
295+
}
296+
// the last block
297+
block = PyList_GET_ITEM(buffer->list, i);
298+
memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block) - avail_out);
299+
} else {
300+
assert(Py_SIZE(result) == 0);
301+
}
302+
303+
Py_CLEAR(buffer->list);
304+
return result;
305+
}
306+
307+
/* Clean up the buffer when an error occurred. */
308+
static inline void
309+
_BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer)
310+
{
311+
Py_CLEAR(buffer->list);
312+
}
313+
314+
#ifdef __cplusplus
315+
}
316+
#endif
317+
#endif /* Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H */

Lib/_compression.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Internal classes used by the gzip, lzma and bz2 modules"""
22

33
import io
4-
4+
import sys
55

66
BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size
77

@@ -110,6 +110,16 @@ def read(self, size=-1):
110110
self._pos += len(data)
111111
return data
112112

113+
def readall(self):
114+
chunks = []
115+
# sys.maxsize means the max length of output buffer is unlimited,
116+
# so that the whole input buffer can be decompressed within one
117+
# .decompress() call.
118+
while data := self.read(sys.maxsize):
119+
chunks.append(data)
8CF5 120+
121+
return b"".join(chunks)
122+
113123
# Rewind the file to the beginning of the data stream.
114124
def _rewind(self):
115125
self._fp.seek(0)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Use a new output buffer management code for :mod:`bz2` / :mod:`lzma` /
2+
:mod:`zlib` modules, and add ``.readall()`` function to
3+
``_compression.DecompressReader`` class. These bring some performance
4+
improvements. Patch by Ma Lin.

0 commit comments

Comments
 (0)
0