8000 gh-101178: refactor base64.b85encode to be memory friendly by romuald · Pull Request #112248 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-101178: refactor base64.b85encode to be memory friendly #112248

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 9 additions & 26 deletions Lib/base64.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,27 +298,12 @@ def b16decode(s, casefold=False):

def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
# Helper function for a85encode and b85encode
# chars2 is now unused
if not isinstance(b, bytes_types):
b = memoryview(b).tobytes()

padding = (-len(b)) % 4
if padding:
b = b + b'\0' * padding
words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)

chunks = [b'z' if foldnuls and not word else
b'y' if foldspaces and word == 0x20202020 else
(chars2[word // 614125] +
chars2[word // 85 % 7225] +
chars[word % 85])
for word in words]

if padding and not pad:
if chunks[-1] == b'z':
chunks[-1] = chars[0] * 5
chunks[-1] = chunks[-1][:-padding]

return b''.join(chunks)
return binascii.b2a_base85(b, chars=chars, pad=pad,
foldnuls=foldnuls, foldspaces=foldspaces)

def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
"""Encode bytes-like object b using Ascii85 and return a bytes object.
Expand All @@ -337,14 +322,13 @@ def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
adobe controls whether the encoded byte sequence is framed with <~ and ~>,
which is used by the Adobe implementation.
"""
global _a85chars, _a85chars2
global _a85chars
# Delay the initialization of tables to not waste memory
# if the function is never called
if _a85chars2 is None:
if _a85chars is None:
_a85chars = [bytes((i,)) for i in range(33, 118)]
_a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]

result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
result = _85encode(b, b''.join(_a85chars), None, pad, True, foldspaces)

if adobe:
result = _A85START + result
Expand Down Expand Up @@ -445,13 +429,12 @@ def b85encode(b, pad=False):
If pad is true, the input is padded with b'\\0' so its length is a multiple of
4 bytes before encoding.
"""
global _b85chars, _b85chars2
global _b85chars
# Delay the initialization of tables to not waste memory
# if the function is never called
if _b85chars2 is None:
if _b85chars is None:
_b85chars = [bytes((i,)) for i in _b85alphabet]
_b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
return _85encode(b, _b85chars, _b85chars2, pad)
return _85encode(b, _b85alphabet, None, pad)

def b85decode(b):
"""Decode the base85-encoded bytes-like object or ASCII string b
Expand Down
1 change: 1 addition & 0 deletions Lib/test/test_base64.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ def test_b85encode(self):
b"""0123456789!@#0^&*();:<>,. []{}""":
b"""VPa!sWoBn+X=-b1ZEkOHadLBXb#`}nd3r%YLqtVJM@UIZOH55pPf$@("""
b"""Q&d$}S6EqEFflSSG&MFiI5{CeBQRbjDkv#CIy^osE+AW7dwl""",
b"paddu\xc7": b'aA9O*b;k',
b'no padding..': b'Zf_uPVPs@!Zf7no',
b'zero compression\x00\x00\x00\x00': b'dS!BNAY*TBaB^jHb7^mG00000',
b'zero compression\x00\x00\x00': b'dS!BNAY*TBaB^jHb7^mG0000',
Expand Down
88 changes: 88 additions & 0 deletions Modules/binascii.c
Original file line number Diff line number Diff line change
Expand Up @@ -1239,13 +1239,101 @@ binascii_b2a_qp_impl(PyObject *module, Py_buffer *data, int quotetabs,
return rv;
}

/*[clinic input]
binascii.b2a_base85
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would feel weird not to have binascii.a2b_base85 so I would suggest keeping it private for now. Ideally, base64 should have its own C accelerator module but maybe it's an overkill.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed to private

That was what delayed me initially because I had no idea how to add a module dedicated to base64, I found out that it did use the binascii one only last week


data: Py_buffer
chars: Py_buffer
pad: bool = False
foldnuls: bool = False
foldspaces: bool = False

Utility method used by the base64 module to encode a85/b85 data

data: bytes
chars: 85 bytes conversion table
pad: use NULL-paded input if necessary
foldnuls: replace NULL chunks by 'z'
foldspaces: replace space-only chucks by 'y'

[clinic start generated code]*/

static PyObject *
binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, Py_buffer *chars,
int pad, int foldnuls, int foldspaces)
/*[clinic end generated code: output=0a92b3c535580aa0 input=a2d8ae712ed5adba]*/
{
if (chars->len != 85) {
PyErr_SetString(PyExc_ValueError,
"chars must be exactly 85 bytes long");
return NULL;
}

_PyBytesWriter writer;
_PyBytesWriter_Init(&writer);

const size_t bin_len = data->len;

// Allocate up to maxium encoded length, adjusted at end
const size_t ascii_len = ((bin_len + 3) / 4) * 5;

unsigned char *ascii_data = _PyBytesWriter_Alloc(&writer, ascii_len);
if (ascii_data == NULL) {
PyErr_NoMemory();
return NULL;
}

const unsigned char *table = chars->buf;
const unsigned char *bin_data = data->buf;

size_t i, j;
for (i = 0; i < bin_len; i += 4) {
const size_t chunk_size = (bin_len - i >= 4) ? 4 : (bin_len - i);

// translate chunk to 32bit integer
uint32_t value = 0;
for (j = 0; j < chunk_size; j++) {
value = (value << 8) | bin_data[i + j];
}
value <<= (4 - chunk_size) * 8;

if (foldnuls && value == 0) {
*ascii_data++ = 'z';
} else if (foldspaces && value == 0x20202020) {
*ascii_data++ = 'y';
} else {
for (j = 0; j < 5 ; j++) {
ascii_data[4 - j] = table[value % 85];
value /= 85;
}
ascii_data += 5;
}
}

// In case `i` went over the input size, we may need to shorten the output
const size_t overflow = (i - bin_len);

if (overflow && !pad && foldnuls && ascii_data[-1] == 'z') {
ascii_data--;
memset(ascii_data, table[0], 5);
ascii_data += 5;
}

if (!pad) {
ascii_data -= overflow;
}

return _PyBytesWriter_Finish(&writer, ascii_data);
}

/* List of functions defined in the module */

static struct PyMethodDef binascii_module_methods[] = {
BINASCII_A2B_UU_METHODDEF
BINASCII_B2A_UU_METHODDEF
BINASCII_A2B_BASE64_METHODDEF
BINASCII_B2A_BASE64_METHODDEF
BINASCII_B2A_BASE85_METHODDEF
BINASCII_A2B_HEX_METHODDEF
BINASCII_B2A_HEX_METHODDEF
BINASCII_HEXLIFY_METHODDEF
Expand Down
111 changes: 110 additions & 1 deletion Modules/clinic/binascii.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
0