8000 bpo-16995: add support for base32 extended hex (base32hex) by FFY00 · Pull Request #20441 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-16995: add support for base32 extended hex (base32hex) #20441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion Doc/library/base64.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ The modern interface provides:
whether a lowercase alphabet is acceptable as input. For security purposes,
the default is ``False``.

:rfc:`3548` allows for optional mapping of the digit 0 (zero) to the letter O
:rfc:`4648` allows for optional mapping of the digit 0 (zero) to the letter O
(oh), and for optional mapping of the digit 1 (one) to either the letter I (eye)
or letter L (el). The optional argument *map01* when not ``None``, specifies
which letter the digit 1 should be mapped to (when *map01* is not ``None``, the
Expand All @@ -136,6 +136,27 @@ The modern interface provides:
input.


.. function:: b32hexencode(s)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think both of these need ..versionadded:: 3.10

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, right! Forgot :)


Similar to :func:`b32encode` but uses the Extended Hex Alphabet, as defined in
:rfc:`4648`.

.. versionadded:: 3.10


.. function:: b32hexdecode(s, casefold=False)

Similar to :func:`b32decode` but uses the Extended Hex Alphabet, as defined in
:rfc:`4648`.

This version does not allow the digit 0 (zero) to the letter O (oh) and digit
1 (one) to either the letter I (eye) or letter L (el) mappings, all these
characters are included in the Extended Hex Alphabet and are not
interchangable.

.. versionadded:: 3.10


.. function:: b16encode(s)

Encode the :term:`bytes-like object` *s* using Base16 and return the
Expand Down
6 changes: 6 additions & 0 deletions Doc/whatsnew/3.10.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ New Modules
Improved Modules
================

base64
------

Add :func:`base64.b32hexencode` and :func:`base64.b32hexdecode` to support the
Base32 Encoding with Extended Hex Alphabet.

curses
------

Expand Down
86 changes: 55 additions & 31 deletions Lib/base64.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
'encode', 'decode', 'encodebytes', 'decodebytes',
# Generalized interface for other encodings
'b64encode', 'b64decode', 'b32encode', 'b32decode',
'b16encode', 'b16decode',
'b32hexencode', 'b32hexdecode', 'b16encode', 'b16decode',
# Base85 and Ascii85 encodings
'b85encode', 'b85decode', 'a85encode', 'a85decode',
# Standard Base64 encoding
Expand Down Expand Up @@ -135,19 +135,40 @@ def urlsafe_b64decode(s):


# Base32 encoding/decoding must be done in Python
_B32_ENCODE_DOCSTRING = '''
Encode the bytes-like objects using {encoding} and return a bytes object.
'''
_B32_DECODE_DOCSTRING = '''
Decode the {encoding} encoded bytes-like object or ASCII string s.

Optional casefold is a flag specifying whether a lowercase alphabet is
acceptable as input. For security purposes, the default is False.
{extra_args}
The result is returned as a bytes object. A binascii.Error is raised if
the input is incorrectly padded or if there are non-alphabet
characters present in the input.
'''
_B32_DECODE_MAP01_DOCSTRING = '''
RFC 3548 allows for optional mapping of the digit 0 (zero) to the
letter O (oh), and for optional mapping of the digit 1 (one) to
either the letter I (eye) or letter L (el). The optional argument
map01 when not None, specifies which letter the digit 1 should be
mapped to (when map01 is not None, the digit 0 is always mapped to
the letter O). For security purposes the default is None, so that
0 and 1 are not allowed in the input.
'''
_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'
_b32tab2 = None
_b32rev = None
_b32hexalphabet = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
_b32tab2 = {}
_b32rev = {}

def b32encode(s):
"""Encode the bytes-like object s using Base32 and return a bytes object.
"""
def _b32encode(alphabet, s):
global _b32tab2
# Delay the initialization of the table to not waste memory
# if the function is never called
if _b32tab2 is None:
b32tab = [bytes((i,)) for i in _b32alphabet]
_b32tab2 = [a + b for a in b32tab for b in b32tab]
if alphabet not in _b32tab2:
b32tab = [bytes((i,)) for i in alphabet]
_b32tab2[alphabet] = [a + b for a in b32tab for b in b32tab]
b32tab = None

if not isinstance(s, bytes_types):
Expand All @@ -158,7 +179,7 @@ def b32encode(s):
s = s + b'\0' * (5 - leftover) # Don't use += !
encoded = bytearray()
from_bytes = int.from_bytes
b32tab2 = _b32tab2
b32tab2 = _b32tab2[alphabet]
for i in range(0, len(s), 5):
c = from_bytes(s[i: i + 5], 'big')
encoded += (b32tab2[c >> 30] + # bits 1 - 10
Expand All @@ -177,29 +198,12 @@ def b32encode(s):
encoded[-1:] = b'='
return bytes(encoded)

def b32decode(s, casefold=False, map01=None):
"""Decode the Base32 encoded bytes-like object or ASCII string s.

Optional casefold is a flag specifying whether a lowercase alphabet is
acceptable as input. For security purposes, the default is False.

RFC 3548 allows for optional mapping of the digit 0 (zero) to the
letter O (oh), and for optional mapping of the digit 1 (one) to
either the letter I (eye) or letter L (el). The optional argument
map01 when not None, specifies which letter the digit 1 should be
mapped to (when map01 is not None, the digit 0 is always mapped to
the letter O). For security purposes the default is None, so that
0 and 1 are not allowed in the input.

The result is returned as a bytes object. A binascii.Error is raised if
the input is incorrectly padded or if there are non-alphabet
characters present in the input.
"""
def _b32decode(alphabet, s, casefold=False, map01=None):
global _b32rev
# Delay the initialization of the table to not waste memory
# if the function is never called
if _b32rev is None:
_b32rev = {v: k for k, v in enumerate(_b32alphabet)}
if alphabet not in _b32rev:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels mildly weird to me to hash the entire alphabet on every call, but I tried a different approach that uses sentinels (enums would also work) and found that it did not make an appreciable difference in speed, and this is more elegant — it has the nice property that you don't even need to change the code if additional base32 alphabets are added in the future — so I am in favor of keeping it like this.

Just thought I'd make a comment to explain my reasoning to future software archaeologists.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it has the nice property that you don't even need to change the code if additional base32 alphabets are added in the future

That was my goal :P

_b32rev[alphabet] = {v: k for k, v in enumerate(alphabet)}
s = _bytes_from_decode_data(s)
if len(s) % 8:
raise binascii.Error('Incorrect padding')
Expand All @@ -220,7 +224,7 @@ def b32decode(s, casefold=False, map01=None):
padchars = l - len(s)
# Now decode the full quanta
decoded = bytearray()
b32rev = _b32rev
b32rev = _b32rev[alphabet]
for i in range(0, len(s), 8):
quanta = s[i: i + 8]
acc = 0
Expand All @@ -241,6 +245,26 @@ def b32decode(s, casefold=False, map01=None):
return bytes(decoded)


def b32encode(s):
return _b32encode(_b32alphabet, s)
b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32')

def b32decode(s, casefold=False, map01=None):
return _b32decode(_b32alphabet, s, casefold, map01)
b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32',
extra_args=_B32_DECODE_MAP01_DOCSTRING)

def b32hexencode(s):
return _b32encode(_b32hexalphabet, s)
b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex')

def b32hexdecode(s, casefold=False):
# base32hex does not have the 01 mapping
return _b32decode(_b32hexalphabet, s, casefold)
b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex',
extra_args='')


# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
# lowercase. The RFC also recommends against accepting input case
# insensitively.
Expand Down
70 changes: 70 additions & 0 deletions Lib/test/test_base64.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,76 @@ def test_b32decode_error(self):
with self.assertRaises(binascii.Error):
base64.b32decode(data.decode('ascii'))

def test_b32hexencode(self):
test_cases = [
# to_encode, expected
(b'', b''),
(b'\x00', b'00======'),
(b'a', b'C4======'),
(b'ab', b'C5H0===='),
(b'abc', b'C5H66==='),
(b'abcd', b'C5H66P0='),
(b'abcde', b'C5H66P35'),
]
for to_encode, expected in test_cases:
with self.subTest(to_decode=to_encode):
self.assertEqual(base64.b32hexencode(to_encode), expected)

def test_b32hexencode_other_types(self):
self.check_other_types(base64.b32hexencode, b'abcd', b'C5H66P0=')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move this test and the next one into their own test functions.

self.check_encode_type_errors(base64.b32hexencode)

def test_b32hexdecode(self):
test_cases = [
# to_decode, expected, casefold
(b'', b'', False),
(b'00======', b'\x00', False),
(b'C4======', b'a', False),
(b'C5H0====', b'ab', False),
(b'C5H66===', b'abc', False),
(b'C5H66P0=', b'abcd', False),
(b'C5H66P35', b'abcde', False),
(b'', b'', True),
(b'00======', b'\x00', True),
(b'C4======', b'a', True),
(b'C5H0====', b'ab', True),
(b'C5H66===', b'abc', True),
(b'C5H66P0=', b'abcd', True),
(b'C5H66P35', b'abcde', True),
(b'c4======', b'a', True),
(b'c5h0====', b'ab', True),
(b'c5h66===', b'abc', True),
(b'c5h66p0=', b'abcd', True),
(b'c5h66p35', b'abcde', True),
]
for to_decode, expected, casefold in test_cases:
with self.subTest(to_decode=to_decode, casefold=casefold):
self.assertEqual(base64.b32hexdecode(to_decode, casefold),
expected)
self.assertEqual(base64.b32hexdecode(to_decode.decode('ascii'),
casefold), expected)

def test_b32hexdecode_other_types(self):
self.check_other_types(base64.b32hexdecode, b'C5H66===', b'abc')
self.check_decode_type_errors(base64.b32hexdecode)

def test_b32hexdecode_error(self):
tests = [b'abc', b'ABCDEF==', b'==ABCDEF', b'c4======']
prefixes = [b'M', b'ME', b'MFRA', b'MFRGG', b'MFRGGZA', b'MFRGGZDF']
for i in range(0, 17):
if i:
tests.append(b'='*i)
for prefix in prefixes:
if len(prefix) + i != 8:
tests.append(prefix + b'='*i)
for data in tests:
with self.subTest(to_decode=data):
with self.assertRaises(binascii.Error):
base64.b32hexdecode(data)
with self.assertRaises(binascii.Error):
base64.b32hexdecode(data.decode('ascii'))


def test_b16encode(self):
eq = self.assertEqual
eq(base64.b16encode(b'\x01\x02\xab\xcd\xef'), b'0102ABCDEF')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add :func:`base64.b32hexencode` and :func:`base64.b32hexdecode` to support the
Base32 Encoding with Extended Hex Alphabet.
0