8000 gh-131725: Generate GNU hash table in `msgfmt.py` by StanFromIreland · Pull Request #131727 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-131725: Generate GNU hash table in msgfmt.py #131727

8000
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Binary file modified Lib/test/test_tools/msgfmt_data/fuzzy.mo
Binary file not shown.
Binary file modified Lib/test/test_tools/msgfmt_data/general.mo
Binary file not shown.
28 changes: 26 additions & 2 deletions Lib/test/test_tools/test_msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
with imports_under_tool("i18n"):
import msgfmt

with imports_under_tool("i18n"):
from msgfmt import _hashpjw


def compile_messages(po_file, mo_file):
assert_python_ok(msgfmt_py, '-o', mo_file, po_file)
Expand All @@ -44,6 +47,27 @@ def test_compilation(sel 8000 f):

self.assertDictEqual(actual._catalog, expected._catalog)

def test_hash_table(self):
# Check _hashpjw generates correct hash values
self.assertEqual(_hashpjw(b"stan"), 502398)
self.assertEqual(_hashpjw(b"foo"), 27999)

# Check hash table is generated correctly for general.po
with temp_cwd():
tmp_mo_file = "messages.mo"
compile_messages(data_dir / "general.po", tmp_mo_file)
with open(tmp_mo_file, "rb") as f:
mo_data = f.read()

header = struct.unpack("=7I", mo_data[:28])
hash_table_size, hash_table_offset = header[5:7]

hash_tab = struct.unpack(f"={hash_table_size}I",
mo_data[hash_table_offset : hash_table_offset + (hash_table_size * 4)])
Comment on lines +62 to +66
Copy link
Contributor Author
@StanFromIreland StanFromIreland Mar 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could just hardcode these sizes since we test them below. Is that preferred?


self.assertEqual(hash_tab, (1, 3, 0, 8, 9, 7, 2, 0, 4, 5, 0, 6, 0))


def test_binary_header(self):
with temp_cwd():
tmp_mo_file = 'messages.mo'
Expand All @@ -66,8 +90,8 @@ def test_binary_header(self):
self.assertEqual(num_strings, 9)
self.assertEqual(orig_table_offset, 28)
self.assertEqual(trans_table_offset, 100)
self.assertEqual(hash_table_size, 0)
self.assertEqual(hash_table_offset, 0)
self.assertEqual(hash_table_size, 13)
self.assertEqual(hash_table_offset, 172)

def test_translations(self):
with open(data_dir / 'general.mo', 'rb') as f:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:program:`msgfmt` now generates GNU hash tables.
99 changes: 85 additions & 14 deletions Tools/i18n/msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

This program converts a textual Uniforum-style message catalog (.po file) into
a binary GNU catalog (.mo file). This is essentially the same function as the
GNU msgfmt program, however, it is a simpler implementation. Currently it
does not handle plural forms but it does handle message contexts.
GNU msgfmt program. Currently it does not handle plural forms but it does
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With #131880 it generates identical mo files so there is no real reason to have this now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The next sentence about plural forms is incorrect, it could be dropped at the occasion (see #112862).

handle message contexts.

Usage: msgfmt.py [OPTIONS] filename.po

Expand Down Expand Up @@ -60,21 +60,56 @@ def add(ctxt, id, str, fuzzy):
def generate():
"Return the generated output."
global MESSAGES

def hash_insert_entry(string, i):
hash_val = _hashpjw(string)
hash_cursor = hash_val % hash_tab_size
inc = 1 + (hash_val % (hash_tab_size - 2))
while hash_table[hash_cursor]:
hash_cursor += inc
hash_cursor %= hash_tab_size
hash_table[hash_cursor] = i + 1

# From [gettext.git]/gettext-tools/src/write-mo.c:
# Each string has an associate hashing value V, computed by a fixed
# function. To locate the string we use open addressing with double
# hashing. The first index will be V % M, where M is the size of the
# hashing table. If no entry is found, iterating with a second,
# independent hashing function takes place. This second value will
# be 1 + V % (M - 2).
# The approximate number of probes will be
#
# for unsuccessful search: (1 - N / M) ^ -1
# for successful search: - (N / M) ^ -1 * ln (1 - N / M)
#
# where N is the number of keys.
#
# If we now choose M to be the next prime bigger than 4 / 3 * N,
# we get the values
# 4 and 1.85 resp.
# Because unsuccessful searches are unlikely this is a good value.
# Formulas: [Knuth, The Art of Computer Programming, Volume 3,
# 766 Sorting and Searching, 1973, Addison Wesley]
hash_tab_size = _next_prime((len(MESSAGES) * 4) // 3)
if hash_tab_size <= 2:
hash_tab_size = 3
hash_table = array.array("I", [0] * hash_tab_size)

# the keys are sorted in the .mo file
keys = sorted(MESSAGES.keys())
offsets = []
ids = strs = b''
for id in keys:
for i, id in enumerate(keys):
# For each string, we need size and file offset. Each string is NUL
# terminated; the NUL does not count into the size.
hash_insert_entry(id, i)
offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
ids += id + b'\0'
strs += MESSAGES[id] + b'\0'
output = ''
# The header is 7 32-bit unsigned integers. We don't use hash tables, so
# the keys start right after the index tables.
# translated string.
keystart = 7*4+16*len(keys)

# The header is 7 32-bit unsigned integers, and we have an index table and
# hash table.
keystart = 7*4+16*len(keys)+hash_tab_size*4
# and the values start after the keys
valuestart = keystart + len(ids)
koffsets = []
Expand All @@ -86,13 +121,15 @@ def generate():
voffsets += [l2, o2+valuestart]
offsets = koffsets + voffsets
output = struct.pack("Iiiiiii",
0x950412de, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
0, 0) # size and offset of hash table
0x950412de, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
hash_tab_size, # size of hash table
7 * 4 + 2 * (len(keys) * 8)) # offset of hash table
output += array.array("i", offsets).tobytes()
output += hash_table.tobytes()
output += ids
output += strs
return output
Expand Down Expand Up @@ -258,5 +295,39 @@ def main():
make(filename, outfile)


# Utilities for writing hash table

# Peter J. Weinberger hash function
# See: https://www.drdobbs.com/database/hashing-rehashed/184409859
def _hashpjw(strs):
hval = 0
for s in strs:
if not s:
break
hval <<= 4
hval += s
g = hval & (0xF << 28)
if g:
hval ^= g >> 24
hval ^= g
return hval


def _next_prime(start):
def is_prime(num):
divn = 3
sq = divn * divn
while sq < num and num % divn != 0:
divn += 1
sq += 4 * divn
divn += 1

return num % divn != 0

candidate = start | 1
while not is_prime(candidate):
candidate += 2
return candidate

if __name__ == '__main__':
main()
0