8000 gh-131725: Generate GNU hash table in `msgfmt.py` by StanFromIreland · Pull Request #131727 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-131725: Generate GNU hash table in msgfmt.py #131727

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Binary file modified Lib/test/test_tools/msgfmt_data/fuzzy.mo
Binary file not shown.
Binary file modified Lib/test/test_tools/msgfmt_data/general.mo
Binary file not shown.
9 changes: 6 additions & 3 deletions Lib/test/test_tools/test_msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def test_compilation(self):

self.assertDictEqual(actual._catalog, expected._catalog)

def test_hash_table(self):
pass

def test_binary_header(self):
with temp_cwd():
tmp_mo_file = 'messages.mo'
Expand All @@ -63,8 +66,8 @@ def test_binary_header(self):
self.assertEqual(num_strings, 9)
self.assertEqual(orig_table_offset, 28)
self.assertEqual(trans_table_offset, 100)
self.assertEqual(hash_table_size, 0)
self.assertEqual(hash_table_offset, 0)
self.assertEqual(hash_table_size, 13)
self.assertEqual(hash_table_offset, 172)

def test_translations(self):
with open(data_dir / 'general.mo', 'rb') as f:
Expand Down Expand Up @@ -152,7 +155,7 @@ def test_version(self):
for option in ('--version', '-V'):
res = assert_python_ok(msgfmt, option)
out = res.out.decode('utf-8').strip()
self.assertEqual('msgfmt.py 1.2', out)
self.assertEqual('msgfmt.py 1.3', out)

def test_invalid_option(self):
res = assert_python_failure(msgfmt, '--invalid-option')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:program:`msgfmt` now generates GNU hash tables.
99 changes: 84 additions & 15 deletions Tools/i18n/msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

This program converts a textual Uniforum-style message catalog (.po file) into
a binary GNU catalog (.mo file). This is essentially the same function as the
GNU msgfmt program, however, it is a simpler implementation. Currently it
does not handle plural forms but it does handle message contexts.
GNU msgfmt program. Currently it does not handle plural forms but it does
handle message contexts.

Usage: msgfmt.py [OPTIONS] filename.po

Expand Down Expand Up @@ -34,7 +34,7 @@
from email.parser import HeaderParser
import codecs

__version__ = "1.2"
__version__ = "1.3"


MESSAGES = {}
Expand All @@ -60,21 +60,56 @@ def add(ctxt, id, str, fuzzy):
def generate():
"Return the generated output."
global MESSAGES

def hash_insert_entry(string, i):
hash_val = hashpjw(string)
hash_cursor = hash_val % hash_tab_size
inc = 1 + (hash_val % (hash_tab_size - 2))
while hash_table[hash_cursor]:
hash_cursor += inc
hash_cursor %= hash_tab_size
hash_table[hash_cursor] = i + 1

# From [gettext.git]/gettext-tools/src/write-mo.c:
# Each string has an associate hashing value V, computed by a fixed
# function. To locate the string we use open addressing with double
# hashing. The first index will be V % M, where M is the size of the
# hashing table. If no entry is found, iterating with a second,
# independent hashing function takes place. This second value will
# be 1 + V % (M - 2).
# The approximate number of probes will be
#
# for unsuccessful search: (1 - N / M) ^ -1
# for successful search: - (N / M) ^ -1 * ln (1 - N / M)
#
# where N is the number of keys.
#
# If we now choose M to be the next prime bigger than 4 / 3 * N,
# we get the values
# 4 and 1.85 resp.
# Because unsuccessful searches are unlikely this is a good value.
# Formulas: [Knuth, The Art of Computer Programming, Volume 3,
# 766 Sorting and Searching, 1973, Addison Wesley]
hash_tab_size = next_prime((len(MESSAGES) * 4) // 3)
if hash_tab_size <= 2:
hash_tab_size = 3
hash_table = array.array("I", [0] * hash_tab_size)

# the keys are sorted in the .mo file
keys = sorted(MESSAGES.keys())
offsets = []
ids = strs = b''
for id in keys:
for i, id in enumerate(keys):
# For each string, we need size and file offset. Each string is NUL
# terminated; the NUL does not count into the size.
hash_insert_entry(id, i)
offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
ids += id + b'\0'
strs += MESSAGES[id] + b'\0'
output = ''
# The header is 7 32-bit unsigned integers. We don't use hash tables, so
# the keys start right after the index tables.
# translated string.
keystart = 7*4+16*len(keys)

# The header is 7 32-bit unsigned integers, and we have an index table and
# hash table.
keystart = 7*4+16*len(keys)+hash_tab_size*4
# and the values start after the keys
valuestart = keystart + len(ids)
koffsets = []
Expand All @@ -86,13 +121,15 @@ def generate():
voffsets += [l2, o2+valuestart]
offsets = koffsets + voffsets
output = struct.pack("Iiiiiii",
0x950412de, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
0, 0) # size and offset of hash table
0x950412de, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
hash_tab_size, # size of hash table
7 * 4 + 2 * (len(keys) * 8)) # offset of hash table
output += array.array("i", offsets).tobytes()
output += hash_table.tobytes()
output += ids
output += strs
return output
Expand Down Expand Up @@ -253,5 +290,37 @@ def main():
make(filename, outfile)


# Utilities for writing hash table

def hashpjw(str_param):
hval = 0
for s in str_param:
if not s:
break
hval <<= 4
hval += s
g = hval & 0xF << 28
if g:
hval ^= g >> 24
hval ^= g
return hval


def next_prime(start):
def is_prime(num):
divn = 3
sq = divn * divn
while sq < num and num % divn != 0:
divn += 1
sq += 4 * divn
divn += 1

return num % divn != 0

candidate = start | 1
while not is_prime(candidate):
candidate += 2
return candidate

if __name__ == '__main__':
main()
Loading
0