diff --git a/Lib/test/test_tools/msgfmt_data/fuzzy.mo b/Lib/test/test_tools/msgfmt_data/fuzzy.mo index 4b144831cf5f75..273edbbbd7cbd7 100644 Binary files a/Lib/test/test_tools/msgfmt_data/fuzzy.mo and b/Lib/test/test_tools/msgfmt_data/fuzzy.mo differ diff --git a/Lib/test/test_tools/msgfmt_data/general.mo b/Lib/test/test_tools/msgfmt_data/general.mo index ee905cbb3ec58d..3107f6711bdd10 100644 Binary files a/Lib/test/test_tools/msgfmt_data/general.mo and b/Lib/test/test_tools/msgfmt_data/general.mo differ diff --git a/Lib/test/test_tools/test_msgfmt.py b/Lib/test/test_tools/test_msgfmt.py index 7be606bbff606a..30872f05d6b752 100644 --- a/Lib/test/test_tools/test_msgfmt.py +++ b/Lib/test/test_tools/test_msgfmt.py @@ -21,6 +21,9 @@ with imports_under_tool("i18n"): import msgfmt +with imports_under_tool("i18n"): + from msgfmt import _hashpjw + def compile_messages(po_file, mo_file): assert_python_ok(msgfmt_py, '-o', mo_file, po_file) @@ -44,6 +47,27 @@ def test_compilation(self): self.assertDictEqual(actual._catalog, expected._catalog) + def test_hash_table(self): + # Check _hashpjw generates correct hash values + self.assertEqual(_hashpjw(b"stan"), 502398) + self.assertEqual(_hashpjw(b"foo"), 27999) + + # Check hash table is generated correctly for general.po + with temp_cwd(): + tmp_mo_file = "messages.mo" + compile_messages(data_dir / "general.po", tmp_mo_file) + with open(tmp_mo_file, "rb") as f: + mo_data = f.read() + + header = struct.unpack("=7I", mo_data[:28]) + hash_table_size, hash_table_offset = header[5:7] + + hash_tab = struct.unpack(f"={hash_table_size}I", + mo_data[hash_table_offset : hash_table_offset + (hash_table_size * 4)]) + + self.assertEqual(hash_tab, (1, 3, 0, 8, 9, 7, 2, 0, 4, 5, 0, 6, 0)) + + def test_binary_header(self): with temp_cwd(): tmp_mo_file = 'messages.mo' @@ -66,8 +90,8 @@ def test_binary_header(self): self.assertEqual(num_strings, 9) self.assertEqual(orig_table_offset, 28) self.assertEqual(trans_table_offset, 100) - self.assertEqual(hash_table_size, 0) - self.assertEqual(hash_table_offset, 0) + self.assertEqual(hash_table_size, 13) + self.assertEqual(hash_table_offset, 172) def test_translations(self): with open(data_dir / 'general.mo', 'rb') as f: diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst b/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst new file mode 100644 index 00000000000000..1e1647061b88c6 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst @@ -0,0 +1 @@ +:program:`msgfmt` now generates GNU hash tables. diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index cd5f1ed9f3e268..b57a7fa50870ca 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -5,8 +5,8 @@ This program converts a textual Uniforum-style message catalog (.po file) into a binary GNU catalog (.mo file). This is essentially the same function as the -GNU msgfmt program, however, it is a simpler implementation. Currently it -does not handle plural forms but it does handle message contexts. +GNU msgfmt program. Currently it does not handle plural forms but it does +handle message contexts. Usage: msgfmt.py [OPTIONS] filename.po @@ -60,21 +60,56 @@ def add(ctxt, id, str, fuzzy): def generate(): "Return the generated output." global MESSAGES + + def hash_insert_entry(string, i): + hash_val = _hashpjw(string) + hash_cursor = hash_val % hash_tab_size + inc = 1 + (hash_val % (hash_tab_size - 2)) + while hash_table[hash_cursor]: + hash_cursor += inc + hash_cursor %= hash_tab_size + hash_table[hash_cursor] = i + 1 + + # From [gettext.git]/gettext-tools/src/write-mo.c: + # Each string has an associate hashing value V, computed by a fixed + # function. To locate the string we use open addressing with double + # hashing. The first index will be V % M, where M is the size of the + # hashing table. If no entry is found, iterating with a second, + # independent hashing function takes place. This second value will + # be 1 + V % (M - 2). + # The approximate number of probes will be + # + # for unsuccessful search: (1 - N / M) ^ -1 + # for successful search: - (N / M) ^ -1 * ln (1 - N / M) + # + # where N is the number of keys. + # + # If we now choose M to be the next prime bigger than 4 / 3 * N, + # we get the values + # 4 and 1.85 resp. + # Because unsuccessful searches are unlikely this is a good value. + # Formulas: [Knuth, The Art of Computer Programming, Volume 3, + # 766 Sorting and Searching, 1973, Addison Wesley] + hash_tab_size = _next_prime((len(MESSAGES) * 4) // 3) + if hash_tab_size <= 2: + hash_tab_size = 3 + hash_table = array.array("I", [0] * hash_tab_size) + # the keys are sorted in the .mo file keys = sorted(MESSAGES.keys()) offsets = [] ids = strs = b'' - for id in keys: + for i, id in enumerate(keys): # For each string, we need size and file offset. Each string is NUL # terminated; the NUL does not count into the size. + hash_insert_entry(id, i) offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) ids += id + b'\0' strs += MESSAGES[id] + b'\0' - output = '' - # The header is 7 32-bit unsigned integers. We don't use hash tables, so - # the keys start right after the index tables. - # translated string. - keystart = 7*4+16*len(keys) + + # The header is 7 32-bit unsigned integers, and we have an index table and + # hash table. + keystart = 7*4+16*len(keys)+hash_tab_size*4 # and the values start after the keys valuestart = keystart + len(ids) koffsets = [] @@ -86,13 +121,15 @@ def generate(): voffsets += [l2, o2+valuestart] offsets = koffsets + voffsets output = struct.pack("Iiiiiii", - 0x950412de, # Magic - 0, # Version - len(keys), # # of entries - 7*4, # start of key index - 7*4+len(keys)*8, # start of value index - 0, 0) # size and offset of hash table + 0x950412de, # Magic + 0, # Version + len(keys), # # of entries + 7*4, # start of key index + 7*4+len(keys)*8, # start of value index + hash_tab_size, # size of hash table + 7 * 4 + 2 * (len(keys) * 8)) # offset of hash table output += array.array("i", offsets).tobytes() + output += hash_table.tobytes() output += ids output += strs return output @@ -258,5 +295,39 @@ def main(): make(filename, outfile) +# Utilities for writing hash table + +# Peter J. Weinberger hash function +# See: https://www.drdobbs.com/database/hashing-rehashed/184409859 +def _hashpjw(strs): + hval = 0 + for s in strs: + if not s: + break + hval <<= 4 + hval += s + g = hval & (0xF << 28) + if g: + hval ^= g >> 24 + hval ^= g + return hval + + +def _next_prime(start): + def is_prime(num): + divn = 3 + sq = divn * divn + while sq < num and num % divn != 0: + divn += 1 + sq += 4 * divn + divn += 1 + + return num % divn != 0 + + candidate = start | 1 + while not is_prime(candidate): + candidate += 2 + return candidate + if __name__ == '__main__': main()