@@ -146,7 +146,10 @@ def iter_substrings(s, minlen, maxlen):
146
146
yield s [begin : begin + n ]
147
147
148
148
149
- def compute_huffman_coding (translations , f ):
149
+ translation_requires_uint16 = {"cs" , "el" , "fr" , "ja" , "ko" , "pl" , "ru" , "tr" , "zh_Latn_pinyin" }
150
+
151
+
152
+ def compute_huffman_coding (translation_name , translations , f ):
150
153
texts = [t [1 ] for t in translations ]
151
154
words = []
152
155
@@ -163,6 +166,12 @@ def compute_huffman_coding(translations, f):
163
166
164
167
bits_per_codepoint = 16 if max_ord > 255 else 8
165
168
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
169
+ translation_name = translation_name .split ("/" )[- 1 ].split ("." )[0 ]
170
+ if max_ord > 255 and translation_name not in translation_requires_uint16 :
171
+ raise ValueError (
172
+ f"Translation { translation_name } expected to fit in 8 bits but required 16 bits"
173
+ )
174
+
166
175
while len (words ) < max_words :
167
176
# Until the dictionary is filled to capacity, use a heuristic to find
168
177
# the best "word" (2- to 11-gram) to add to it.
@@ -522,5 +531,7 @@ def output_translation_data(encoding_table, i18ns, out):
522
531
i18ns = parse_input_headers (args .infiles )
523
532
i18ns = sorted (i18ns )
524
533
translations = translate (args .translation , i18ns )
525
- encoding_table = compute_huffman_coding (translations , args .compression_filename )
534
+ encoding_table = compute_huffman_coding (
535
+ args .translation , translations , args .compression_filename
536
+ )
526
537
output_translation_data (encoding_table , translations , args .translation_filename )
0 commit comments