8000 Compact the characters of certain translations, so they fit in uint8_t · jepler/circuitpython@d16c951 · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit d16c951

Browse files
committed
Compact the characters of certain translations, so they fit in uint8_t
This saves a few hundred bytes on the affected translations, such as `el` which shrunk from 186152 to 185588 bytes (564 bytes saved).
1 parent 9916b39 commit d16c951

File tree

2 files changed

+75
-8
lines changed

2 files changed

+75
-8
lines changed

py/maketranslationdata.py

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from __future__ import print_function
1111

1212
import bisect
13+
from dataclasses import dataclass
1314
import re
1415
import sys
1516

@@ -146,7 +147,40 @@ def iter_substrings(s, minlen, maxlen):
146147
yield s[begin : begin + n]
147148

148149

149-
translation_requires_uint16 = {"cs", "el", "fr", "ja", "ko", "pl", "ru", "tr", "zh_Latn_pinyin"}
150+
translation_requires_uint16 = {"cs", "fr", "ja", "ko", "pl", "tr", "zh_Latn_pinyin"}
151+
152+
153+
def compute_unicode_offset(texts):
154+
all_ch = set(" ".join(texts))
155+
ch_160 = sorted(c for c in all_ch if 160 <= ord(c) < 255)
156+
ch_256 = sorted(c for c in all_ch if 255 < ord(c))
157+
if not ch_256:
158+
return 0, 0
159+
min_256 = ord(min(ch_256))
160+
span = ord(max(ch_256)) - ord(min(ch_256)) + 1
161+
162+
if ch_160:
163+
max_160 = ord(max(ch_160)) + 1
164+
else:
165+
max_160 = max(160, 255 - span)
166+
167+
if max_160 + span > 256:
168+
return 0, 0
169+
170+
offstart = max_160
171+
offset = min_256 - max_160
172+
return offstart, offset
173+
174+
175+
@dataclass
176+
class EncodingTable:
177+
values: object
178+
lengths: object
179+
words: object
180+
canonical: object
181+
extractor: object
182+
apply_offset: object
183+
remove_offset: object
150184

151185

152186
def compute_huffman_coding(translation_name, translations, f):
@@ -156,8 +190,26 @@ def compute_huffman_coding(translation_name, translations, f):
156190
start_unused = 0x80
157191
end_unused = 0xFF
158192
max_ord = 0
193+
offstart, offset = compute_unicode_offset(texts)
194+
195+
def apply_offset(c):
196+
oc = ord(c)
197+
if oc >= offstart:
198+
oc += offset
199+
return chr(oc)
200+
201+
def remove_offset(c):
202+
oc = ord(c)
203+
if oc >= offstart:
204+
oc = oc - offset
205+
try:
206+
return chr(oc)
207+
except Exception as e:
208+
raise ValueError(f"remove_offset {offstart=} {oc=}") from e
209+
159210 8000
for text in texts:
160211
for c in text:
212+
c = remove_offset(c)
161213
ord_c = ord(c)
162214
max_ord = max(ord_c, max_ord)
163215
if 0x80 <= ord_c < 0xFF:
@@ -276,15 +328,17 @@ def est_net_savings(s, occ):
276328
length_count[length] += 1
277329
if last_length:
278330
renumbered <<= length - last_length
279-
canonical[atom] = "{0:0{width}b}".format(renumbered, width=length)
280331
# print(f"atom={repr(atom)} code={code}", file=sys.stderr)
332+
canonical[atom] = "{0:0{width}b}".format(renumbered, width=length)
281333
if len(atom) > 1:
282334
o = words.index(atom) + 0x80
283335
s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom)
336+
f.write(f"// {o} {s} {counter[atom]} {canonical[atom]} {renumbered}\n")
284337
else:
285338
s = C_ESCAPES.get(atom, atom)
339+
canonical[atom] = "{0:0{width}b}".format(renumbered, width=length)
286340
o = ord(atom)
287-
f.write(f"// {o} {s} {counter[atom]} {canonical[atom]} {renumbered}\n")
341+
f.write(f"// {o} {s} {counter[atom]} {canonical[atom]} {renumbered}\n")
288342
renumbered += 1
289343
last_length = length
290344
lengths = bytearray()
@@ -306,28 +360,37 @@ def est_net_savings(s, occ):
306360

307361
f.write("typedef {} mchar_t;\n".format(values_type))
308362
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
309-
f.write("const mchar_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values)))
363+
f.write(
364+
"const mchar_t values[] = {{ {} }};\n".format(
365+
", ".join(str(ord(remove_offset(u))) for u in values)
366+
)
367+
)
310368
f.write(
311369
"#define compress_max_length_bits ({})\n".format(
312370
max_translation_encoded_length.bit_length()
313371
)
314372
)
315373
f.write(
316374
"const mchar_t words[] = {{ {} }};\n".format(
317-
", ".join(str(ord(c)) for w in words for c in w)
375+
", ".join(str(ord(remove_offset(c))) for w in words for c in w)
318376
)
319377
)
320378
f.write("const uint8_t wlencount[] = {{ {} }};\n".format(", ".join(str(p) for p in wlencount)))
321379
f.write("#define word_start {}\n".format(word_start))
322380
f.write("#define word_end {}\n".format(word_end))
323381
f.write("#define minlen {}\n".format(minlen))
324382
f.write("#define maxlen {}\n".format(maxlen))
383+
f.write("#define offstart {}\n".format(offstart))
384+
f.write("#define offset {}\n".format(offset))
325385

326-
return (values, lengths, words, canonical, extractor)
386+
return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
327387

328388

329389
def decompress(encoding_table, encoded, encoded_length_bits):
330-
(values, lengths, words, _, _) = encoding_table
390+
values = encoding_table.values
391+
lengths = encoding_table.lengths
392+
words = encoding_table.words
393+
331394
dec = []
332395
this_byte = 0
333396
this_bit = 7
@@ -385,7 +448,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
385448
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
386449
if not isinstance(decompressed, str):
387450
raise TypeError()
388-
(_, _, _, canonical, extractor) = encoding_table
451+
canonical = encoding_table.canonical
452+
extractor = encoding_table.extractor
389453

390454
enc = bytearray(len(decompressed) * 3)
391455
current_bit = 7

supervisor/shared/translate/translate.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ STATIC void get_word(int n, const mchar_t **pos, const mchar_t **end) {
5757
}
5858

5959
STATIC int put_utf8(char *buf, int u) {
60+
if (u >= offstart) {
61+
u += offset;
62+
}
6063
if (u <= 0x7f) {
6164
*buf = u;
6265
return 1;

0 commit comments

Comments
 (0)
0