8000 Merge pull request #8327 from jepler/translation-compression-qstr · n3o59hf/circuitpython@e0fa155 · GitHub 8000
[go: up one dir, main page]

Skip to content

Commit e0fa155

Browse files
authored
Merge pull request adafruit#8327 from jepler/translation-compression-qstr
Use qstrs to improve compression
2 parents 000d22f + 5c23e28 commit e0fa155

File tree

4 files changed

+161
-102
lines changed

4 files changed

+161
-102
lines changed

py/maketranslationdata.py

Lines changed: 107 additions & 85 deletions
410
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""
22
Process raw qstr file and output qstr data with length, hash and data bytes.
33
4-
This script works with Python 2.7, 3.3 and 3.4.
4+
This script is only regularly tested with the same version of Python used
5+
during CI, typically the latest "3.x". However, incompatibilities with any
6+
supported CPython version are unintended.
57
68
For documentation about the format of compressed translated strings, see
79
supervisor/shared/translate/translate.h
@@ -16,31 +18,16 @@
1618

1719
import collections
1820
import gettext
19-
import os.path
21+
import pathlib
2022

2123
if hasattr(sys.stdout, "reconfigure"):
2224
sys.stdout.reconfigure(encoding="utf-8")
2325
sys.stderr.reconfigure(errors="backslashreplace")
2426

25-
py = os.path.dirname(sys.argv[0])
26-
top = os.path.dirname(py)
27-
28-
sys.path.append(os.path.join(top, "tools/huffman"))
27+
sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))
2928

3029
import huffman
31-
32-
# Python 2/3 compatibility:
33-
# - iterating through bytes is different
34-
# - codepoint2name lives in a different module
35-
import platform
36-
37-
if platform.python_version_tuple()[0] == "2":
38-
bytes_cons = lambda val, enc=None: bytearray(val)
39-
from htmlentitydefs import codepoint2name
40-
elif platform.python_version_tuple()[0] == "3":
41-
bytes_cons = bytes
42-
from html.entities import codepoint2name
43-
# end compatibility code
30+
from html.entities import codepoint2name
4431

4532
codepoint2name[ord("-")] = "hyphen"
4633

@@ -182,9 +169,15 @@ class EncodingTable:
182169
extractor: object
183170
apply_offset: object
184171
remove_offset: object
172+
translation_qstr_bits: int
173+
qstrs: object
174+
qstrs_inv: object
185175

186176

187-
def compute_huffman_coding(translation_name, translations, f):
177+
def compute_huffman_coding(qstrs, translation_name, translations, f):
178+
# possible future improvement: some languages are better when consider len(k) > 2. try both?
179+
qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
180+
qstr_strs = list(qstrs.keys())
188181
texts = [t[1] for t in translations]
189182
words = []
190183

@@ -234,10 +227,12 @@ def remove_offset(c):
234227
# if "the" is in words then not only will "the" not be considered
235228
# again, neither will "there" or "wither", since they have "the"
236229
# as substrings.
237-
extractor = TextSplitter(words)
230+
extractor = TextSplitter(words + qstr_strs)
238231
counter = collections.Counter()
239232
for t in texts:
240233
for atom in extractor.iter(t):
234+
if atom in qstrs:
235+
atom = "\1"
241236
counter[atom] += 1
242237
cb = huffman.codebook(counter.items())
243238
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
@@ -304,10 +299,14 @@ def est_net_savings(s, occ):
304299
words.append(word)
305300

306301
words.sort(key=len)
307-
extractor = TextSplitter(words)
302+
extractor = TextSplitter(words + qstr_strs)
308303
counter = collections.Counter()
304+
used_qstr = 0
309305
for t in texts:
310306
for atom in extractor.iter(t):
307+
if atom in qstrs:
308+
used_qstr = max(used_qstr, qstrs[atom])
309+
atom = "\1"
311310
counter[atom] += 1
312311
cb = huffman.codebook(counter.items())
313312

@@ -322,6 +321,8 @@ def est_net_savings(s, occ):
322321
last_length = None
323322
canonical = {}
324323
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
324+
if atom in qstr_strs:
325+
atom = "\1"
325326
values.append(atom)
326327
length = len(code)
327328
if length not in length_count:
@@ -359,6 +360,8 @@ def est_net_savings(s, occ):
359360
minlen = len(words[0])
360361
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
361362

363+
translation_qstr_bits = used_qstr.bit_length()
364+
362365
f.write("typedef {} mchar_t;\n".format(values_type))
363366
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
364367
f.write(
@@ -383,34 +386,44 @@ def est_net_savings(s, occ):
383386
f.write("#define maxlen {}\n".format(maxlen))
384387
f.write("#define translation_offstart {}\n".format(offstart))
385388
f.write("#define translation_offset {}\n".format(offset))
386-
387-
return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
389+
f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))
390+
391+
qstrs_inv = dict((v, k) for k, v in qstrs.items())
392+
return EncodingTable(
393+
values,
394+
lengths,
395+
words,
396+
canonical,
397+
extractor,
398+
apply_offset,
399+
remove_offset,
400+
translation_qstr_bits,
401+
qstrs,
402+
qstrs_inv,
403+
)
388404

389405

390406
def decompress(encoding_table, encoded, encoded_length_bits):
407+
qstrs_inv = encoding_table.qstrs_inv
391408
values = encoding_table.values
392409
lengths = encoding_table.lengths
393410
words = encoding_table.words
394411

412+
def bititer():
413+
for byte in encoded:
414+
for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
415+
yield bool(byte & bit)
416+
417+
nextbit = bititer().__next__
418+
419+
def getnbits(n):
420+
bits = 0
421+
for i in range(n):
422+
bits = (bits << 1) | nextbit()
423+
return bits
424+
395425
dec = []
396-
this_byte = 0
397-
this_bit = 7
398-
b = encoded[this_byte]
399-
bits = 0
400-
for i in range(encoded_length_bits):
401-
bits <<= 1
402-
if 0x80 & b:
403-
bits |= 1
404-
405-
b <<= 1
406-
if this_bit == 0:
407-
this_bit = 7
408-
this_byte += 1
409-
if this_byte < len(encoded):
-
b = encoded[this_byte]
411-
else:
412-
this_bit -= 1
413-
length = bits
426+
length = getnbits(encoded_length_bits)
414427

415428
i = 0
416429
while i < length:
@@ -419,27 +432,19 @@ def decompress(encoding_table, encoded, encoded_length_bits):
419432
max_code = lengths[0]
420433
searched_length = lengths[0]
421434
while True:
422-
bits <<= 1
423-
if 0x80 & b:
424-
bits |= 1
425-
426-
b <<= 1
435+
bits = (bits << 1) | nextbit()
427436
bit_length += 1
428-
if this_bit == 0:
429-
this_bit = 7
430-
this_byte += 1
431-
if this_byte < len(encoded):
432-
b = encoded[this_byte]
433-
else:
434-
this_bit -= 1
435437
if max_code > 0 and bits < max_code:
436438
# print('{0:0{width}b}'.format(bits, width=bit_length))
437439
break
438440
max_code = (max_code << 1) + lengths[bit_length]
439441
searched_length += lengths[bit_length]
440442

441443
v = values[searched_length + bits - max_code]
442-
if v >= chr(0x80) and v < chr(0x80 + len(words)):
444+
if v == chr(1):
445+
qstr_idx = getnbits(encoding_table.translation_qstr_bits)
446+
v = qstrs_inv[qstr_idx]
447+
elif v >= chr(0x80) and v < chr(0x80 + len(words)):
443448
v = words[ord(v) - 0x80]
444449
i += len(v.encode("utf-8"))
445450
dec.append(v)
@@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
449454
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
450455
if not isinstance(decompressed, str):
451456
raise TypeError()
457+
qstrs = encoding_table.qstrs
452458
canonical = encoding_table.canonical
453459
extractor = encoding_table.extractor
454460

455-
enc = bytearray(len(decompressed) * 3)
456-
current_bit = 7
457-
current_byte = 0
458-
459-
bits = encoded_length_bits + 1
460-
for i in range(bits - 1, 0, -1):
461-
if len_translation_encoded & (1 << (i - 1)):
462-
enc[current_byte] |= 1 << current_bit
463-
if current_bit == 0:
464-
current_bit = 7
465-
current_byte += 1
466-
else:
467-
current_bit -= 1
461+
enc = 1
462+
463+
def put_bit(enc, b):
464+
return (enc << 1) | bool(b)
465+
466+
def put_bits(enc, b, n):
467+
for i in range(n - 1, -1, -1):
468+
enc = put_bit(enc, b & (1 << i))
469+
return enc
470+
471+
enc = put_bits(enc, len_translation_encoded, encoded_length_bits)
468472

469473
for atom in extractor.iter(decompressed):
470-
for b in canonical[atom]:
471-
if b == "1":
472-
enc[current_byte] |= 1 << current_bit
473-
if current_bit == 0:
474-
current_bit = 7
475-
current_byte += 1
476-
else:
477-
current_bit -= 1
474+
if atom in qstrs:
475+
can = canonical["\1"]
476+
else:
477+
can = canonical[atom]
478+
for b in can:
479+
enc = put_bit(enc, b == "1")
480+
if atom in qstrs:
481+
enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)
482+
483+
while enc.bit_length() % 8 != 1:
484+
enc = put_bit(enc, 0)
478485

479-
if current_bit != 7:
480-
current_byte += 1
481-
return enc[:current_byte]
486+
r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
487+
return r[1:]
482488

483489

484490
def qstr_escape(qst):
@@ -493,10 +499,20 @@ def esc_char(m):
493499
return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
494500

495501

502+
def parse_qstrs(infile):
503+
r = {}
504+
rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
505+
content = infile.read()
506+
for i, mat in enumerate(rx.findall(content, re.M)):
507+
mat = eval(mat)
508+
r[mat] = i
509+
return r
510+
511+
496512
def parse_input_headers(infiles):
497513
i18ns = set()
498514

499-
# read the qstrs in from the input files
515+
# read the TRANSLATE strings in from the input files
500516
for infile in infiles:
501517
with open(infile, "rt") as f:
502518
for line in f:
@@ -516,12 +532,12 @@ def escape_bytes(qstr):
516532
return qstr
517533
else:
518534
# qstr contains non-printable codes so render entire thing as hex pairs
519-
qbytes = bytes_cons(qstr, "utf8")
535+
qbytes = bytes(qstr, "utf8")
520536
return "".join(("\\x%02x" % b) for b in qbytes)
521537

522538

523539
def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
524-
qbytes = bytes_cons(qstr, "utf8")
540+
qbytes = bytes(qstr, "utf8")
525541
qlen = len(qbytes)
526542
qhash = compute_hash(qbytes, cfg_bytes_hash)
527543
if qlen >= (1 << (8 * cfg_bytes_len)):
@@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
551567
)
552568
total_text_compressed_size += len(compressed)
553569
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
554-
assert decompressed == translation
570+
assert decompressed == translation, (decompressed, translation)
555571
for c in C_ESCAPES:
556572
decompressed = decompressed.replace(c, C_ESCAPES[c])
557573
formatted = ["{:d}".format(x) for x in compressed]
@@ -572,7 +588,7 @@ def output_translation_data(encoding_table, i18ns, out):
572588
import argparse
573589

574590
parser = argparse.ArgumentParser(
575-
description="Process QSTR definitions into headers for compilation"
591+
description="Process TRANSLATE strings into headers for compilation"
576592
)
577593
parser.add_argument(
578594
"infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
@@ -590,13 +606,19 @@ def output_translation_data(encoding_table, i18ns, out):
590606
type=argparse.FileType("w", encoding="UTF-8"),
591607
help="c file for translation data",
592608
)
609+
parser.add_argument(
610+
"--qstrdefs_filename",
611+
type=argparse.FileType("r", encoding="UTF-8"),
612+
help="",
613+
)
593614

594615
args = parser.parse_args()
595616

617+
qstrs = parse_qstrs(args.qstrdefs_filename)
596618
i18ns = parse_input_headers(args.infiles)
597619
i18ns = sorted(i18ns)
598620
translations = translate(args.translation, i18ns)
599621
encoding_table = compute_huffman_coding(
600-
args.translation, translations, args.compression_filename
622+
qstrs, args.translation, translations, args.compression_filename
601623
)
602624
output_translation_data(encoding_table, translations, args.translation_filename)

py/py.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
270270
$(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
271271
$(STEPECHO) "GEN $@"
272272
$(Q)mkdir -p $(PY_BUILD)
273-
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h
273+
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h
274274

275275
PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o
276276

supervisor/shared/translate/compressed_string.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,28 @@
5353
// speaking, words. They're just spans of code points that frequently
5454
// occur together. They are ordered shortest to longest.
5555
//
56+
// - If the translation uses a lot of code points or widely spaced code points,
57+
// then the huffman table entries are UTF-16 code points. But if the translation
58+
// uses only ASCII 7-bit code points plus a SMALL range of higher code points that
59+
// still fit in 8 bits, translation_offset and translation_offstart are used to
60+
// renumber the code points so that they still fit within 8 bits. (it's very beneficial
61+
// for mchar_t to be 8 bits instead of 16!)
62+
//
5663
// - dictionary entries are non-overlapping, and the _ending_ index of each
5764
// entry is stored in an array. A count of words of each length, from
5865
// minlen to maxlen, is given in the array called wlencount. From
5966
// this small array, the start and end of the N'th word can be
6067
// calculated by an efficient, small loop. (A bit of time is traded
6168
// to reduce the size of this table indicating lengths)
6269
//
70+
// - Value 1 ('\1') is used to indicate that a QSTR number follows. the
71+
// QSTR is encoded as a fixed number of bits (translation_qstr_bits), e.g.,
72+
// 10 bits if the highest core qstr is from 512 to 1023 inclusive.
73+
// (maketranslationdata uses a simple heuristic where any qstr >= 3
74+
// characters long is encoded in this way; this is simple but probably not
75+
// optimal. In fact, the rule of >= 2 characters is better for SOME languages
76+
// on SOME boards.)
77+
//
6378
// The "data" / "tail" construct is so that the struct's last member is a
6479
// "flexible array". However, the _only_ member is not permitted to be
6580
// a flexible member, so we have to declare the first byte as a separate

0 commit comments

Comments
 (0)
0