8000 add bigram compression to makeqstrdata (save ~100 bytes on trinket m0 de_DE) by jepler · Pull Request #3370 · adafruit/circuitpython · GitHub
[go: up one dir, main page]

Skip to content

add bigram compression to makeqstrdata (save ~100 bytes on trinket m0 de_DE) #3370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions locale/cs.po
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ msgstr ""

#: py/obj.c
msgid " File \"%q\""
msgstr "  Soubor \"%q\""
msgstr " Soubor \"%q\""

#: py/obj.c
msgid " File \"%q\", line %d"
msgstr "  Soubor \& 8000 quot;%q\", řádek %d"
msgstr " Soubor \"%q\", řádek %d"

#: main.c
msgid " output:\n"
msgstr " výstup:\n"
msgstr " výstup:\n"

#: py/objstr.c
#, c-format
Expand Down
6 changes: 3 additions & 3 deletions locale/pl.po
Original file line number Diff line number Diff line change
Expand Up @@ -1971,7 +1971,7 @@ msgstr "wartość kalibracji poza zakresem +/-127"

#: py/emitinlinethumb.c
msgid "can only have up to 4 parameters to Thumb assembly"
msgstr "asembler Thumb może przyjąć do 4 parameterów"
msgstr "asembler Thumb może przyjąć do 4 parameterów"

#: py/emitinlinextensa.c
msgid "can only have up to 4 parameters to Xtensa assembly"
Expand Down Expand Up @@ -3562,7 +3562,7 @@ msgstr ""
#~ msgstr "Nie udało się odkryć serwisów"

#~ msgid "Failed to get local address"
#~ msgstr "Nie udało się uzyskać lokalnego adresu"
#~ msgstr "Nie udało się uzyskać lokalnego adresu"

#~ msgid "Failed to get softdevice state"
#~ msgstr "Nie udało się odczytać stanu softdevice"
Expand Down Expand Up @@ -3610,7 +3610,7 @@ msgstr ""
#~ msgstr "Nie udało się zapisać gatts, błąd 0x%04x"

#~ msgid "Flash erase failed"
#~ msgstr "Nie udało się skasować flash"
#~ msgstr "Nie udało się skasować flash"

#~ msgid "Flash erase failed to start, err 0x%04x"
#~ msgstr "Nie udało się rozpocząć kasowania flash, błąd 0x%04x"
Expand Down
41 changes: 37 additions & 4 deletions py/makeqstrdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,30 @@ def translate(translation_file, i18ns):
translations.append((original, translation))
return translations

def frequent_ngrams(corpus, sz, n):
return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n)

def encode_ngrams(translation, ngrams):
if len(ngrams) > 32:
start = 0xe000
else:
start = 0x80
for i, g in enumerate(ngrams):
translation = translation.replace(g, chr(start + i))
return translation

def decode_ngrams(compressed, ngrams):
if len(ngrams) > 32:
start, end = 0xe000, 0xf8ff
else:
start, end = 0x80, 0x9f
return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)

def compute_huffman_coding(translations, qstrs, compression_filename):
all_strings = [x[1] for x in translations]
all_strings_concat = "".join(all_strings)
ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)]
all_strings_concat = encode_ngrams(all_strings_concat, ngrams)
counts = collections.Counter(all_strings_concat)
cb = huffman.codebook(counts.items())
values = []
Expand All @@ -125,21 +146,31 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
last_l = l
lengths = bytearray()
print("// length count", length_count)
print("// bigrams", ngrams)
for i in range(1, max(length_count) + 2):
lengths.append(length_count.get(i, 0))
print("// values", values, "lengths", len(lengths), lengths)
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
ngramdata = [ord(ni) for i in ngrams for ni in i]
print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat))
print("//", values, lengths)
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
with open(compression_filename, "w") as f:
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
return values, lengths
f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
if len(ngrams) > 32:
bigram_start = 0xe000
else:
bigram_start = 0x80
bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
f.write("#define bigram_start {}\n".format(bigram_start))
f.write("#define bigram_end {}\n".format(bigram_end))
return values, lengths, ngrams

def decompress(encoding_table, encoded, encoded_length_bits):
values, lengths = encoding_table
values, lengths, ngrams = encoding_table
dec = []
this_byte = 0
this_bit = 7
Expand Down Expand Up @@ -187,14 +218,16 @@ def decompress(encoding_table, encoded, encoded_length_bits):
searched_length += lengths[bit_length]

v = values[searched_length + bits - max_code]
v = decode_ngrams(v, ngrams)
i += len(v.encode('utf-8'))
dec.append(v)
return ''.join(dec)

def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
if not isinstance(decompressed, str):
raise TypeError()
values, lengths = encoding_table
values, lengths, ngrams = encoding_table
decompressed = encode_ngrams(decompressed, ngrams)
enc = bytearray(len(decompressed) * 3)
#print(decompressed)
#print(lengths)
Expand Down
14 changes: 11 additions & 3 deletions supervisor/shared/translate.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "genhdr/compression.generated.h"
#endif

#include "py/misc.h"
#include "supervisor/serial.h"

void serial_write_compressed(const compressed_string_t* compressed) {
Expand All @@ -46,13 +47,20 @@ STATIC int put_utf8(char *buf, int u) {
if(u <= 0x7f) {
*buf = u;
return 1;
} else if(bigram_start <= u && u <= bigram_end) {
int n = (u - 0x80) * 2;
// (note that at present, entries in the bigrams table are
// guaranteed not to represent bigrams themselves, so this adds
// at most 1 level of recursive call
int ret = put_utf8(buf, bigrams[n]);
return ret + put_utf8(buf + ret, bigrams[n+1]);
} else if(u <= 0x07ff) {
*buf++ = 0b11000000 | (u >> 6);
*buf = 0b10000000 | (u & 0b00111111);
return 2;
} else { // u <= 0xffff)
*buf++ = 0b11000000 | (u >> 12);
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
} else { // u <= 0xffff
*buf++ = 0b11100000 | (u >> 12);
*buf++ = 0b10000000 | ((u >> 6) & 0b00111111);
*buf = 0b10000000 | (u & 0b00111111);
return 3;
}
Expand Down
0