1
1
"""
2
2
Process raw qstr file and output qstr data with length, hash and data bytes.
3
3
4
- This script works with Python 2.7, 3.3 and 3.4.
4
+ This script is only regularly tested with the same version of Python used
5
+ during CI, typically the latest "3.x". However, incompatibilities with any
6
+ supported CPython version are unintended.
5
7
6
8
For documentation about the format of compressed translated strings, see
7
9
supervisor/shared/translate/translate.h
16
18
17
19
import collections
18
20
import gettext
19
- import os . path
21
+ import pathlib
20
22
21
23
if hasattr (sys .stdout , "reconfigure" ):
22
24
sys .stdout .reconfigure (encoding = "utf-8" )
23
25
sys .stderr .reconfigure (errors = "backslashreplace" )
24
26
25
- py = os .path .dirname (sys .argv [0 ])
26
- top = os .path .dirname (py )
27
-
28
- sys .path .append (os .path .join (top , "tools/huffman" ))
27
+ sys .path .append (str (pathlib .Path (__file__ ).parent .parent / "tools/huffman" ))
29
28
30
29
import huffman
31
-
32
- # Python 2/3 compatibility:
33
- # - iterating through bytes is different
34
- # - codepoint2name lives in a different module
35
- import platform
36
-
37
- if platform .python_version_tuple ()[0 ] == "2" :
38
- bytes_cons = lambda val , enc = None : bytearray (val )
39
- from htmlentitydefs import codepoint2name
40
- elif platform .python_version_tuple ()[0 ] == "3" :
41
- bytes_cons = bytes
42
- from html .entities import codepoint2name
43
- # end compatibility code
30
+ from html .entities import codepoint2name
44
31
45
32
codepoint2name [ord ("-" )] = "hyphen"
46
33
@@ -182,9 +169,15 @@ class EncodingTable:
182
169
extractor : object
183
170
apply_offset : object
184
171
remove_offset : object
172
+ translation_qstr_bits : int
173
+ qstrs : object
174
+ qstrs_inv : object
185
175
186
176
187
- def compute_huffman_coding (translation_name , translations , f ):
177
+ def compute_huffman_coding (qstrs , translation_name , translations , f ):
178
+ # possible future improvement: some languages are better when consider len(k) > 2. try both?
179
+ qstrs = dict ((k , v ) for k , v in qstrs .items () if len (k ) > 3 )
180
+ qstr_strs = list (qstrs .keys ())
188
181
texts = [t [1 ] for t in translations ]
189
182
words = []
190
183
@@ -234,10 +227,12 @@ def remove_offset(c):
234
227
# if "the" is in words then not only will "the" not be considered
235
228
# again, neither will "there" or "wither", since they have "the"
236
229
# as substrings.
237
- extractor = TextSplitter (words )
230
+ extractor = TextSplitter (words + qstr_strs )
238
231
counter = collections .Counter ()
239
232
for t in texts :
240
233
for atom in extractor .iter (t ):
234
+ if atom in qstrs :
235
+ atom = "\1 "
241
236
counter [atom ] += 1
242
237
cb = huffman .codebook (counter .items ())
243
238
lengths = sorted (dict ((v , len (cb [k ])) for k , v in counter .items ()).items ())
@@ -304,10 +299,14 @@ def est_net_savings(s, occ):
304
299
words .append (word )
305
300
306
301
words .sort (key = len )
307
- extractor = TextSplitter (words )
302
+ extractor = TextSplitter (words + qstr_strs )
308
303
counter = collections .Counter ()
304
+ used_qstr = 0
309
305
for t in texts :
310
306
for atom in extractor .iter (t ):
307
+ if atom in qstrs :
308
+ used_qstr = max (used_qstr , qstrs [atom ])
309
+ atom = "\1 "
311
310
counter [atom ] += 1
312
311
cb = huffman .codebook (counter .items ())
313
312
@@ -322,6 +321,8 @@ def est_net_savings(s, occ):
322
321
last_length = None
323
322
canonical = {}
324
323
for atom , code in sorted (cb .items (), key = lambda x : (len (x [1 ]), x [0 ])):
324
+ if atom in qstr_strs :
325
+ atom = "\1 "
325
326
values .append (atom )
326
327
length = len (code )
327
328
if length not in length_count :
@@ -359,6 +360,8 @@ def est_net_savings(s, occ):
359
360
minlen = len (words [0 ])
360
361
wlencount = [len ([None for w in words if len (w ) == l ]) for l in range (minlen , maxlen + 1 )]
361
362
363
+ translation_qstr_bits = used_qstr .bit_length ()
364
+
362
365
f .write ("typedef {} mchar_t;\n " .format (values_type ))
363
366
f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
364
367
f .write (
@@ -383,34 +386,44 @@ def est_net_savings(s, occ):
383
386
f .write ("#define maxlen {}\n " .format (maxlen ))
384
387
f .write ("#define translation_offstart {}\n " .format (offstart ))
385
388
f .write ("#define translation_offset {}\n " .format (offset ))
386
-
387
- return EncodingTable (values , lengths , words , canonical , extractor , apply_offset , remove_offset )
389
+ f .write ("#define translation_qstr_bits {}\n " .format (translation_qstr_bits ))
390
+
391
+ qstrs_inv = dict ((v , k ) for k , v in qstrs .items ())
392
+ return EncodingTable (
393
+ values ,
394
+ lengths ,
395
+ words ,
396
+ canonical ,
397
+ extractor ,
398
+ apply_offset ,
399
+ remove_offset ,
400
+ translation_qstr_bits ,
401
+ qstrs ,
402
+ qstrs_inv ,
403
+ )
388
404
389
405
390
406
def decompress (encoding_table , encoded , encoded_length_bits ):
407
+ qstrs_inv = encoding_table .qstrs_inv
391
408
values = encoding_table .values
392
409
lengths = encoding_table .lengths
393
410
words = encoding_table .words
394
411
412
+ def bititer ():
413
+ for byte in encoded :
414
+ for bit in (0x80 , 0x40 , 0x20 , 0x10 , 0x8 , 0x4 , 0x2 , 0x1 ):
415
+ yield bool (byte & bit )
416
+
417
+ nextbit = bititer ().__next__
418
+
419
+ def getnbits (n ):
420
+ bits = 0
421
+ for i in range (n ):
422
+ bits = (bits << 1 ) | nextbit ()
423
+ return bits
424
+
395
425
dec = []
396
- this_byte = 0
397
- this_bit = 7
398
- b = encoded [this_byte ]
399
- bits = 0
400
- for i in range (encoded_length_bits ):
401
- bits <<= 1
402
- if 0x80 & b :
403
- bits |= 1
404
-
405
- b <<= 1
406
- if this_bit == 0 :
407
- this_bit = 7
408
- this_byte += 1
409
- if this_byte < len (encoded ):
410
- b = encoded [this_byte ]
411
- else :
412
- this_bit -= 1
413
- length = bits
426
+ length = getnbits (encoded_length_bits )
414
427
415
428
i = 0
416
429
while i < length :
@@ -419,27 +432,19 @@ def decompress(encoding_table, encoded, encoded_length_bits):
419
432
max_code = lengths [0 ]
420
433
searched_length = lengths [0 ]
421
434
while True :
422
- bits <<= 1
423
- if 0x80 & b :
424
- bits |= 1
425
-
426
- b <<= 1
435
+ bits = (bits << 1 ) | nextbit ()
427
436
bit_length += 1
428
- if this_bit == 0 :
429
- this_bit = 7
430
- this_byte += 1
431
- if this_byte < len (encoded ):
432
- b = encoded [this_byte ]
433
- else :
434
- this_bit -= 1
435
437
if max_code > 0 and bits < max_code :
436
438
# print('{0:0{width}b}'.format(bits, width=bit_length))
437
439
break
438
440
max_code = (max_code << 1 ) + lengths [bit_length ]
439
441
searched_length += lengths [bit_length ]
440
442
441
443
v = values [searched_length + bits - max_code ]
442
- if v >= chr (0x80 ) and v < chr (0x80 + len (words )):
444
+ if v == chr (1 ):
445
+ qstr_idx = getnbits (encoding_table .translation_qstr_bits )
446
+ v = qstrs_inv [qstr_idx ]
447
+ elif v >= chr (0x80 ) and v < chr (0x80 + len (words )):
443
448
v = words [ord (v ) - 0x80 ]
444
449
i += len (v .encode ("utf-8" ))
445
450
dec .append (v )
@@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
449
454
def compress (encoding_table , decompressed , encoded_length_bits , len_translation_encoded ):
450
455
if not isinstance (decompressed , str ):
451
456
raise TypeError ()
457
+ qstrs = encoding_table .qstrs
452
458
canonical = encoding_table .canonical
453
459
extractor = encoding_table .extractor
454
460
455
- enc = bytearray (len (decompressed ) * 3 )
456
- current_bit = 7
457
- current_byte = 0
458
-
459
- bits = encoded_length_bits + 1
460
- for i in range (bits - 1 , 0 , - 1 ):
461
- if len_translation_encoded & (1 << (i - 1 )):
462
- enc [current_byte ] |= 1 << current_bit
463
- if current_bit == 0 :
464
- current_bit = 7
465
- current_byte += 1
466
- else :
467
- current_bit -= 1
461
+ enc = 1
462
+
463
+ def put_bit (enc , b ):
464
+ return (enc << 1 ) | bool (b )
465
+
466
+ def put_bits (enc , b , n ):
467
+ for i in range (n - 1 , - 1 , - 1 ):
468
+ enc = put_bit (enc , b & (1 << i ))
469
+ return enc
470
+
471
+ enc = put_bits (enc , len_translation_encoded , encoded_length_bits )
468
472
469
473
for atom in extractor .iter (decompressed ):
470
- for b in canonical [atom ]:
471
- if b == "1" :
472
- enc [current_byte ] |= 1 << current_bit
473
- if current_bit == 0 :
474
- current_bit = 7
475
- current_byte += 1
476
- else :
477
- current_bit -= 1
474
+ if atom in qstrs :
475
+ can = canonical ["\1 " ]
476
+ else :
477
+ can = canonical [atom ]
478
+ for b in can :
479
+ enc = put_bit (enc , b == "1" )
480
+ if atom in qstrs :
481
+ enc = put_bits (enc , qstrs [atom ], encoding_table .translation_qstr_bits )
482
+
483
+ while enc .bit_length () % 8 != 1 :
484
+ enc = put_bit (enc , 0 )
478
485
479
- if current_bit != 7 :
480
- current_byte += 1
481
- return enc [:current_byte ]
486
+ r = enc .to_bytes ((enc .bit_length () + 7 ) // 8 , "big" )
487
+ return r [1 :]
482
488
483
489
484
490
def qstr_escape (qst ):
@@ -493,10 +499,20 @@ def esc_char(m):
493
499
return re .sub (r"[^A-Za-z0-9_]" , esc_char , qst )
494
500
495
501
502
+ def parse_qstrs (infile ):
503
+ r = {}
504
+ rx = re .compile (r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)' )
505
+ content = infile .read ()
506
+ for i , mat in enumerate (rx .findall (content , re .M )):
507
+ mat = eval (mat )
508
+ r [mat ] = i
509
+ return r
510
+
511
+
496
512
def parse_input_headers (infiles ):
497
513
i18ns = set ()
498
514
499
- # read the qstrs in from the input files
515
+ # read the TRANSLATE strings in from the input files
500
516
for infile in infiles :
501
517
with open (infile , "rt" ) as f :
502
518
for line in f :
@@ -516,12 +532,12 @@ def escape_bytes(qstr):
516
532
return qstr
517
533
else :
518
534
# qstr contains non-printable codes so render entire thing as hex pairs
519
- qbytes = bytes_cons (qstr , "utf8" )
535
+ qbytes = bytes (qstr , "utf8" )
520
536
return "" .join (("\\ x%02x" % b ) for b in qbytes )
521
537
522
538
523
539
def make_bytes (cfg_bytes_len , cfg_bytes_hash , qstr ):
524
- qbytes = bytes_cons (qstr , "utf8" )
540
+ qbytes = bytes (qstr , "utf8" )
525
541
qlen = len (qbytes )
526
542
qhash = compute_hash (qbytes , cfg_bytes_hash )
527
543
if qlen >= (1 << (8 * cfg_bytes_len )):
@@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
551
567
)
552
568
total_text_compressed_size += len (compressed )
553
569
decompressed = decompress (encoding_table , compressed , encoded_length_bits )
554
- assert decompressed == translation
570
+ assert decompressed == translation , ( decompressed , translation )
555
571
for c in C_ESCAPES :
556
572
decompressed = decompressed .replace (c , C_ESCAPES [c ])
557
573
formatted = ["{:d}" .format (x ) for x in compressed ]
@@ -572,7 +588,7 @@ def output_translation_data(encoding_table, i18ns, out):
572
588
import argparse
573
589
574
590
parser = argparse .ArgumentParser (
575
- description = "Process QSTR definitions into headers for compilation"
591
+ description = "Process TRANSLATE strings into headers for compilation"
576
592
)
577
593
parser .add_argument (
578
594
"infiles" , metavar = "N" , type = str , nargs = "+" , help = "an integer for the accumulator"
@@ -590,13 +606,19 @@ def output_translation_data(encoding_table, i18ns, out):
590
606
type = argparse .FileType ("w" , encoding = "UTF-8" ),
591
607
help = "c file for translation data" ,
592
608
)
609
+ parser .add_argument (
610
+ "--qstrdefs_filename" ,
611
+ type = argparse .FileType ("r" , encoding = "UTF-8" ),
612
+ help = "" ,
613
+ )
593
614
594
615
args = parser .parse_args ()
595
616
617
+ qstrs = parse_qstrs (args .qstrdefs_filename )
596
618
i18ns = parse_input_headers (args .infiles )
597
619
i18ns = sorted (i18ns )
598
620
translations = translate (args .translation , i18ns )
599
621
encoding_table = compute_huffman_coding (
600
- args .translation , translations , args .compression_filename
622
+ qstrs , args .translation , translations , args .compression_filename
601
623
)
602
624
output_translation_data (encoding_table , translations , args .translation_filename )
0 commit comments