2020# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121import collections
2222import urllib .request
23+ from itertools import batched
2324
2425UNICODE_VERSION = "15.1.0"
2526UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
@@ -354,20 +355,26 @@ def is_first_and_last(first, last):
354355 return False
355356 return first [1 :- 8 ] == last [1 :- 7 ]
356357
357- def gen_mph_data (name , d , kv_type , kv_callback ):
358+ def gen_mph_data (name , d , kv_type , kv_callback , kv_row_width ):
358359 (salt , keys ) = minimal_perfect_hash (d )
359- out .write ("pub(crate) const %s_SALT: &[u16] = &[\n " % name .upper ())
360- for s in salt :
361- out .write (" 0x{:x},\n " .format (s ))
360+ out .write (f"\n pub(crate) const { name .upper ()} _SALT: &[u16] = &[\n " )
361+ for s_row in batched (salt , 13 ):
362+ out .write (" " )
363+ for s in s_row :
364+ out .write (f" 0x{ s :03X} ," )
365+ out .write ("\n " )
366+ out .write ("];\n " )
367+ out .write (f"pub(crate) const { name .upper ()} _KV: &[{ kv_type } ] = &[\n " )
368+ for k_row in batched (keys , kv_row_width ):
369+ out .write (" " )
370+ for k in k_row :
371+ out .write (f" { kv_callback (k )} ," )
372+ out .write ("\n " )
362373 out .write ("];\n " )
363- out .write ("pub(crate) const {}_KV: &[{}] = &[\n " .format (name .upper (), kv_type ))
364- for k in keys :
365- out .write (" {},\n " .format (kv_callback (k )))
366- out .write ("];\n \n " )
367374
368375def gen_combining_class (combining_classes , out ):
369376 gen_mph_data ('canonical_combining_class' , combining_classes , 'u32' ,
370- lambda k : "0x{:X}" . format ( int (combining_classes [k ]) | (k << 8 )) )
377+ lambda k : f "0x{ int (combining_classes [k ]) | (k << 8 ):07X } " , 8 )
371378
372379def gen_composition_table (canon_comp , out ):
373380 table = {}
@@ -376,7 +383,7 @@ def gen_composition_table(canon_comp, out):
376383 table [(c1 << 16 ) | c2 ] = c3
377384 (salt , keys ) = minimal_perfect_hash (table )
378385 gen_mph_data ('COMPOSITION_TABLE' , table , '(u32, char)' ,
379- lambda k : "(0x%s , '\\ u{%s} ')" % ( hexify ( k ), hexify ( table [ k ])) )
386+ lambda k : f "(0x{ k :08X } , '\\ u{{ { table [ k ]:06X } }} ')", 1 )
380387
381388 out .write ("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n " )
382389 out .write (" match (c1, c2) {\n " )
@@ -403,7 +410,7 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_de
403410 assert offset < 65536
404411 out .write ("];\n " )
405412 gen_mph_data (name + '_decomposed' , table , "(u32, (u16, u16))" ,
406- lambda k : "(0x{:x }, ({}, {}))" . format ( k , offsets [k ], len (table [k ])) )
413+ lambda k : f "(0x{ k :05X } , (0x { offsets [k ]:03X } , 0x { len (table [k ]):X } ))" , 1 )
407414
408415def gen_qc_match (prop_table , out ):
409416 out .write (" match c {\n " )
@@ -421,7 +428,7 @@ def gen_qc_match(prop_table, out):
421428 out .write (" }\n " )
422429
423430def gen_nfc_qc (prop_tables , out ):
424- out .write ("#[inline]\n " )
431+ out .write ("\n #[inline]\n " )
425432 out .write ("#[allow(ellipsis_inclusive_range_patterns)]\n " )
426433 out .write ("pub fn qc_nfc(c: char) -> IsNormalized {\n " )
427434 gen_qc_match (prop_tables ['NFC_QC' ], out )
@@ -450,7 +457,7 @@ def gen_nfkd_qc(prop_tables, out):
450457
451458def gen_combining_mark (general_category_mark , out ):
452459 gen_mph_data ('combining_mark' , general_category_mark , 'u32' ,
453- lambda k : '0x{:04x }' .format (k ))
460+ lambda k : '0x{:05X }' .format (k ), 10 )
454461
455462def gen_public_assigned (general_category_public_assigned , out ):
456463 # This could be done as a hash but the table is somewhat small.
@@ -464,17 +471,16 @@ def gen_public_assigned(general_category_public_assigned, out):
464471 out .write (" " )
465472 start = False
466473 else :
467- out .write (" | " )
474+ out .write ("\n | " )
468475 if first == last :
469- out .write ("'\\ u{%s}'\n " % hexify (first ))
476+ out .write ("'\\ u{%s}'" % hexify (first ))
470477 else :
471- out .write ("'\\ u{%s}'..='\\ u{%s}'\n " % (hexify (first ), hexify (last )))
472- out .write (" => true,\n " )
478+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
479+ out .write (" => true,\n " )
473480
474481 out .write (" _ => false,\n " )
475482 out .write (" }\n " )
476483 out .write ("}\n " )
477- out .write ("\n " )
478484
479485def gen_stream_safe (leading , trailing , out ):
480486 # This could be done as a hash but the table is very small.
@@ -488,10 +494,9 @@ def gen_stream_safe(leading, trailing, out):
488494 out .write (" _ => 0,\n " )
489495 out .write (" }\n " )
490496 out .write ("}\n " )
491- out .write ("\n " )
492497
493498 gen_mph_data ('trailing_nonstarters' , trailing , 'u32' ,
494- lambda k : "0x{:X}" . format ( int (trailing [k ]) | (k << 8 )) )
499+ lambda k : f "0x{ int (trailing [k ]) | (k << 8 ):07X } " , 8 )
495500
496501def gen_tests (tests , out ):
497502 out .write ("""#[derive(Debug)]
@@ -579,43 +584,33 @@ def minimal_perfect_hash(d):
579584 data = UnicodeData ()
580585 with open ("tables.rs" , "w" , newline = "\n " ) as out :
581586 out .write (PREAMBLE )
582- out .write ("#![cfg_attr(rustfmt, rustfmt::skip)]\n " )
583587 out .write ("use crate::quick_check::IsNormalized;\n " )
584588 out .write ("use crate::quick_check::IsNormalized::*;\n " )
585589 out .write ("\n " )
586590
587591 version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
588592 out .write ("#[allow(unused)]\n " )
589- out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n \n " % version )
593+ out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n " % version )
590594
591595 gen_combining_class (data .combining_classes , out )
592- out .write ("\n " )
593596
594597 gen_composition_table (data .canon_comp , out )
595- out .write ("\n " )
596598
597599 gen_decomposition_tables (data .canon_fully_decomp , data .compat_fully_decomp , data .cjk_compat_variants_fully_decomp , out )
598600
599601 gen_combining_mark (data .general_category_mark , out )
600- out .write ("\n " )
601602
602603 gen_public_assigned (data .general_category_public_assigned , out )
603- out .write ("\n " )
604604
605605 gen_nfc_qc (data .norm_props , out )
606- out .write ("\n " )
607606
608607 gen_nfkc_qc (data .norm_props , out )
609- out .write ("\n " )
610608
611609 gen_nfd_qc (data .norm_props , out )
612- out .write ("\n " )
613610
614611 gen_nfkd_qc (data .norm_props , out )
615- out .write ("\n " )
616612
617613 gen_stream_safe (data .ss_leading , data .ss_trailing , out )
618- out .write ("\n " )
619614
620615 with open ("normalization_tests.rs" , "w" , newline = "\n " ) as out :
621616 out .write (PREAMBLE )
0 commit comments