@@ -123,9 +123,9 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
123
123
`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
124
124
with fetch_open ("EastAsianWidth.txt" ) as eaw :
125
125
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126
- single = re .compile (r"^([0-9A-F]+)\s+ ;\s+ (\w+) +# (\w+)" )
126
+ single = re .compile (r"^([0-9A-F]+)\s* ;\s* (\w+) +# (\w+)" )
127
127
# matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..."
128
- multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+ ;\s+ (\w+) +# (\w+)" )
128
+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s* ;\s* (\w+) +# (\w+)" )
129
129
# map between width category code and condensed width
130
130
width_codes = {
131
131
** {c : EffectiveWidth .NARROW for c in ["N" , "Na" , "H" ]},
@@ -189,10 +189,10 @@ def load_zero_widths() -> "list[bool]":
189
189
# canonically equivalent sequences have the same width.
190
190
with fetch_open ("DerivedCoreProperties.txt" ) as properties :
191
191
single = re .compile (
192
- r"^([0-9A-F]+)\s+ ;\s+ (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
192
+ r"^([0-9A-F]+)\s* ;\s* (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
193
193
)
194
194
multiple = re .compile (
195
- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+ ;\s+ (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
195
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s* ;\s* (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
196
196
)
197
197
198
198
for line in properties .readlines ():
@@ -225,8 +225,8 @@ def load_zero_widths() -> "list[bool]":
225
225
#
226
226
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
227
227
with fetch_open ("HangulSyllableType.txt" ) as categories :
228
- single = re .compile (r"^([0-9A-F]+)\s+ ;\s+ (V|T)\s+" )
229
- multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+ ;\s+ (V|T)\s+" )
228
+ single = re .compile (r"^([0-9A-F]+)\s* ;\s* (V|T)\s+" )
229
+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s* ;\s* (V|T)\s+" )
230
230
231
231
for line in categories .readlines ():
232
232
raw_data = None # (low, high)
@@ -396,14 +396,14 @@ def make_tables(
396
396
return tables
397
397
398
398
399
- def load_variation_sequences () -> "list[int]" :
399
+ def load_emoji_presentation_sequences () -> "list[int]" :
400
400
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
401
401
an emoji presentation sequence."""
402
402
403
403
with fetch_open ("emoji/emoji-variation-sequences.txt" ) as sequences :
404
404
# Match all emoji presentation sequences
405
405
# (one codepoint followed by U+FE0F, and labeled "emoji style")
406
- sequence = re .compile (r"^([0-9A-F]+)\s+FE0F\s*;\s+ emoji style" )
406
+ sequence = re .compile (r"^([0-9A-F]+)\s+FE0F\s*;\s* emoji style" )
407
407
codepoints = []
408
408
for line in sequences .readlines ():
409
409
if match := sequence .match (line ):
@@ -412,55 +412,114 @@ def load_variation_sequences() -> "list[int]":
412
412
return codepoints
413
413
414
414
415
- def make_variation_sequence_table (
415
+ def load_text_presentation_sequences () -> "list[int]" :
416
+ """Outputs a list of character ranages, corresponding to all the valid characters
417
+ whose widths change with a text presentation sequence."""
418
+
419
+ text_presentation_seq_codepoints = set ()
420
+ with fetch_open ("emoji/emoji-variation-sequences.txt" ) as sequences :
421
+ # Match all text presentation sequences
422
+ # (one codepoint followed by U+FE0E, and labeled "text style")
423
+ sequence = re .compile (r"^([0-9A-F]+)\s+FE0E\s*;\s*text style" )
424
+ for line in sequences .readlines ():
425
+ if match := sequence .match (line ):
426
+ cp = int (match .group (1 ), 16 )
427
+ text_presentation_seq_codepoints .add (cp )
428
+
429
+ default_emoji_codepoints = set ()
430
+ with fetch_open ("emoji/emoji-data.txt" ) as emoji_data :
431
+ single = re .compile (r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+" )
432
+ multiple = re .compile (
433
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
434
+ )
435
+
436
+ for line in emoji_data .readlines ():
437
+ raw_data = None # (low, high)
438
+ if match := single .match (line ):
439
+ raw_data = (match .group (1 ), match .group (1 ))
440
+ elif match := multiple .match (line ):
441
+ raw_data = (match .group (1 ), match .group (2 ))
442
+ else :
443
+ continue
444
+ low = int (raw_data [0 ], 16 )
445
+ high = int (raw_data [1 ], 16 )
446
+ for cp in range (low , high + 1 ):
447
+ default_emoji_codepoints .add (cp )
448
+
449
+ codepoints = []
450
+ for cp in text_presentation_seq_codepoints .intersection (default_emoji_codepoints ):
451
+ # "Enclosed Ideographic Supplement" block;
452
+ # wide even in text presentation
453
+ if not cp in range (0x1F200 , 0x1F300 ):
454
+ codepoints .append (cp )
455
+
456
+ codepoints .sort ()
457
+ return codepoints
458
+
459
+
460
+ def make_presentation_sequence_table (
416
461
seqs : "list[int]" ,
417
462
width_map : "list[EffectiveWidth]" ,
418
- ) -> "tuple[list[int], list[list[int]]]" :
419
- """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
420
- (Characters that are always wide may be excluded.)
463
+ spurious_false : "set[EffectiveWidth]" ,
464
+ spurious_true : "set[EffectiveWidth]" ,
465
+ ) -> "tuple[list[tuple[int, int]], list[list[int]]]" :
466
+ """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
421
467
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
422
468
"""
423
469
424
470
prefixes_dict = defaultdict (set )
425
471
for cp in seqs :
426
472
prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
427
473
428
- # We don't strictly need to keep track of characters that are always wide,
429
- # because being in an emoji variation seq won't affect their width.
430
- # So store their info only when it wouldn't inflate the size of the tables.
431
474
for k in list (prefixes_dict .keys ()):
432
475
if all (
433
476
map (
434
- lambda cp : width_map [(k << 10 ) | cp ] == EffectiveWidth . WIDE ,
477
+ lambda cp : width_map [(k << 10 ) | cp ] in spurious_false ,
435
478
prefixes_dict [k ],
436
479
)
437
480
):
438
481
del prefixes_dict [k ]
439
482
440
- indexes = list (prefixes_dict .keys ())
483
+ msbs : "list[int]" = list (prefixes_dict .keys ())
441
484
442
- # Similarly, we can spuriously return `true` for always-wide characters
443
- # even if not part of a presentation seq; this saves an additional lookup,
444
- # so we should do it where there is no size cost.
445
485
for cp , width in enumerate (width_map ):
446
- if width == EffectiveWidth . WIDE and (cp >> 10 ) in indexes :
486
+ if width in spurious_true and (cp >> 10 ) in msbs :
447
487
prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
448
488
449
- leaves = []
489
+ leaves : "list[list[int]]" = []
450
490
for cps in prefixes_dict .values ():
451
491
leaf = [0 ] * 128
452
492
for cp in cps :
453
493
idx_in_leaf , bit_shift = divmod (cp , 8 )
454
494
leaf [idx_in_leaf ] |= 1 << bit_shift
455
495
leaves .append (leaf )
496
+
497
+ indexes = [(msb , index ) for (index , msb ) in enumerate (msbs )]
498
+
499
+ # Cull duplicate leaves
500
+ i = 0
501
+ while i < len (leaves ):
502
+ first_idx = leaves .index (leaves [i ])
503
+ if first_idx == i :
504
+ i += 1
505
+ else :
506
+ for j in range (0 , len (indexes )):
507
+ if indexes [j ][1 ] == i :
508
+ indexes [j ] = (indexes [j ][0 ], first_idx )
509
+ elif indexes [j ][1 ] > i :
510
+ indexes [j ] = (indexes [j ][0 ], indexes [j ][1 ] - 1 )
511
+
512
+ leaves .pop (i )
513
+
456
514
return (indexes , leaves )
457
515
458
516
459
517
def emit_module (
460
518
out_name : str ,
461
519
unicode_version : "tuple[int, int, int]" ,
462
520
tables : "list[Table]" ,
463
- variation_table : "tuple[list[int], list[list[int]]]" ,
521
+ emoji_presentation_table : "tuple[list[tuple[int, int]], list[list[int]]]" ,
522
+ text_presentation_table : "tuple[list[tuple[int, int]], list[list[int]]]" ,
464
523
):
465
524
"""Outputs a Rust module to `out_name` using table data from `tables`.
466
525
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -537,7 +596,8 @@ def emit_module(
537
596
"""
538
597
)
539
598
540
- variation_idx , variation_leaves = variation_table
599
+ emoji_presentation_idx , emoji_presentation_leaves = emoji_presentation_table
600
+ text_presentation_idx , text_presentation_leaves = text_presentation_table
541
601
542
602
module .write (
543
603
"""
@@ -555,7 +615,7 @@ def emit_module(
555
615
"""
556
616
)
557
617
558
- for i , msbs in enumerate ( variation_idx ) :
618
+ for msbs , i in emoji_presentation_idx :
559
619
module .write (f" { msbs } => { i } ,\n " )
560
620
561
621
module .write (
@@ -571,6 +631,39 @@ def emit_module(
571
631
"""
572
632
)
573
633
634
+ module .write (
635
+ """
636
+ /// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence]
637
+ /// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
638
+ /// when followed by `'\\ u{FEOE}'`, and is not ideographic.
639
+ /// Such sequences are considered to have width 1.
640
+ ///
641
+ /// This may spuriously return `true` for characters of narrow or ambiguous width.
642
+ #[inline]
643
+ pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
644
+ let cp: u32 = c.into();
645
+ // First level of lookup uses all but 10 LSB
646
+ let top_bits = cp >> 10;
647
+ let idx_of_leaf: usize = match top_bits {
648
+ """
649
+ )
650
+
651
+ for msbs , i in text_presentation_idx :
652
+ module .write (f" { msbs } => { i } ,\n " )
653
+
654
+ module .write (
655
+ """ _ => return false,
656
+ };
657
+ // Extract the 3-9th (0-indexed) least significant bits of `cp`,
658
+ // and use them to index into `leaf_row`.
659
+ let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
660
+ let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
661
+ // Use the 3 LSB of `cp` to index into `leaf_byte`.
662
+ ((leaf_byte >> (cp & 7)) & 1) == 1
663
+ }
664
+ """
665
+ )
666
+
574
667
module .write (
575
668
"""
576
669
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -626,12 +719,32 @@ def emit_module(
626
719
f"""
627
720
#[repr(align(128))]
628
721
struct Align128<T>(T);
629
- /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
630
- /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
631
- static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; { len (variation_leaves )} ]> = Align128([
722
+ /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
723
+ /// to get whether it can start an emoji presentation sequence.
724
+ static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; { len (emoji_presentation_leaves )} ]> = Align128([
725
+ """
726
+ )
727
+ for leaf in emoji_presentation_leaves :
728
+ module .write (" [\n " )
729
+ for row in batched (leaf , 14 ):
730
+ module .write (" " )
731
+ for entry in row :
732
+ module .write (f" 0x{ entry :02X} ," )
733
+ module .write ("\n " )
734
+ module .write (" ],\n " )
735
+
736
+ module .write (" ]);\n " )
737
+
738
+ # text table
739
+
740
+ module .write (
741
+ f"""
742
+ /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
743
+ /// to get whether it can start a text presentation sequence.
744
+ static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128]; { len (text_presentation_leaves )} ]> = Align128([
632
745
"""
633
746
)
634
- for leaf in variation_leaves :
747
+ for leaf in text_presentation_leaves :
635
748
module .write (" [\n " )
636
749
for row in batched (leaf , 14 ):
637
750
module .write (" " )
@@ -650,21 +763,7 @@ def main(module_path: str):
650
763
lookup table for character width, and write a Rust module utilizing that table to
651
764
`module_filename`.
652
765
653
- We obey the following rules, in decreasing order of importance:
654
-
655
- - Emoji presentation sequences are double-width.
656
- - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
657
- - Hangul jamo medial vowels & final consonants are zero-width.
658
- - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
659
- - Control characters are zero-width.
660
- - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
661
- are zero-width.
662
- - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
663
- - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
664
- - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
665
- of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
666
-
667
- These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
766
+ See `lib.rs` for documentation of the exact width rules.
668
767
"""
669
768
version = load_unicode_version ()
670
769
print (f"Generating module for Unicode { version [0 ]} .{ version [1 ]} .{ version [2 ]} " )
@@ -682,8 +781,18 @@ def main(module_path: str):
682
781
683
782
tables = make_tables (TABLE_CFGS , enumerate (width_map ))
684
783
685
- emoji_variations = load_variation_sequences ()
686
- variation_table = make_variation_sequence_table (emoji_variations , width_map )
784
+ emoji_presentations = load_emoji_presentation_sequences ()
785
+ emoji_presentation_table = make_presentation_sequence_table (
786
+ emoji_presentations , width_map , {EffectiveWidth .WIDE }, {EffectiveWidth .WIDE }
787
+ )
788
+
789
+ text_presentations = load_text_presentation_sequences ()
790
+ text_presentation_table = make_presentation_sequence_table (
791
+ text_presentations ,
792
+ width_map ,
793
+ set (),
794
+ {EffectiveWidth .NARROW , EffectiveWidth .AMBIGUOUS },
795
+ )
687
796
688
797
# Download normalization test file for use by tests
689
798
fetch_open ("NormalizationTest.txt" , "../tests/" )
@@ -694,16 +803,23 @@ def main(module_path: str):
694
803
size_bytes = len (table .to_bytes ())
695
804
print (f"Table { i } size: { size_bytes } bytes" )
696
805
total_size += size_bytes
697
- emoji_index_size = len (variation_table [0 ]) * 4
698
- print (f"Emoji presentation index size: { emoji_index_size } bytes" )
699
- total_size += emoji_index_size
700
- emoji_leaves_size = len (variation_table [1 ]) * len (variation_table [1 ][0 ])
701
- print (f"Emoji presentation leaves size: { emoji_leaves_size } bytes" )
702
- total_size += emoji_leaves_size
806
+
807
+ for s , table in [
808
+ ("Emoji" , emoji_presentation_table ),
809
+ ("Text" , text_presentation_table ),
810
+ ]:
811
+ index_size = len (table [0 ]) * 4
812
+ print (f"{ s } presentation index size: { index_size } bytes" )
813
+ total_size += index_size
814
+ leaves_size = len (table [1 ]) * len (table [1 ][0 ])
815
+ print (f"{ s } presentation leaves size: { leaves_size } bytes" )
816
+ total_size += leaves_size
703
817
print ("------------------------" )
704
818
print (f" Total size: { total_size } bytes" )
705
819
706
- emit_module (module_path , version , tables , variation_table )
820
+ emit_module (
821
+ module_path , version , tables , emoji_presentation_table , text_presentation_table
822
+ )
707
823
print (f'Wrote to "{ module_path } "' )
708
824
709
825
0 commit comments