8000 Support text presentation sequences · unicode-rs/unicode-width@2e2d3bb · GitHub
[go: up one dir, main page]

Skip to content

Commit 2e2d3bb

Browse files
Support text presentation sequences
1 parent 74c8394 commit 2e2d3bb

File tree

4 files changed

+320
-81
lines changed

4 files changed

+320
-81
lines changed

scripts/unicode.py

Lines changed: 169 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,9 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
123123
`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
124124
with fetch_open("EastAsianWidth.txt") as eaw:
125125
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126-
single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
126+
single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
127127
# matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..."
128-
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
128+
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
129129
# map between width category code and condensed width
130130
width_codes = {
131131
**{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
@@ -189,10 +189,10 @@ def load_zero_widths() -> "list[bool]":
189189
# canonically equivalent sequences have the same width.
190190
with fetch_open("DerivedCoreProperties.txt") as properties:
191191
single = re.compile(
192-
r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
192+
r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
193193
)
194194
multiple = re.compile(
195-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
195+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
196196
)
197197

198198
for line in properties.readlines():
@@ -225,8 +225,8 @@ def load_zero_widths() -> "list[bool]":
225225
#
226226
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
227227
with fetch_open("HangulSyllableType.txt") as categories:
228-
single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
229-
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
228+
single = re.compile(r"^([0-9A-F]+)\s*;\s*(V|T)\s+")
229+
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+")
230230

231231
for line in categories.readlines():
232232
raw_data = None # (low, high)
@@ -396,14 +396,14 @@ def make_tables(
396396
return tables
397397

398398

399-
def load_variation_sequences() -> "list[int]":
399+
def load_emoji_presentation_sequences() -> "list[int]":
400400
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
401401
an emoji presentation sequence."""
402402

403403
with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
404404
# Match all emoji presentation sequences
405405
# (one codepoint followed by U+FE0F, and labeled "emoji style")
406-
sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
406+
sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style")
407407
codepoints = []
408408
for line in sequences.readlines():
409409
if match := sequence.match(line):
@@ -412,55 +412,114 @@ def load_variation_sequences() -> "list[int]":
412412
return codepoints
413413

414414

415-
def make_variation_sequence_table(
415+
def load_text_presentation_sequences() -> "list[int]":
416+
"""Outputs a list of character ranages, corresponding to all the valid characters
417+
whose widths change with a text presentation sequence."""
418+
419+
text_presentation_seq_codepoints = set()
420+
with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
421+
# Match all text presentation sequences
422+
# (one codepoint followed by U+FE0E, and labeled "text style")
423+
sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style")
424+
for line in sequences.readlines():
425+
if match := sequence.match(line):
426+
cp = int(match.group(1), 16)
427+
text_presentation_seq_codepoints.add(cp)
428+
429+
default_emoji_codepoints = set()
430+
with fetch_open("emoji/emoji-data.txt") as emoji_data:
431+
single = re.compile(r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+")
432+
multiple = re.compile(
433+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
434+
)
435+
436+
for line in emoji_data.readlines():
437+
raw_data = None # (low, high)
438+
if match := single.match(line):
439+
raw_data = (match.group(1), match.group(1))
440+
elif match := multiple.match(line):
441+
raw_data = (match.group(1), match.group(2))
442+
else:
443+
continue
444+
low = int(raw_data[0], 16)
445+
high = int(raw_data[1], 16)
446+
for cp in range(low, high + 1):
447+
default_emoji_codepoints.add(cp)
448+
449+
codepoints = []
450+
for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints):
451+
# "Enclosed Ideographic Supplement" block;
452+
# wide even in text presentation
453+
if not cp in range(0x1F200, 0x1F300):
454+
codepoints.append(cp)
455+
456+
codepoints.sort()
457+
return codepoints
458+
459+
460+
def make_presentation_sequence_table(
416461
seqs: "list[int]",
417462
width_map: "list[EffectiveWidth]",
418-
) -> "tuple[list[int], list[list[int]]]":
419-
"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
420-
(Characters that are always wide may be excluded.)
463+
spurious_false: "set[EffectiveWidth]",
464+
spurious_true: "set[EffectiveWidth]",
465+
) -> "tuple[list[tuple[int, int]], list[list[int]]]":
466+
"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
421467
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
422468
"""
423469

424470
prefixes_dict = defaultdict(set)
425471
for cp in seqs:
426472
prefixes_dict[cp >> 10].add(cp & 0x3FF)
427473

428-
# We don't strictly need to keep track of characters that are always wide,
429-
# because being in an emoji variation seq won't affect their width.
430-
# So store their info only when it wouldn't inflate the size of the tables.
431474
for k in list(prefixes_dict.keys()):
432475
if all(
433476
map(
434-
lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
477+
lambda cp: width_map[(k << 10) | cp] in spurious_false,
435478
prefixes_dict[k],
436479
)
437480
):
438481
del prefixes_dict[k]
439482

440-
indexes = list(prefixes_dict.keys())
483+
msbs: "list[int]" = list(prefixes_dict.keys())
441484

442-
# Similarly, we can spuriously return `true` for always-wide characters
443-
# even if not part of a presentation seq; this saves an additional lookup,
444-
# so we should do it where there is no size cost.
445485
for cp, width in enumerate(width_map):
446-
if width == EffectiveWidth.WIDE and (cp >> 10) in indexes:
486+
if width in spurious_true and (cp >> 10) in msbs:
447487
prefixes_dict[cp >> 10].add(cp & 0x3FF)
448488

449-
leaves = []
489+
leaves: "list[list[int]]" = []
450490
for cps in prefixes_dict.values():
451491
leaf = [0] * 128
452492
for cp in cps:
453493
idx_in_leaf, bit_shift = divmod(cp, 8)
454494
leaf[idx_in_leaf] |= 1 << bit_shift
455495
leaves.append(leaf)
496+
497+
indexes = [(msb, index) for (index, msb) in enumerate(msbs)]
498+
499+
# Cull duplicate leaves
500+
i = 0
501+
while i < len(leaves):
502+
first_idx = leaves.index(leaves[i])
503+
if first_idx == i:
504+
i += 1
505+
else:
506+
for j in range(0, len(indexes)):
507+
if indexes[j][1] == i:
508+
indexes[j] = (indexes[j][0], first_idx)
509+
elif indexes[j][1] > i:
510+
indexes[j] = (indexes[j][0], indexes[j][1] - 1)
511+
512+
leaves.pop(i)
513+
456514
return (indexes, leaves)
457515

458516

459517
def emit_module(
460518
out_name: str,
461519
unicode_version: "tuple[int, int, int]",
462520
tables: "list[Table]",
463-
variation_table: "tuple[list[int], list[list[int]]]",
521+
emoji_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
522+
text_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
464523
):
465524
"""Outputs a Rust module to `out_name` using table data from `tables`.
466525
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -537,7 +596,8 @@ def emit_module(
537596
"""
538597
)
539598

540-
variation_idx, variation_leaves = variation_table
599+
emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table
600+
text_presentation_idx, text_presentation_leaves = text_presentation_table
541601

542602
module.write(
543603
"""
@@ -555,7 +615,7 @@ def emit_module(
555615
"""
556616
)
557617

558-
for i, msbs in enumerate(variation_idx):
618+
for msbs, i in emoji_presentation_idx:
559619
module.write(f" {msbs} => {i},\n")
560620

561621
module.write(
@@ -571,6 +631,39 @@ def emit_module(
571631
"""
572632
)
573633

634+
module.write(
635+
"""
636+
/// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence]
637+
/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
638+
/// when followed by `'\\u{FEOE}'`, and is not ideographic.
639+
/// Such sequences are considered to have width 1.
640+
///
641+
/// This may spuriously return `true` for characters of narrow or ambiguous width.
642+
#[inline]
643+
pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
644+
let cp: u32 = c.into();
645+
// First level of lookup uses all but 10 LSB
646+
let top_bits = cp >> 10;
647+
let idx_of_leaf: usize = match top_bits {
648+
"""
649+
)
650+
651+
for msbs, i in text_presentation_idx:
652+
module.write(f" {msbs} => {i},\n")
653+
654+
module.write(
655+
""" _ => return false,
656+
};
657+
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
658+
// and use them to index into `leaf_row`.
659+
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
660+
let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
661+
// Use the 3 LSB of `cp` to index into `leaf_byte`.
662+
((leaf_byte >> (cp & 7)) & 1) == 1
663+
}
664+
"""
665+
)
666+
574667
module.write(
575668
"""
576669
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -626,12 +719,32 @@ def emit_module(
626719
f"""
627720
#[repr(align(128))]
628721
struct Align128<T>(T);
629-
/// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
630-
/// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
631-
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
722+
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
723+
/// to get whether it can start an emoji presentation sequence.
724+
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([
725+
"""
726+
)
727+
for leaf in emoji_presentation_leaves:
728+
module.write(" [\n")
729+
for row in batched(leaf, 14):
730+
module.write(" ")
731+
for entry in row:
732+
module.write(f" 0x{entry:02X},")
733+
module.write("\n")
734+
module.write(" ],\n")
735+
736+
module.write(" ]);\n")
737+
738+
# text table
739+
740+
module.write(
741+
f"""
742+
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
743+
/// to get whether it can start a text presentation sequence.
744+
static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(text_presentation_leaves)}]> = Align128([
632745
"""
633746
)
634-
for leaf in variation_leaves:
747+
for leaf in text_presentation_leaves:
635748
module.write(" [\n")
636749
for row in batched(leaf, 14):
637750
module.write(" ")
@@ -650,21 +763,7 @@ def main(module_path: str):
650763
lookup table for character width, and write a Rust module utilizing that table to
651764
`module_filename`.
652765
653-
We obey the following rules, in decreasing order of importance:
654-
655-
- Emoji presentation sequences are double-width.
656-
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
657-
- Hangul jamo medial vowels & final consonants are zero-width.
658-
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
659-
- Control characters are zero-width.
660-
- `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
661-
are zero-width.
662-
- Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
663-
- Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
664-
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
665-
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
666-
667-
These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
766+
See `lib.rs` for documentation of the exact width rules.
668767
"""
669768
version = load_unicode_version()
670769
print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
@@ -682,8 +781,18 @@ def main(module_path: str):
682781

683782
tables = make_tables(TABLE_CFGS, enumerate(width_map))
684783

685-
emoji_variations = load_variation_sequences()
686-
variation_table = make_variation_sequence_table(emoji_variations, width_map)
784+
emoji_presentations = load_emoji_presentation_sequences()
785+
emoji_presentation_table = make_presentation_sequence_table(
786+
emoji_presentations, width_map, {EffectiveWidth.WIDE}, {EffectiveWidth.WIDE}
787+
)
788+
789+
text_presentations = load_text_presentation_sequences()
790+
text_presentation_table = make_presentation_sequence_table(
791+
text_presentations,
792+
width_map,
793+
set(),
794+
{EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS},
795+
)
687796

688797
# Download normalization test file for use by tests
689798
fetch_open("NormalizationTest.txt", "../tests/")
@@ -694,16 +803,23 @@ def main(module_path: str):
694803
size_bytes = len(table.to_bytes())
695804
print(f"Table {i} size: {size_bytes} bytes")
696805
total_size += size_bytes
697-
emoji_index_size = len(variation_table[0]) * 4
698-
print(f"Emoji presentation index size: {emoji_index_size} bytes")
699-
total_size += emoji_index_size
700-
emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
701-
print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
702-
total_size += emoji_leaves_size
806+
807+
for s, table in [
808+
("Emoji", emoji_presentation_table),
809+
("Text", text_presentation_table),
810+
]:
811+
index_size = len(table[0]) * 4
812+
print(f"{s} presentation index size: {index_size} bytes")
813+
total_size += index_size
814+
leaves_size = len(table[1]) * len(table[1][0])
815+
print(f"{s} presentation leaves size: {leaves_size} bytes")
816+
total_size += leaves_size
703817
print("------------------------")
704818
print(f" Total size: {total_size} bytes")
705819

706-
emit_module(module_path, version, tables, variation_table)
820+
emit_module(
821+
module_path, version, tables, emoji_presentation_table, text_presentation_table
822+
)
707823
print(f'Wrote to "{module_path}"')
708824

709825

0 commit comments

Comments
0 (0)
0