unicode-rs · Manishearth · Jun 17, 2024 · Jun 17, 2024
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -20,6 +20,7 @@
 # - ReadMe.txt
 # - Scripts.txt
 # - UnicodeData.txt
+# - auxiliary/GraphemeBreakProperty.txt
 # - emoji/emoji-data.txt
 # - emoji/emoji-variation-sequences.txt
 # - extracted/DerivedGeneralCategory.txt
@@ -526,6 +527,21 @@ def load_zero_widths() -> list[bool]:
     zw_map[0x0891] = True
     zw_map[0x08E2] = True
 
+    # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]`
+    gcb_prepend = set()
+    load_property(
+        "auxiliary/GraphemeBreakProperty.txt",
+        "Prepend",
+        lambda cp: gcb_prepend.add(cp),
+    )
+    load_property(
+        "PropList.txt",
+        "Prepended_Concatenation_Mark",
+        lambda cp: gcb_prepend.remove(cp),
+    )
+    for cp in gcb_prepend:
+        zw_map[cp] = True
+
     # HANGUL CHOSEONG FILLER
     # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
     # zero width. However, the expected usage is to combine it with vowel or trailing jamo

diff --git a/src/lib.rs b/src/lib.rs
@@ -73,7 +73,7 @@
 //!      - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
 //!      - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
 //!      - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
-//!        `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}'  | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}'  | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
+//!        `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}'  | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
 //!        have width 0.
 //!      - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
 //!        followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
@@ -113,6 +113,8 @@
 //!         - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890),
 //!         - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
 //!         - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
+//!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D)
+//!         with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s.
 //!       - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
 //!    5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
 //!       with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
@@ -132,6 +134,7 @@
 //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
 //! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
 //! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
+//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
 //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
 //! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
 //! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862

diff --git a/src/tables.rs b/src/tables.rs
@@ -1162,7 +1162,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15,
-        0x44, 0x01, 0x54, 0x55, 0x51, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x44, 0x01, 0x54, 0x55, 0x41, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1532,7 +1532,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00,
-        0x40, 0x55, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x40, 0x05, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1587,7 +1587,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x15,
-        0x44, 0x15, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x04, 0x11, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1596,12 +1596,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
         0x55, 0x55,
     ],
     [
-        0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x14,
+        0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x04,
         0x40, 0x55, 0x15, 0x55, 0x55, 0x01, 0x40, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
-        0x55, 0x55, 0x05, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x55, 0x00, 0x00, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
@@ -1617,7 +1617,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x40, 0x45,
-        0x10, 0x00, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x10, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1631,7 +1631,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
         0x55, 0x55,
     ],
     [
-        0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
+        0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
         0x55, 0x44, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
@@ -1994,7 +1994,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
 /// Sorted list of codepoint ranges (inclusive)
 /// that are zero-width but not `Joining_Type=Transparent`
 /// FIXME: can we get better compression?
-static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
+static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 53] = [
     ([0x05, 0x06, 0x00], [0x05, 0x06, 0x00]),
     ([0x90, 0x08, 0x00], [0x91, 0x08, 0x00]),
     ([0xE2, 0x08, 0x00], [0xE2, 0x08, 0x00]),
@@ -2010,6 +2010,7 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
     ([0xCA, 0x0C, 0x00], [0xCB, 0x0C, 0x00]),
     ([0xD5, 0x0C, 0x00], [0xD6, 0x0C, 0x00]),
     ([0x3E, 0x0D, 0x00], [0x3E, 0x0D, 0x00]),
+    ([0x4E, 0x0D, 0x00], [0x4E, 0x0D, 0x00]),
     ([0x57, 0x0D, 0x00], [0x57, 0x0D, 0x00]),
     ([0xCF, 0x0D, 0x00], [0xCF, 0x0D, 0x00]),
     ([0xDF, 0x0D, 0x00], [0xDF, 0x0D, 0x00]),
@@ -2028,12 +2029,19 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
     ([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]),
     ([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]),
     ([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]),
+    ([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]),
     ([0x3E, 0x13, 0x01], [0x3E, 0x13, 0x01]),
     ([0x57, 0x13, 0x01], [0x57, 0x13, 0x01]),
     ([0xB0, 0x14, 0x01], [0xB0, 0x14, 0x01]),
     ([0xBD, 0x14, 0x01], [0xBD, 0x14, 0x01]),
     ([0xAF, 0x15, 0x01], [0xAF, 0x15, 0x01]),
     ([0x30, 0x19, 0x01], [0x30, 0x19, 0x01]),
+    ([0x3F, 0x19, 0x01], [0x3F, 0x19, 0x01]),
+    ([0x41, 0x19, 0x01], [0x41, 0x19, 0x01]),
+    ([0x3A, 0x1A, 0x01], [0x3A, 0x1A, 0x01]),
+    ([0x84, 0x1A, 0x01], [0x89, 0x1A, 0x01]),
+    ([0x46, 0x1D, 0x01], [0x46, 0x1D, 0x01]),
+    ([0x02, 0x1F, 0x01], [0x02, 0x1F, 0x01]),
     ([0x65, 0xD1, 0x01], [0x65, 0xD1, 0x01]),
     ([0x6E, 0xD1, 0x01], [0x72, 0xD1, 0x01]),
     ([0x00, 0x00, 0x0E], [0x00, 0x00, 0x0E]),

diff --git a/tests/tests.rs b/tests/tests.rs
@@ -110,6 +110,12 @@ fn test_prepended_concatenation_marks() {
     }
 }
 
+#[test]
 fn test_gcb_prepend() {
+    assert_width!("ൎഉ", 1, 1);
+    assert_width!("\u{11A89}", 0, 0);
+}
+
 #[test]
 fn test_interlinear_annotation_chars() {
     assert_width!('\u{FFF9}', Some(1), Some(1));