8000 Unicode 16: Initial support · unicode-rs/unicode-width@6ab41d7 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6ab41d7

Browse files
Unicode 16: Initial support
Includes Kirat Rai normalization behavior.
1 parent 82d7136 commit 6ab41d7

File tree

3 files changed

+1116
-121
lines changed

3 files changed

+1116
-121
lines changed

scripts/unicode.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from itertools import batched
4444
from typing import Callable, Iterable
4545

46-
UNICODE_VERSION = "15.1.0"
46+
UNICODE_VERSION = "16.0.0"
4747
"""The version of the Unicode data files to download."""
4848

4949
NUM_CODEPOINTS = 0x110000
@@ -264,6 +264,12 @@ class WidthState(enum.IntEnum):
264264
TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110
265265
"(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`"
266266

267+
# Kirat Rai
268+
KIRAT_RAI_VOWEL_SIGN_E = 0b0000_0000_0010_0000
269+
"\\u16D67 (\\u16D67 \\u16D67)+ and canonical equivalents"
270+
KIRAT_RAI_VOWEL_SIGN_AI = 0b0000_0000_0010_0001
271+
"(\\u16D68)+ and canonical equivalents"
272+
267273
# VARIATION SELECTORS
268274

269275
# Text presentation sequences (not CJK)
@@ -639,6 +645,8 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
639645
([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU),
640646
([0xFE0F], WidthState.VARIATION_SELECTOR_16),
641647
([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I),
648+
([0x16D67], WidthState.KIRAT_RAI_VOWEL_SIGN_E),
649+
([0x16D68], WidthState.KIRAT_RAI_VOWEL_SIGN_AI),
642650
(emoji_presentation, WidthState.EMOJI_PRESENTATION),
643651
(emoji_modifiers, WidthState.EMOJI_MODIFIER),
644652
(regional_indicators, WidthState.REGIONAL_INDICATOR),
@@ -1496,6 +1504,22 @@ def lookup_fns(
14961504
return (0, WidthInfo::EMOJI_PRESENTATION)
14971505
}}
14981506
1507+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D63}}') => {{
1508+
return (0, WidthInfo::DEFAULT);
1509+
}}
1510+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D67}}') => {{
1511+
return (0, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI);
1512+
}}
1513+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D68}}') => {{
1514+
return (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E);
1515+
}}
1516+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D69}}') => {{
1517+
return (0, WidthInfo::DEFAULT);
1518+
}}
1519+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI, '\\u{{16D63}}') => {{
1520+
return (0, WidthInfo::DEFAULT);
1521+
}}
1522+
14991523
// Fallback
15001524
_ => {{}}
15011525
}}

src/lib.rs

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
//! - Script-specific ligatures:
6666
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
6767
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
68-
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G23126)s
68+
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G23126)s
6969
//! will not affect the width.
7070
//! - **[Arabic]**: A character sequence consisting of one character with [`Joining_Group`]`=Lam`,
7171
//! followed by any number of characters with [`Joining_Type`]`=Transparent`, followed by one character
@@ -75,6 +75,7 @@
7575
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
7676
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
7777
//! have width 0.
78+
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `\u{16D68}`, `\u{16D69}`, or `\u{16D6A}` has total width 1.
7879
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
7980
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
8081
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
@@ -130,18 +131,18 @@
130131
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
131132
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
132133
//!
133-
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50313
134-
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
134+
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G50313
135+
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40095
135136
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
136137
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
137-
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
138+
//! [`General_Category`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-4/#G124142
138139
//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
139-
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
140-
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
140+
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G52443
141+
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G45593
141142
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
142-
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009
143+
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G50009
143144
//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
144-
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
145+
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/c 57AE hapter-23/#G37908
145146
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
146147
//!
147148
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
@@ -150,7 +151,7 @@
150151
//!
151152
//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
152153
//!
153-
//! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602
154+
//! [combining marks]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602
154155
//!
155156
//! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence
156157
//! [Emoji modifier sequences]: https://www.unicode.org/reports/tr51/#def_emoji_modifier_sequence
@@ -159,13 +160,14 @@
159160
//!
160161
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
161162
//!
162-
//! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480
163-
//! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743
164-
//! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528
165-
//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642
166-
//! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587
167-
//! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975
168-
//! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184
163+
//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480
164+
//! [Buginese]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-17/#G26743
165+
//! [Hebrew]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G6528
166+
//! [Khmer]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-16/#G64642
167+
//! [Kirat Rai]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-13/#G746409
168+
//! [Lisu]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-18/#G44587
169+
//! [Old Turkic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-14/#G41975
170+
//! [Tifinagh]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-19/#G43184
169171
//!
170172
//!
171173
//! ## Canonical equivalence

0 commit comments

Comments
 (0)
0