10000 Fix `is_public_assigned` to include Hangul Syllable and other ranges. · unicode-rs/unicode-normalization@aa980cf · GitHub
[go: up one dir, main page]

Skip to content

Commit aa980cf

Browse files
committed
Fix is_public_assigned to include Hangul Syllable and other ranges.
Hangul Syllables and several other ranges are defined in UnicodeData.txt as just their first and last values: ``` AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; ``` Teach the unicode.py script how to recognize these, so that it correctly classifies them as assigned ranges, for the `is_public_assigned` predicate.
1 parent 424b947 commit aa980cf

File tree

3 files changed

+70
-23
lines changed

3 files changed

+70
-23
lines changed

scripts/unicode.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,13 @@ def _load_unicode_data(self):
102102

103103
assigned_start = 0;
104104
prev_char_int = -1;
105+
prev_name = "";
105106

106107
for line in self._fetch("UnicodeData.txt").splitlines():
107108
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
108109
pieces = line.split(';')
109110
assert len(pieces) == 15
110-
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
111+
char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
111112
char_int = int(char, 16)
112113

113114
name = pieces[1].strip()
@@ -126,10 +127,11 @@ def _load_unicode_data(self):
126127

127128
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
128129
if category not in ['Co', 'Cs']:
129-
if char_int != prev_char_int + 1:
130+
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
130131
self.general_category_public_assigned.append((assigned_start, prev_char_int))
131132
assigned_start = char_int
132133
prev_char_int = char_int
134+
prev_name = name;
133135

134136
self.general_category_public_assigned.append((assigned_start, prev_char_int))
135137

@@ -343,6 +345,15 @@ def _compute_stream_safe_tables(self):
343345

344346
hexify = lambda c: '{:04X}'.format(c)
345347

348+
# Test whether `first` and `last` are corresponding "<..., First>" and
349+
# "<..., Last>" markers.
350+
def is_first_and_last(first, last):
351+
if not first.startswith('<') or not first.endswith(', First>'):
352+
return False
353+
if not last.startswith('<') or not last.endswith(', Last>'):
354+
return False
355+
return first[1:-8] == last[1:-7]
356+
346357
def gen_mph_data(name, d, kv_type, kv_callback):
347358
(salt, keys) = minimal_perfect_hash(d)
348359
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())

src/tables.rs

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22051,9 +22051,7 @@ pub fn is_public_assigned(c: char) -> bool {
2205122051
| '\u{3131}'..='\u{318E}'
2205222052
| '\u{3190}'..='\u{31E3}'
2205322053
| '\u{31F0}'..='\u{321E}'
22054-
| '\u{3220}'..='\u{3400}'
22055-
| '\u{4DBF}'..='\u{4E00}'
22056-
| '\u{9FFC}'
22054+
| '\u{3220}'..='\u{9FFC}'
2205722055
| '\u{A000}'..='\u{A48C}'
2205822056
| '\u{A490}'..='\u{A4C6}'
2205922057
| '\u{A4D0}'..='\u{A62B}'
@@ -22083,8 +22081,7 @@ pub fn is_public_assigned(c: char) -> bool {
2208322081
| '\u{AB30}'..='\u{AB6B}'
2208422082
| '\u{AB70}'..='\u{ABED}'
2208522083
| '\u{ABF0}'..='\u{ABF9}'
22086-
| '\u{AC00}'
22087-
| '\u{D7A3}'
22084+
| '\u{AC00}'..='\u{D7A3}'
2208822085
| '\u{D7B0}'..='\u{D7C6}'
2208922086
| '\u{D7CB}'..='\u{D7FB}'
2209022087
| '\u{F900}'..='\u{FA6D}'
@@ -22305,11 +22302,9 @@ pub fn is_public_assigned(c: char) -> bool {
2230522302
| '\u{16F8F}'..='\u{16F9F}'
2230622303
| '\u{16FE0}'..='\u{16FE4}'
2230722304
| '\u{16FF0}'..='\u{16FF1}'
22308-
| '\u{17000}'
22309-
| '\u{187F7}'
22305+
| '\u{17000}'..='\u{187F7}'
2231022306
| '\u{18800}'..='\u{18CD5}'
22311-
| '\u{18D00}'
22312-
| '\u{18D08}'
22307+
| '\u{18D00}'..='\u{18D08}'
2231322308
| '\u{1B000}'..='\u{1B11E}'
2231422309
| '\u{1B150}'..='\u{1B152}'
2231522310
| '\u{1B164}'..='\u{1B167}'
@@ -22439,19 +22434,13 @@ pub fn is_public_assigned(c: char) -> bool {
2243922434
| '\u{1FB00}'..='\u{1FB92}'
2244022435
| '\u{1FB94}'..='\u{1FBCA}'
2244122436
| '\u{1FBF0}'..='\u{1FBF9}'
22442-
| '\u{20000}'
22443-
| '\u{2A6DD}'
22444-
| '\u{2A700}'
22445-
| '\u{2B734}'
22446-
| '\u{2B740}'
22447-
| '\u{2B81D}'
22448-
| '\u{2B820}'
22449-
| '\u{2CEA1}'
22450-
| '\u{2CEB0}'
22451-
| '\u{2EBE0}'
22437+
| '\u{20000}'..='\u{2A6DD}'
22438+
| '\u{2A700}'..='\u{2B734}'
22439+
| '\u{2B740}'..='\u{2B81D}'
22440+
| '\u{2B820}'..='\u{2CEA1}'
22441+
| '\u{2CEB0}'..='\u{2EBE0}'
2245222442
| '\u{2F800}'..='\u{2FA1D}'
22453-
| '\u{30000}'
22454-
| '\u{3134A}'
22443+
| '\u{30000}'..='\u{3134A}'
2245522444
| '\u{E0001}'
2245622445
| '\u{E0020}'..='\u{E007F}'
2245722446
| '\u{E0100}'..='\u{E01EF}'

tests/public_assigned.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,51 @@ fn test_public_assigned() {
7171
assert!(!is_public_assigned('\u{fffff}'));
7272
assert!(!is_public_assigned('\u{10fffe}'));
7373
assert!(!is_public_assigned('\u{10ffff}'));
74+
75+
// Several ranges are defined by "<..., First>" and "<..., Last>" pairs in
76+
// UnicodeData.txt:
77+
78+
// CJK Ideograph Extension A
79+
assert!(is_public_assigned('\u{3400}'));
80+
assert!(is_public_assigned('\u{4dbf}'));
81+
82+
// CJK Ideograph
83+
assert!(is_public_assigned('\u{4e00}'));
84+
assert!(is_public_assigned('\u{9ffc}'));
85+
86+
// Hangul Syllable
87+
assert!(is_public_assigned('\u{ac00}'));
88+
assert!(is_public_assigned('\u{d7a3}'));
89+
90+
// Tangut Ideograph
91+
assert!(is_public_assigned('\u{17000}'));
92+
assert!(is_public_assigned('\u{187f7}'));
93+
94+
// Tangut Ideograph Supplement
95+
assert!(is_public_assigned('\u{18d00}'));
96+
assert!(is_public_assigned('\u{18d08}'));
97+
98+
// CJK Ideograph Extension B
99+
assert!(is_public_assigned('\u{20000}'));
100+
assert!(is_public_assigned('\u{2a6dd}'));
101+
102+
// CJK Ideograph Extension C
103+
assert!(is_public_assigned('\u{2a700}'));
104+
assert!(is_public_assigned('\u{2b734}'));
105+
106+
// CJK Ideograph Extension D
107+
assert!(is_public_assigned('\u{2b740}'));
108+
assert!(is_public_assigned('\u{2b81d}'));
109+
110+
// CJK Ideograph Extension E
111+
assert!(is_public_assigned('\u{2b820}'));
112+
assert!(is_public_assigned('\u{2cea1}'));
113+
114+
// CJK Ideograph Extension F
115+
assert!(is_public_assigned('\u{2ceb0}'));
116+
assert!(is_public_assigned('\u{2ebe0}'));
117+
118+
// CJK Ideograph Extension G
119+
assert!(is_public_assigned('\u{30000}'));
120+
assert!(is_public_assigned('\u{3134a}'));
74121
}

0 commit comments

Comments
 (0)
0