8000 Rename `svar` to `cjk_compat_variants`. · unicode-rs/unicode-normalization@fea4f13 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit fea4f13

Browse files
committed
Rename svar to cjk_compat_variants.
1 parent 5aca91b commit fea4f13

File tree

8 files changed

+134
-128
lines changed

8 files changed

+134
-128
lines changed

scripts/unicode.py

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ def __init__(self):
7272
self.canon_comp = self._compute_canonical_comp()
7373
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
7474

75-
self.svar_decomp = {}
76-
self.svar_fully_decomp = {}
77-
self._load_standardized_variants()
75+
self.cjk_compat_variants_decomp = {}
76+
self.cjk_compat_variants_fully_decomp = {}
77+
self._load_cjk_compat_ideograph_variants()
7878

7979
def stats(name, table):
8080
count = sum(len(v) for v in table.values())
@@ -83,10 +83,10 @@ def stats(name, table):
8383
print("Decomposition table stats:")
8484
stats("Canonical decomp", self.canon_decomp)
8585
stats("Compatible decomp", self.compat_decomp)
86-
stats("Standardized Variants", self.svar_decomp)
86+
stats("CJK Compat Variants", self.cjk_compat_variants_decomp)
8787
stats("Canonical fully decomp", self.canon_fully_decomp)
8888
stats("Compatible fully decomp", self.compat_fully_decomp)
89-
stats("Standardized Variants", self.svar_fully_decomp)
89+
stats("CJK Compat Variants", self.cjk_compat_variants_fully_decomp)
9090

9191
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
9292

@@ -122,38 +122,41 @@ def _load_unicode_data(self):
122122
if category == 'M' or 'M' in expanded_categories.get(category, []):
123123
self.general_category_mark.append(char_int)
124124

125-
def _load_standardized_variants(self):
125+
def _load_cjk_compat_ideograph_variants(self):
126126
for line in self._fetch("StandardizedVariants.txt").splitlines():
127127
strip_comments = line.split('#', 1)[0].strip()
128128
if not strip_comments:
129129
continue
130130

131-
pieces = strip_comments.split(';')
132-
assert len(pieces) == 3
133-
134-
variation_sequence, description, differences = pieces[0], pieces[1].strip(), pieces[2]
131+
variation_sequence, description, differences = strip_comments.split(';')
132+
description = description.strip()
135133

136134
# Don't use variations that only apply in particular shaping environments.
137135
if differences:
138136
continue
139137

140138
# Look for entries where the description field is a codepoint name.
141-
if description in self.name_to_char_int:
142-
char_int = self.name_to_char_int[description]
143-
144-
assert not char_int in self.combining_classes, "Unexpected: standardized variant with a combining class"
145-
assert not char_int in self.compat_decomp, "Unexpected: standardized variant and compatibility decomposition"
146-
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: standardized variant and non-singleton canonical decomposition"
147-
# If we ever need to handle Hangul here, we'll need to handle it separately.
148-
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
149-
150-
standardized_variant_parts = [int(c, 16) for c in variation_sequence.split()]
151-
for c in standardized_variant_parts:
152-
#assert not never_composes(c) TODO: Re-enable this once #67 lands.
153-
assert not c in self.canon_decomp, "Unexpected: standardized variant is unnormalized (canon)"
154-
assert not c in self.compat_decomp, "Unexpected: standardized variant is unnormalized (compat)"
155-
self.svar_decomp[char_int] = standardized_variant_parts
156-
self.svar_fully_decomp[char_int] = standardized_variant_parts
139+
if description not in self.name_to_char_int:
140+
continue
141+
142+
# Only consider the CJK Compatibility Ideographs.
143+
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
144+
continue
145+
146+
char_int = self.name_to_char_int[description]
147+
148+
assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
149+
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
150+
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
151+
# If we ever need to handle Hangul here, we'll need to handle it separately.
152+
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
153+
154+
cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
155+
for c in cjk_compat_variant_parts:
156+
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
157+
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
158+
self.cjk_compat_variants_decomp[char_int] = cjk_compat_variant_parts
159+
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
157160

158161
def _load_norm_props(self):
159162
props = collections.defaultdict(list)
@@ -364,8 +367,8 @@ def gen_composition_table(canon_comp, out):
364367
out.write(" }\n")
365368
out.write("}\n")
366369

367-
def gen_decomposition_tables(canon_decomp, compat_decomp, svar_decomp, out):
368-
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (svar_decomp, 'svar')]
370+
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
371+
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
369372
for table, name in tables:
370373
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
371374
lambda k: "(0x{:x}, &[{}])".format(k,
@@ -535,7 +538,7 @@ def minimal_perfect_hash(d):
535538
gen_composition_table(data.canon_comp, out)
536539
out.write("\n")
537540

538-
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.svar_fully_decomp, out)
541+
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
539542

540543
gen_combining_mark(data.general_category_mark, out)
541544
out.write("\n")

src/lib.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ mod test;
8686
/// Methods for composing and decomposing characters.
8787
pub mod char {
8888
pub use crate::normalize::{
89-
compose, decompose_canonical, decompose_compatible, decompose_svar,
89+
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
9090
};
9191

9292
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
@@ -112,17 +112,17 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
112112
/// (compatibility decomposition followed by canonical composition).
113113
fn nfkc(self) -> Recompositions<I>;
114114

115-
/// A transformation which replaces codepoints with normal forms using
116-
/// Standardized Variation Sequences. This is not part of the canonical
117-
/// or compatibility decomposition algorithms, but performing it before
118-
/// those algorithms produces normalized output which better preserves
119-
/// the intent of the original text.
115+
/// A transformation which replaces CJK Compatibility Ideograph codepoints
116+
/// with normal forms using Standardized Variation Sequences. This is not
117+
/// part of the canonical or compatibility decomposition algorithms, but
118+
/// performing it before those algorithms produces normalized output which
119+
/// better preserves the intent of the original text.
120120
///
121121
/// Note that many systems today ignore variation selectors, so these
122122
/// may not immediately help text display as intended, but they at
123123
/// least preserve the information in a standardized form, giving
124124
/// implementations the option to recognize them.
125-
fn svar(self) -> Replacements<I>;
125+
fn cjk_compat_variants(self) -> Replacements<I>;
126126

127127
/// An Iterator over the string with Conjoining Grapheme Joiner characters
128128
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
@@ -151,8 +151,8 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
151151
}
152152

153153
#[inline]
154-
fn svar(self) -> Replacements<Chars<'a>> {
155-
replace::new_svar(self.chars())
154+
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
155+
replace::new_cjk_compat_variants(self.chars())
156156
}
157157

158158
#[inline]
@@ -183,8 +183,8 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
183183
}
184184

185185
#[inline]
186-
fn svar(self) -> Replacements<I> {
187-
replace::new_svar(self)
186+
fn cjk_compat_variants(self) -> Replacements<I> {
187+
replace::new_cjk_compat_variants(self)
188188
}
189189

190190
#[inline]

src/lookups.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]>
6464
)
6565
}
6666

67-
pub(crate) fn svar_fully_decomposed(c: char) -> Option<&'static [char]> {
67+
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
6868
mph_lookup(
6969
c.into(),
70-
SVAR_DECOMPOSED_SALT,
71-
SVAR_DECOMPOSED_KV,
70+
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
71+
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
7272
pair_lookup_fk,
7373
pair_lookup_fv_opt,
7474
None,

src/normalize.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111
//! Functions for computing canonical and compatible decompositions for Unicode characters.
1212
use crate::lookups::{
13-
canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
14-
svar_fully_decomposed,
13+
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
14+
compatibility_fully_decomposed, composition_table,
1515
};
1616

1717
use core::{char, ops::FnMut};
@@ -47,7 +47,7 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
4747
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
4848
/// for more information.
4949
#[inline]
50-
pub fn decompose_svar<F>(c: char, mut emit_char: F)
50+
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
5151
where
5252
F: FnMut(char),
5353
{
@@ -59,7 +59,7 @@ where
5959

6060
// Don't perform decomposition for Hangul
6161

62-
if let Some(decomposed) = svar_fully_decomposed(c) {
62+
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
6363
for &d in decomposed {
6464
emit_char(d);
6565
}

src/replace.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub struct Replacements<I> {
2020
}
2121

2222
#[inline]
23-
pub fn new_svar<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
23+
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
2424
Replacements { iter, buffer: None }
2525
}
2626

@@ -37,7 +37,7 @@ impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
3737
Some(ch) => {
3838
// At this time, the longest replacement sequence has length 2.
3939
let mut buffer = TinyVec::<[char; 2]>::new();
40-
super::char::decompose_svar(ch, |d| buffer.push(d));
40+
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
4141
self.buffer = buffer.get(1).copied();
4242
Some(buffer[0])
4343
}

src/tables.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15161,7 +15161,7 @@ pub(crate) const COMPATIBILITY_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
1516115161
(0x2106, &['\u{0063}', '\u{002F}', '\u{0075}']),
1516215162
];
1516315163

15164-
pub(crate) const SVAR_DECOMPOSED_SALT: &[u16] = &[
15164+
pub(crate) const CJK_COMPAT_VARIANTS_DECOMPOSED_SALT: &[u16] = &[
1516515165
0x5,
1516615166
0x0,
1516715167
0x0,
@@ -16165,7 +16165,7 @@ pub(crate) const SVAR_DECOMPOSED_SALT: &[u16] = &[
1616516165
0x5,
1616616166
0x1,
1616716167
];
16168-
pub(crate) const SVAR_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
16168+
pub(crate) const CJK_COMPAT_VARIANTS_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
1616916169
(0xfa08, &['\u{884C}', '\u{FE00}']),
1617016170
(0x2f825, &['\u{52C7}', '\u{FE01}']),
1617116171
(0x2f838, &['\u{20B63}', '\u{FE00}']),

tests/cjk_compat_variants.rs

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
//! Test the standard variation sequence replacements.
2+
3+
use unicode_normalization::UnicodeNormalization;
4+
5+
#[test]
6+
fn test_cjk_compat_variants() {
7+
// These codepoints have singleton decompositions in the canonical
8+
// decomposition, and can use standardized variations.
9+
let s = "\u{2f999}\u{2f8a6}";
10+
11+
// These codepoints have canonical decompositions.
12+
let mut nfd_iter = s.chars().nfd();
13+
assert_eq!(nfd_iter.next(), Some('\u{831d}'));
14+
assert_eq!(nfd_iter.next(), Some('\u{6148}'));
15+
assert_eq!(nfd_iter.next(), None);
16+
17+
let mut nfkd_iter = s.chars().nfkd();
18+
assert_eq!(nfkd_iter.next(), Some('\u{831d}'));
19+
assert_eq!(nfkd_iter.next(), Some('\u{6148}'));
20+
assert_eq!(nfkd_iter.next(), None);
21+
22+
let mut nfc_iter = s.chars().nfc();
23+
assert_eq!(nfc_iter.next(), Some('\u{831d}'));
24+
assert_eq!(nfc_iter.next(), Some('\u{6148}'));
25+
assert_eq!(nfc_iter.next(), None);
26+
27+
let mut nfkc_iter = s.chars().nfkc();
28+
assert_eq!(nfkc_iter.next(), Some('\u{831d}'));
29+
assert_eq!(nfkc_iter.next(), Some('\u{6148}'));
30+
assert_eq!(nfkc_iter.next(), None);
31+
32+
// However they also have standardized variants.
33+
let mut var_iter = s.chars().cjk_compat_variants();
34+
assert_eq!(var_iter.next(), Some('\u{831d}'));
35+
assert_eq!(var_iter.next(), Some('\u{fe00}'));
36+
assert_eq!(var_iter.next(), Some('\u{6148}'));
37+
assert_eq!(var_iter.next(), Some('\u{fe00}'));
38+
assert_eq!(var_iter.next(), None);
39+
40+
// The standardized variants are normalization-stable.
41+
let mut var_nfc_iter = s.chars().cjk_compat_variants().nfc();
42+
assert_eq!(var_nfc_iter.next(), Some('\u{831d}'));
43+
assert_eq!(var_nfc_iter.next(), Some('\u{fe00}'));
44+
assert_eq!(var_nfc_iter.next(), Some('\u{6148}'));
45+
assert_eq!(var_nfc_iter.next(), Some('\u{fe00}'));
46+
assert_eq!(var_nfc_iter.next(), None);
47+
48+
let mut var_nfd_iter = s.chars().cjk_compat_variants().nfd();
49+
assert_eq!(var_nfd_iter.next(), Some('\u{831d}'));
50+
assert_eq!(var_nfd_iter.next(), Some('\u{fe00}'));
51+
assert_eq!(var_nfd_iter.next(), Some('\u{6148}'));
52+
assert_eq!(var_nfd_iter.next(), Some('\u{fe00}'));
53+
assert_eq!(var_nfd_iter.next(), None);
54+
55+
let mut var_nfkc_iter = s.chars().cjk_compat_variants().nfkc();
56+
assert_eq!(var_nfkc_iter.next(), Some('\u{831d}'));
57+
assert_eq!(var_nfkc_iter.next(), Some('\u{fe00}'));
58+
assert_eq!(var_nfkc_iter.next(), Some('\u{6148}'));
59+
assert_eq!(var_nfkc_iter.next(), Some('\u{fe00}'));
60+
assert_eq!(var_nfkc_iter.next(), None);
61+
62+
let mut var_nfkd_iter = s.chars().cjk_compat_variants().nfkd();
63+
assert_eq!(var_nfkd_iter.next(), Some('\u{831d}'));
64+
assert_eq!(var_nfkd_iter.next(), Some('\u{fe00}'));
65+
assert_eq!(var_nfkd_iter.next(), Some('\u{6148}'));
66+
assert_eq!(var_nfkd_iter.next(), Some('\u{fe00}'));
67+
assert_eq!(var_nfkd_iter.next(), None);
68+
}
69+
70+
/// `cjk_compat_variants` shouldn't decompose Hangul.
71+
#[test]
72+
fn test_cjk_compat_variants_with_hangul() {
73+
assert_eq!(
74+
"중국어 (홍콩)"
75+
.chars()
76+
.cjk_compat_variants()
77+
.collect::<String>(),
78+
"중국어 (홍콩)"
79+
);
80+
}

0 commit comments

Comments
 (0)
0