8000 Rename `svar` to `cjk_compat_variants`. · emilio/unicode-normalization@fea4f13 · GitHub
[go: up one dir, main page]

Skip to content

Commit fea4f13

Browse files
committed
Rename svar to cjk_compat_variants.
1 parent 5aca91b commit fea4f13

File tree

8 files changed

+134
-128
lines changed

8 files changed

+134
-128
lines changed

scripts/unicode.py

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ def __init__(self):
7272
self.canon_comp = self._compute_canonical_comp()
7373
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
7474

75-
self.svar_decomp = {}
76-
self.svar_fully_decomp = {}
77-
self._load_standardized_variants()
75+
self.cjk_compat_variants_decomp = {}
76+
self.cjk_compat_variants_fully_decomp = {}
77+
self._load_cjk_compat_ideograph_variants()
7878

7979
def stats(name, table):
8080
count = sum(len(v) for v in table.values())
@@ -83,10 +83,10 @@ def stats(name, table):
8383
print("Decomposition table stats:")
8484
stats("Canonical decomp", self.canon_decomp)
8585
stats("Compatible decomp", self.compat_decomp)
86-
stats("Standardized Variants", self.svar_decomp)
86+
stats("CJK Compat Variants", self.cjk_compat_variants_decomp)
8787
stats("Canonical fully decomp", self.canon_fully_decomp)
8888
stats("Compatible fully decomp", self.compat_fully_decomp)
89-
stats("Standardized Variants", self.svar_fully_decomp)
89+
stats("CJK Compat Variants", self.cjk_compat_variants_fully_decomp)
9090

9191
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
9292

@@ -122,38 +122,41 @@ def _load_unicode_data(self):
122122
if category == 'M' or 'M' in expanded_categories.get(category, []):
123123
self.general_category_mark.append(char_int)
124124

125-
def _load_standardized_variants(self):
125+
def _load_cjk_compat_ideograph_variants(self):
126126
for line in self._fetch("StandardizedVariants.txt").splitlines():
127127
strip_comments = line.split('#', 1)[0].strip()
128128
if not strip_comments:
129129
continue
130130

131-
pieces = strip_comments.split(';')
132-
assert len(pieces) == 3
133-
134-
variation_sequence, description, differences = pieces[0], pieces[1].strip(), pieces[2]
131+
variation_sequence, description, differences = strip_comments.split(';')
132+
description = description.strip()
135133

136134
# Don't use variations that only apply in particular shaping environments.
137135
if differences:
138136
continue
139137

140138
# Look for entries where the description field is a codepoint name.
141-
if description in self.name_to_char_int:
142-
char_int = self.name_to_char_int[description]
143-
144-
assert not char_int in self.combining_classes, "Unexpected: standardized variant with a combining class"
145-
assert not char_int in self.compat_decomp, "Unexpected: standardized variant and compatibility decomposition"
146-
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: standardized variant and non-singleton canonical decomposition"
147-
# If we ever need to handle Hangul here, we'll need to handle it separately.
148-
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
149-
150-
standardized_variant_parts = [int(c, 16) for c in variation_sequence.split()]
151-
for c in standardized_variant_parts:
152-
#assert not never_composes(c) TODO: Re-enable this once #67 lands.
153-
assert not c in self.canon_decomp, "Unexpected: standardized variant is unnormalized (canon)"
154-
assert not c in self.compat_decomp, "Unexpected: standardized variant is unnormalized (compat)"
155-
self.svar_decomp[char_int] = standardized_variant_parts
156-
self.svar_fully_decomp[char_int] = standardized_variant_parts
139+
if description not in self.name_to_char_int:
140+
continue
141+
142+
# Only consider the CJK Compatibility Ideographs.
143+
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
144+
continue
145+
146+
char_int = self.name_to_char_int[description]
147+
148+
assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
149+
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
150+
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
151+
# If we ever need to handle Hangul here, we'll need to handle it separately.
152+
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
153+
154+
cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
155+
for c in cjk_compat_variant_parts:
156+
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
157+
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
158+
self.cjk_compat_variants_decomp[char_int] = cjk_compat_variant_parts
159+
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
157160

158161
def _load_norm_props(self):
159162
props = collections.defaultdict(list)
@@ -364,8 +367,8 @@ def gen_composition_table(canon_comp, out):
364367
out.write(" }\n")
365368
out.write("}\n")
366369

367-
def gen_decomposition_tables(canon_decomp, compat_decomp, svar_decomp, out):
368-
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (svar_decomp, 'svar')]
370+
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
371+
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
369372
for table, name in tables:
370373
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
371374
lambda k: "(0x{:x}, &[{}])".format(k,
@@ -535,7 +538,7 @@ def minimal_perfect_hash(d):
535538
gen_composition_table(data.canon_comp, out)
536539
out.write("\n")
537540

538-
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.svar_fully_decomp, out)
541+
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
539542

540543
gen_combining_mark(data.general_category_mark, out)
541544
out.write("\n")

src/lib.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ mod test;
8686
/// Methods for composing and decomposing characters.
8787
pub mod char {
8888
pub use crate::normalize::{
89-
compose, decompose_canonical, decompose_compatible, decompose_svar,
89+
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
9090
};
9191

9292
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
@@ -112,17 +112,17 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
112112
/// (compatibility decomposition followed by canonical composition).
113113
fn nfkc(self) -> Recompositions<I>;
114114

115-
/// A transformation which replaces codepoints with normal forms using
116-
/// Standardized Variation Sequences. This is not part of the canonical
117-
/// or compatibility decomposition algorithms, but performing it before
118-
/// those algorithms produces normalized output which better preserves
119-
/// the intent of the original text.
115+
/// A transformation which replaces CJK Compatibility Ideograph codepoints
116+
/// with normal forms using Standardized Variation Sequences. This is not
117+
/// part of the canonical or compatibility decomposition algorithms, but
118+
/// performing it before those algorithms produces normalized output which
119+
/// better preserves the intent of the original text.
120120
///
121121
/// Note that many systems today ignore variation selectors, so these
122122
/// may not immediately help text display as intended, but they at
123123
/// least preserve the information in a standardized form, giving
124124
/// implementations the option to recognize them.
125-
fn svar(self) -> Replacements<I>;
125+
fn cjk_compat_variants(self) -> Replacements<I>;
126126

127127
/// An Iterator over the string with Conjoining Grapheme Joiner characters
128128
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
@@ -151,8 +151,8 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
151151
}
152152

153153
#[inline]
154-
fn svar(self) -> Replacements<Chars<'a>> {
155-
replace::new_svar(self.chars())
154+
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
155+
replace::new_cjk_compat_variants(self.chars())
156156
}
157157

158158
#[inline]
@@ -183,8 +183,8 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
183183
}
184184

185185
#[inline]
186-
fn svar(self) -> Replacements<I> {
187-
replace::new_svar(self)
186+
fn cjk_compat_variants(self) -> Replacements<I> {
187+
replace::new_cjk_compat_variants(self)
188188
}
189189

190190
#[inline]

src/lookups.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]>
6464
)
6565
}
6666

67-
pub(crate) fn svar_fully_decomposed(c: char) -> Option<&'static [char]> {
67+
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
6868
mph_lookup(
6969
c.into(),
70-
SVAR_DECOMPOSED_SALT,
71-
SVAR_DECOMPOSED_KV,
70+
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
71+
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
7272
pair_lookup_fk,
7373
pair_lookup_fv_opt,
7474
None,

src/normalize.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111
//! Functions for computing canonical and compatible decompositions for Unicode characters.
1212
use crate::lookups::{
13-
canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
14-
svar_fully_decomposed,
13+
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
14+
compatibility_fully_decomposed, composition_table,
1515
};
1616

1717
use core::{char, ops::FnMut};
@@ -47,7 +47,7 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
4747
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
4848
/// for more information.
4949
#[inline]
50-
pub fn decompose_svar<F>(c: char, mut emit_char: F)
50+
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
5151
where
5252
F: FnMut(char),
5353
{
@@ -59,7 +59,7 @@ where
5959

6060
// Don't perform decomposition for Hangul
6161

62-
if let Some(decomposed) = svar_fully_decomposed(c) {
62+
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
6363
for &d in decomposed {
6464
emit_char(d);
6565
}

src/replace.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub struct Replacements<I> {
2020
}
2121

2222
#[inline]
23-
pub fn new_svar<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
23+
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
2424
Replacements { iter, buffer: None }
2525
}
2626

@@ -37,7 +37,7 @@ impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
3737
Some(ch) => {
3838
// At this time, the longest replacement sequence has length 2.
3939
let mut buffer = TinyVec::<[char; 2]>::new();
40-
super::char::decompose_svar(ch, |d| buffer.push(d));
40+
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
4141
self.buffer = buffer.get(1).copied();
4242
Some(buffer[0])
4343
}

src/tables.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15161,7 +15161,7 @@ pub(crate) const COMPATIBILITY_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
1516115161
(0x2106, &['\u{0063}', '\u{002F}', '\u{0075}']),
1516215162
];
1516315163

15164-
pub(crate) const SVAR_DECOMPOSED_SALT: &[u16] = &[
15164+
pub(crate) const CJK_COMPAT_VARIANTS_DECOMPOSED_SALT: &[u16] = &[
1516515165
0x5,
1516615166
0x0,
1516715167
0x0,
@@ -16165,7 +16165,7 @@ pub(crate) const SVAR_DECOMPOSED_SALT: &[u16] = &[
1616516165
0x5,
1616616166
0x1,
1616716167
];
16168-
pub(crate) const SVAR_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
16168+
pub(crate) const CJK_COMPAT_VARIANTS_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
1616916169
(0xfa08, &['\u{884C}', '\u{FE00}']),
1617016170
(0x2f825, &['\u{52C7}', '\u{FE01}']),
1617116171
(0x2f838, &['\u{20B63}', '\u{FE00}']),

tests/cjk_compat_variants.rs

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
//! Test the standard variation sequence replacements.
2+
3+
use unicode_normalization::UnicodeNormalization;
4+
5+
#[test]
6+
fn test_cjk_compat_variants() {
7+
// These codepoints have singleton decompositions in the canonical
8+
// decomposition, and can use standardized variations.
9+
let s = "\u{2f999}\u{2f8a6}";
10+
11+
// These codepoints have canonical decompositions.
12+
let mut nfd_iter = s.chars().nfd();
13+
assert_eq!(nfd_iter.next(), Some('\u{831d}'));
14+
assert_eq!(nfd_iter.next(), Some('\u{6148}'));
15+
assert_eq!(nfd_iter.next(), None);
16+
17+
let mut nfkd_iter = s.chars().nfkd();
18+
assert_eq!(nfkd_iter.next(), Some('\u{831d}'));
19+
assert_eq!(nfkd_iter.next(), Some('\u{6148}'));
20+
assert_eq!(nfkd_iter.next(), None);
21+
22+
let mut nfc_iter = s.chars().nfc();
23+
assert_eq!(nfc_iter.next(), Some('\u{831d}'));
24+
assert_eq!(nfc_iter.next(), Some('\u{6148}'));
25+
assert_eq!(nfc_iter.next(), None);
26+
27+
let mut nfkc_iter = s.chars().nfkc();
28+
assert_eq!(nfkc_iter.next(), Some('\u{831d}'));
29+
assert_eq!(nfkc_iter.next(), Some('\u{6148}'));
30+
assert_eq!(nfkc_iter.next(), None);
31+
32+
// However they also have standardized variants.
33+
let mut var_iter = s.chars().cjk_compat_variants();
34+
assert_eq!(var_iter.next(), Some('\u{831d}'));
35+
assert_eq!(var_iter.next(), Some('\u{fe00}'));
36+
assert_eq!(var_iter.next(), Some('\u{6148}'));
37+
assert_eq!(var_iter.next(), Some('\u{fe00}'));
38+
assert_eq!(var_iter.next(), None);
39+
40+
// The standardized variants are normalization-stable.
41+
let mut var_nfc_iter = s.chars().cjk_compat_variants().nfc();
42+
assert_eq!(var_nfc_iter.next(), Some('\u{831d}'));
43+
assert_eq!(var_nfc_iter.next(), Some('\u{fe00}'));
44+
assert_eq!(var_nfc_iter.next(), Some('\u{6148}'));
45+
assert_eq!(var_nfc_iter.next(), Some('\u{fe00}'));
46+
assert_eq!(var_nfc_iter.next(), None);
47+
48+
let mut var_nfd_iter = s.chars().cjk_compat_variants().nfd();
49+
assert_eq!(var_nfd_iter.next(), Some('\u{831d}'));
50+
assert_eq!(var_nfd_iter.next(), Some('\u{fe00}'));
51+
assert_eq!(var_nfd_iter.next(), Some('\u{6148}'));
52+
assert_eq!(var_nfd_iter.next(), Some('\u{fe00}'));
53+
assert_eq!(var_nfd_iter.next(), None);
54+
55+
let mut var_nfkc_iter = s.chars().cjk_compat_variants().nfkc();
56+
assert_eq!(var_nfkc_iter.next(), Some('\u{831d}'));
57+
assert_eq!(var_nfkc_iter.next(), Some('\u{fe00}'));
58+
assert_eq!(var_nfkc_iter.next(), Some('\u{6148}'));
59+
assert_eq!(var_nfkc_iter.next(), Some('\u{fe00}'));
60+
assert_eq!(var_nfkc_iter.next(), None);
61+
62+
let mut var_nfkd_iter = s.chars().cjk_compat_variants().nfkd();
63+
assert_eq!(var_nfkd_iter.next(), Some('\u{831d}'));
64+
assert_eq!(var_nfkd_iter.next(), Some('\u{fe00}'));
65+
assert_eq!(var_nfkd_iter.next(), Some('\u{6148}'));
66+
assert_eq!(var_nfkd_iter.next(), Some('\u{fe00}'));
67+
assert_eq!(var_nfkd_iter.next(), None);
68+
}
69+
70+
/// `cjk_compat_variants` shouldn't decompose Hangul.
71+
#[test]
72+
fn test_cjk_compat_variants_with_hangul() {
73+
assert_eq!(
74+
"중국어 (홍콩)"
75+
.chars()
76+
.cjk_compat_variants()
77+
.collect::<String>(),
78+
"중국어 (홍콩)"
79+
);
80+
}

0 commit comments

Comments
 (0)
0