emilio
diff --git a/‎scripts/unicode.py
Lines changed: 32 additions & 29 deletions b/‎scripts/unicode.py
Lines changed: 32 additions & 29 deletions
diff --git a/‎src/lib.rs
Lines changed: 11 additions & 11 deletions b/‎src/lib.rs
Lines changed: 11 additions & 11 deletions
diff --git a/‎src/lookups.rs
Lines changed: 3 additions & 3 deletions b/‎src/lookups.rs
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/normalize.rs
Lines changed: 4 additions & 4 deletions b/‎src/normalize.rs
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/replace.rs
Lines changed: 2 additions & 2 deletions b/‎src/replace.rs
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/tables.rs
Lines changed: 2 additions & 2 deletions b/‎src/tables.rs
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/cjk_compat_variants.rs
Lines changed: 80 additions & 0 deletions b/‎tests/cjk_compat_variants.rs
Lines changed: 80 additions & 0 deletions
@@ -72,9 +72,9 @@ def __init__(self):
         self.canon_comp = self._compute_canonical_comp()
         self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
 
-        self.svar_decomp = {}
-        self.svar_fully_decomp = {}
-        self._load_standardized_variants()
+        self.cjk_compat_variants_decomp = {}
+        self.cjk_compat_variants_fully_decomp = {}
+        self._load_cjk_compat_ideograph_variants()
 
         def stats(name, table):
             count = sum(len(v) for v in table.values())
@@ -83,10 +83,10 @@ def stats(name, table):
         print("Decomposition table stats:")
         stats("Canonical decomp", self.canon_decomp)
         stats("Compatible decomp", self.compat_decomp)
-        stats("Standardized Variants", self.svar_decomp)
+        stats("CJK Compat Variants", self.cjk_compat_variants_decomp)
         stats("Canonical fully decomp", self.canon_fully_decomp)
         stats("Compatible fully decomp", self.compat_fully_decomp)
-        stats("Standardized Variants", self.svar_fully_decomp)
+        stats("CJK Compat Variants", self.cjk_compat_variants_fully_decomp)
 
         self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
 
@@ -122,38 +122,41 @@ def _load_unicode_data(self):
             if category == 'M' or 'M' in expanded_categories.get(category, []):
                 self.general_category_mark.append(char_int)
 
-    def _load_standardized_variants(self):
+    def _load_cjk_compat_ideograph_variants(self):
         for line in self._fetch("StandardizedVariants.txt").splitlines():
             strip_comments = line.split('#', 1)[0].strip()
             if not strip_comments:
                 continue
 
-            pieces = strip_comments.split(';')
-            assert len(pieces) == 3
-
-            variation_sequence, description, differences = pieces[0], pieces[1].strip(), pieces[2]
+            variation_sequence, description, differences = strip_comments.split(';')
+            description = description.strip()
 
             # Don't use variations that only apply in particular shaping environments.
             if differences:
                 continue
 
             # Look for entries where the description field is a codepoint name.
-            if description in self.name_to_char_int:
-                char_int = self.name_to_char_int[description]
-
-                assert not char_int in self.combining_classes, "Unexpected: standardized variant with a combining class"
-                assert not char_int in self.compat_decomp, "Unexpected: standardized variant and compatibility decomposition"
-                assert len(self.canon_decomp[char_int]) == 1, "Unexpected: standardized variant and non-singleton canonical decomposition"
-                # If we ever need to handle Hangul here, we'll need to handle it separately.
-                assert not (S_BASE <= char_int < S_BASE + S_COUNT)
-
-                standardized_variant_parts = [int(c, 16) for c in variation_sequence.split()]
-                for c in standardized_variant_parts:
-                    #assert not never_composes(c) TODO: Re-enable this once #67 lands.
-                    assert not c in self.canon_decomp, "Unexpected: standardized variant is unnormalized (canon)"
-                    assert not c in self.compat_decomp, "Unexpected: standardized variant is unnormalized (compat)"
-                self.svar_decomp[char_int] = standardized_variant_parts
-                self.svar_fully_decomp[char_int] = standardized_variant_parts
+            if description not in self.name_to_char_int:
+                continue
+
+            # Only consider the CJK Compatibility Ideographs.
+            if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
+                continue
+
+            char_int = self.name_to_char_int[description]
+
+            assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
+            assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
+            assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
+            # If we ever need to handle Hangul here, we'll need to handle it separately.
+            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
+
+            cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
+            for c in cjk_compat_variant_parts:
+                assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
+                assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
+            self.cjk_compat_variants_decomp[char_int] = cjk_compat_variant_parts
+            self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
 
     def _load_norm_props(self):
         props = collections.defaultdict(list)
@@ -364,8 +367,8 @@ def gen_composition_table(canon_comp, out):
     out.write("    }\n")
     out.write("}\n")
 
-def gen_decomposition_tables(canon_decomp, compat_decomp, svar_decomp, out):
-    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (svar_decomp, 'svar')]
+def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
+    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
     for table, name in tables:
         gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
             lambda k: "(0x{:x}, &[{}])".format(k,
@@ -535,7 +538,7 @@ def minimal_perfect_hash(d):
         gen_composition_table(data.canon_comp, out)
         out.write("\n")
 
-        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.svar_fully_decomp, out)
+        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
 
         gen_combining_mark(data.general_category_mark, out)
         out.write("\n")
 
@@ -86,7 +86,7 @@ mod test;
 /// Methods for composing and decomposing characters.
 pub mod char {
     pub use crate::normalize::{
-        compose, decompose_canonical, decompose_compatible, decompose_svar,
+        compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
     };
 
     pub use crate::lookups::{canonical_combining_class, is_combining_mark};
@@ -112,17 +112,17 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
     /// (compatibility decomposition followed by canonical composition).
     fn nfkc(self) -> Recompositions<I>;
 
-    /// A transformation which replaces codepoints with normal forms using
-    /// Standardized Variation Sequences. This is not part of the canonical
-    /// or compatibility decomposition algorithms, but performing it before
-    /// those algorithms produces normalized output which better preserves
-    /// the intent of the original text.
+    /// A transformation which replaces CJK Compatibility Ideograph codepoints
+    /// with normal forms using Standardized Variation Sequences. This is not
+    /// part of the canonical or compatibility decomposition algorithms, but
+    /// performing it before those algorithms produces normalized output which
+    /// better preserves the intent of the original text.
     ///
     /// Note that many systems today ignore variation selectors, so these
     /// may not immediately help text display as intended, but they at
     /// least preserve the information in a standardized form, giving
     /// implementations the option to recognize them.
-    fn svar(self) -> Replacements<I>;
+    fn cjk_compat_variants(self) -> Replacements<I>;
 
     /// An Iterator over the string with Conjoining Grapheme Joiner characters
     /// inserted according to the Stream-Safe Text Process (UAX15-D4)
@@ -151,8 +151,8 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
     }
 
     #[inline]
-    fn svar(self) -> Replacements<Chars<'a>> {
-        replace::new_svar(self.chars())
+    fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
+        replace::new_cjk_compat_variants(self.chars())
     }
 
     #[inline]
@@ -183,8 +183,8 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
     }
 
     #[inline]
-    fn svar(self) -> Replacements<I> {
-        replace::new_svar(self)
+    fn cjk_compat_variants(self) -> Replacements<I> {
+        replace::new_cjk_compat_variants(self)
     }
 
     #[inline]
 
@@ -64,11 +64,11 @@ pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]>
     )
 }
 
-pub(crate) fn svar_fully_decomposed(c: char) -> Option<&'static [char]> {
+pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
     mph_lookup(
         c.into(),
-        SVAR_DECOMPOSED_SALT,
-        SVAR_DECOMPOSED_KV,
+        CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
+        CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
         pair_lookup_fk,
         pair_lookup_fv_opt,
         None,
 
@@ -10,8 +10,8 @@
 
 //! Functions for computing canonical and compatible decompositions for Unicode characters.
 use crate::lookups::{
-    canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
-    svar_fully_decomposed,
+    canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
+    compatibility_fully_decomposed, composition_table,
 };
 
 use core::{char, ops::FnMut};
@@ -47,7 +47,7 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
 /// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
 /// for more information.
 #[inline]
-pub fn decompose_svar<F>(c: char, mut emit_char: F)
+pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
 where
     F: FnMut(char),
 {
@@ -59,7 +59,7 @@ where
 
     // Don't perform decomposition for Hangul
 
-    if let Some(decomposed) = svar_fully_decomposed(c) {
+    if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
         for &d in decomposed {
             emit_char(d);
         }
 
@@ -20,7 +20,7 @@ pub struct Replacements<I> {
 }
 
 #[inline]
-pub fn new_svar<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
+pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
     Replacements { iter, buffer: None }
 }
 
@@ -37,7 +37,7 @@ impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
             Some(ch) => {
                 // At this time, the longest replacement sequence has length 2.
                 let mut buffer = TinyVec::<[char; 2]>::new();
-                super::char::decompose_svar(ch, |d| buffer.push(d));
+                super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
                 self.buffer = buffer.get(1).copied();
                 Some(buffer[0])
             }
 
@@ -15161,7 +15161,7 @@ pub(crate) const COMPATIBILITY_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
     (0x2106, &['\u{0063}', '\u{002F}', '\u{0075}']),
 ];
 
-pub(crate) const SVAR_DECOMPOSED_SALT: &[u16] = &[
+pub(crate) const CJK_COMPAT_VARIANTS_DECOMPOSED_SALT: &[u16] = &[
     0x5,
     0x0,
     0x0,
@@ -16165,7 +16165,7 @@ pub(crate) const SVAR_DECOMPOSED_SALT: &[u16] = &[
     0x5,
     0x1,
 ];
-pub(crate) const SVAR_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
+pub(crate) const CJK_COMPAT_VARIANTS_DECOMPOSED_KV: &[(u32, &'static [char])] = &[
     (0xfa08, &['\u{884C}', '\u{FE00}']),
     (0x2f825, &['\u{52C7}', '\u{FE01}']),
     (0x2f838, &['\u{20B63}', '\u{FE00}']),
 
@@ -0,0 +1,80 @@
+//! Test the standard variation sequence replacements.
+
+use unicode_normalization::UnicodeNormalization;
+
+#[test]
+fn test_cjk_compat_variants() {
+    // These codepoints have singleton decompositions in the canonical
+    // decomposition, and can use standardized variations.
+    let s = "\u{2f999}\u{2f8a6}";
+
+    // These codepoints have canonical decompositions.
+    let mut nfd_iter = s.chars().nfd();
+    assert_eq!(nfd_iter.next(), Some('\u{831d}'));
+    assert_eq!(nfd_iter.next(), Some('\u{6148}'));
+    assert_eq!(nfd_iter.next(), None);
+
+    let mut nfkd_iter = s.chars().nfkd();
+    assert_eq!(nfkd_iter.next(), Some('\u{831d}'));
+    assert_eq!(nfkd_iter.next(), Some('\u{6148}'));
+    assert_eq!(nfkd_iter.next(), None);
+
+    let mut nfc_iter = s.chars().nfc();
+    assert_eq!(nfc_iter.next(), Some('\u{831d}'));
+    assert_eq!(nfc_iter.next(), Some('\u{6148}'));
+    assert_eq!(nfc_iter.next(), None);
+
+    let mut nfkc_iter = s.chars().nfkc();
+    assert_eq!(nfkc_iter.next(), Some('\u{831d}'));
+    assert_eq!(nfkc_iter.next(), Some('\u{6148}'));
+    assert_eq!(nfkc_iter.next(), None);
+
+    // However they also have standardized variants.
+    let mut var_iter = s.chars().cjk_compat_variants();
+    assert_eq!(var_iter.next(), Some('\u{831d}'));
+    assert_eq!(var_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_iter.next(), Some('\u{6148}'));
+    assert_eq!(var_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_iter.next(), None);
+
+    // The standardized variants are normalization-stable.
+    let mut var_nfc_iter = s.chars().cjk_compat_variants().nfc();
+    assert_eq!(var_nfc_iter.next(), Some('\u{831d}'));
+    assert_eq!(var_nfc_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfc_iter.next(), Some('\u{6148}'));
+    assert_eq!(var_nfc_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfc_iter.next(), None);
+
+    let mut var_nfd_iter = s.chars().cjk_compat_variants().nfd();
+    assert_eq!(var_nfd_iter.next(), Some('\u{831d}'));
+    assert_eq!(var_nfd_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfd_iter.next(), Some('\u{6148}'));
+    assert_eq!(var_nfd_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfd_iter.next(), None);
+
+    let mut var_nfkc_iter = s.chars().cjk_compat_variants().nfkc();
+    assert_eq!(var_nfkc_iter.next(), Some('\u{831d}'));
+    assert_eq!(var_nfkc_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfkc_iter.next(), Some('\u{6148}'));
+    assert_eq!(var_nfkc_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfkc_iter.next(), None);
+
+    let mut var_nfkd_iter = s.chars().cjk_compat_variants().nfkd();
+    assert_eq!(var_nfkd_iter.next(), Some('\u{831d}'));
+    assert_eq!(var_nfkd_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfkd_iter.next(), Some('\u{6148}'));
+    assert_eq!(var_nfkd_iter.next(), Some('\u{fe00}'));
+    assert_eq!(var_nfkd_iter.next(), None);
+}
+
+/// `cjk_compat_variants` shouldn't decompose Hangul.
+#[test]
+fn test_cjk_compat_variants_with_hangul() {
+    assert_eq!(
+        "중국어 (홍콩)"
+            .chars()
+            .cjk_compat_variants()
+            .collect::<String>(),
+        "중국어 (홍콩)"
+    );
+}
Original file line number	Diff line number	Diff line change
`@@ -64,11 +64,11 @@ pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]>`
`64`	`64`	`)`
`65`	`65`	`}`
`66`	`66`
`67`		`-pub(crate) fn svar_fully_decomposed(c: char) -> Option<&'static [char]> {`
	`67`	`+pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {`
`68`	`68`	`mph_lookup(`
`69`	`69`	`c.into(),`
`70`		`- SVAR_DECOMPOSED_SALT,`
`71`		`- SVAR_DECOMPOSED_KV,`
	`70`	`+ CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,`
	`71`	`+ CJK_COMPAT_VARIANTS_DECOMPOSED_KV,`
`72`	`72`	`pair_lookup_fk,`
`73`	`73`	`pair_lookup_fv_opt,`
`74`	`74`	`None,`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ pub struct Replacements<I> {`
`20`	`20`	`}`
`21`	`21`
`22`	`22`	`#[inline]`
`23`		`-pub fn new_svar<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {`
	`23`	`+pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {`
`24`	`24`	`Replacements { iter, buffer: None }`
`25`	`25`	`}`
`26`	`26`
`@@ -37,7 +37,7 @@ impl<I: Iterator<Item = char>> Iterator for Replacements<I> {`
`37`	`37`	`Some(ch) => {`
`38`	`38`	`// At this time, the longest replacement sequence has length 2.`
`39`	`39`	`let mut buffer = TinyVec::<[char; 2]>::new();`
`40`		`- super::char::decompose_svar(ch, \|d\| buffer.push(d));`
	`40`	`+ super::char::decompose_cjk_compat_variants(ch, \|d\| buffer.push(d));`
`41`	`41`	`self.buffer = buffer.get(1).copied();`
`42`	`42`	`Some(buffer[0])`
`43`	`43`	`}`