unicode-rs
diff --git a/‎scripts/unicode.py
Lines changed: 53 additions & 9 deletions b/‎scripts/unicode.py
Lines changed: 53 additions & 9 deletions
diff --git a/‎src/decompose.rs
Lines changed: 28 additions & 0 deletions b/‎src/decompose.rs
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/lib.rs
Lines changed: 80 additions & 1 deletion b/‎src/lib.rs
Lines changed: 80 additions & 1 deletion
diff --git a/‎src/lookups.rs
Lines changed: 11 additions & 0 deletions b/‎src/lookups.rs
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/normalize.rs
Lines changed: 32 additions & 0 deletions b/‎src/normalize.rs
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/recompose.rs
Lines changed: 22 additions & 0 deletions b/‎src/recompose.rs
Lines changed: 22 additions & 0 deletions
@@ -14,9 +14,10 @@
 # - DerivedNormalizationProps.txt
 # - NormalizationTest.txt
 # - UnicodeData.txt
+# - StandardizedVariants.txt
 #
 # Since this should not require frequent updates, we just store this
-# out-of-line and check the unicode.rs file into git.
+# out-of-line and check the tables.rs and normalization_tests.rs files into git.
 import collections
 import urllib.request
 
@@ -57,6 +58,11 @@
     'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
 }
 
+# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
+# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
+S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
+S_COUNT = L_COUNT * V_COUNT * T_COUNT
+
 class UnicodeData(object):
     def __init__(self):
         self._load_unicode_data()
@@ -66,14 +72,20 @@ def __init__(self):
         self.canon_comp = self._compute_canonical_comp()
         self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
 
+        self.ext_decomp = {}
+        self.ext_fully_decomp = {}
+        self._load_standardized_variants()
+
         def stats(name, table):
             count = sum(len(v) for v in table.values())
             print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
 
         print("Decomposition table stats:")
         stats("Canonical decomp", self.canon_decomp)
+        stats("Canonical decomp with extensions", self.ext_decomp)
         stats("Compatible decomp", self.compat_decomp)
         stats("Canonical fully decomp", self.canon_fully_decomp)
+        stats("Canonical fully decomp with extensions", self.ext_fully_decomp)
         stats("Compatible fully decomp", self.compat_fully_decomp)
 
         self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
@@ -83,6 +95,7 @@ def _fetch(self, filename):
         return resp.read().decode('utf-8')
 
     def _load_unicode_data(self):
+        self.name_to_char_int = {}
         self.combining_classes = {}
         self.compat_decomp = {}
         self.canon_decomp = {}
@@ -95,6 +108,9 @@ def _load_unicode_data(self):
             char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
             char_int = int(char, 16)
 
+            name = pieces[1].strip()
+            self.name_to_char_int[name] = char_int
+
             if cc != '0':
                 self.combining_classes[char_int] = cc
 
@@ -106,6 +122,39 @@ def _load_unicode_data(self):
             if category == 'M' or 'M' in expanded_categories.get(category, []):
                 self.general_category_mark.append(char_int)
 
+    def _load_standardized_variants(self):
+        for line in self._fetch("StandardizedVariants.txt").splitlines():
+            strip_comments = line.split('#', 1)[0].strip()
+            if not strip_comments:
+                continue
+
+            pieces = strip_comments.split(';')
+            assert len(pieces) == 3
+
+            variation_sequence, description, differences = pieces[0], pieces[1].strip(), pieces[2]
+
+            # Don't use variations that only apply in particular shaping environments.
+            if differences:
+                continue
+
+            # Look for entries where the description field is a codepoint name.
+            if description in self.name_to_char_int:
+                char_int = self.name_to_char_int[description]
+
+                assert not char_int in self.combining_classes, "Unexpected: standardized variant with a combining class"
+                assert not char_int in self.compat_decomp, "Unexpected: standardized variant and compatibility decomposition"
+                assert len(self.canon_decomp[char_int]) == 1, "Unexpected: standardized variant and non-singleton canonical decomposition"
+                # If we ever need to handle Hangul here, we'll need to handle it separately.
+                assert not (S_BASE <= char_int < S_BASE + S_COUNT)
+
+                standardized_variant_parts = [int(c, 16) for c in variation_sequence.split()]
+                for c in standardized_variant_parts:
+                    #assert not never_composes(c) TODO: Re-enable this once #67 lands.
+                    assert not c in self.canon_decomp, "Unexpected: standardized variant is unnormalized (canon)"
+                    assert not c in self.compat_decomp, "Unexpected: standardized variant is unnormalized (compat)"
+                self.ext_decomp[char_int] = standardized_variant_parts
+                self.ext_fully_decomp[char_int] = standardized_variant_parts
+
     def _load_norm_props(self):
         props = collections.defaultdict(list)
 
@@ -178,11 +227,6 @@ def _compute_fully_decomposed(self):
         The upshot is that decomposition code is very simple and easy to inline
         at mild code size cost.
         """
-        # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
-        # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
-        S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
-        S_COUNT = L_COUNT * V_COUNT * T_COUNT
-
         def _decompose(char_int, compatible):
             # 7-bit ASCII never decomposes
             if char_int <= 0x7f:
@@ -320,8 +364,8 @@ def gen_composition_table(canon_comp, out):
     out.write("    }\n")
     out.write("}\n")
 
-def gen_decomposition_tables(canon_decomp, compat_decomp, out):
-    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
+def gen_decomposition_tables(canon_decomp, ext_decomp, compat_decomp, out):
+    tables = [(canon_decomp, 'canonical'), (ext_decomp, 'ext'), (compat_decomp, 'compatibility')]
     for table, name in tables:
         gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
             lambda k: "(0x{:x}, &[{}])".format(k,
@@ -491,7 +535,7 @@ def minimal_perfect_hash(d):
         gen_composition_table(data.canon_comp, out)
         out.write("\n")
 
-        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
+        gen_decomposition_tables(data.canon_fully_decomp, data.ext_fully_decomp, data.compat_fully_decomp, out)
 
         gen_combining_mark(data.general_category_mark, out)
         out.write("\n")
 
@@ -16,6 +16,8 @@ use tinyvec::TinyVec;
 enum DecompositionType {
     Canonical,
     Compatible,
+    CanonicalExt,
+    CompatibleExt,
 }
 
 /// External iterator for a string decomposition's characters.
@@ -56,6 +58,26 @@ pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
     }
 }
 
+#[inline]
+pub fn new_canonical_ext<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
+    Decompositions {
+        kind: self::DecompositionType::CanonicalExt,
+        iter: iter.fuse(),
+        buffer: TinyVec::new(),
+        ready: 0..0,
+    }
+}
+
+#[inline]
+pub fn new_compatible_ext<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
+    Decompositions {
+        kind: self::DecompositionType::CompatibleExt,
+        iter: iter.fuse(),
+        buffer: TinyVec::new(),
+        ready: 0..0,
+    }
+}
+
 impl<I> Decompositions<I> {
     #[inline]
     fn push_back(&mut self, ch: char) {
@@ -113,6 +135,12 @@ impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
                 (Some(ch), &DecompositionType::Compatible) => {
                     super::char::decompose_compatible(ch, |d| self.push_back(d));
                 }
+                (Some(ch), &DecompositionType::CanonicalExt) => {
+                    super::char::decompose_canonical_ext(ch, |d| self.push_back(d));
+                }
+                (Some(ch), &DecompositionType::CompatibleExt) => {
+                    super::char::decompose_compatible_ext(ch, |d| self.push_back(d));
+                }
                 (None, _) => {
                     if self.buffer.is_empty() {
                         return None;
 
@@ -83,7 +83,10 @@ mod test;
 
 /// Methods for composing and decomposing characters.
 pub mod char {
-    pub use crate::normalize::{compose, decompose_canonical, decompose_compatible};
+    pub use crate::normalize::{
+        compose, decompose_canonical, decompose_canonical_ext, decompose_compatible,
+        decompose_compatible_ext,
+    };
 
     pub use crate::lookups::{canonical_combining_class, is_combining_mark};
 }
@@ -108,6 +111,42 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
     /// (compatibility decomposition followed by canonical composition).
     fn nfkc(self) -> Recompositions<I>;
 
+    /// Similar to `nfd`, but with extensions which differ from the standard
+    /// decomposition algorithm and which don't have a stability guarantee,
+    /// but which still produce valid NFD and provide better results:
+    ///  - Standardized Variation Seqeuences are used to avoid losing
+    ///    information when normalizing "CJK Compatibility Ideographs"
+    ///    codepoints. Note that many systemes today ignore variation
+    ///    selectors, but the information is at least preserved in a
+    ///    standardized form.
+    ///
+    /// Additional extensions may be added in future versions.
+    ///
+    /// If you need to match the standard `toNFD` algorithm exactly, or you
+    /// need a stability guarantee, use `nfd` instead.
+    fn nfd_ext(self) -> Decompositions<I>;
+
+    /// Similar to `nfkd`, and the result is valid NFKD, but with the same
+    /// extensions as `nfd`.
+    ///
+    /// If you need to match the standard `toNFKD` algorithm exactly, or you
+    /// need a stability guarantee, use `nfd` instead.
+    fn nfkd_ext(self) -> Decompositions<I>;
+
+    /// Similar to `nfc`, and the result is valid NFC, but with the same
+    /// extensions as `nfd`.
+    ///
+    /// If you need to match the standard `toNFC` algorithm exactly, or you
+    /// need a stability guarantee, use `nfd` instead.
+    fn nfc_ext(self) -> Recompositions<I>;
+
+    /// Similar to `nfkc`, and the result is valid NFKC, but with the same
+    /// extensions as `nfd`.
+    ///
+    /// If you need to match the standard `toNFKC` algorithm exactly, or you
+    /// need a stability guarantee, use `nfd` instead.
+    fn nfkc_ext(self) -> Recompositions<I>;
+
     /// An Iterator over the string with Conjoining Grapheme Joiner characters
     /// inserted according to the Stream-Safe Text Process (UAX15-D4)
     fn stream_safe(self) -> StreamSafe<I>;
@@ -134,6 +173,26 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
         recompose::new_compatible(self.chars())
     }
 
+    #[inline]
+    fn nfd_ext(self) -> Decompositions<Chars<'a>> {
+        decompose::new_canonical_ext(self.chars())
+    }
+
+    #[inline]
+    fn nfkd_ext(self) -> Decompositions<Chars<'a>> {
+        decompose::new_compatible_ext(self.chars())
+    }
+
+    #[inline]
+    fn nfc_ext(self) -> Recompositions<Chars<'a>> {
+        recompose::new_canonical_ext(self.chars())
+    }
+
+    #[inline]
+    fn nfkc_ext(self) -> Recompositions<Chars<'a>> {
+        recompose::new_compatible_ext(self.chars())
+    }
+
     #[inline]
     fn stream_safe(self) -> StreamSafe<Chars<'a>> {
         StreamSafe::new(self.chars())
@@ -161,6 +220,26 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
         recompose::new_compatible(self)
     }
 
+    #[inline]
+    fn nfd_ext(self) -> Decompositions<I> {
+        decompose::new_canonical_ext(self)
+    }
+
+    #[inline]
+    fn nfkd_ext(self) -> Decompositions<I> {
+        decompose::new_compatible_ext(self)
+    }
+
+    #[inline]
+    fn nfc_ext(self) -> Recompositions<I> {
+        recompose::new_canonical_ext(self)
+    }
+
+    #[inline]
+    fn nfkc_ext(self) -> Recompositions<I> {
+        recompose::new_compatible_ext(self)
+    }
+
     #[inline]
     fn stream_safe(self) -> StreamSafe<I> {
         StreamSafe::new(self)
 
@@ -53,6 +53,17 @@ pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
     )
 }
 
+pub(crate) fn ext_fully_decomposed(c: char) -> Option<&'static [char]> {
+    mph_lookup(
+        c.into(),
+        EXT_DECOMPOSED_SALT,
+        EXT_DECOMPOSED_KV,
+        pair_lookup_fk,
+        pair_lookup_fv_opt,
+        None,
+    )
+}
+
 pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
     mph_lookup(
         c.into(),
 
@@ -11,6 +11,7 @@
 //! Functions for computing canonical and compatible decompositions for Unicode characters.
 use crate::lookups::{
     canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
+    ext_fully_decomposed,
 };
 
 use core::{char, ops::FnMut};
@@ -36,6 +37,37 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
     decompose(c, decompose_char, emit_char)
 }
 
+/// Compute "extended" canonical Unicode decomposition for character.
+///
+/// This is `decompose_canonical` plus extensions, which currently consist of:
+///  - [Standardized Variation Sequences] are used instead of the standard canonical
+///    decompositions for CJK codepoints with singleton canonical decompositions, to
+///    avoid losing information. See the
+///    [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
+///    "Other Enhancements" section of the
+///    [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
+///    for more information.
+#[inline]
+pub fn decompose_canonical_ext<F>(c: char, emit_char: F)
+where
+    F: FnMut(char),
+{
+    let decompose_char = |c| ext_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
+    decompose(c, decompose_char, emit_char)
+}
+
+/// Compute "extended" compatible Unicode decomposition for character.
+///
+/// This is `decompose_compatible` plus the same extensions as `decompose_canonical_ext`.
+#[inline]
+pub fn decompose_compatible_ext<F: FnMut(char)>(c: char, emit_char: F) {
+    let decompose_char = |c| {
+        ext_fully_decomposed(c)
+            .or_else(|| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)))
+    };
+    decompose(c, decompose_char, emit_char)
+}
+
 #[inline]
 fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
 where
 
@@ -51,6 +51,28 @@ pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
     }
 }
 
+#[inline]
+pub fn new_canonical_ext<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
+    Recompositions {
+        iter: super::decompose::new_canonical_ext(iter),
+        state: self::RecompositionState::Composing,
+        buffer: TinyVec::new(),
+        composee: None,
+        last_ccc: None,
+    }
+}
+
+#[inline]
+pub fn new_compatible_ext<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
+    Recompositions {
+        iter: super::decompose::new_compatible_ext(iter),
+        state: self::RecompositionState::Composing,
+        buffer: TinyVec::new(),
+        composee: None,
+        last_ccc: None,
+    }
+}
+
 impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
     type Item = char;