10000 Add new normalization algorithms using Standardized Variants · unicode-rs/unicode-normalization@d1ad2ac · GitHub
[go: up one dir, main page]

Skip to content

Commit d1ad2ac

Browse files
committed
Add new normalization algorithms using Standardized Variants
The standard normalization algorithm decomposes CJK compatibility ideographs into nominally equivalent codepoints, but which traditionally look different, and is one of the main reasons normalization is considered destructive in practice. [Unicode 6.3] introduced a solution for this, by providing [standardized variation sequences] for these codepoints. For example, while U+2F8A6 "CJK COMPATIBILITY-IDEOGRAPH-2F8A6" canonically decomposes to U+6148 with a different appearance, in Unicode 6.3 and later the standardized variation sequences in the StandardizedVariants.txt file include the following: > 6148 FE00; CJK COMPATIBILITY IDEOGRAPH-2F8A6; which says that "CJK COMPATIBILITY IDEOGRAPH-2F8A6" corresponds to U+6148 U+FE00, where U+FE00 is "VARIATION SELECTOR-1". U+6148 and U+FE00 are both normalized codepoints, so we can transform text containing U+2F8A6 into normal form without losing information about the distinct appearance. At this time, many popular implementations ignore these variation selectors, however this technique at least preserves the information in a standardized way, so implementations could use it if they chose. This PR adds "ext" versions of the `nfd`, `nfc`, `nfkd`, and `nkfd` iterators, which perform the standard algorithms extended with this technique. They don't match the standard decompositions, and don't guarantee stability, but they do produce appropriately normalized output. I used the generic term "ext" to reflect that other extensions could theoretically be added in the future. The standard decomposition tables are limited by their stability requirements, but these "ext" versions could be free to adopt new useful rules. I'm not an expert in any of these topics, so please correct me if I'm mistaken in any of this. Also, I'm open to ideas about how to best present this functionality in the API. [Unicode 6.3]: https://www.unicode.org/versions/Unicode6.3.0/#Summary [standardized variation sequences]: http://unicode.org/faq/vs.html
1 parent 2f400a9 commit d1ad2ac

File tree

8 files changed

+2354
-10
lines changed

8 files changed

+2354
-10
lines changed

scripts/unicode.py

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@
1414
# - DerivedNormalizationProps.txt
1515
# - NormalizationTest.txt
1616
# - UnicodeData.txt
17+
# - StandardizedVariants.txt
1718
#
1819
# Since this should not require frequent updates, we just store this
19-
# out-of-line and check the unicode.rs file into git.
20+
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2021
import collections
2122
import urllib.request
2223

@@ -57,6 +58,11 @@
5758
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
5859
}
5960

61+
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
62+
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
63+
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
64+
S_COUNT = L_COUNT * V_COUNT * T_COUNT
65+
6066
class UnicodeData(object):
6167
def __init__(self):
6268
self._load_unicode_data()
@@ -66,14 +72,20 @@ def __init__(self):
6672
self.canon_comp = self._compute_canonical_comp()
6773
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
6874

75+
self.ext_decomp = {}
76+
self.ext_fully_decomp = {}
77+
self._load_standardized_variants()
78+
6979
def stats(name, table):
7080
count = sum(len(v) for v in table.values())
7181
print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
7282

7383
print("Decomposition table stats:")
7484
stats("Canonical decomp", self.canon_decomp)
85+
stats("Canonical decomp with extensions", self.ext_decomp)
7586
stats("Compatible decomp", self.compat_decomp)
7687
stats("Canonical fully decomp", self.canon_fully_decomp)
88+
stats("Canonical fully decomp with extensions", self.ext_fully_decomp)
7789
stats("Compatible fully decomp", self.compat_fully_decomp)
7890

7991
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
@@ -83,6 +95,7 @@ def _fetch(self, filename):
8395
return resp.read().decode('utf-8')
8496

8597
def _load_unicode_data(self):
98+
self.name_to_char_int = {}
8699
self.combining_classes = {}
87100
self.compat_decomp = {}
88101
self.canon_decomp = {}
@@ -95,6 +108,9 @@ def _load_unicode_data(self):
95108
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
96109
char_int = int(char, 16)
97110

111+
name = pieces[1].strip()
112+
self.name_to_char_int[name] = char_int
113+
98114
if cc != '0':
99115
self.combining_classes[char_int] = cc
100116

@@ -106,6 +122,39 @@ def _load_unicode_data(self):
106122
if category == 'M' or 'M' in expanded_categories.get(category, []):
107123
self.general_category_mark.append(char_int)
108124

125+
def _load_standardized_variants(self):
126+
for line in self._fetch("StandardizedVariants.txt").splitlines():
127+
strip_comments = line.split('#', 1)[0].strip()
128+
if not strip_comments:
129+
continue
130+
131+
pieces = strip_comments.split(';')
132+
assert len(pieces) == 3
133+
134+
variation_sequence, description, differences = pieces[0], pieces[1].strip(), pieces[2]
135+
136+
# Don't use variations that only apply in particular shaping environments.
137+
if differences:
138+
continue
139+
140+
# Look for entries where the description field is a codepoint name.
141+
if description in self.name_to_char_int:
142+
char_int = self.name_to_char_int[description]
143+
144+
assert not char_int in self.combining_classes, "Unexpected: standardized variant with a combining class"
145+
assert not char_int in self.compat_decomp, "Unexpected: standardized variant and compatibility decomposition"
146+
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: standardized variant and non-singleton canonical decomposition"
147+
# If we ever need to handle Hangul here, we'll need to handle it separately.
148+
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
149+
150+
standardized_variant_parts = [int(c, 16) for c in variation_sequence.split()]
151+
for c in standardized_variant_parts:
152+
#assert not never_composes(c) TODO: Re-enable this once #67 lands.
153+
assert not c in self.canon_decomp, "Unexpected: standardized variant is unnormalized (canon)"
154+
assert not c in self.compat_decomp, "Unexpected: standardized variant is unnormalized (compat)"
155+
self.ext_decomp[char_int] = standardized_variant_parts
156+
self.ext_fully_decomp[char_int] = standardized_variant_parts
157+
109158
def _load_norm_props(self):
110159
props = collections.defaultdict(list)
111160

@@ -178,11 +227,6 @@ def _compute_fully_decomposed(self):
178227
The upshot is that decomposition code is very simple and easy to inline
179228
at mild code size cost.
180229
"""
181-
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
182-
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
183-
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
184-
S_COUNT = L_COUNT * V_COUNT * T_COUNT
185-
186230
def _decompose(char_int, compatible):
187231
# 7-bit ASCII never decomposes
188232
if char_int <= 0x7f:
@@ -320,8 +364,8 @@ def gen_composition_table(canon_comp, out):
320364
out.write(" }\n")
321365
out.write("}\n")
322366

323-
def gen_decomposition_tables(canon_decomp, compat_decomp, out):
324-
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
367+
def gen_decomposition_tables(canon_decomp, ext_decomp, compat_decomp, out):
368+
tables = [(canon_decomp, 'canonical'), (ext_decomp, 'ext'), (compat_decomp, 'compatibility')]
325369
for table, name in tables:
326370
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
327371
lambda k: "(0x{:x}, &[{}])".format(k,
@@ -491,7 +535,7 @@ def minimal_perfect_hash(d):
491535
gen_composition_table(data.canon_comp, out)
492536
out.write("\n")
493537

494-
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
538+
gen_decomposition_tables(data.canon_fully_decomp, data.ext_fully_decomp, data.compat_fully_decomp, out)
495539

496540
gen_combining_mark(data.general_category_mark, out)
497541
out.write("\n")

src/decompose.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ use tinyvec::TinyVec;
1616
enum DecompositionType {
1717
Canonical,
1818
Compatible,
19+
CanonicalExt,
20+
CompatibleExt,
1921
}
2022

2123
/// External iterator for a string decomposition's characters.
@@ -56,6 +58,26 @@ pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
5658
}
5759
}
5860

61+
#[inline]
62+
pub fn new_canonical_ext<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
63+
Decompositions {
64+
kind: self::DecompositionType::CanonicalExt,
65+
iter: iter.fuse(),
66+
buffer: TinyVec::new(),
67+
ready: 0..0,
68+
}
69+
}
70+
71+
#[inline]
72+
pub fn new_compatible_ext<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
73+
Decompositions {
74+
kind: self::DecompositionType::CompatibleExt,
75+
iter: iter.fuse(),
76+
buffer: TinyVec::new(),
77+
ready: 0..0,
78+
}
79+
}
80+
5981
impl<I> Decompositions<I> {
6082
#[inline]
6183
fn push_back(&mut self, ch: char) {
@@ -113,6 +135,12 @@ impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
113135
(Some(ch), &DecompositionType::Compatible) => {
114136
super::char::decompose_compatible(ch, |d| self.push_back(d));
115137
}
138+
(Some(ch), &DecompositionType::CanonicalExt) => {
139+
super::char::decompose_canonical_ext(ch, |d| self.push_back(d));
140+
}
141+
(Some(ch), &DecompositionType::CompatibleExt) => {
142+
super::char::decompose_compatible_ext(ch, |d| self.push_back(d));
143+
}
116144
(None, _) => {
117145
if self.buffer.is_empty() {
118146
return None;

src/lib.rs

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,10 @@ mod test;
8383

8484
/// Methods for composing and decomposing characters.
8585
pub mod char {
86-
pub use crate::normalize::{compose, decompose_canonical, decompose_compatible};
86+
pub use crate::normalize::{
87+
compose, decompose_canonical, decompose_canonical_ext, decompose_compatible,
88+
decompose_compatible_ext,
89+
};
8790

8891
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
8992
}
@@ -108,6 +111,42 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
108111
/// (compatibility decomposition followed by canonical composition).
109112
fn nfkc(self) -> Recompositions<I>;
110113

114+
/// Similar to `nfd`, but with extensions which differ from the standard
115+
/// decomposition algorithm and which don't have a stability guarantee,
116+
/// but which still produce valid NFD and provide better results:
117+
/// - Standardized Variation Seqeuences are used to avoid losing
118+
/// information when normalizing "CJK Compatibility Ideographs"
119+
/// codepoints. Note that many systemes today ignore variation
120+
/// selectors, but the information is at least preserved in a
121+
/// standardized form.
122+
///
123+
/// Additional extensions may be added in future versions.
124+
///
125+
/// If you need to match the standard `toNFD` algorithm exactly, or you
126+
/// need a stability guarantee, use `nfd` instead.
127+
fn nfd_ext(self) -> Decompositions<I>;
128+
129+
/// Similar to `nfkd`, and the result is valid NFKD, but with the same
130+
/// extensions as `nfd`.
131+
///
132+
/// If you need to match the standard `toNFKD` algorithm exactly, or you
133+
/// need a stability guarantee, use `nfd` instead.
134+
fn nfkd_ext(self) -> Decompositions<I>;
135+
136+
/// Similar to `nfc`, and the result is valid NFC, but with the same
137+
/// extensions as `nfd`.
138+
///
139+
/// If you need to match the standard `toNFC` algorithm exactly, or you
140+
/// need a stability guarantee, use `nfd` instead.
141+
fn nfc_ext(self) -> Recompositions<I>;
142+
143+
/// Similar to `nfkc`, and the result is valid NFKC, but with the same
144+
/// extensions as `nfd`.
145+
///
146+
/// If you need to match the standard `toNFKC` algorithm exactly, or you
147+
/// need a stability guarantee, use `nfd` instead.
148+
fn nfkc_ext(self) -> Recompositions<I>;
149+
111150
/// An Iterator over the string with Conjoining Grapheme Joiner characters
112151
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
113152
fn stream_safe(self) -> StreamSafe<I>;
@@ -134,6 +173,26 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
134173
recompose::new_compatible(self.chars())
135174
}
136175

176+
#[inline]
177+
fn nfd_ext(self) -> Decompositions<Chars<'a>> {
178+
decompose::new_canonical_ext(self.chars())
179+
}
180+
181+
#[inline]
182+
fn nfkd_ext(self) -> Decompositions<Chars<'a>> {
183+
decompose::new_compatible_ext(self.chars())
184+
}
185+
186+
#[inline]
187+
fn nfc_ext(self) -> Recompositions<Chars<'a>> {
188+
recompose::new_canonical_ext(self.chars())
189+
}
190+
191+
#[inline]
192+
fn nfkc_ext(self) -> Recompositions<Chars<'a>> {
193+
recompose::new_compatible_ext(self.chars())
194+
}
195+
137196
#[inline]
138197
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
139198
StreamSafe::new(self.chars())
@@ -161,6 +220,26 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
161220
recompose::new_compatible(self)
162221
}
163222

223+
#[inline]
224+
fn nfd_ext(self) -> Decompositions<I> {
225+
decompose::new_canonical_ext(self)
226+
}
227+
228+
#[inline]
229+
fn nfkd_ext(self) -> Decompositions<I> {
230+
decompose::new_compatible_ext(self)
231+
}
232+
233+
#[inline]
234+
fn nfc_ext(self) -> Recompositions<I> {
235+
recompose::new_canonical_ext(self)
236+
}
237+
238+
#[inline]
239+
fn nfkc_ext(self) -> Recompositions<I> {
240+
recompose::new_compatible_ext(self)
241+
}
242+
164243
#[inline]
165244
fn stream_safe(self) -> StreamSafe<I> {
166245
StreamSafe::new(self)

src/lookups.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,17 @@ pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
5353
)
5454
}
5555

56+
pub(crate) fn ext_fully_decomposed(c: char) -> Option<&'static [char]> {
57+
mph_lookup(
58+
c.into(),
59+
EXT_DECOMPOSED_SALT,
60+
EXT_DECOMPOSED_KV,
61+
pair_lookup_fk,
62+
pair_lookup_fv_opt,
63+
None,
64+
)
65+
}
66+
5667
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
5768
mph_lookup(
5869
c.into(),

src/normalize.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//! Functions for computing canonical and compatible decompositions for Unicode characters.
1212
use crate::lookups::{
1313
canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
14+
ext_fully_decomposed,
1415
};
1516

1617
use core::{char, ops::FnMut};
@@ -36,6 +37,37 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
3637
decompose(c, decompose_char, emit_char)
3738
}
3839

40+
/// Compute "extended" canonical Unicode decomposition for character.
41+
///
42+
/// This is `decompose_canonical` plus extensions, which currently consist of:
43+
/// - [Standardized Variation Sequences] are used instead of the standard canonical
44+
/// decompositions for CJK codepoints with singleton canonical decompositions, to
45+
/// avoid losing information. See the
46+
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
47+
/// "Other Enhancements" section of the
48+
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
49+
/// for more information.
50+
#[inline]
51+
pub fn decompose_canonical_ext<F>(c: char, emit_char: F)
52+
where
53+
F: FnMut(char),
54+
{
55+
let decompose_char = |c| ext_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
56+
decompose(c, decompose_char, emit_char)
57+
}
58+
59+
/// Compute "extended" compatible Unicode decomposition for character.
60+
///
61+
/// This is `decompose_compatible` plus the same extensions as `decompose_canonical_ext`.
62+
#[inline]
63+
pub fn decompose_compatible_ext<F: FnMut(char)>(c: char, emit_char: F) {
64+
let decompose_char = |c| {
65+
ext_fully_decomposed(c)
66+
.or_else(|| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)))
67+
};
68+
decompose(c, decompose_char, emit_char)
69+
}
70+
3971
#[inline]
4072
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
4173
where

src/recompose.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,28 @@ pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
5151
}
5252
}
5353

54+
#[inline]
55+
pub fn new_canonical_ext<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
56+
Recompositions {
57+
iter: super::decompose::new_canonical_ext(iter),
58+
state: self::RecompositionState::Composing,
59+
buffer: TinyVec::new(),
60+
composee: None,
61+
last_ccc: None,
62+
}
63+
}
64+
65+
#[inline]
66+
pub fn new_compatible_ext<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
67+
Recompositions {
68+
iter: super::decompose::new_compatible_ext(iter),
69+
state: self::RecompositionState::Composing,
70+
buffer: TinyVec::new(),
71+
composee: None,
72+
last_ccc: None,
73+
}
74+
}
75+
5476
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
5577
type Item = char;
5678

0 commit comments

Comments
 (0)
0