8000 Switch to a more explicit API. · emilio/unicode-normalization@d0c3706 · GitHub
[go: up one dir, main page]

Skip to content

Commit d0c3706

Browse files
committed
Switch to a more explicit API.
Switch to a dedicated `svar()` iterator function, which just does standardized variation sequences, rather than framing this functionality as an open-ended "extended" version of the standard normalization algorithms. This makes for a more factored API, gives users more control over exactly what transformations are done, and has less impact on users that don't need this new functionality.
1 parent d1ad2ac commit d0c3706

File tree

10 files changed

+3006
-3114
lines changed

10 files changed

+3006
-3114
lines changed

scripts/unicode.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ def __init__(self):
7272
self.canon_comp = self._compute_canonical_comp()
7373
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
7474

75-
self.ext_decomp = {}
76-
self.ext_fully_decomp = {}
75+
self.svar_decomp = {}
76+
self.svar_fully_decomp = {}
7777
self._load_standardized_variants()
7878

7979
def stats(name, table):
@@ -82,11 +82,11 @@ def stats(name, table):
8282

8383
print("Decomposition table stats:")
8484
stats("Canonical decomp", self.canon_decomp)
85-
stats("Canonical decomp with extensions", self.ext_decomp)
8685
stats("Compatible decomp", self.compat_decomp)
86+
stats("Standardized Variants", self.svar_decomp)
8787
stats("Canonical fully decomp", self.canon_fully_decomp)
88-
stats("Canonical fully decomp with extensions", self.ext_fully_decomp)
8988
stats("Compatible fully decomp", self.compat_fully_decomp)
89+
stats("Standardized Variants", self.svar_fully_decomp)
9090

9191
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
9292

@@ -152,8 +152,8 @@ def _load_standardized_variants(self):
152152
#assert not never_composes(c) TODO: Re-enable this once #67 lands.
153153
assert not c in self.canon_decomp, "Unexpected: standardized variant is unnormalized (canon)"
154154
assert not c in self.compat_decomp, "Unexpected: standardized variant is unnormalized (compat)"
155-
self.ext_decomp[char_int] = standardized_variant_parts
156-
self.ext_fully_decomp[char_int] = standardized_variant_parts
155+
self.svar_decomp[char_int] = standardized_variant_parts
156+
self.svar_fully_decomp[char_int] = standardized_variant_parts
157157

158158
def _load_norm_props(self):
159159
props = collections.defaultdict(list)
@@ -364,8 +364,8 @@ def gen_composition_table(canon_comp, out):
364364
out.write(" }\n")
365365
out.write("}\n")
366366

367-
def gen_decomposition_tables(canon_decomp, ext_decomp, compat_decomp, out):
368-
tables = [(canon_decomp, 'canonical'), (ext_decomp, 'ext'), (compat_decomp, 'compatibility')]
367+
def gen_decomposition_tables(canon_decomp, compat_decomp, svar_decomp, out):
368+
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (svar_decomp, 'svar')]
369369
for table, name in tables:
370370
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
371371
lambda k: "(0x{:x}, &[{}])".format(k,
@@ -535,7 +535,7 @@ def minimal_perfect_hash(d):
535535
gen_composition_table(data.canon_comp, out)
536536
out.write("\n")
537537

538-
gen_decomposition_tables(data.canon_fully_decomp, data.ext_fully_decomp, data.compat_fully_decomp, out)
538+
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.svar_fully_decomp, out)
539539

540540
gen_combining_mark(data.general_category_mark, out)
541541
out.write("\n")

src/decompose.rs

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ use tinyvec::TinyVec;
1616
enum DecompositionType {
1717
Canonical,
1818
Compatible,
19-
CanonicalExt,
20-
CompatibleExt,
2119
}
2220

2321
/// External iterator for a string decomposition's characters.
@@ -58,26 +56,6 @@ pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
5856
}
5957
}
6058

61-
#[inline]
62-
pub fn new_canonical_ext<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
63-
Decompositions {
64-
kind: self::DecompositionType::CanonicalExt,
65-
iter: iter.fuse(),
66-
buffer: TinyVec::new(),
67-
ready: 0..0,
68-
}
69-
}
70-
71-
#[inline]
72-
pub fn new_compatible_ext<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
73-
Decompositions {
74-
kind: self::DecompositionType::CompatibleExt,
75-
iter: iter.fuse(),
76-
buffer: TinyVec::new(),
77-
ready: 0..0,
78-
}
79-
}
80-
8159
impl<I> Decompositions<I> {
8260
#[inline]
8361
fn push_back(&mut self, ch: char) {
@@ -135,12 +113,6 @@ impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
135113
(Some(ch), &DecompositionType::Compatible) => {
136114
super::char::decompose_compatible(ch, |d| self.push_back(d));
137115
}
138-
(Some(ch), &DecompositionType::CanonicalExt) => {
139-
super::char::decompose_canonical_ext(ch, |d| self.push_back(d));
140-
}
141-
(Some(ch), &DecompositionType::CompatibleExt) => {
142-
super::char::decompose_compatible_ext(ch, |d| self.push_back(d));
143-
}
144116
(None, _) => {
145117
if self.buffer.is_empty() {
146118
return None;

src/lib.rs

Lines changed: 16 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ pub use crate::quick_check::{
5959
IsNormalized,
6060
};
6161
pub use crate::recompose::Recompositions;
62+
pub use crate::replace::Replacements;
6263
pub use crate::stream_safe::StreamSafe;
6364
pub use crate::tables::UNICODE_VERSION;
6465
use core::str::Chars;
@@ -71,6 +72,7 @@ mod normalize;
7172
mod perfect_hash;
7273
mod quick_check;
7374
mod recompose;
75+
mod replace;
7476
mod stream_safe;
7577

7678
#[rustfmt::skip]
@@ -84,8 +86,7 @@ mod test;
8486
/// Methods for composing and decomposing characters.
8587
pub mod char {
8688
pub use crate::normalize::{
87-
compose, decompose_canonical, decompose_canonical_ext, decompose_compatible,
88-
decompose_compatible_ext,
89+
compose, decompose_canonical, decompose_compatible, decompose_svar,
8990
};
9091

9192
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
@@ -111,41 +112,16 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
111112
/// (compatibility decomposition followed by canonical composition).
112113
fn nfkc(self) -> Recompositions<I>;
113114

114-
/// Similar to `nfd`, but with extensions which differ from the standard
115-
/// decomposition algorithm and which don't have a stability guarantee,
116-
/// but which still produce valid NFD and provide better results:
117-
/// - Standardized Variation Seqeuences are used to avoid losing
118-
/// information when normalizing "CJK Compatibility Ideographs"
119-
/// codepoints. Note that many systemes today ignore variation
120-
/// selectors, but the information is at least preserved in a
121-
/// standardized form.
115+
/// A non-standard transformation which replaces select codepoints with
116+
/// normal forms using Standardized Variation Sequences. These are
117+
/// different than the standard decompositions, but they better preserve
118+
/// the intent of the original text.
122119
///
123-
/// Additional extensions may be added in future versions.
124-
///
125-
/// If you need to match the standard `toNFD` algorithm exactly, or you
126-
/// need a stability guarantee, use `nfd` instead.
127-
fn nfd_ext(self) -> Decompositions<I>;
128-
129-
/// Similar to `nfkd`, and the result is valid NFKD, but with the same
130-
/// extensions as `nfd`.
131-
///
132-
/// If you need to match the standard `toNFKD` algorithm exactly, or you
133-
/// need a stability guarantee, use `nfd` instead.
134-
fn nfkd_ext(self) -> Decompositions<I>;
135-
136-
/// Similar to `nfc`, and the result is valid NFC, but with the same
137-
/// extensions as `nfd`.
138-
///
139-
/// If you need to match the standard `toNFC` algorithm exactly, or you
140-
/// need a stability guarantee, use `nfd` instead.
141-
fn nfc_ext(self) -> Recompositions<I>;
142-
143-
/// Similar to `nfkc`, and the result is valid NFKC, but with the same
144-
/// extensions as `nfd`.
145-
///
146-
/// If you need to match the standard `toNFKC` algorithm exactly, or you
147-
/// need a stability guarantee, use `nfd` instead.
148-
fn nfkc_ext(self) -> Recompositions<I>;
120+
/// Note that many systems today ignore variation selectors, so these
121+
/// may not immediately help text display as intended, but they at
122+
/// least preserve the information in a standardized form, giving
123+
/// implementations the option to recognize them.
124+
fn svar(self) -> Replacements<I>;
149125

150126
/// An Iterator over the string with Conjoining Grapheme Joiner characters
151127
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
@@ -174,23 +150,8 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
174150
}
175151

176152
#[inline]
177-
fn nfd_ext(self) -> Decompositions<Chars<'a>> {
178-
decompose::new_canonical_ext(self.chars())
179-
}
180-
181-
#[inline]
182-
fn nfkd_ext(self) -> Decompositions<Chars<'a>> {
183-
decompose::new_compatible_ext(self.chars())
184-
}
185-
186-
#[inline]
187-
fn nfc_ext(self) -> Recompositions<Chars<'a>> {
188-
recompose::new_canonical_ext(self.chars())
189-
}
190-
191-
#[inline]
192-
fn nfkc_ext(self) -> Recompositions<Chars<'a>> {
193-
recompose::new_compatible_ext(self.chars())
153+
fn svar(self) -> Replacements<Chars<'a>> {
154+
replace::new_svar(self.chars())
194155
}
195156

196157
#[inline]
@@ -221,23 +182,8 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
221182
}
222183

223184
#[inline]
224-
fn nfd_ext(self) -> Decompositions<I> {
225-
decompose::new_canonical_ext(self)
226-
}
227-
228-
#[inline]
229-
fn nfkd_ext(self) -> Decompositions<I> {
230-
decompose::new_compatible_ext(self)
231-
}
232-
233-
#[inline]
234-
fn nfc_ext(self) -> Recompositions<I> {
235-
recompose::new_canonical_ext(self)
236-
}
237-
238-
#[inline]
239-
fn nfkc_ext(self) -> Recompositions<I> {
240-
recompose::new_compatible_ext(self)
185+
fn svar(self) -> Replacements<I> {
186+
replace::new_svar(self)
241187
}
242188

243189
#[inline]

src/lookups.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,22 +53,22 @@ pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
5353
)
5454
}
5555

56-
pub(crate) fn ext_fully_decomposed(c: char) -> Option<&'static [char]> {
56+
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
5757
mph_lookup(
5858
c.into(),
59-
EXT_DECOMPOSED_SALT,
60-
EXT_DECOMPOSED_KV,
59+
COMPATIBILITY_DECOMPOSED_SALT,
60+
COMPATIBILITY_DECOMPOSED_KV,
6161
pair_lookup_fk,
6262
pair_lookup_fv_opt,
6363
None,
6464
)
6565
}
6666

67-
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
67+
pub(crate) fn svar_fully_decomposed(c: char) -> Option<&'static [char]> {
6868
mph_lookup(
6969
c.into(),
70-
COMPATIBILITY_DECOMPOSED_SALT,
71-
COMPATIBILITY_DECOMPOSED_KV,
70+
SVAR_DECOMPOSED_SALT,
71+
SVAR_DECOMPOSED_KV,
7272
pair_lookup_fk,
7373
pair_lookup_fv_opt,
7474
None,

src/normalize.rs

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
//! Functions for computing canonical and compatible decompositions for Unicode characters.
1212
use crate::lookups::{
1313
canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
14-
ext_fully_decomposed,
14+
svar_fully_decomposed,
1515
};
1616

1717
use core::{char, ops::FnMut};
@@ -37,35 +37,21 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
3737
decompose(c, decompose_char, emit_char)
3838
}
3939

40-
/// Compute "extended" canonical Unicode decomposition for character.
40+
/// Compute standard-variation decomposition for character.
4141
///
42-
/// This is `decompose_canonical` plus extensions, which currently consist of:
43-
/// - [Standardized Variation Sequences] are used instead of the standard canonical
44-
/// decompositions for CJK codepoints with singleton canonical decompositions, to
45-
/// avoid losing information. See the
46-
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
47-
/// "Other Enhancements" section of the
48-
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
49-
/// for more information.
42+
/// [Standardized Variation Sequences] are used instead of the standard canonical
43+
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
44+
/// to avoid losing information. See the
45+
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
46+
/// "Other Enhancements" section of the
47+
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
48+
/// for more information.
5049
#[inline]
51-
pub fn decompose_canonical_ext<F>(c: char, emit_char: F)
50+
pub fn decompose_svar<F>(c: char, emit_char: F)
5251
where
5352
F: FnMut(char),
5453
{
55-
let decompose_char = |c| ext_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
56-
decompose(c, decompose_char, emit_char)
57-
}
58-
59-
/// Compute "extended" compatible Unicode decomposition for character.
60-
///
61-
/// This is `decompose_compatible` plus the same extensions as `decompose_canonical_ext`.
62-
#[inline]
63-
pub fn decompose_compatible_ext<F: FnMut(char)>(c: char, emit_char: F) {
64-
let decompose_char = |c| {
65-
ext_fully_decomposed(c)
66-
.or_else(|| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)))
67-
};
68-
decompose(c, decompose_char, emit_char)
54+
decompose(c, svar_fully_decomposed, emit_char)
6955
}
7056

7157
#[inline]

src/recompose.rs

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -51,28 +51,6 @@ pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
5151
}
5252
}
5353

54-
#[inline]
55-
pub fn new_canonical_ext<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
56-
Recompositions {
57-
iter: super::decompose::new_canonical_ext(iter),
58-
state: self::RecompositionState::Composing,
59-
buffer: TinyVec::new(),
60-
composee: None,
61-
last_ccc: None,
62-
}
63-
}
64-
65-
#[inline]
66-
pub fn new_compatible_ext<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
67-
Recompositions {
68-
iter: super::decompose::new_compatible_ext(iter),
69-
state: self::RecompositionState::Composing,
70-
buffer: TinyVec::new(),
71-
composee: None,
72-
last_ccc: None,
73-
}
74-
}
75-
7654
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
7755
type Item = char;
7856

0 commit comments

Comments
 (0)
0