8000 Add APIs to normalize arbitrary `char` iterators rather than just `str`. · Florob/unicode-normalization@4f6a6ca · GitHub
[go: up one dir, main page]

Skip to content

Commit 4f6a6ca

Browse files
committed
Add APIs to normalize arbitrary char iterators rather than just str.
This enables not allocating memory for intermediate results in algorithms like this part of Unicode’s [*canonical caseless matching*]( http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G34145 ): ```rust nfkd(default_case_fold(nfkd(default_case_fold(nfd(input))))) ```
1 parent db78f43 commit 4f6a6ca

File tree

6 files changed

+164
-58
lines changed

6 files changed

+164
-58
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ fn main() {
1414
assert_eq!(compose('A','\u{30a}'), Some('Å'));
1515

1616
let s = "ÅΩ";
17-
let c = UnicodeNormalization::nfc_chars(s).collect::<String>();
17+
let c = UnicodeNormalization::nfc(s).collect::<String>();
1818
assert_eq!(c, "ÅΩ");
1919
}
2020
```

src/decompose.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
use std::str::Chars;
1211

1312
// Helper functions used for Unicode normalization
1413
fn canonical_sort(comb: &mut [(char, u8)]) {
@@ -35,34 +34,34 @@ enum DecompositionType {
3534

3635
/// External iterator for a string decomposition's characters.
3736
#[derive(Clone)]
38-
pub struct Decompositions<'a> {
37+
pub struct Decompositions<I> {
3938
kind: DecompositionType,
40-
iter: Chars<'a>,
39+
iter: I,
4140
buffer: Vec<(char, u8)>,
4241
sorted: bool
4342
}
4443

4544
#[inline]
46-
pub fn new_canonical<'a>(s: &'a str) -> Decompositions<'a> {
45+
pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
4746
Decompositions {
48-
iter: s.chars(),
47+
iter: iter,
4948
buffer: Vec::new(),
5049
sorted: false,
5150
kind: self::DecompositionType::Canonical,
5251
}
5352
}
5453

5554
#[inline]
56-
pub fn new_compatible<'a>(s: &'a str) -> Decompositions<'a> {
55+
pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
5756
Decompositions {
58-
iter: s.chars(),
57+
iter: iter,
5958
buffer: Vec::new(),
6059
sorted: false,
6160
kind: self::DecompositionType::Compatible,
6261
}
6362
}
6463

65-
impl<'a> Iterator for Decompositions<'a> {
64+
impl<I: Iterator<Item=char>> Iterator for Decompositions<I> {
6665
type Item = char;
6766

6867
#[inline]

src/lib.rs

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
2323
//!
2424
//! let s = "ÅΩ";
25-
//! let c = UnicodeNormalization::nfc_chars(s).collect::<String>();
25+
//! let c = UnicodeNormalization::nfc(s).collect::<String>();
2626
//! assert_eq!(c, "ÅΩ");
2727
//! }
2828
//! ```
@@ -61,54 +61,77 @@ pub mod char {
6161
pub use tables::normalization::canonical_combining_class;
6262
}
6363

64-
/// Methods for applying composition and decomposition to strings.
64+
/// Methods for applying composition and decomposition to strings and char iterators.
6565
pub mod str {
6666
pub use super::decompose::Decompositions;
6767
pub use super::recompose::Recompositions;
68+
use std::str::Chars;
6869

6970
/// Methods for iterating over strings while applying Unicode normalizations
70-
/// as described in
71+
/// as described in
7172
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
72-
pub trait UnicodeNormalization {
73+
pub trait UnicodeNormalization<I: Iterator<Item=char>> {
7374
/// Returns an iterator over the string in Unicode Normalization Form D
7475
/// (canonical decomposition).
7576
#[inline]
76-
fn nfd_chars(&self) -> Decompositions;
77+
fn nfd(self) -> Decompositions<I>;
7778

7879
/// Returns an iterator over the string in Unicode Normalization Form KD
7980
/// (compatibility decomposition).
8081
#[inline]
81-
fn nfkd_chars(&self) -> Decompositions;
82+
fn nfkd(self) -> Decompositions<I>;
8283

8384
/// An Iterator over the string in Unicode Normalization Form C
8485
/// (canonical decomposition followed by canonical composition).
8586
#[inline]
86-
fn nfc_chars(&self) -> Recompositions;
87+
fn nfc(self) -> Recompositions<I>;
8788

8889
/// An Iterator over the string in Unicode Normalization Form KC
8990
/// (compatibility decomposition followed by canonical composition).
9091
#[inline]
91-
fn nfkc_chars(&self) -> Recompositions;
92+
fn nfkc(self) -> Recompositions<I>;
9293
}
9394

94-
impl UnicodeNormalization for str {
95+
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
9596
#[inline]
96-
fn nfd_chars(&self) -> Decompositions {
97+
fn nfd(self) -> Decompositions<Chars<'a>> {
98+
super::decompose::new_canonical(self.chars())
99+
}
100+
101+
#[inline]
102+
fn nfkd(self) -> Decompositions<Chars<'a>> {
103+
super::decompose::new_compatible(self.chars())
104+
}
105+
106+
#[inline]
107+
fn nfc(self) -> Recompositions<Chars<'a>> {
108+
super::recompose::new_canonical(self.chars())
109+
}
110+
111+
#[inline]
112+
fn nfkc(self) -> Recompositions<Chars<'a>> {
113+
super::recompose::new_compatible(self.chars())
114+
}
115+
}
116+
117+
impl<I: Iterator<Item=char>> UnicodeNormalization<I> for I {
118+
#[inline]
119+
fn nfd(self) -> Decompositions<I> {
97120
super::decompose::new_canonical(self)
98121
}
99122

100123
#[inline]
101-
fn nfkd_chars(&self) -> Decompositions {
124+
fn nfkd(self) -> Decompositions<I> {
102125
super::decompose::new_compatible(self)
103126
}
104127

105128
#[inline]
106-
fn nfc_chars(&self) -> Recompositions {
129+
fn nfc(self) -> Recompositions<I> {
107130
super::recompose::new_canonical(self)
108131
}
109132

110133
#[inline]
111-
fn nfkc_chars(&self) -> Recompositions {
134+
fn nfkc(self) -> Recompositions<I> {
112135
super::recompose::new_compatible(self)
113136
}
114137
}

src/recompose.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// except according to those terms.
1010

1111
use std::collections::VecDeque;
12-
use super::str::{Decompositions, UnicodeNormalization};
12+
use super::str::Decompositions;
1313

1414
#[derive(Clone)]
1515
enum RecompositionState {
@@ -20,18 +20,18 @@ enum RecompositionState {
2020

2121
/// External iterator for a string recomposition's characters.
2222
#[derive(Clone)]
23-
pub struct Recompositions<'a> {
24-
iter: Decompositions<'a>,
23+
pub struct Recompositions<I> {
24+
iter: Decompositions<I>,
2525
state: RecompositionState,
2626
buffer: VecDeque<char>,
2727
composee: Option<char>,
2828
last_ccc: Option<u8>
2929
}
3030

3131
#[inline]
32-
pub fn new_canonical<'a>(s: &'a str) -> Recompositions<'a> {
32+
pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
3333
Recompositions {
34-
iter: UnicodeNormalization::nfd_chars(s),
34+
iter: super::decompose::new_canonical(iter),
3535
state: self::RecompositionState::Composing,
3636
buffer: VecDeque::new(),
3737
composee: None,
@@ -40,17 +40,17 @@ pub fn new_canonical<'a>(s: &'a str) -> Recompositions<'a> {
4040
}
4141

4242
#[inline]
43-
pub fn new_compatible<'a>(s: &'a str) -> Recompositions<'a> {
43+
pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
4444
Recompositions {
45-
iter: UnicodeNormalization::nfkd_chars(s),
45+
iter: super::decompose::new_compatible(iter),
4646
st F438 ate : self::RecompositionState::Composing,
4747
buffer: VecDeque::new(),
4848
composee: None,
4949
last_ccc: None,
5050
}
5151
}
5252

53-
impl<'a> Iterator for Recompositions<'a> {
53+
impl<I: Iterator<Item=char>> Iterator for Recompositions<I> {
5454
type Item = char;
5555

5656
#[inline]

src/str.rs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//! Methods for applying composition and decomposition to strings and char iterators.
2+
3+
pub use super::decompose::Decompositions;
4+
pub use super::recompose::Recompositions;
5+
6+
7+
/// Methods for iterating over strings while applying Unicode normalizations
8+
/// as described in
9+
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
10+
pub trait UnicodeNormalization<I: Iterator<Item=char>> {
11+
/// Returns an iterator over the string in Unicode Normalization Form D
12+
/// (canonical decomposition).
13+
#[inline]
14+
fn nfd_chars(self) -> Decompositions<I>;
15+
16+
/// Returns an iterator over the string in Unicode Normalization Form KD
17+
/// (compatibility decomposition).
18+
#[inline]
19+
fn nfkd_chars(self) -> Decompositions<I>;
20+
21+
/// An Iterator over the string in Unicode Normalization Form C
22+
/// (canonical decomposition followed by canonical composition).
23+
#[inline]
24+
fn nfc_chars(self) -> Recompositions<I>;
25+
26+
/// An Iterator over the string in Unicode Normalization Form KC
27+
/// (compatibility decomposition followed by canonical composition).
28+
#[inline]
29+
fn nfkc_chars(self) -> Recompositions<I>;
30+
}
31+
32+
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
33+
#[inline]
34+
fn nfd_chars(self) -> Decompositions<Chars<'a>> {
35+
super::decompose::new_canonical(self.chars())
36+
}
37+
38+
#[inline]
39+
fn nfkd_chars(self) -> Decompositions<Chars<'a>> {
40+
super::decompose::new_compatible(self.chars())
41+
}
42+
43+
#[inline]
44+
fn nfc_chars(self) -> Recompositions<Chars<'a>> {
45+
super::recompose::new_canonical(self.chars())
46+
}
47+
48+
#[inline]
49+
fn nfkc_chars(self) -> Recompositions<Chars<'a>> {
50+
super::recompose::new_compatible(self.chars())
51+
}
52+
}
53+
54+
impl<I: Iterator<Item=char>> UnicodeNormalization<I> for I {
55+
#[inline]
56+
fn nfd_chars(self) -> Decompositions<I> {
57+
super::decompose::new_canonical(self)
58+
}
59+
60+
#[inline]
61+
fn nfkd_chars(self) -> Decompositions<I> {
62+
super::decompose::new_compatible(self)
63+
}
64+
65+
#[inline]
66+
fn nfc_chars(self) -> Recompositions<I> {
67+
super::recompose::new_canonical(self)
68+
}
69+
70+
#[inline]
71+
fn nfkc_chars(self) -> Recompositions<I> {
72+
super::recompose::new_compatible(self)
73+
}
74+
}

0 commit comments

Comments
 (0)
0