10000 Simplify decomposition · djudd/unicode-normalization@0e821ff · GitHub
[go: up one dir, main page]

Skip to content

Commit 0e821ff

Browse files
committed
Simplify decomposition
- Eliminate custom sorter - Only sort the "pending" suffix of the buffer
1 parent e97357d commit 0e821ff

File tree

1 file changed

+60
-79
lines changed

1 file changed

+60
-79
lines changed

src/decompose.rs

Lines changed: 60 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ - 10000 7,26 +7,8 @@
77
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
10-
1110
use std::fmt::{self, Write};
1211

13-
// Helper functions used for Unicode normalization
14-
fn canonical_sort(comb: &mut [(char, u8)]) {
15-
let len = comb.len();
16-
for i in 0..len {
17-
let mut swapped = false;
18-
for j in 1..len-i {
19-
let class_a = comb[j-1].1;
20-
let class_b = comb[j].1;
21-
if class_a != 0 && class_b != 0 && class_a > class_b {
22-
comb.swap(j-1, j);
23-
swapped = true;
24-
}
25-
}
26-
if !swapped { break; }
27-
}
28-
}
29-
3012
#[derive(Clone)]
3113
enum DecompositionType {
3214
Canonical,
@@ -38,95 +20,94 @@ enum DecompositionType {
3820
pub struct Decompositions<I> {
3921
kind: DecompositionType,
4022
iter: I,
41-
buffer: Vec<(char, u8)>,
42-
sorted: bool
23+
done: bool,
24+
25+
// This buffer stores pairs of (canonical combining class, character),
26+
// pushed onto the end in text order.
27+
//
28+
// It's split into two contiguous regions by the `ready` offset. The first
29+
// `ready` pairs are sorted and ready to emit on demand. The "pending"
30+
// suffix afterwards still needs more characters for us to be able to sort
31+
// in canonical order and is not safe to emit.
32+
buffer: Vec<(u8, char)>,
33+
ready: usize,
4334
}
4435

4536
#[inline]
4637
pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
4738
Decompositions {
39+
kind: self::DecompositionType::Canonical,
4840
iter: iter,
41+
done: false,
4942
buffer: Vec::new(),
50-
sorted: false,
51-
kind: self::DecompositionType::Canonical,
43+
ready: 0,
5244
}
5345
}
5446

5547
#[inline]
5648
pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
5749
Decompositions {
50+
kind: self::DecompositionType::Compatible,
5851
iter: iter,
52+
done: false,
5953
buffer: Vec::new(),
60-
sorted: false,
61-
kind: self::DecompositionType::Compatible,
54+
ready: 0,
6255
}
6356
}
6457

65-
impl<I: Iterator<Item=char>> Iterator for Decompositions<I> {
66-
type Item = char;
67-
58+
impl<I> Decompositions<I> {
6859
#[inline]
69-
fn next(&mut self) -> Option<char> {
70-
use self::DecompositionType::*;
71-
72-
match self.buffer.first() {
73-
Some(&(c, 0)) => {
74-
self.sorted = false;
75-
self.buffer.remove(0);
76-
return Some(c);
77-
}
78-
Some(&(c, _)) if self.sorted => {
79-
self.buffer.remove(0);
80-
return Some(c);
81-
}
82-
_ => self.sorted = false
60+
fn push_back(&mut self, ch: char) {
61+
let class = super::char::canonical_combining_class(ch);
62+
if class == 0 {
63+
self.sort_pending();
8364
}
65+
self.buffer.push((class, ch));
66+
}
8467

85-
if !self.sorted {
86-
for ch in self.iter.by_ref() {
87-
let buffer = &mut self.buffer;
88-
let sorted = &mut self.sorted;
89-
{
90-
let callback = |d| {
91-
let class =
92-
super::char::canonical_combining_class(d);
93-
if class == 0 && !*sorted {
94-
canonical_sort(buffer);
95-
*sorted = true;
96-
}
97-
buffer.push((d, class));
98-
};
99-
match self.kind {
100-
Canonical => {
101-
super::char::decompose_canonical(ch, callback)
102-
}
103-
Compatible => {
104-
super::char::decompose_compatible(ch, callback)
105-
}
106-
}
107-
}
108-
if *sorted {
109-
break
110-
}
111-
}
68+
#[inline]
69+
fn sort_pending(&mut self) {
70+
if self.ready == 0 && self.buffer.is_empty() {
71+
return;
11272
}
11373

114-
if !self.sorted {
115-
canonical_sort(&mut self.buffer);
116-
self.sorted = true;
117-
}
74+
// NB: `sort_by_key` is stable, so it will preserve the original text's
75+
// order within a combining class.
76+
self.buffer[self.ready..].sort_by_key(|k| k.0);
77+
self.ready = self.buffer.len();
78+
}
11879

119-
if self.buffer.is_empty() {
80+
#[inline]
81+
fn pop_front(&mut self) -> Option<char> {
82+
if self.ready == 0 {
12083
None
12184
} else {
122-
match self.buffer.remove(0) {
123-
(c, 0) => {
124-
self.sorted = false;
125-
Some(c)
126-
}
127-
(c, _) => Some(c),
85+
self.ready -= 1;
86+
Some(self.buffer.remove(0).1)
87+
}
88+
}
89+
}
90+
91+
impl<I: Iterator<Item=char>> Iterator for Decompositions<I> {
92+
type Item = char;
93+
94+
#[inline]
95+
fn next(&mut self) -> Option<char> {
96+
while self.ready == 0 && !self.done {
97+
match (self.iter.next(), &self.kind) {
98+
(Some(ch), &DecompositionType::Canonical) => {
99+
super::char::decompose_canonical(ch, |d| self.push_back(d));
100+
},
101+
(Some(ch), &DecompositionType::Compatible) => {
102+
super::char::decompose_compatible(ch, |d| self.push_back(d));
103+
},
104+
(None, _) => {
105+
self.sort_pending();
106+
self.done = true;
107+
},
128108
}
129109
}
110+
self.pop_front()
130111
}
131112

132113
fn size_hint(&self) -> (usize, Option<usize>) {

0 commit comments

Comments
 (0)
0