8000 Make the decompose iterator avoid buffering elements past a starter. · emilio/unicode-normalization@a94c6e1 · GitHub
[go: up one dir, main page]

Skip to content

Commit a94c6e1

Browse files
committed
Make the decompose iterator avoid buffering elements past a starter.
Once the decompose iterator sees a starter, it should immediately start returning characters from the preceeding sequence. If the input happens to be stream-safe, it should never get more than MAX_NONSTARTERS + plus boundary values ahead of its inner iterator.
1 parent 5967481 commit a94c6e1

File tree

3 files changed

+60
-3
lines changed

3 files changed

+60
-3
lines changed

fuzz/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,11 @@ path = "fuzz_targets/unicode-normalization.rs"
1717
test = false
1818
doc = false
1919

20+
[[bin]]
21+
name = "streaming"
22+
path = "fuzz_targets/streaming.rs"
23+
test = false
24+
doc = false
25+
2026
# Work around https://github.com/rust-lang/cargo/issues/8338
2127
[workspace]

fuzz/fuzz_targets/streaming.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
//! Test that the NFC iterator doesn't run needlessly further ahead of its
2+
//! underlying iterator.
3+
//!
4+
//! The NFC iterator is wrapped around the NFD iterator, and it buffers
5+
//! up combining characters so that it can sort them once it knows it has
6+
//! seen the complete sequence. At that point, it should drain its own
7+
//! buffer before consuming more characters from its inner iterator.
8+
//! This fuzz target defines a custom iterator which records how many
9+
//! times it's called so we can detect if NFC called it too many times.
10+
11+
#![no_main]
12+
13+
#[macro_use]
14+
extern crate libfuzzer_sys;
15+
16+
use std::str::Chars;
17+
use std::cell::RefCell;
18+
use std::rc::Rc;
19+
use unicode_normalization::UnicodeNormalization;
20+
21+
const MAX_NONSTARTERS: u32 = 30;
22+
23+
#[derive(Debug)]
24+
struct Counter<'a> {
25+
iter: Chars<'a>,
26+
value: Rc<RefCell<u32>>,
27+
}
28+
29+
impl<'a> Iterator for Counter<'a> {
30+
type Item = char;
31+
32+
fn next(&mut self) -> Option<char> {
33+
*self.value.borrow_mut() += 1;
34+
self.iter.next()
35+
}
36+
}
37+
38+
fuzz_target!(|input: String| {
39+
let stream_safe = input.chars().stream_safe().collect::<String>();
40+
41+
let mut value = Rc::new(RefCell::new(0));
42+
let counter = Counter { iter: stream_safe.chars(), value: Rc::clone(&mut value) };
43+
for _ in counter.nfc() {
44+
// Plus 2: one for the starter at the beginning of a sequence, and
45+
// one for a starter that begins the following sequence.
46+
assert!(*value.borrow() <= MAX_NONSTARTERS + 2);
47+
*value.borrow_mut() = 0;
48+
}
49+
});

src/decompose.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,18 @@ impl<I> Decompositions<I> {
6363

6464
if class == 0 {
6565
self.sort_pending();
66+
self.buffer.push((class, ch));
67+
self.ready.end = self.buffer.len();
68+
} else {
69+
self.buffer.push((class, ch));
6670
}
67-
68-
self.buffer.push((class, ch));
6971
}
7072

7173
#[inline]
7274
fn sort_pending(&mut self) {
7375
// NB: `sort_by_key` is stable, so it will preserve the original text's
7476
// order within a combining class.
7577
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
76-
self.ready.end = self.buffer.len();
7778
}
7879

7980
#[inline]
@@ -117,6 +118,7 @@ impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
117118
return None;
118119
} else {
119120
self.sort_pending();
121+
self.ready.end = self.buffer.len();
120122

121123
// This implementation means that we can call `next`
122124
// on an exhausted iterator; the last outer `next` call

0 commit comments

Comments
 (0)
0