8000 Merge branch 'master' into patch-1 · unicode-rs/unicode-normalization@b726540 · GitHub
[go: up one dir, main page]

Skip to content

Commit b726540

Browse files
authored
Merge branch 'master' into patch-1
2 parents 0772fc9 + 9b20974 commit b726540

File tree

7 files changed

+124
-18
lines changed

7 files changed

+124
-18
lines changed

fuzz/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,11 @@ path = "fuzz_targets/streaming.rs"
2323
test = false
2424
doc = false
2525

26+
[[bin]]
27+
name = "process"
28+
path = "fuzz_targets/process.rs"
29+
test = false
30+
doc = false
31+
2632
# Work around https://github.com/rust-lang/cargo/issues/8338
2733
[workspace]

fuzz/fuzz_targets/process.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// The fuzzing harness fuzz test some of the the
2+
// unicode string normalization processing
3+
4+
#![no_main]
5+
6+
#[macro_use]
7+
extern crate libfuzzer_sys;
8+
extern crate unicode_normalization;
9+
10+
use unicode_normalization::{
11+
char::{
12+
canonical_combining_class, compose, decompose_canonical, decompose_compatible,
13+
is_combining_mark,
14+
},
15+
UnicodeNormalization,
16+
};
17+
18+
fuzz_target!(|data: (u8, String)| {
19+
let (function_index, string_data) = data;
20+
21+
// Create an iterator for characters
22+
let mut chars = string_data.chars();
23+
24+
// Randomly fuzz a target function
25+
match function_index % 10 {
26+
0 => {
27+
// Fuzz compose with two distinct characters
28+
if let (Some(c1), Some(c2)) = (chars.next(), chars.next()) {
29+
let _ = compose(c1, c2);
30+
}
31+
}
32+
1 => {
33+
// Fuzz canonical_combining_class
34+
if let Some(c) = chars.next() {
35+
let _ = canonical_combining_class(c);
36+
}
37+
}
38+
2 => {
39+
// Fuzz is_combining_mark
40+
if let Some(c) = chars.next() {
41+
let _ = is_combining_mark(c);
42+
}
43+
}
44+
3 => {
45+
// Fuzz NFC
46+
let _ = string_data.nfc().collect::<String>();
47+
}
48+
4 => {
49+
// Fuzz NFKD
50+
let _ = string_data.nfkd().collect::<String>();
51+
}
52+
5 => {
53+
// Fuzz NFD
54+
let _ = string_data.nfd().collect::<String>();
55+
}
56+
6 => {
57+
// Fuzz NFKC
58+
let _ = string_data.nfkc().collect::<String>();
59+
}
60+
7 => {
61+
// Fuzz stream_safe
62+
let _ = string_data.stream_safe().collect::<String>();
63+
}
64+
8 => {
65+
// Fuzz decompose_canonical
66+
if let Some(c) = chars.next() {
67+
decompose_canonical(c, |_| {});
68+
}
69+
}
70+
9 => {
71+
// Fuzz decompose_compatible
72+
if let Some(c) = chars.next() {
73+
decompose_compatible(c, |_| {});
74+
}
75+
}
76+
_ => {}
77+
}
78+
});

src/decompose.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ impl<I: Iterator<Item = char>> Decompositions<I> {
5353

5454
/// Create a new decomposition iterator for compatability decompositions (NFkD)
5555
///
56-
/// Note that this iterator can also be obtained by directly calling [`.nfd()`](crate::UnicodeNormalization::nfd)
56+
/// Note that this iterator can also be obtained by directly calling [`.nfkd()`](crate::UnicodeNormalization::nfkd)
5757
/// on the iterator.
5858
#[inline]
5959
pub fn new_compatible(iter: I) -> Decompositions<I> {

src/lib.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
113113
/// (compatibility decomposition followed by canonical composition).
114114
fn nfkc(self) -> Recompositions<I>;
115115

116-
/// A transformation which replaces CJK Compatibility Ideograph codepoints
117-
/// with normal forms using Standardized Variation Sequences. This is not
116+
/// A transformation which replaces [CJK Compatibility Ideograph] codepoints
117+
/// with normal forms using [Standardized Variation Sequences]. This is not
118118
/// part of the canonical or compatibility decomposition algorithms, but
119119
/// performing it before those algorithms produces normalized output which
120120
/// better preserves the intent of the original text.
@@ -123,10 +123,15 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
123123
/// may not immediately help text display as intended, but they at
124124
/// least preserve the information in a standardized form, giving
125125
/// implementations the option to recognize them.
126+
///
127+
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
128+
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
126129
fn cjk_compat_variants(self) -> Replacements<I>;
127130

128131
/// An Iterator over the string with Conjoining Grapheme Joiner characters
129-
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
132+
/// inserted according to the Stream-Safe Text Process ([UAX15-D4]).
133+
///
134+
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
130135
fn stream_safe(self) -> StreamSafe<I>;
131136
}
132137

@@ -153,7 +158,7 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
153158

154159
#[inline]
155160
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
156-
replace::new_cjk_compat_variants(self.chars())
161+
Replacements::new_cjk_compat_variants(self.chars())
157162
}
158163

159164
#[inline]
@@ -185,7 +190,7 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
185190

186191
#[inline]
187192
fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
188-
replace::new_cjk_compat_variants(Some(self).into_iter())
193+
Replacements::new_cjk_compat_variants(Some(self).into_iter())
189194
}
190195

191196
#[inline]
@@ -217,7 +222,7 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
217222

218223
#[inline]
219224
fn cjk_compat_variants(self) -> Replacements<I> {
220-
replace::new_cjk_compat_variants(self)
225+
Replacements::new_cjk_compat_variants(self)
221226
}
222227

223228
#[inline]

src/normalize.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,12 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
4141
///
4242
/// [Standardized Variation Sequences] are used instead of the standard canonical
4343
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
44-
/// to avoid losing information. See the
45-
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
46-
/// "Other Enhancements" section of the
47-
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
48-
/// for more information.
44+
/// to avoid losing information. See the [Unicode Variation Sequence FAQ] and the
45+
/// "Other Enhancements" section of the [Unicode 6.3 Release Summary] for more information.
46+
///
47+
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
48+
/// [Unicode Variation Sequence FAQ]: http://unicode.org/faq/vs.html
49+
/// [Unicode 6.3 Release Summary]: https://www.unicode.org/versions/Unicode6.3.0/#Summary
4950
#[inline]
5051
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
5152
where

src/replace.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,18 @@ pub struct Replacements<I> {
2222
buffer: Option<char>,
2323
}
2424

25-
#[inline]
26-
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
27-
Replacements { iter, buffer: None }
25+
impl<I: Iterator<Item = char>> Replacements<I> {
26+
/// Create a new iterator that replaces [CJK Compatibility Ideograph] codepoints with normal forms using [Standardized Variation Sequences].
27+
///
28+
/// Note that this iterator can also be obtained by directly calling [`.cjk_compat_variants()`] on the iterator.
29+
///
30+
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
31+
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
32+
/// [`.cjk_compat_variants()`]: crate::UnicodeNormalization::cjk_compat_variants
33+
#[inline]
34+
pub fn new_cjk_compat_variants(iter: I) -> Replacements<I> {
35+
Replacements { iter, buffer: None }
36+
}
2837
}
2938

3039
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {

src/stream_safe.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,24 @@ use crate::tables::stream_safe_leading_nonstarters;
1010
pub(crate) const MAX_NONSTARTERS: usize = 30;
1111
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
1212

13-
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
13+
/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
1414
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
1515
/// (U+034F) if the count exceeds 30.
16+
///
17+
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
1618
pub struct StreamSafe<I> {
1719
iter: I,
1820
nonstarter_count: usize,
1921
buffer: Option<char>,
2022
}
2123

22-
impl<I> StreamSafe<I> {
23-
pub(crate) fn new(iter: I) -> Self {
< 1241 code>24+
impl<I: Iterator<Item = char>> StreamSafe<I> {
25+
/// Create a new stream safe iterator.
26+
///
27+
/// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
28+
/// on the iterator.
29+
#[inline]
30+
pub fn new(iter: I) -> Self {
2431
Self {
2532
iter,
2633
nonstarter_count: 0,

0 commit comments

Comments
 (0)
0