8000 Expose all iterator constructors, add hyperlinks to Unicode glossary/technical reports by ShE3py · Pull Request #106 · unicode-rs/unicode-normalization · GitHub
[go: up one dir, main page]

Skip to content

Expose all iterator constructors, add hyperlinks to Unicode glossary/technical reports #106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
< 8000 div class="hide-sm hide-md">
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/decompose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ impl<I: Iterator<Item = char>> Decompositions<I> {

/// Create a new decomposition iterator for compatability decompositions (NFkD)
///
/// Note that this iterator can also be obtained by directly calling [`.nfd()`](crate::UnicodeNormalization::nfd)
/// Note that this iterator can also be obtained by directly calling [`.nfkd()`](crate::UnicodeNormalization::nfkd)
/// on the iterator.
#[inline]
pub fn new_compatible(iter: I) -> Decompositions<I> {
Expand Down
17 changes: 11 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// (compatibility decomposition followed by canonical composition).
fn nfkc(self) -> Recompositions<I>;

/// A transformation which replaces CJK Compatibility Ideograph codepoints
/// with normal forms using Standardized Variation Sequences. This is not
/// A transformation which replaces [CJK Compatibility Ideograph] codepoints
/// with normal forms using [Standardized Variation Sequences]. This is not
/// part of the canonical or compatibility decomposition algorithms, but
/// performing it before those algorithms produces normalized output which
/// better preserves the intent of the original text.
Expand All @@ -123,10 +123,15 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// may not immediately help text display as intended, but they at
/// least preserve the information in a standardized form, giving
/// implementations the option to recognize them.
///
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
fn cjk_compat_variants(self) -> Replacements<I>;

/// An Iterator over the string with Conjoining Grapheme Joiner characters
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
/// inserted according to the Stream-Safe Text Process ([UAX15-D4]).
///
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
fn stream_safe(self) -> StreamSafe<I>;
}

Expand All @@ -153,7 +158,7 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {

#[inline]
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
replace::new_cjk_compat_variants(self.chars())
Replacements::new_cjk_compat_variants(self.chars())
}

#[inline]
Expand Down Expand Up @@ -185,7 +190,7 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {

#[inline]
fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
replace::new_cjk_compat_variants(Some(self).into_iter())
Replacements::new_cjk_compat_variants(Some(self).into_iter())
}

#[inline]
Expand Down Expand Up @@ -217,7 +222,7 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {

#[inline]
fn cjk_compat_variants(self) -> Replacements<I> {
replace::new_cjk_compat_variants(self)
Replacements::new_cjk_compat_variants(self)
}

#[inline]
Expand Down
11 changes: 6 additions & 5 deletions src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
///
/// [Standardized Variation Sequences] are used instead of the standard canonical
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
/// to avoid losing information. See the
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
/// "Other Enhancements" section of the
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
/// for more information.
/// to avoid losing information. See the [Unicode Variation Sequence FAQ] and the
/// "Other Enhancements" section of the [Unicode 6.3 Release Summary] for more information.
///
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
/// [Unicode Variation Sequence FAQ]: http://unicode.org/faq/vs.html
/// [Unicode 6.3 Release Summary]: https://www.unicode.org/versions/Unicode6.3.0/#Summary
#[inline]
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
where
Expand Down
15 changes: 12 additions & 3 deletions src/replace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,18 @@ pub struct Replacements<I> {
buffer: Option<char>,
}

#[inline]
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
impl<I: Iterator<Item = char>> Replacements<I> {
/// Create a new iterator that replaces [CJK Compatibility Ideograph] codepoints with normal forms using [Standardized Variation Sequences].
///
/// Note that this iterator can also be obtained by directly calling [`.cjk_compat_variants()`] on the iterator.
///
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
/// [`.cjk_compat_variants()`]: crate::UnicodeNormalization::cjk_compat_variants
#[inline]
pub fn new_cjk_compat_variants(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
}
}

impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
Expand Down
13 changes: 10 additions & 3 deletions src/stream_safe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,24 @@ use crate::tables::stream_safe_leading_nonstarters;
pub(crate) const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';

/// UAX15-D4: This iterator keeps track of how many non-starters there have been
/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
/// (U+034F) if the count exceeds 30.
///
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
pub struct StreamSafe<I> {
iter: I,
nonstarter_count: usize,
buffer: Option<char>,
}

impl<I> StreamSafe<I> {
pub(crate) fn new(iter: I) -> Self {
impl<I: Iterator<Item = char>> StreamSafe<I> {
/// Create a new stream safe iterator.
///
/// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
/// on the iterator.
#[inline]
pub fn new(iter: I) -> Self {
Self {
iter,
nonstarter_count: 0,
Expand Down
Loading
0