8000 working build · froydnj/unicode-normalization@462dd30 · GitHub
[go: up one dir, main page]

Skip to content

Commit 462dd30

Browse files
committed
working build
1 parent 933f988 commit 462dd30

File tree

5 files changed

+630
-2
lines changed
  • 5 files changed

    +630
    -2
    lines changed

    src/decompose.rs

    Lines changed: 136 additions & 0 deletions
    Original file line numberDiff line numberDiff line change
    @@ -0,0 +1,136 @@
    1+
    // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
    2+
    // file at the top-level directory of this distribution and at
    3+
    // http://rust-lang.org/COPYRIGHT.
    4+
    //
    5+
    // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
    6+
    // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
    7+
    // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
    8+
    // option. This file may not be copied, modified, or distributed
    9+
    // except according to those terms.
    10+
    11+
    use std::str::Chars;
    12+
    13+
    // Helper functions used for Unicode normalization
    14+
    fn canonical_sort(comb: &mut [(char, u8)]) {
    15+
    let len = comb.len();
    16+
    for i in 0..len {
    17+
    let mut swapped = false;
    18+
    for j in 1..len-i {
    19+
    let class_a = comb[j-1].1;
    20+
    let class_b = comb[j].1;
    21+
    if class_a != 0 && class_b != 0 && class_a > class_b {
    22+
    comb.swap(j-1, j);
    23+
    swapped = true;
    24+
    }
    25+
    }
    26+
    if !swapped { break; }
    27+
    }
    28+
    }
    29+
    30+
    #[derive(Clone)]
    31+
    enum DecompositionType {
    32+
    Canonical,
    33+
    Compatible
    34+
    }
    35+
    36+
    /// External iterator for a string decomposition's characters.
    37+
    #[derive(Clone)]
    38+
    pub struct Decompositions<'a> {
    39+
    kind: DecompositionType,
    40+
    iter: Chars<'a>,
    41+
    buffer: Vec<(char, u8)>,
    42+
    sorted: bool
    43+
    }
    44+
    45+
    #[inline]
    46+
    pub fn new_canonical<'a>(s: &'a str) -> Decompositions<'a> {
    47+
    Decompositions {
    48+
    iter: s.chars(),
    49+
    buffer: Vec::new(),
    50+
    sorted: false,
    51+
    kind: self::DecompositionType::Canonical,
    52+
    }
    53+
    }
    54+
    55+
    #[inline]
    56+
    pub fn new_compatible<'a>(s: &'a str) -> Decompositions<'a> {
    57+
    Decompositions {
    58+
    iter: s.chars(),
    59+
    buffer: Vec::new(),
    60+
    sorted: false,
    61+
    kind: self::DecompositionType::Compatible,
    62+
    }
    63+
    }
    64+
    65+
    impl<'a> Iterator for Decompositions<'a> {
    66+
    type Item = char;
    67+
    68+
    #[inline]
    69+
    fn next(&mut self) -> Option<char> {
    70+
    use self::DecompositionType::*;
    71+
    72+
    match self.buffer.first() {
    73+
    Some(&(c, 0)) => {
    74+
    self.sorted = false;
    75+
    self.buffer.remove(0);
    76+
    return Some(c);
    77+
    }
    78+
    Some(&(c, _)) if self.sorted => {
    79+
    self.buffer.remove(0);
    80+
    return Some(c);
    81+
    }
    82+
    _ => self.sorted = false
    83+
    }
    84+
    85+
    if !self.sorted {
    86+
    for ch in self.iter.by_ref() {
    87+
    let buffer = &mut self.buffer;
    88+
    let sorted = &mut self.sorted;
    89+
    {
    90+
    let callback = |d| {
    91+
    let class =
    92+
    super::char::canonical_combining_class(d);
    93+
    if class == 0 && !*sorted {
    94+
    canonical_sort(buffer);
    95+
    *sorted = true;
    96+
    }
    97+
    buffer.push((d, class));
    98+
    };
    99+
    match self.kind {
    100+
    Canonical => {
    101+
    super::char::decompose_canonical(ch, callback)
    102+
    }
    103+
    Compatible => {
    104+
    super::char::decompose_compatible(ch, callback)
    105+
    }
    106+
    }
    107+
    }
    108+
    if *sorted {
    109+
    break
    110+
    }
    111+
    }
    112+
    }
    113+
    114+
    if !self.sorted {
    115+
    canonical_sort(&mut self.buffer);
    116+
    self.sorted = true;
    117+
    }
    118+
    119+
    if self.buffer.is_empty() {
    120+
    None
    121+
    } else {
    122+
    match self.buffer.remove(0) {
    123+
    (c, 0) => {
    124+
    self.sorted = false;
    125+
    Some(c)
    126+
    }
    127+
    (c, _) => Some(c),
    128+
    }
    129+
    }
    130+
    }
    131+
    132+
    fn size_hint(&self) -> (usize, Option<usize>) {
    133+
    let (lower, _) = self.iter.size_hint();
    134+
    (lower, None)
    135+
    }
    136+
    }

    src/lib.rs

    Lines changed: 110 additions & 2 deletions
    Original file line numberDiff line numberDiff line change
    @@ -1,3 +1,111 @@
    1-
    #[test]
    2-
    fn it_works() {
    1+
    // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
    2+
    // file at the top-level directory of this distribution and at
    3+
    // http://rust-lang.org/COPYRIGHT.
    4+
    //
    5+
    // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
    6+
    // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
    7+
    // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
    8+
    // option. This file may not be copied, modified, or distributed
    9+
    // except according to those terms.
    10+
    11+
    //! Unicode character composition and decomposition utilities
    12+
    //! as described in
    13+
    //! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
    14+
    //!
    15+
    //! ```rust
    16+
    //! extern crate unicode_normalization;
    17+
    //!
    18+
    //! use unicode_normalization::char::compose;
    19+
    //! use unicode_normalization::str::UnicodeNormalization;
    20+
    //!
    21+
    //! fn main() {
    22+
    //! assert_eq!(compose('A','\u{30a}'), Some('Å'));
    23+
    //!
    24+
    //! let s = "ÅΩ";
    25+
    //! let c = UnicodeNormalization::nfc_chars(s).collect::<String>();
    26+
    //! assert_eq!(c, "ÅΩ");
    27+
    //! }
    28+
    //! ```
    29+
    //!
    30+
    //! # crates.io
    31+
    //!
    32+
    //! You can use this package in your project by adding the following
    33+
    //! to your `Cargo.toml`:
    34+
    //!
    35+
    //! ```toml
    36+
    //! [dependencies]
    37+
    //! unicode-normalization = "0.0.1"
    38+
    //! ```
    39+
    40+
    #![deny(missing_docs, unsafe_code)]
    41+
    42+
    pub use tables::UNICODE_VERSION;
    43+
    44+
    mod decompose;
    45+
    mod normalize;
    46+
    mod recompose;
    47+
    mod tables;
    48+
    49+
    #[cfg(test)]
    50+
    mod test;
    51+
    52+
    /// Methods for composing and decomposing characters.
    53+
    pub mod char {
    54+
    pub use normalize::{decompose_canonical, decompose_compatible, compose};
    55+
    56+
    /// Look up the canonical combining class of a character.
    57+
    pub use tables::normalization::canonical_combining_class;
    58+
    }
    59+
    60+
    /// Methods for applying composition and decomposition to strings.
    61+
    pub mod str {
    62+
    pub use super::decompose::Decompositions;
    63+
    pub use super::recompose::Recompositions;
    64+
    65+
    /// Methods for iterating over strings while applying Unicode normalizations
    66+
    /// as described in
    67+
    /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
    68+
    pub trait UnicodeNormalization {
    69+
    /// Returns an iterator over the string in Unicode Normalization Form D
    70+
    /// (canonical decomposition).
    71+
    #[inline]
    72+
    fn nfd_chars(&self) -> Decompositions;
    73+
    74+
    /// Returns an iterator over the string in Unicode Normalization Form KD
    75+
    /// (compatibility decomposition).
    76+
    #[inline]
    77+
    fn nfkd_chars(&self) -> Decompositions;
    78+
    79+
    /// An Iterator over the string in Unicode Normalization Form C
    80+
    /// (canonical decomposition followed by canonical composition).
    81+
    #[inline]
    82+
    fn nfc_chars(&self) -> Recompositions;
    83+
    84+
    /// An Iterator over the string in Unicode Normalization Form KC
    85+
    /// (compatibility decomposition followed by canonical composition).
    86+
    #[inline]
    87+
    fn nfkc_chars(&self) -> Recompositions;
    88+
    }
    89+
    90+
    impl UnicodeNormalization for str {
    91+
    #[inline]
    92+
    fn nfd_chars(&self) -> Decompositions {
    93+
    super::decompose::new_canonical(self)
    94+
    }
    95+
    96+
    #[inline]
    97+
    fn nfkd_chars(&self) -> Decompositions {
    98+
    super::decompose::new_compatible(self)
    99+
    }
    100+
    101+
    #[inline]
    102+
    fn nfc_chars(&self) -> Recompositions {
    103+
    super::recompose::new_canonical(self)
    104+
    }
    105+
    106+
    #[inline]
    107+
    fn nfkc_chars(&self) -> Recompositions {
    108+
    super::recompose::new_compatible(self)
    109+
    }
    110+
    }
    3111
    }

    0 commit comments

    Comments
     (0)
    0