8000 working build · Florob/unicode-normalization@462dd30 · GitHub
[go: up one dir, main page]

Skip to content

Commit 462dd30

Browse files
committed
working build
1 parent 933f988 commit 462dd30

File tree

5 files changed

+630
-2
lines changed

5 files changed

+630
-2
lines changed

src/decompose.rs

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use std::str::Chars;
12+
13+
// Helper functions used for Unicode normalization
14+
fn canonical_sort(comb: &mut [(char, u8)]) {
15+
let len = comb.len();
16+
for i in 0..len {
17+
let mut swapped = false;
18+
for j in 1..len-i {
19+
let class_a = comb[j-1].1;
20+
let class_b = comb[j].1;
21+
if class_a != 0 && class_b != 0 && class_a > class_b {
22+
comb.swap(j-1, j);
23+
swapped = true;
24+
}
25+
}
26+
if !swapped { break; }
27+
}
28+
}
29+
30+
#[derive(Clone)]
31+
enum DecompositionType {
32+
Canonical,
33+
Compatible
34+
}
35+
36+
/// External iterator for a string decomposition's characters.
37+
#[derive(Clone)]
38+
pub struct Decompositions<'a> {
39+
kind: DecompositionType,
40+
iter: Chars<'a>,
41+
buffer: Vec<(char, u8)>,
42+
sorted: bool
43+
}
44+
45+
#[inline]
46+
pub fn new_canonical<'a>(s: &'a str) -> Decompositions<'a> {
47+
Decompositions {
48+
iter: s.chars(),
49+
buffer: Vec::new(),
50+
sorted: false,
51+
kind: self::DecompositionType::Canonical,
52+
}
53+
}
54+
55+
#[inline]
56+
pub fn new_compatible<'a>(s: &'a str) -> Decompositions<'a> {
57+
Decompositions {
58+
iter: s.chars(),
59+
buffer: Vec::new(),
60+
sorted: false,
61+
kind: self::DecompositionType::Compatible,
62+
}
63+
}
64+
65+
impl<'a> Iterator for Decompositions<'a> {
66+
type Item = char;
67+
68+
#[inline]
69+
fn next(&mut self) -> Option<char> {
70+
use self::DecompositionType::*;
71+
72+
match self.buffer.first() {
73+
Some(&(c, 0)) => {
74+
self.sorted = false;
75+
self.buffer.remove(0);
76+
return Some(c);
77+
}
78+
Some(&(c, _)) if self.sorted => {
79+
self.buffer.remove(0);
80+
return Some(c);
81+
}
82+
_ => self.sorted = false
83+
}
84+
85+
if !self.sorted {
86+
for ch in self.iter.by_ref() {
87+
let buffer = &mut self.buffer;
88 9E88 +
let sorted = &mut self.sorted;
89+
{
90+
let callback = |d| {
91+
let class =
92+
super::char::canonical_combining_class(d);
93+
if class == 0 && !*sorted {
94+
canonical_sort(buffer);
95+
*sorted = true;
96+
}
97+
buffer.push((d, class));
98+
};
99+
match self.kind {
100+
Canonical => {
101+
super::char::decompose_canonical(ch, callback)
102+
}
103+
Compatible => {
104+
super::char::decompose_compatible(ch, callback)
105+
}
106+
}
107+
}
108+
if *sorted {
109+
break
110+
}
111+
}
112+
}
113+
114+
if !self.sorted {
115+
canonical_sort(&mut self.buffer);
116+
self.sorted = true;
117+
}
118+
119+
if self.buffer.is_empty() {
120+
None
121+
} else {
122+
match self.buffer.remove(0) {
123+
(c, 0) => {
124+
self.sorted = false;
125+
Some(c)
126+
}
127+
(c, _) => Some(c),
128+
}
129+
}
130+
}
131+
132+
fn size_hint(&self) -> (usize, Option<usize>) {
133+
let (lower, _) = self.iter.size_hint();
134+
(lower, None)
135+
}
136+
}

src/lib.rs

Lines changed: 110 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,111 @@
1-
#[test]
2-
fn it_works() {
1+
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
//! Unicode character composition and decomposition utilities
12+
//! as described in
13+
//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
14+
//!
15+
//! ```rust
16+
//! extern crate unicode_normalization;
17+
//!
18+
//! use unicode_normalization::char::compose;
19+
//! use unicode_normalization::str::UnicodeNormalization;
20+
//!
21+
//! fn main() {
22+
//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
23+
//!
24+
//! let s = "ÅΩ";
25+
//! let c = UnicodeNormalization::nfc_chars(s).collect::<String>();
26+
//! assert_eq!(c, "ÅΩ");
27+
//! }
28+
//! ```
29+
//!
30+
//! # crates.io
31+
//!
32+
//! You can use this package in your project by adding the following
33+
//! to your `Cargo.toml`:
34+
//!
35+
//! ```toml
36+
//! [dependencies]
37+
//! unicode-normalization = "0.0.1"
38+
//! ```
39+
40+
#![deny(missing_docs, unsafe_code)]
41+
42+
pub use tables::UNICODE_VERSION;
43+
44+
mod decompose;
45+
mod normalize;
46+
mod recompose;
47+
mod tables;
48+
49+
#[cfg(test)]
50+
mod test;
51+
52+
/// Methods for composing and decomposing characters.
53+
pub mod char {
54+
pub use normalize::{decompose_canonical, decompose_compatible, compose};
55+
56+
/// Look up the canonical combining class of a character.
57+
pub use tables::normalization::canonical_combining_class;
58+
}
59+
60+
/// Methods for applying composition and decomposition to strings.
61+
pub mod str {
62+
pub use super::decompose::Decompositions;
63+
pub use super::recompose::Recompositions;
64+
65+
/// Methods for iterating over strings while applying Unicode normalizations
66+
/// as described in
67+
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
68+
pub trait UnicodeNormalization {
69+
/// Returns an iterator over the string in Unicode Normalization Form D
70+
/// (canonical decomposition).
71+
#[inline]
72+
fn nfd_chars(&self) -> Decompositions;
73+
74+
/// Returns an iterator over the string in Unicode Normalization Form KD
75+
/// (compatibility decomposition).
76+
#[inline]
77+
fn nfkd_chars(&self) -> Decompositions;
78+
79+
/// An Iterator over the string in Unicode Normalization Form C
80+
/// (canonical decomposition followed by canonical composition).
81+
#[inline]
82+
fn nfc_chars(&self) -> Recompositions;
83+
84+
/// An Iterator over the string in Unicode Normalization Form KC
85+
/// (compatibility decomposition followed by canonical composition).
86+
#[inline]
87+
fn nfkc_chars(&self) -> Recompositions;
88+
}
89+
90+
impl UnicodeNormalization for str {
91+
#[inline]
92+
fn nfd_chars(&self) -> Decompositions {
93+
super::decompose::new_canonical(self)
94+
}
95+
96+
#[inline]
97+
fn nfkd_chars(&self) -> Decompositions {
98+
super::decompose::new_compatible(self)
99+
}
100+
101+
#[inline]
102+
fn nfc_chars(&self) -> Recompositions {
103+
super::recompose::new_canonical(self)
104+
}
105+
106+
#[inline]
107+
fn nfkc_chars(&self) -> Recompositions {
108+
super::recompose::new_compatible(self)
109+
}
110+
}
3111
}

0 commit comments

Comments
 (0)
0