8000 Use fixed-size arrays instead of static slices for case folding. · unicode-rs/rust-caseless@a803b1e · GitHub
[go: up one dir, main page]

Skip to content

Commit a803b1e

Browse files
committed
Use fixed-size arrays instead of static slices for case folding.
1 parent 6b1d53e commit a803b1e

File tree

3 files changed

+60
-51
lines changed

3 files changed

+60
-51
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
[package]
22

33
name = "caseless"
4-
version = "0.0.1"
4+
version = "0.1.0"
55
authors = ["Simon Sapin <simon.sapin@exyr.org>"]
66
description = "Unicode caseless matching"
7+
repository = "https://github.com/SimonSapin/rust-caseless"
78
license = "MIT"
89

910
build = "src/build.rs"

src/build.rs

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
extern crate regex;
22

3+
use std::char;
34
use std::env;
45
use std::fs::File;
56
use std::io::Write;
67
use std::path::Path;
7-
use std::char;
88
use regex::Regex;
99

10+
// Case folding a single code point can give up to this many code points.
11+
const MAX_FOLDED_CODE_POINTS: usize = 3;
12+
1013
fn main() {
1114
let mut lines = include_str!("../CaseFolding.txt").lines();
1215
let first_line = lines.next().unwrap();
@@ -21,19 +24,25 @@ fn main() {
2124
};
2225

2326
w!("pub const UNICODE_VERSION: &'static str = \"{}\";\n", unicode_version);
24-
w!("const CASE_FOLDING_TABLE: &'static [(char, &'static [char])] = &[\n");
27+
w!("const CASE_FOLDING_TABLE: &'static [(char, [char; 3])] = &[\n");
2528

2629
// Entry with C (common case folding) or F (full case folding) status
2730
let c_or_f_entry = Regex::new(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);").unwrap();
2831

2932
for line in lines {
3033
if let Some(captures) = c_or_f_entry.captures(line) {
3134
let from = captures.at(1).unwrap();
32-
let mut to = captures.at(2).unwrap().split(' ');
35+
let to = captures.at(2).unwrap().split(' ').map(hex_to_escaped).collect::<Vec<_>>();
36+
assert!(to.len() <= MAX_FOLDED_CODE_POINTS);
37+
let blanks = MAX_FOLDED_CODE_POINTS - to.len();
38+
let mut to = to.into_iter();
3339
let first_to = to.next().unwrap();
34-
w!(" ('{}', &['{}'", hex_to_escaped(from), hex_to_escaped(first_to));
40+
w!(" ('{}', ['{}'", hex_to_escaped(from), first_to);
3541
for c in to {
36-
w!(", '{}'", hex_to_escaped(c));
42+
w!(", '{}'", c);
43+
}
44+
for _ in 0..blanks {
45+
w!(", '\\0'");
3746
}
3847
w!("]),\n");
3948
}
@@ -43,5 +52,7 @@ fn main() {
4352

4453

4554
fn hex_to_escaped(hex: &str) -> String {
46-
char::from_u32(u32::from_str_radix(hex, 16).unwrap()).unwrap().escape_default().collect()
55+
let c = u32::from_str_radix(hex, 16).unwrap();
56+
assert!(c != 0);
57+
char::from_u32(c).unwrap().escape_default().collect()
4758
}

src/lib.rs

Lines changed: 41 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,49 +4,6 @@ extern crate unicode_normalization;
44

55
include!(concat!(env!("OUT_DIR"), "/case_folding_data.rs"));
66

7-
pub fn default_case_fold_char(c: char) -> CaseFoldingResult {
8-
match CASE_FOLDING_TABLE.binary_search_by(|&(x, _)| x.cmp(&c)) {
9-
Err(_) => CaseFoldingResult::Unchanged,
10-
Ok(i) => CaseFoldingResult::ReplacedWith(CASE_FOLDING_TABLE[i].1),
11-
}
12-
}
13-
14-
#[derive(Copy, Clone)]
15-
pub enum CaseFoldingResult {
16-
/// A `char` case folds to itself
17-
Unchanged,
18-
/// A `char` case folds to a sequence of one (most common),
19-
/// two, or three `char`s.
20-
ReplacedWith(&'static [char]),
21-
}
22-
23-
pub struct CaseFold<I> {
24-
chars: I,
25 9E7A -
queue: &'static [char],
26-
}
27-
28-
impl<I> Iterator for CaseFold<I> where I: Iterator<Item = char> {
29-
type Item = char;
30-
31-
fn next(&mut self) -> Option<char> {
32-
if let Some(&c) = self.queue.first() {
33-
self.queue = &self.queue[1..];
34-
return Some(c);
35-
}
36-
self.chars.next().map(|c| match default_case_fold_char(c) {
37-
CaseFoldingResult::Unchanged => c,
38-
CaseFoldingResult::ReplacedWith(replacement) => {
39-
self.queue = &replacement[1..];
40-
replacement[0]
41-
}
42-
})
43-
}
44-
45-
fn size_hint(&self) -> (usize, Option<usize>) {
46-
let (low, high) = self.chars.size_hint();
47-
(low, high.and_then(|h| h.checked_mul(3)))
48-
}
49-
}
507

518
pub trait Caseless {
529
fn default_case_fold(self) -> CaseFold<Self>;
@@ -59,7 +16,7 @@ impl<I: Iterator<Item=char>> Caseless for I {
5916
fn default_case_fold(self) -> CaseFold<I> {
6017
CaseFold {
6118
chars: self,
62-
queue: &[],
19+
queue: ['\0', '\0'],
6320
}
6421
}
6522

@@ -115,3 +72,43 @@ fn iter_eq<L: Iterator, R: Iterator>(mut a: L, mut b: R) -> bool where L::Item:
11572
}
11673
}
11774
}
75+
76+
pub struct CaseFold<I> {
77+
chars: I,
78+
queue: [char; 2],
79+
}
80+
81+
impl<I> Iterator for CaseFold<I> where I: Iterator<Item = char> {
82+
type Item = char;
83+
84+
fn next(&mut self) -> Option<char> {
85+
let c = self.queue[0];
86+
if c != '\0' {
87+
self.queue[0] = self.queue[1];
88+
return Some(c)
89+
}
90+
self.chars.next().map(|c| {
91+
match CASE_FOLDING_TABLE.binary_search_by(|&(x, _)| x.cmp(&c)) {
92+
Err(_) => c,
93+
Ok(i) => {
94+
let folded = CASE_FOLDING_TABLE[i].1;
95+
self.queue = [folded[1], folded[2]];
96+
folded[0]
97+
}
98+
}
99+
})
100+
}
101+
102+
fn size_hint(&self) -> (usize, Option<usize>) {
103+
let queue_len = if self.queue[0] == '\0' {
104+
0
105+
} else if self.queue[1] == '\0' {
106+
1
107+
} else {
108+
2
109+
};
110+
let (low, high) = self.chars.size_hint();
111+
(low.saturating_add(queue_len),
112+
high.and_then(|h| h.checked_mul(3)).and_then(|h| h.checked_add(queue_len)))
113+
}
114+
}

0 commit comments

Comments
 (0)
0