8000 add Unicode norms test suite; bump to ver 0.0.3 · Florob/unicode-normalization@7a49d25 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7a49d25

Browse files
committed
add Unicode norms test suite; bump to ver 0.0.3
1 parent 0a336e5 commit 7a49d25

File tree

8 files changed

+22944
-4
lines changed

8 files changed

+22944
-4
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
target
22
Cargo.lock
33
scripts/tmp
4+
*.pyc

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "unicode-normalization"
4-
version = "0.0.2"
4+
version = "0.0.3"
55
authors = ["kwantam <kwantam@gmail.com>"]
66

77
homepage = "https://github.com/unicode-rs/unicode-normalization"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,5 @@ to your `Cargo.toml`:
2626

2727
```toml
2828
[dependencies]
29-
unicode-normalization = "0.0.2"
29+
unicode-normalization = "0.0.3"
3030
```

scripts/unicode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
// option. This file may not be copied, modified, or distributed
3131
// except according to those terms.
3232
33-
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
33+
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
3434
3535
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
3636
'''

scripts/unicode_gen_normtests.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
# This script uses the following Unicode tables:
14+
# - NormalizationTest.txt
15+
#
16+
# Since this should not require frequent updates, we just store this
17+
# out-of-line and check the unicode.rs file into git.
18+
19+
import unicode, re, os, fileinput
20+
21+
def load_test_data(f):
22+
outls = []
23+
testRe = re.compile("^(.*?);(.*?);(.*?);(.*?);(.*?);\s+#.*$")
24+
25+
unicode.fetch(f)
26+
for line in fileinput.input(os.path.basename(f)):
27+
# comment and header lines start with # and @ respectively
28+
if len(line) < 1 or line[0:1] == '#' or line[0:1] == '@':
29+
continue
30+
31+
m = testRe.match(line)
32+
groups = []
33+
if not m:
34+
print "error: no match on line where test was expected: %s" % line
35+
continue
36+
37+
has_surrogates = False
38+
for i in range(1, 6):
39+
group = []
40+
chs = m.group(i).split()
41+
for ch in chs:
42+
intch = int(ch,16)
43+
if unicode.is_surrogate(intch):
44+
has_surrogates = True
45+
break
46+
group.append(intch)
47+
48+
if has_surrogates:
49+
break
50+
groups.append(group)
51+
52+
if has_surrogates:
53+
continue
54+
outls.append(groups)
55+
56+
return outls
57+
58+
def showfun(gs):
59+
outstr = '('
60+
gfirst = True
61+
for g in gs:
62+
if not gfirst:
63+
outstr += ','
64+
gfirst = False
65+
66+
outstr += '"'
67+
for ch in g:
68+
outstr += "\\u{%x}" % ch
69+
outstr += '"'
70+
outstr += ')'
71+
return outstr
72+
73+
if __name__ == "__main__":
74+
d = load_test_data("NormalizationTest.txt")
75+
ntype = "&'static [(&'static str, &'static str, &'static str, &'static str, &'static str)]"
76+
with open("testdata.rs", "w") as nf:
77+
nf.write(unicode.preamble)
78+
nf.write("\n")
79+
nf.write(" // official Unicode test data\n")
80+
nf.write(" // http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt\n")
81+
unicode.emit_table(nf, "TEST_NORM", d, ntype, True, showfun)

src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
//!
3535
//! ```toml
3636
//! [dependencies]
37-
//! unicode-normalization = "0.0.2"
37+
//! unicode-normalization = "0.0.3"
3838
//! ```
3939
4040
#![deny(missing_docs, unsafe_code)]
@@ -48,6 +48,8 @@ mod tables;
4848

4949
#[cfg(test)]
5050
mod test;
51+
#[cfg(test)]
52+
mod testdata;
5153

5254
/// Methods for composing and decomposing characters.
5355
pub mod char {

src/test.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,67 @@ fn test_nfkc_chars() {
8787
t!("\u{ac1c}", "\u{ac1c}");
8888
t!("a\u{300}\u{305}\u{315}\u{5ae}b", "\u{e0}\u{5ae}\u{305}\u{315}b");
8989
}
90+
91+
#[test]
92+
fn test_official() {
93+
use testdata::TEST_NORM;
94+
macro_rules! normString {
95+
($fun: ident, $input: expr) => { UnicodeNormalization::$fun($input).collect::<String>() }
96+
}
97+
98+
for &(s1, s2, s3, s4, s5) in TEST_NORM {
99+
// these invariants come from the CONFORMANCE section of
100+
// http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
101+
{
102+
let r1 = normString!(nfc_chars, s1);
103+
let r2 = normString!(nfc_chars, s2);
104+
let r3 = normString!(nfc_chars, s3);
105+
let r4 = normString!(nfc_chars, s4);
106+
let r5 = normString!(nfc_chars, s5);
107+
assert_eq!(s2, &r1[..]);
108+
assert_eq!(s2, &r2[..]);
109+
assert_eq!(s2, &r3[..]);
110+
assert_eq!(s4, &r4[..]);
111+
assert_eq!(s4, &r5[..]);
112+
}
113+
114+
{
115+
let r1 = normString!(nfd_chars, s1);
116+
let r2 = normString!(nfd_chars, s2);
117+
let r3 = normString!(nfd_chars, s3);
118+
let r4 = normString!(nfd_chars, s4);
119+
let r5 = normString!(nfd_chars, s5);
120+
assert_eq!(s3, &r1[..]);
121+
assert_eq!(s3, &r2[..]);
122+
assert_eq!(s3, &r3[..]);
123+
assert_eq!(s5, &r4[..]);
124+
assert_eq!(s5, &r5[..]);
125+
}
126+
127+
{
128+
let r1 = normString!(nfkc_chars, s1);
129+
let r2 = normString!(nfkc_chars, s2);
130+
let r3 = normString!(nfkc_chars, s3);
131+
let r4 = normString!(nfkc_chars, s4);
132+
let r5 = normString!(nfkc_chars, s5);
133+
assert_eq!(s4, &r1[..]);
134+
assert_eq!(s4, &r2[..]);
135+
assert_eq!(s4, &r3[..]);
136+
assert_eq!(s4, &r4[..]);
137+
assert_eq!(s4, &r5[..]);
138+
}
139+
140+
{
141+
let r1 = normString!(nfkd_chars, s1);
142+
let r2 = normString!(nfkd_chars, s2);
143+
let r3 = normString!(nfkd_chars, s3);
144+
let r4 = normString!(nfkd_chars, s4);
145+
let r5 = normString!(nfkd_chars, s5);
146+
assert_eq!(s5, &r1[..]);
147+
assert_eq!(s5, &r2[..]);
148+
assert_eq!(s5, &r3[..]);
149+
assert_eq!(s5, &r4[..]);
150+
assert_eq!(s5, &r5[..]);
151+
}
152+
}
153+
}

0 commit comments

Comments
 (0)
0