8000 add tablegen script and tables.rs · Florob/unicode-normalization@933f988 · GitHub
[go: up one dir, main page]

Skip to content

Commit 933f988

Browse files
committed
add tablegen script and tables.rs
1 parent 3f237be commit 933f988

File tree

2 files changed

+3047
-0
lines changed

2 files changed

+3047
-0
lines changed

scripts/unicode.py

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright 2011-2013 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
# This script uses the following Unicode tables:
14+
# - DerivedNormalizationProps.txt
15+
# - ReadMe.txt
16+
# - UnicodeData.txt
17+
#
18+
# Since this should not require frequent updates, we just store this
19+
# out-of-line and check the unicode.rs file into git.
20+
21+
import fileinput, re, os, sys
22+
23+
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
24+
// file at the top-level directory of this distribution and at
25+
// http://rust-lang.org/COPYRIGHT.
26+
//
27+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
28+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
29+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
30+
// option. This file may not be copied, modified, or distributed
31+
// except according to those terms.
32+
33+
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
34+
35+
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
36+
'''
37+
38+
# Mapping taken from Table 12 from:
39+
# http://www.unicode.org/reports/tr44/#General_Category_Values
40+
expanded_categories = {
41+
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
42+
'Lm': ['L'], 'Lo': ['L'],
43+
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
44+
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
45+
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
46+
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
47+
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
48+
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
49+
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
50+
}
51+
52+
# these are the surrogate codepoints, which are not valid rust characters
53+
surrogate_codepoints = (0xd800, 0xdfff)
54+
55+
def fetch(f):
56+
if not os.path.exists(os.path.basename(f)):
57+
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
58+
% f)
59+
60+
if not os.path.exists(os.path.basename(f)):
61+
sys.stderr.write("cannot load %s" % f)
62+
exit(1)
63+
64+
def is_surrogate(n):
65+
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
66+
67+
def load_unicode_data(f):
68+
fetch(f)
69+
combines = {}
70+
canon_decomp = {}
71+
compat_decomp = {}
72+
73+
udict = {};
74+
range_start = -1;
75+
for line in fileinput.input(f):
76+
data = line.split(';');
77+
if len(data) != 15:
78+
continue
79+
cp = int(data[0], 16);
80+
if is_surrogate(cp):
81+
continue
82+
if range_start >= 0:
83+
for i in xrange(range_start, cp):
84+
udict[i] = data;
85+
range_start = -1;
86+
if data[1].endswith(", First>"):
87+
range_start = cp;
88+
continue;
89+
udict[cp] = data;
90+
91+
for code in udict:
92+
[code_org, name, gencat, combine, bidi,
93+
decomp, deci, digit, num, mirror,
94+
old, iso, upcase, lowcase, titlecase ] = udict[code];
95+
96+
# store decomposition, if given
97+
if decomp != "":
98+
if decomp.startswith('<'):
99+
seq = []
100+
for i in decomp.split()[1:]:
101+
seq.append(int(i, 16))
102+
compat_decomp[code] = seq
103+
else:
104+
seq = []
105+
for i in decomp.split():
106+
seq.append(int(i, 16))
107+
canon_decomp[code] = seq
108+
109+
# record combining class, if any
110+
if combine != "0":
111+
if combine not in combines:
112+
combines[combine] = []
113+
combines[combine].append(code)
114+
115+
combines = to_combines(group_cats(combines))
116+
117+
return (canon_decomp, compat_decomp, combines)
118+
119+
def group_cats(cats):
120+
cats_out = {}
121+
for cat in cats:
122+
cats_out[cat] = group_cat(cats[cat])
123+
return cats_out
124+
125+
def group_cat(cat):
126+
cat_out = []
127+
letters = sorted(set(cat))
128+
cur_start = letters.pop(0)
129+
cur_end = cur_start
130+
for letter in letters:
131+
assert letter > cur_end, \
132+
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
133+
if letter == cur_end + 1:
134+
cur_end = letter
135+
else:
136+
cat_out.append((cur_start, cur_end))
137+
cur_start = cur_end = letter
138+
cat_out.append((cur_start, cur_end))
139+
return cat_out
140+
141+
def to_combines(combs):
142+
combs_out = []
143+
for comb in combs:
144+
for (lo, hi) in combs[comb]:
145+
combs_out.append((lo, hi, comb))
146+
combs_out.sort(key=lambda comb: comb[0])
147+
return combs_out
148+
149+
def format_table_content(f, content, indent):
150+
line = " "*indent
151+
first = True
152+
for chunk in content.split(","):
153+
if len(line) + len(chunk) < 98:
154+
if first:
155+
line += chunk
156+
else:
157+
line += ", " + chunk
158+
first = False
159+
else:
160+
f.write(line + ",\n")
161+
line = " "*indent + chunk
162+
f.write(line)
163+
164+
def load_properties(f, interestingprops):
165+
fetch(f)
166+
props = {}
167+
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
168+
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
169+
170+
for line in fileinput.input(os.path.basename(f)):
171+
prop = None
172+
d_lo = 0
173+
d_hi = 0
174+
m = re1.match(line)
175+
if m:
176+
d_lo = m.group(1)
177+
d_hi = m.group(1)
178+
prop = m.group(2)
179+
else:
180+
m = re2.match(line)
181+
if m:
182+
d_lo = m.group(1)
183+
d_hi = m.group(2)
184+
prop = m.group(3)
185+
else:
186+
continue
187+
if interestingprops and prop not in interestingprops:
188+
continue
189+
d_lo = int(d_lo, 16)
190+
d_hi = int(d_hi, 16)
191+
if prop not in props:
192+
props[prop] = []
193+
props[prop].append((d_lo, d_hi))
194+
return props
195+
196+
def escape_char(c):
197+
return "'\\u{%x}'" % c
198+
199+
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
200+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
201+
pub_string = ""
202+
if is_pub:
203+
pub_string = "pub "
204+
f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type))
205+
data = ""
206+
first = True
207+
for dat in t_data:
208+
if not first:
209+
data += ","
210+
first = False
211+
data += pfun(dat)
212+
format_table_content(f, data, 8)
213+
f.write("\n ];\n\n")
214+
215+
def emit_norm_module(f, canon, compat, combine, norm_props):
216+
canon_keys = canon.keys()
217+
canon_keys.sort()
218+
219+
compat_keys = compat.keys()
220+
compat_keys.sort()
221+
222+
canon_comp = {}
223+
comp_exclusions = norm_props["Full_Composition_Exclusion"]
224+
for char in canon_keys:
225+
if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions):
226+
continue
227+
decomp = canon[char]
228+
if len(decomp) == 2:
229+
if not canon_comp.has_key(decomp[0]):
230+
canon_comp[decomp[0]] = []
231+
canon_comp[decomp[0]].append( (decomp[1], char) )
232+
canon_comp_keys = canon_comp.keys()
233+
canon_comp_keys.sort()
234+
235+
f.write("pub mod normalization {\n")
236+
237+
def mkdata_fun(table):
238+
def f(char):
239+
data = "(%s,&[" % escape_char(char)
240+
first = True
241+
for d in table[char]:
242+
if not first:
243+
data += ","
244+
first = False
245+
data += escape_char(d)
246+
data += "])"
247+
return data
248+
return f
249+
250+
f.write(" // Canonical decompositions\n")
251+
emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
252+
pfun=mkdata_fun(canon))
253+
254+
f.write(" // Compatibility decompositions\n")
255+
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
256+
pfun=mkdata_fun(compat))
257+
258+
def comp_pfun(char):
259+
data = "(%s,&[" % escape_char(char)
260+
canon_comp[char].sort(lambda x, y: x[0] - y[0])
261+
first = True
262+
for pair in canon_comp[char]:
263+
if not first:
264+
data += ","
265+
first = False
266+
data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
267+
data += "])"
268+
return data
269+
270+
f.write(" // Canonical compositions\n")
271+
emit_table(f, "composition_table", canon_comp_keys,
272+
"&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)
273+
274+
f.write("""
275+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
276+
use std::cmp::Ordering::{Equal, Less, Greater};
277+
match r.binary_search_by(|&(lo, hi, _)| {
278+
if lo <= c && c <= hi { Equal }
279+
else if hi < c { Less }
280+
else { Greater }
281+
}) {
282+
Ok(idx) => {
283+
let (_, _, result) = r[idx];
284+
result
285+
}
286+
Err(_) => 0
287+
}
288+
}\n
289+
""")
290+
291+
emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False,
292+
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
293+
294+
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
295+
+ " bsearch_range_value_table(c, combining_class_table)\n"
296+
+ " }\n")
297+
298+
f.write("""
299+
}
300+
301+
""")
302+
303+
if __name__ == "__main__":
304+
r = "tables.rs"
305+
if os.path.exists(r):
306+
os.remove(r)
307+
with open(r, "w") as rf:
308+
# write the file's preamble
309+
rf.write(preamble)
310+
311+
# download and parse all the data
312+
fetch("ReadMe.txt")
313+
with open("ReadMe.txt") as readme:
314+
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
315+
unicode_version = re.search(pattern, readme.read()).groups()
316+
rf.write("""
317+
/// The version of [Unicode](http://www.unicode.org/)
318+
/// that the unicode parts of `CharExt` and `UnicodeStrPrelude` traits are based on.
319+
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
320+
321+
""" % unicode_version)
322+
(canon_decomp, compat_decomp, combines) = load_unicode_data("UnicodeData.txt")
323+
norm_props = load_properties("DerivedNormalizationProps.txt",
324+
["Full_Composition_Exclusion"])
325+
326+
# normalizations and conversions module
327+
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)

0 commit comments

Comments
 (0)
0