8000 Add unicode.py · unicode-rs/unicode-script@2345a63 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2345a63

Browse files
committed
Add unicode.py
1 parent a564826 commit 2345a63

File tree

1 file changed

+368
-0
lines changed

1 file changed

+368
-0
lines changed

scripts/unicode.py

Lines changed: 368 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,368 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
# This script uses the following Unicode tables:
14+
# - DerivedCoreProperties.txt
15+
# - auxiliary/GraphemeBreakProperty.txt
16+
# - auxiliary/WordBreakProperty.txt
17+
# - ReadMe.txt
18+
# - UnicodeData.txt
19+
#
20+
# Since this should not require frequent updates, we just store this
21+
# out-of-line and check the unicode.rs file into git.
22+
23+
import fileinput, re, os, sys
24+
25+
preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
26+
// file at the top-level directory of this distribution and at
27+
// http://rust-lang.org/COPYRIGHT.
28+
//
29+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
30+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
31+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
32+
// option. This file may not be copied, modified, or distributed
33+
// except according to those terms.
34+
35+
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
36+
37+
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
38+
'''
39+
40+
UNICODE_VERSION = (12, 0, 0)
41+
42+
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
43+
44+
def escape_char(c):
45+
return "'\\u{%x}'" % c
46+
47+
def fetch(f):
48+
if not os.path.exists(os.path.basename(f)):
49+
if "emoji" in f:
50+
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
51+
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
52+
else:
53+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
54+
% (UNICODE_VERSION_NUMBER, f))
55+
56+
if not os.path.exists(os.path.basename(f)):
57+
sys.stderr.write("cannot load %s" % f)
58+
exit(1)
59+
60+
def group_cats(cats):
61+
cats_out = {}
62+
for cat in cats:
63+
cats_out[cat] = group_cat(cats[cat])
64+
return cats_out
65+
66+
def aliases():
67+
"""
68+
Fetch the shorthand aliases for each longhand Script name
69+
"""
70+
fetch("PropertyValueAliases.txt")
71+
longforms = {}
72+
shortforms = {}
73+
re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)")
74+
for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")):
75+
m = re1.match(line)
76+
if m:
77+
l = m.group(2).strip()
78+
s = m.group(1).strip()
79+
assert(s not in longforms)
80+
assert(l not in shortforms)
81+
longforms[s] = l
82+
shortforms[l] = s
83+
else:
84+
continue
85+
86+
return (longforms, shortforms)
87+
88+
def format_table_content(f, content, indent):
89+
line = " "*indent
90+
first = True
91+
for chunk in content.split(","):
92+
if len(line) + len(chunk) < 98:
93+
if first:
94+
line += chunk
95+
else:
96+
line += ", " + chunk
97+
first = False
98+
else:
99+
f.write(line + ",\n")
100+
line = " "*indent + chunk
101+
f.write(line)
102+
103+
# Implementation from unicode-segmentation
104+
def load_properties(f, interestingprops):
105+
fetch(f)
106+
props = {}
107+
# Note: these regexes are different from those in unicode-segmentation,
108+
# becase we need to handle spaces here
109+
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
110+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
111+
112+
for line in fileinput.input(os.path.basename(f)):
113+
prop = None
114+
d_lo = 0
115+
d_hi = 0
116+
m = re1.match(line)
117+
if m:
118+
d_lo = m.group(1)
119+
d_hi = m.group(1)
120+
prop = m.group(2).strip()
121+
else:
122+
m = re2.match(line)
123+
if m:
124+
d_lo = m.group(1)
125+
d_hi = m.group(2)
126+
prop = m.group(3).strip()
127+
else:
128+
continue
129+
if interestingprops and prop not in interestingprops:
130+
continue
131+
d_lo = int(d_lo, 16)
132+
d_hi = int(d_hi, 16)
133+
if prop not in props:
134+
props[prop] = []
135+
props[prop].append((d_lo, d_hi))
136+
137+
return props
138+
139+
# Implementation from unicode-segmentation
140+
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
141+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
142+
pub_string = "const"
143+
if not is_const:
144+
pub_string = "let"
145+
if is_pub:
146+
pub_string = "pub " + pub_string
147+
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
148+
data = ""
149+
first = True
150+
for dat in t_data:
151+
if not first:
152+
data += ","
153+
first = False
154+
data += pfun(dat)
155+
format_table_content(f, data, 8)
156+
f.write("\n ];\n\n")
157+
158+
def emit_search(f):
159+
f.write("""
160+
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
161+
use core::cmp::Ordering::{Equal, Less, Greater};
162+
match r.binary_search_by(|&(lo, hi, _)| {
163+
if lo <= c && c <= hi { Equal }
164+
else if hi < c { Less }
165+
else { Greater }
166+
}) {
167+
Ok(idx) => {
168+
let (_, _, cat) = r[idx];
169+
Some(cat)
170+
}
171+
Err(_) => None
172+
}
173+
}
174+
175+
#[inline]
176+
pub fn get_script(c: char) -> Option<Script> {
177+
bsearch_range_value_table(c, SCRIPTS)
178+
}
179+
180+
#[inline]
181+
pub fn get_script_extension(c: char) -> Option<ScriptExtension> {
182+
bsearch_range_value_table(c, SCRIPT_EXTENSIONS)
183+
}
184+
""")
185+
186+
def emit_enums(f, script_list, extension_list, longforms):
187+
"""
188+
Emit the Script and ScriptExtension enums as well as any related utility functions
189+
"""
190+
f.write("""
191+
use core::convert::TryFrom;
192+
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
193+
#[non_exhaustive]
194+
/// A value of the Script property
195+
pub enum Script {
196+
""")
197+
for script in script_list:
198+
f.write(" /// %s\n %s,\n" % (longforms[script], script))
199+
f.write("""}
200+
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
201+
#[non_exhaustive]
202+
/// A value for the Script_Extension property
203+
///
204+
/// Script_Extension is one or more Script
205+
/// This is essentially an optimized version of Vec<Script>,
206+
/// optimized by script sets actually present in Unicode.
207+
pub enum ScriptExtension {
208+
/// A single script
209+
Single(Script),
210+
""")
211+
for ext in extension_list:
212+
longform = ", ".join([longforms[s] for s in ext])
213+
f.write(" /// %s\n %s,\n" % (longform, "".join(ext)))
214+
f.write("""}
215+
216+
impl From<Script> for ScriptExtension {
217+
fn from(script: Script) -> Self {
218+
ScriptExtension::Single(script)
219+
}
220+
}
221+
222+
impl TryFrom<ScriptExtension> for Script {
223+
type Error = ();
224+
fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
225+
match ext {
226+
ScriptExtension::Single(s) => Ok(s),
227+
_ => Err(())
228+
}
229+
}
230+
}
231+
232+
impl Script {
233+
pub(crate) fn inner_full_name(self) -> &'static str {
234+
match self {
235+
""")
236+
for script in script_list:
237+
f.write(" Script::%s => \"%s\",\n" % (script, longforms[script]))
238+
f.write(""" }
239+
}
240+
}
241+
242+
impl ScriptExtension {
243+
#[inline]
244+
#[cfg(feature = "with_std")]
245+
pub(crate) fn inner_scripts(self) -> Vec<Script> {
246+
match self {
247+
ScriptExtension::Single(s) => vec![s],
248+
""")
249+
for ext in extension_list:
250+
scripts = ", ".join(["Script::%s" % s for s in ext])
251+
f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts))
252+
f.write(""" _ => unreachable!()
253+
}
254+
}
255+
256+
#[inline]
257+
pub(crate) fn inner_contains_script(self, other: Script) -> bool {
258+
match self {
259+
ScriptExtension::Single(s) => s == other,
260+
""")
261+
for ext in extension_list:
262+
scripts = " || ".join(["other == Script::%s" % s for s in ext])
263+
f.write(" %s => %s,\n" % (extension_name(ext), scripts))
264+
f.write(""" }
265+
}
266+
267+
#[inline]
268+
pub(crate) fn inner_intersects(self, other: Self) -> bool {
269+
match (self, other) {
270+
(ScriptExtension::Single(Script::Zyyy), _) |
271+
(ScriptExtension::Single(Script::Zinh), _) |
272+
(_, ScriptExtension::Single(Script::Zyyy)) |
273+
(_, ScriptExtension::Single(Script::Zinh)) => true,
274+
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) => o.inner_contains_script(s),
275+
""")
276+
intersections = compute_intersections(extension_list)
277+
for (e1, e2) in intersections:
278+
f.write(" (%s, %s) => true,\n" % (extension_name(e1), extension_name(e2)))
279+
f.write(""" _ => false,
280+
}
281+
}
282+
}
283+
""")
284+
285+
286+
# We currently do NOT have an optimized method to compute
287+
# the actual intersection between two script extensions, we
288+
# only check if they *do* intersect
289+
#
290+
# To add such a method we'd need to do an extra pass where we compute any
291+
# new ScriptExtension enums we'll need from the intersections. It doesn't
292+
# seem worth it for now
293+
def compute_intersections(extension_list):
294+
"""
295+
Compute which pairs of elements intersect. This will return duplicate pairs with
296+
the elements swapped, but that's fine.
297+
"""
298+
intersections = []
299+
sets = [(e, set(e)) for e in extension_list]
300+
for (e1, s1) in sets:
301+
for (e2, s2) in sets:
302+
if e1 == e2:
303+
continue
304+
intersection = s1.intersection(s2)
305+
if len(intersection) > 0:
306+
intersections.append((e1, e2))
307+
return intersections
308+
309+
def extension_name(ext):
310+
"""Get the rust source for a given ScriptExtension"""
311+
if len(ext) == 1:
312+
return "ScriptExtension::Single(Script::%s)" % ext[0]
313+
else:
314+
return "ScriptExtension::%s" % "".join(ext)
315+
316+
317+
318+
319+
if __name__ == "__main__":
320+
r = "tables.rs"
321+
if os.path.exists(r):
322+
os.remove(r)
323+
with open(r, "w") as rf:
324+
# write the file's preamble
325+
rf.write(preamble)
326+
rf.write("""
327+
/// The version of [Unicode](http://www.unicode.org/)
328+
/// that this version of unicode-segmentation is based on.
329+
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
330+
""" % UNICODE_VERSION)
331+
332+
333+
(longforms, shortforms) = aliases()
334+
335+
scripts = load_properties("Scripts.txt", [])
336+
337+
script_table = []
338+
script_list = []
339+
340+
for script in scripts:
341+
script_list.append(shortforms[script])
342+
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
343+
script_table.sort(key=lambda w: w[0])
344+
345+
346+
extensions = load_properties("ScriptExtensions.txt", [])
347+
extension_table = []
348+
extension_list = []
349+
350+
for ext in extensions:
351+
split = ext.split(" ")
352+
split.sort()
353+
output_ext = [ext]
354+
if len(split) > 1:
355+
extension_list.append(split)
356+
output_ext = split
357+
extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]])
358+
extension_table.sort(key=lambda w: w[0])
359+
360+
emit_enums(rf, script_list, extension_list, longforms)
361+
emit_search(rf)
362+
363+
emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]",
364+
is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
365+
emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]",
366+
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2])))
367+
368+
# emit_table(rf, "FOObar", properties)

0 commit comments

Comments
 (0)
0