|
| 1 | +#!/usr/bin/env python |
| 2 | +# |
| 3 | +# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT |
| 4 | +# file at the top-level directory of this distribution and at |
| 5 | +# http://rust-lang.org/COPYRIGHT. |
| 6 | +# |
| 7 | +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 9 | +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 10 | +# option. This file may not be copied, modified, or distributed |
| 11 | +# except according to those terms. |
| 12 | + |
| 13 | +# This script uses the following Unicode tables: |
| 14 | +# - DerivedCoreProperties.txt |
| 15 | +# - auxiliary/GraphemeBreakProperty.txt |
| 16 | +# - auxiliary/WordBreakProperty.txt |
| 17 | +# - ReadMe.txt |
| 18 | +# - UnicodeData.txt |
| 19 | +# |
| 20 | +# Since this should not require frequent updates, we just store this |
| 21 | +# out-of-line and check the unicode.rs file into git. |
| 22 | + |
| 23 | +import fileinput, re, os, sys |
| 24 | + |
| 25 | +preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT |
| 26 | +// file at the top-level directory of this distribution and at |
| 27 | +// http://rust-lang.org/COPYRIGHT. |
| 28 | +// |
| 29 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 30 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 31 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 32 | +// option. This file may not be copied, modified, or distributed |
| 33 | +// except according to those terms. |
| 34 | +
|
| 35 | +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly |
| 36 | +
|
| 37 | +#![allow(missing_docs, non_upper_case_globals, non_snake_case)] |
| 38 | +''' |
| 39 | + |
| 40 | +UNICODE_VERSION = (12, 0, 0) |
| 41 | + |
| 42 | +UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION |
| 43 | + |
| 44 | +def escape_char(c): |
| 45 | + return "'\\u{%x}'" % c |
| 46 | + |
| 47 | +def fetch(f): |
| 48 | + if not os.path.exists(os.path.basename(f)): |
| 49 | + if "emoji" in f: |
| 50 | + os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s" |
| 51 | + % (UNICODE_VERSION[0], UNICODE_VERSION[1], f)) |
| 52 | + else: |
| 53 | + os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s" |
| 54 | + % (UNICODE_VERSION_NUMBER, f)) |
| 55 | + |
| 56 | + if not os.path.exists(os.path.basename(f)): |
| 57 | + sys.stderr.write("cannot load %s" % f) |
| 58 | + exit(1) |
| 59 | + |
| 60 | +def group_cats(cats): |
| 61 | + cats_out = {} |
| 62 | + for cat in cats: |
| 63 | + cats_out[cat] = group_cat(cats[cat]) |
| 64 | + return cats_out |
| 65 | + |
| 66 | +def aliases(): |
| 67 | + """ |
| 68 | + Fetch the shorthand aliases for each longhand Script name |
| 69 | + """ |
| 70 | + fetch("PropertyValueAliases.txt") |
| 71 | + longforms = {} |
| 72 | + shortforms = {} |
| 73 | + re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)") |
| 74 | + for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")): |
| 75 | + m = re1.match(line) |
| 76 | + if m: |
| 77 | + l = m.group(2).strip() |
| 78 | + s = m.group(1).strip() |
| 79 | + assert(s not in longforms) |
| 80 | + assert(l not in shortforms) |
| 81 | + longforms[s] = l |
| 82 | + shortforms[l] = s |
| 83 | + else: |
| 84 | + continue |
| 85 | + |
| 86 | + return (longforms, shortforms) |
| 87 | + |
| 88 | +def format_table_content(f, content, indent): |
| 89 | + line = " "*indent |
| 90 | + first = True |
| 91 | + for chunk in content.split(","): |
| 92 | + if len(line) + len(chunk) < 98: |
| 93 | + if first: |
| 94 | + line += chunk |
| 95 | + else: |
| 96 | + line += ", " + chunk |
| 97 | + first = False |
| 98 | + else: |
| 99 | + f.write(line + ",\n") |
| 100 | + line = " "*indent + chunk |
| 101 | + f.write(line) |
| 102 | + |
| 103 | +# Implementation from unicode-segmentation |
| 104 | +def load_properties(f, interestingprops): |
| 105 | + fetch(f) |
| 106 | + props = {} |
| 107 | + # Note: these regexes are different from those in unicode-segmentation, |
| 108 | + # becase we need to handle spaces here |
| 109 | + re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#") |
| 110 | + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#") |
| 111 | + |
| 112 | + for line in fileinput.input(os.path.basename(f)): |
| 113 | + prop = None |
| 114 | + d_lo = 0 |
| 115 | + d_hi = 0 |
| 116 | + m = re1.match(line) |
| 117 | + if m: |
| 118 | + d_lo = m.group(1) |
| 119 | + d_hi = m.group(1) |
| 120 | + prop = m.group(2).strip() |
| 121 | + else: |
| 122 | + m = re2.match(line) |
| 123 | + if m: |
| 124 | + d_lo = m.group(1) |
| 125 | + d_hi = m.group(2) |
| 126 | + prop = m.group(3).strip() |
| 127 | + else: |
| 128 | + continue |
| 129 | + if interestingprops and prop not in interestingprops: |
| 130 | + continue |
| 131 | + d_lo = int(d_lo, 16) |
| 132 | + d_hi = int(d_hi, 16) |
| 133 | + if prop not in props: |
| 134 | + props[prop] = [] |
| 135 | + props[prop].append((d_lo, d_hi)) |
| 136 | + |
| 137 | + return props |
| 138 | + |
| 139 | +# Implementation from unicode-segmentation |
| 140 | +def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, |
| 141 | + pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): |
| 142 | + pub_string = "const" |
| 143 | + if not is_const: |
| 144 | + pub_string = "let" |
| 145 | + if is_pub: |
| 146 | + pub_string = "pub " + pub_string |
| 147 | + f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) |
| 148 | + data = "" |
| 149 | + first = True |
| 150 | + for dat in t_data: |
| 151 | + if not first: |
| 152 | + data += "," |
| 153 | + first = False |
| 154 | + data += pfun(dat) |
| 155 | + format_table_content(f, data, 8) |
| 156 | + f.write("\n ];\n\n") |
| 157 | + |
| 158 | +def emit_search(f): |
| 159 | + f.write(""" |
| 160 | +pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> { |
| 161 | + use core::cmp::Ordering::{Equal, Less, Greater}; |
| 162 | + match r.binary_search_by(|&(lo, hi, _)| { |
| 163 | + if lo <= c && c <= hi { Equal } |
| 164 | + else if hi < c { Less } |
| 165 | + else { Greater } |
| 166 | + }) { |
| 167 | + Ok(idx) => { |
| 168 | + let (_, _, cat) = r[idx]; |
| 169 | + Some(cat) |
| 170 | + } |
| 171 | + Err(_) => None |
| 172 | + } |
| 173 | +} |
| 174 | +
|
| 175 | +#[inline] |
| 176 | +pub fn get_script(c: char) -> Option<Script> { |
| 177 | + bsearch_range_value_table(c, SCRIPTS) |
| 178 | +} |
| 179 | +
|
| 180 | +#[inline] |
| 181 | +pub fn get_script_extension(c: char) -> Option<ScriptExtension> { |
| 182 | + bsearch_range_value_table(c, SCRIPT_EXTENSIONS) |
| 183 | +} |
| 184 | +""") |
| 185 | + |
| 186 | +def emit_enums(f, script_list, extension_list, longforms): |
| 187 | + """ |
| 188 | + Emit the Script and ScriptExtension enums as well as any related utility functions |
| 189 | + """ |
| 190 | + f.write(""" |
| 191 | +use core::convert::TryFrom; |
| 192 | +#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)] |
| 193 | +#[non_exhaustive] |
| 194 | +/// A value of the Script property |
| 195 | +pub enum Script { |
| 196 | +""") |
| 197 | + for script in script_list: |
| 198 | + f.write(" /// %s\n %s,\n" % (longforms[script], script)) |
| 199 | + f.write("""} |
| 200 | +#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)] |
| 201 | +#[non_exhaustive] |
| 202 | +/// A value for the Script_Extension property |
| 203 | +/// |
| 204 | +/// Script_Extension is one or more Script |
| 205 | +/// This is essentially an optimized version of Vec<Script>, |
| 206 | +/// optimized by script sets actually present in Unicode. |
| 207 | +pub enum ScriptExtension { |
| 208 | + /// A single script |
| 209 | + Single(Script), |
| 210 | +""") |
| 211 | + for ext in extension_list: |
| 212 | + longform = ", ".join([longforms[s] for s in ext]) |
| 213 | + f.write(" /// %s\n %s,\n" % (longform, "".join(ext))) |
| 214 | + f.write("""} |
| 215 | +
|
| 216 | +impl From<Script> for ScriptExtension { |
| 217 | + fn from(script: Script) -> Self { |
| 218 | + ScriptExtension::Single(script) |
| 219 | + } |
| 220 | +} |
| 221 | +
|
| 222 | +impl TryFrom<ScriptExtension> for Script { |
| 223 | + type Error = (); |
| 224 | + fn try_from(ext: ScriptExtension) -> Result<Self, ()> { |
| 225 | + match ext { |
| 226 | + ScriptExtension::Single(s) => Ok(s), |
| 227 | + _ => Err(()) |
| 228 | + } |
| 229 | + } |
| 230 | +} |
| 231 | +
|
| 232 | +impl Script { |
| 233 | + pub(crate) fn inner_full_name(self) -> &'static str { |
| 234 | + match self { |
| 235 | +""") |
| 236 | + for script in script_list: |
| 237 | + f.write(" Script::%s => \"%s\",\n" % (script, longforms[script])) |
| 238 | + f.write(""" } |
| 239 | + } |
| 240 | +} |
| 241 | +
|
| 242 | +impl ScriptExtension { |
| 243 | + #[inline] |
| 244 | + #[cfg(feature = "with_std")] |
| 245 | + pub(crate) fn inner_scripts(self) -> Vec<Script> { |
| 246 | + match self { |
| 247 | + ScriptExtension::Single(s) => vec![s], |
| 248 | +""") |
| 249 | + for ext in extension_list: |
| 250 | + scripts = ", ".join(["Script::%s" % s for s in ext]) |
| 251 | + f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts)) |
| 252 | + f.write(""" _ => unreachable!() |
| 253 | + } |
| 254 | + } |
| 255 | +
|
| 256 | + #[inline] |
| 257 | + pub(crate) fn inner_contains_script(self, other: Script) -> bool { |
| 258 | + match self { |
| 259 | + ScriptExtension::Single(s) => s == other, |
| 260 | +""") |
| 261 | + for ext in extension_list: |
| 262 | + scripts = " || ".join(["other == Script::%s" % s for s in ext]) |
| 263 | + f.write(" %s => %s,\n" % (extension_name(ext), scripts)) |
| 264 | + f.write(""" } |
| 265 | + } |
| 266 | +
|
| 267 | + #[inline] |
| 268 | + pub(crate) fn inner_intersects(self, other: Self) -> bool { |
| 269 | + match (self, other) { |
| 270 | + (ScriptExtension::Single(Script::Zyyy), _) | |
| 271 | + (ScriptExtension::Single(Script::Zinh), _) | |
| 272 | + (_, ScriptExtension::Single(Script::Zyyy)) | |
| 273 | + (_, ScriptExtension::Single(Script::Zinh)) => true, |
| 274 | + (ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) => o.inner_contains_script(s), |
| 275 | +""") |
| 276 | + intersections = compute_intersections(extension_list) |
| 277 | + for (e1, e2) in intersections: |
| 278 | + f.write(" (%s, %s) => true,\n" % (extension_name(e1), extension_name(e2))) |
| 279 | + f.write(""" _ => false, |
| 280 | + } |
| 281 | + } |
| 282 | +} |
| 283 | +""") |
| 284 | + |
| 285 | + |
| 286 | +# We currently do NOT have an optimized method to compute |
| 287 | +# the actual intersection between two script extensions, we |
| 288 | +# only check if they *do* intersect |
| 289 | +# |
| 290 | +# To add such a method we'd need to do an extra pass where we compute any |
| 291 | +# new ScriptExtension enums we'll need from the intersections. It doesn't |
| 292 | +# seem worth it for now |
| 293 | +def compute_intersections(extension_list): |
| 294 | + """ |
| 295 | + Compute which pairs of elements intersect. This will return duplicate pairs with |
| 296 | + the elements swapped, but that's fine. |
| 297 | + """ |
| 298 | + intersections = [] |
| 299 | + sets = [(e, set(e)) for e in extension_list] |
| 300 | + for (e1, s1) in sets: |
| 301 | + for (e2, s2) in sets: |
| 302 | + if e1 == e2: |
| 303 | + continue |
| 304 | + intersection = s1.intersection(s2) |
| 305 | + if len(intersection) > 0: |
| 306 | + intersections.append((e1, e2)) |
| 307 | + return intersections |
| 308 | + |
| 309 | +def extension_name(ext): |
| 310 | + """Get the rust source for a given ScriptExtension""" |
| 311 | + if len(ext) == 1: |
| 312 | + return "ScriptExtension::Single(Script::%s)" % ext[0] |
| 313 | + else: |
| 314 | + return "ScriptExtension::%s" % "".join(ext) |
| 315 | + |
| 316 | + |
| 317 | + |
| 318 | + |
| 319 | +if __name__ == "__main__": |
| 320 | + r = "tables.rs" |
| 321 | + if os.path.exists(r): |
| 322 | + os.remove(r) |
| 323 | + with open(r, "w") as rf: |
| 324 | + # write the file's preamble |
| 325 | + rf.write(preamble) |
| 326 | + rf.write(""" |
| 327 | +/// The version of [Unicode](http://www.unicode.org/) |
| 328 | +/// that this version of unicode-segmentation is based on. |
| 329 | +pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); |
| 330 | +""" % UNICODE_VERSION) |
| 331 | + |
| 332 | + |
| 333 | + (longforms, shortforms) = aliases() |
| 334 | + |
| 335 | + scripts = load_properties("Scripts.txt", []) |
| 336 | + |
| 337 | + script_table = [] |
| 338 | + script_list = [] |
| 339 | + |
| 340 | + for script in scripts: |
| 341 | + script_list.append(shortforms[script]) |
| 342 | + script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]]) |
| 343 | + script_table.sort(key=lambda w: w[0]) |
| 344 | + |
| 345 | + |
| 346 | + extensions = load_properties("ScriptExtensions.txt", []) |
| 347 | + extension_table = [] |
| 348 | + extension_list = [] |
| 349 | + |
| 350 | + for ext in extensions: |
| 351 | + split = ext.split(" ") |
| 352 | + split.sort() |
| 353 | + output_ext = [ext] |
| 354 | + if len(split) > 1: |
| 355 | + extension_list.append(split) |
| 356 | + output_ext = split |
| 357 | + extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]]) |
| 358 | + extension_table.sort(key=lambda w: w[0]) |
| 359 | + |
| 360 | + emit_enums(rf, script_list, extension_list, longforms) |
| 361 | + emit_search(rf) |
| 362 | + |
| 363 | + emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]", |
| 364 | + is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) |
| 365 | + emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]", |
| 366 | + is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2]))) |
| 367 | + |
| 368 | + # emit_table(rf, "FOObar", properties) |
0 commit comments