8000 Move to using bit sets for ScriptExtension · crlf0710/unicode-script@91a8e06 · GitHub
[go: up one dir, main page]

Skip to content

Commit 91a8e06

Browse files
committed
Move to using bit sets for ScriptExtension
1 parent 1057462 commit 91a8e06

File tree

5 files changed

+1954
-2433
lines changed

5 files changed

+1954
-2433
lines changed

.github/workflows/tests.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,24 @@ on: [push]
44

55
jobs:
66
build:
7-
87
runs-on: ubuntu-latest
9-
8+
strategy:
9+
matrix:
10+
rust:
11+
- beta
12+
- nightly
1013
steps:
1114
- uses: actions/checkout@v1
1215
- uses: actions-rs/toolchain@v1
1316
with:
1417
profile: minimal
15-
toolchain: beta
18+
toolchain: ${{ matrix.rust }}
1619
override: true
1720
components: rustfmt
1821
- name: Build
1922
run: cargo build --verbose
2023
- name: Run tests
2124
run: cargo test
25+
- name: Run benchmarks
26+
run: cargo bench --features bench
27+
if: startsWith(matrix.rust, 'nightly')

Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "unicode-script"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
authors = ["Manish Goregaokar <manishsmail@gmail.com>"]
55
edition = "2018"
66

@@ -20,9 +20,8 @@ exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
2020
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
2121

2222
[features]
23-
with_std = []
24-
default_features = ["with_std"]
2523
rustc-dep-of-std = ['std', 'core', 'compiler_builtins']
24+
bench = []
2625

2726
[dependencies]
2827
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }

scripts/unicode.py

Lines changed: 58 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
3636
3737
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
38+
39+
use super::ScriptExtension;
3840
'''
3941

4042
UNICODE_VERSION = (12, 0, 0)
@@ -183,44 +185,69 @@ def emit_search(f):
183185
}
184186
""")
185187

186-
def emit_enums(f, script_list, extension_list, longforms, intersections):
188+
def emit_enums(f, script_list, extension_list, longforms):
187189
"""
188190
Emit the Script and ScriptExtension enums as well as any related utility functions
189191
"""
192+
190193
f.write("""
191194
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
192195
#[non_exhaustive]
193196
#[allow(non_camel_case_types)]
197+
#[repr(u8)]
194198
/// A value of the `Script` property
195199
pub enum Script {
196200
/// Unknown script
197-
Unknown,
201+
Unknown = 0xFF,
202+
/// Zyyy
203+
Common = 0xFE,
204+
/// Zinh,
205+
Inherited = 0xFD,
198206
""")
199-
for script in script_list:
200-
f.write(" /// %s\n %s,\n" % (script, longforms[script]))
201-
f.write("""}
202-
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
203-
#[non_exhaustive]
204-
/// A value for the `Script_Extension` property
205-
///
206-
/// [`ScriptExtension`] is one or more [`Script`]
207-
///
208-
/// This is essentially an optimized version of `Vec<Script>`,
209-
/// optimized by script sets and intersections actually present in Unicode.
210-
pub enum ScriptExtension {
211-
/// A single script
212-
Single(Script),
207+
for (i, script) in enumerate(script_list):
208+
f.write(" /// %s\n %s = %s,\n" % (script, longforms[script], i))
209+
f.write("}\n")
210+
f.write("pub const NEXT_SCRIPT: u8 = %s;" % len(script_list))
211+
f.write("""
212+
213+
pub mod script_extensions {
214+
use crate::ScriptExtension;
215+
pub const COMMON: ScriptExtension = ScriptExtension::new_common();
216+
pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited();
217+
pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown();
213218
""")
219+
for (i, script) in enumerate(script_list):
220+
first = 0
221+
second = 0
222+
third = 0
223+
# need to replace L because `hex()` will spit out an L suffix for larger numbers
224+
if i < 64:
225+
first = hex(1 << i).replace("L", "")
226+
elif i < 128:
227+
second = hex(1 << (i - 64)).replace("L", "")
228+
else:
229+
third = hex(1 << (i - 128)).replace("L", "")
230+
f.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n" %
231+
(longforms[script], longforms[script].upper(), first, second, third))
232+
if script != longforms[script]:
233+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" %
234+
(longforms[script], script.upper(), longforms[script].upper()))
214235
for ext in extension_list:
215236
longform = ", ".join([longforms[s] for s in ext])
216-
f.write(" /// %s\n %s,\n" % (longform, "".join(ext)))
237+
name = "_".join([s.upper() for s in ext])
238+
expr = ext[0].upper()
239+
for e in ext[1:]:
240+
expr = "%s.union(%s)" % (expr, e.upper())
241+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" % (longform, name, expr))
217242
f.write("""}
218243
219244
impl Script {
220245
#[inline]
221246
pub(crate) fn inner_full_name(self) -> &'static str {
222247
match self {
223248
Script::Unknown => "Unknown",
249+
Script::Common => "Common",
250+
Script::Inherited => "Inherited",
224251
""")
225252
for script in script_list:
226253
f.write(" Script::%s => \"%s\",\n" % (longforms[script], longforms[script]))
@@ -231,119 +258,29 @@ def emit_enums(f, script_list, extension_list, longforms, intersections):
231258
pub(crate) fn inner_short_name(self) -> &'static str {
232259
match self {
233260
Script::Unknown => "",
261+
Script::Common => "Zyyy",
262+
Script::Inherited => "Zinh",
234263
""")
235264
for script in script_list:
236265
f.write(" Script::%s => \"%s\",\n" % (longforms[script], script))
237266
f.write(""" }
238267
}
239-
}
240-
241-
impl ScriptExtension {
242-
#[inline]
243-
#[cfg(feature = "with_std")]
244-
pub(crate) fn inner_scripts(self) -> Vec<Script> {
245-
match self {
246-
ScriptExtension::Single(s) => vec![s],
247-
""")
248-
for ext in extension_list:
249-
scripts = ", ".join(["Script::%s" % longforms[s] for s in ext])
250-
f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts))
251-
f.write(""" _ => unreachable!()
252-
}
253-
}
254-
255-
#[inline]
256-
pub(crate) fn inner_contains_script(self, other: Script) -> bool {
257-
match self {
258-
ScriptExtension::Single(s) => s == other,
259-
""")
260-
for ext in extension_list:
261-
scripts = " || ".join(["other == Script::%s" % longforms[s] for s in ext])
262-
f.write(" %s => %s,\n" % (extension_name(ext), scripts))
263-
f.write(""" }
264-
}
265268
266269
#[inline]
267-
pub(crate) fn inner_intersect(self, other: Self) -> Self {
268-
match (self, other) {
269-
(ScriptExtension::Single(Script::Unknown), _) |
270-
(_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown),
271-
(a, b) if a == b => a,
272-
(ScriptExtension::Single(Script::Common), a) |
273-
(ScriptExtension::Single(Script::Inherited), a) |
274-
(a, ScriptExtension::Single(Script::Common)) |
275-
(a, ScriptExtension::Single(Script::Inherited)) => a,
276-
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s),
270+
pub(crate) fn for_integer(value: u8) -> Self {
271+
match value {
277272
""")
278-
for (e1, e2, i) in intersections:
279-
f.write(" (%s, %s) => %s,\n" % (extension_name(e1), extension_name(e2), extension_name(i, longforms)))
280-
f.write(""" _ => ScriptExtension::Single(Script::Unknown),
273+
for (i, script) in enumerate(script_list):
274+
f.write(" %s => Script::%s,\n" % (i, longforms[script]))
275+
f.write(""" _ => unreachable!(),
281276
}
282277
}
283278
}
284279
""")
285280

286-
287-
def compute_intersections_elements(extension_list):
288-
"""
289-
Compute all intersections between the script extensions.
290-
This will add new elements to extension_list, be sure to call it first!
291-
"""
292-
293-
# This is the only third-level intersection
294-
# It's easier to hardcode things here rather than
295-
# do the below calculation in a loop
296-
extension_list.append(['Deva', 'Knda', 'Tirh'])
297-
intersections = []
298-
# Some intersections will not exist in extension_list and we'll need to add them
299-
new_elements = []
300-
sets = [(e, set(e)) for e in extension_list]
301-
for (e1, s1) in sets:
302-
for (e2, s2) in sets:
303-
if e1 == e2:
304-
continue
305-
intersection = s1.in 1CF5 tersection(s2)
306-
if len(intersection) > 0:
307-
intersection = [i for i in intersection]
308-
intersection.sort()
309-
if len(intersection) > 1 and intersection not in extension_list and intersection not in new_elements:
310-
new_elements.append(intersection)
311-
if (e1, e2, intersection) not in intersections:
312-
intersections.append((e1, e2, intersection))
313-
extension_list.extend(new_elements)
314-
315-
# We now go through the newly added second-level extension values and calculate their intersections
316-
# with the original set and each other
317-
new_sets = [(e, set(e)) for e in new_elements]
318-
sets = [(e, set(e)) for e in extension_list]
319-
for (e1, s1) in new_sets:
320-
for (e2, s2) in sets:
321-
if e1 == e2:
322-
continue
323-
intersection = s1.intersection(s2)
324-
if len(intersection) > 0:
325-
intersection = [i for i in intersection]
326-
intersection.sort()
327-
if len(intersection) > 1 and intersection not in extension_list:
328-
raise "Found new third-level intersection, please hardcode it"
329-
# The previous routine would automatically get both versions
330-
# of an intersection because it would iterate each pair in both orders,
331-
# but here we're working on an asymmetric pair, so we insert both in order to not
332-
# miss anything
333-
if (e1, e2, intersection) not in intersections:
334-
intersections.append((e1, e2, intersection))
335-
if (e2, e1, intersection) not in intersections:
336-
intersections.append((e2, e1, intersection))
337-
338-
intersections.sort()
339-
return intersections
340-
341-
def extension_name(ext, longforms={}):
281+
def extension_name(ext):
342282
"""Get the rust source for a given ScriptExtension"""
343-
if len(ext) == 1:
344-
return "ScriptExtension::Single(Script::%s)" % longforms[ext[0]]
345-
else:
346-
return "ScriptExtension::%s" % "".join(ext)
283+
return "script_extensions::%s" % "_".join([e.upper() for e in ext])
347284

348285

349286

@@ -370,8 +307,10 @@ def extension_name(ext, longforms={}):
370307
script_list = []
371308

372309
for script in scripts:
373-
script_list.append(shortforms[script])
310+
if script not in ["Common", "Unknown", "Inherited"]:
311+
script_list.append(shortforms[script])
374312
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
313+
script_list.sort()
375314
script_table.sort(key=lambda w: w[0])
376315

377316

@@ -389,14 +328,13 @@ def extension_name(ext, longforms={}):
389328
extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]])
390329
extension_table.sort(key=lambda w: w[0])
391330

392-
intersections = compute_intersections_elements(extension_list)
393331

394-
emit_enums(rf, script_list, extension_list, longforms, intersections)
332+
emit_enums(rf, script_list, extension_list, longforms)
395333
emit_search(rf)
396334

397335
emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]",
398336
is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), longforms[x[2]]))
399337
emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]",
400-
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2], longforms)))
338+
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2])))
401339

402340
# emit_table(rf, "FOObar", properties)

0 commit comments

Comments
 (0)
0