8000 Merge pull request #3 from unicode-rs/bits · crlf0710/unicode-script@4a8cb11 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4a8cb11

Browse files
authored
Merge pull request unicode-rs#3 from unicode-rs/bits
Move to using bit sets for ScriptExtension
2 parents cbfd7bd + 91a8e06 commit 4a8cb11

File tree

5 files changed

+1974
-2466
lines changed

5 files changed

+1974
-2466
lines changed

.github/workflows/tests.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,24 @@ on: [push]
44

55
jobs:
66
build:
7-
87
runs-on: ubuntu-latest
9-
8+
strategy:
9+
matrix:
10+
rust:
11+
- beta
12+
- nightly
1013
steps:
1114
- uses: actions/checkout@v1
1215
- uses: actions-rs/toolchain@v1
1316
with:
1417
profile: minimal
15-
toolchain: beta
18+
toolchain: ${{ matrix.rust }}
1619
override: true
1720
components: rustfmt
1821
- name: Build
1922
run: cargo build --verbose
2023
- name: Run tests
2124
run: cargo test
25+
- name: Run benchmarks
26+
run: cargo bench --features bench
27+
if: startsWith(matrix.rust, 'nightly')

Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "unicode-script"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
authors = ["Manish Goregaokar <manishsmail@gmail.com>"]
55
edition = "2018"
66

@@ -20,9 +20,8 @@ exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
2020
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
2121

2222
[features]
23-
with_std = []
24-
default_features = ["with_std"]
2523
rustc-dep-of-std = ['std', 'core', 'compiler_builtins']
24+
bench = []
2625

2726
[dependencies]
2827
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }

scripts/unicode.py

Lines changed: 61 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
3636
3737
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
38+
39+
use super::ScriptExtension;
3840
'''
3941

4042
UNICODE_VERSION = (12, 0, 0)
@@ -183,182 +185,102 @@ def emit_search(f):
183185
}
184186
""")
185187

186-
def emit_enums(f, script_list, extension_list, longforms, intersections):
188+
def emit_enums(f, script_list, extension_list, longforms):
187189
"""
188190
Emit the Script and ScriptExtension enums as well as any related utility functions
189191
"""
192+
190193
f.write("""
191-
use core::convert::TryFrom;
192194
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
193195
#[non_exhaustive]
194196
#[allow(non_camel_case_types)]
195-
/// A value of the Script property
197+
#[repr(u8)]
198+
/// A value of the `Script` property
196199
pub enum Script {
197200
/// Unknown script
198-
Unknown,
201+
Unknown = 0xFF,
202+
/// Zyyy
203+
Common = 0xFE,
204+
/// Zinh,
205+
Inherited = 0xFD,
199206
""")
200-
for script in script_list:
201-
f.write(" /// %s\n %s,\n" % (script, longforms[script]))
202-
f.write("""}
203-
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
204-
#[non_exhaustive]
205-
/// A value for the Script_Extension property
206-
///
207-
/// Script_Extension is one or more Script
208-
///
209-
/// This is essentially an optimized version of Vec<Script>,
210-
/// optimized by script sets and intersections actually present in Unicode.
211-
pub enum ScriptExtension {
212-
/// A single script
213-
Single(Script),
207+
for (i, script) in enumerate(script_list):
208+
f.write(" /// %s\n %s = %s,\n" % (script, longforms[script], i))
209+
f.write("}\n")
210+
f.write("pub const NEXT_SCRIPT: u8 = %s;" % len(script_list))
211+
f.write("""
212+
213+
pub mod script_extensions {
214+
use crate::ScriptExtension;
215+
pub const COMMON: ScriptExtension = ScriptExtension::new_common();
216+
pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited();
217+
pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown();
214218
""")
219+
for (i, script) in enumerate(script_list):
220+
first = 0
221+
second = 0
222+
third = 0
223+
# need to replace L because `hex()` will spit out an L suffix for larger numbers
224+
if i < 64:
225+
first = hex(1 << i).replace("L", "")
226+
elif i < 128:
227+
second = hex(1 << (i - 64)).replace("L", "")
228+
else:
229+
third = hex(1 << (i - 128)).replace("L", "")
230+
f.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n" %
231+
(longforms[script], longforms[script].upper(), first, second, third))
232+
if script != longforms[script]:
233+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" %
234+
(longforms[script], script.upper(), longforms[script].upper()))
215235
for ext in extension_list:
216236
longform = ", ".join([longforms[s] for s in ext])
217-
f.write(" /// %s\n %s,\n" % (longform, "".join(ext)))
237+
name = "_".join([s.upper() for s in ext])
238+
expr = ext[0].upper()
239+
for e in ext[1:]:
240+
expr = "%s.union(%s)" % (expr, e.upper())
241+
f.write(" /// %s\n pub const %s 10000 : ScriptExtension = %s;\n" % (longform, name, expr))
218242
f.write("""}
219243
220-
impl From<Script> for ScriptExtension {
221-
fn from(script: Script) -> Self {
222-
ScriptExtension::Single(script)
223-
}
224-
}
225-
226-
impl TryFrom<ScriptExtension> for Script {
227-
type Error = ();
228-
fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
229-
match ext {
230-
ScriptExtension::Single(s) => Ok(s),
231-
_ => Err(())
232-
}
233-
}
234-
}
235-
236244
impl Script {
245+
#[inline]
237246
pub(crate) fn inner_full_name(self) -> &'static str {
238247
match self {
239248
Script::Unknown => "Unknown",
249+
Script::Common => "Common",
250+
Script::Inherited => "Inherited",
240251
""")
241252
for script in script_list:
242253
f.write(" Script::%s => \"%s\",\n" % (longforms[script], longforms[script]))
243254
f.write(""" }
244255
}
245256
257+
#[inline]
246258
pub(crate) fn inner_short_name(self) -> &'static str {
247259
match self {
248260
Script::Unknown => "",
261+
Script::Common => "Zyyy",
262+
Script::Inherited => "Zinh",
249263
""")
250264
for script in script_list:
251265
f.write(" Script::%s => \"%s\",\n" % (longforms[script], script))
252266
f.write(""" }
253267
}
254-
}
255268
256-
impl ScriptExtension {
257269
#[inline]
258-
#[cfg(feature = "with_std")]
259-
pub(crate) fn inner_scripts(self) -> Vec<Script> {
260-
match self {
261-
ScriptExtension::Single(s) => vec![s],
270+
pub(crate) fn for_integer(value: u8) -> Self {
271+
match value {
262272
""")
263-
for ext in extension_list:
264-
scripts = ", ".join(["Script::%s" % longforms[s] for s in ext])
265-
f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts))
266-
f.write(""" _ => unreachable!()
267-
}
268-
}
269-
270-
#[inline]
271-
pub(crate) fn inner_contains_script(self, other: Script) -> bool {
272-
match self {
273-
ScriptExtension::Single(s) => s == other,
274-
""")
275-
for ext in extension_list:
276-
scripts = " || ".join(["other == Script::%s" % longforms[s] for s in ext])
277-
f.write(" %s => %s,\n" % (extension_name(ext), scripts))
278-
f.write(""" }
279-
}
280-
281-
#[inline]
282-
pub(crate) fn inner_intersect(self, other: Self) -> Self {
283-
match (self, other) {
284-
(ScriptExtension::Single(Script::Unknown), _) |
285-
(_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown),
286-
(a, b) if a == b => a,
287-
(ScriptExtension::Single(Script::Common), a) |
288-
(ScriptExtension::Single(Script::Inherited), a) |
289-
(a, ScriptExtension::Single(Script::Common)) |
290-
(a, ScriptExtension::Single(Script::Inherited)) => a,
291-
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s),
292-
""")
293-
for (e1, e2, i) in intersections:
294-
f.write(" (%s, %s) => %s,\n" % (extension_name(e1), extension_name(e2), extension_name(i, longforms)))
295-
f.write(""" _ => ScriptExtension::Single(Script::Unknown),
273+
for (i, script) in enumerate(script_list):
274+
f.write(" %s => Script::%s,\n" % (i, longforms[script]))
275+
f.write(""" _ => unreachable!(),
296276
}
297277
}
298278
}
299279
""")
300280

301-
302-
def compute_intersections_elements(extension_list):
303-
"""
304-
Compute all intersections between the script extensions.
305-
This will add new elements to extension_list, be sure to call it first!
306-
"""
307-
308-
# This is the only third-level intersection
309-
# It's easier to hardcode things here rather than
310-
# do the below calculation in a loop
311-
extension_list.append(['Deva', 'Knda', 'Tirh'])
312-
intersections = []
313-
# Some intersections will not exist in extension_list and we'll need to add them
314-
new_elements = []
315-
sets = [(e, set(e)) for e in extension_list]
316-
for (e1, s1) in sets:
317-
for (e2, s2) in sets:
318-
if e1 == e2:
319-
continue
320-
intersection = s1.intersection(s2)
321-
if len(intersection) > 0:
322-
intersection = [i for i in intersection]
323-
intersection.sort()
324-
if len(intersection) > 1 and intersection not in extension_list and intersection not in new_elements:
325-
new_elements.append(intersection)
326-
if (e1, e2, intersection) not in intersections:
327-
intersections.append((e1, e2, intersection))
328-
extension_list.extend(new_elements)
329-
330-
# We now go through the newly added second-level extension values and calculate their intersections
331-
# with the original set and each other
332-
new_sets = [(e, set(e)) for e in new_elements]
333-
sets = [(e, set(e)) for e in extension_list]
334-
for (e1, s1) in new_sets:
335-
for (e2, s2) in sets:
336-
if e1 == e2:
337-
continue
338-
intersection = s1.intersection(s2)
339-
if len(intersection) > 0:
340-
intersection = [i for i in intersection]
341-
intersection.sort()
342-
if len(intersection) > 1 and intersection not in extension_list:
343-
raise "Found new third-level intersection, please hardcode it"
344-
# The previous routine would automatically get both versions
345-
# of an intersection because it would iterate each pair in both orders,
346-
# but here we're working on an asymmetric pair, so we insert both in order to not
347-
# miss anything
348-
if (e1, e2, intersection) not in intersections:
349-
intersections.append((e1, e2, intersection))
350-
if (e2, e1, intersection) not in intersections:
351-
intersections.append((e2, e1, intersection))
352-
353-
intersections.sort()
354-
return intersections
355-
356-
def extension_name(ext, longforms={}):
281+
def extension_name(ext):
357282
"""Get the rust source for a given ScriptExtension"""
358-
if len(ext) == 1:
359-
return "ScriptExtension::Single(Script::%s)" % longforms[ext[0]]
360-
else:
361-
return "ScriptExtension::%s" % "".join(ext)
283+
return "script_extensions::%s" % "_".join([e.upper() for e in ext])
362284

363285

364286

@@ -385,8 +307,10 @@ def extension_name(ext, longforms={}):
385307
script_list = []
386308

387309
for script in scripts:
388-
script_list.append(shortforms[script])
310+
if script not in ["Common", "Unknown", "Inherited"]:
311+
script_list.append(shortforms[script])
389312
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
313+
script_list.sort()
390314
script_table.sort(key=lambda w: w[0])
391315

392316

@@ -404,14 +328,13 @@ def extension_name(ext, longforms={}):
404328
extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]])
405329
extension_table.sort(key=lambda w: w[0])
406330

407-
intersections = compute_intersections_elements(extension_list)
408331

409-
emit_enums(rf, script_list, extension_list, longforms, intersections)
332+
emit_enums(rf, script_list, extension_list, longforms)
410333
emit_search(rf)
411334

412335
emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]",
413336
is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), longforms[x[2]]))
414337
emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]",
415-
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2], longforms)))
338+
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2])))
416339

417340
# emit_table(rf, "FOObar", properties)

0 commit comments

Comments
 (0)
0