8000 Merge pull request #112 from indutny/feature/lookup-and-search · unicode-rs/unicode-segmentation@e94d2fe · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit e94d2fe

Browse files
authored
Merge pull request #112 from indutny/feature/lookup-and-search
Improve table search speed through lookups
2 parents 243af2c + b4c9ce1 commit e94d2fe

File tree

2 files changed

+387
-21
lines changed

2 files changed

+387
-21
lines changed

scripts/unicode.py

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
274274
pub enum %sCat {
275275
""" % (name, Name, Name))
276276

277+
# We don't want the lookup table to be too large so choose a reasonable
278+
# cutoff. 0x20000 is selected because most of the range table entries are
279+
# within the interval of [0x0, 0x20000]
280+
lookup_value_cutoff = 0x20000
281+
282+
# Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
283+
lookup_table_len = 0x400
284+
285+
lookup_interval = round(lookup_value_cutoff / lookup_table_len)
286+
287+
# Lookup table is a mapping from `character code / lookup_interval` to
288+
# the index in the range table that covers the `character code`.
289+
lookup_table = [0] * lookup_table_len
290+
j = 0
291+
for i in range(0, lookup_table_len):
292+
lookup_from = i * lookup_interval
293+
while j < len(break_table):
294+
(_, entry_to, _) = break_table[j]
295+
if entry_to >= lookup_from:
296+
break
297+
j += 1
298+
lookup_table[i] = j
299+
277300
break_cats.append("Any")
278301
break_cats.sort()
279302
for cat in break_cats:
280303
f.write((" %sC_" % Name[0]) + cat + ",\n")
281304
f.write(""" }
282305
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
306+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
284307
use core::cmp::Ordering::{Equal, Less, Greater};
285308
match r.binary_search_by(|&(lo, hi, _)| {
286309
if lo <= c && c <= hi { Equal }
@@ -293,19 +316,48 @@ def emit_break_module(f, break_table, break_cats, name):
293316
}
294317
Err(idx) => {
295318
(
296-
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
297-
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
319+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
320+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
298321
%sC_Any,
299322
)
300323
}
301324
}
302325
}
303326
304327
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
305-
bsearch_range_value_table(c, %s_cat_table)
328+
// Perform a quick O(1) lookup in a precomputed table to determine
329+
// the slice of the range table to search in.
330+
let lookup_interval = 0x%x;
331+
let idx = (c as u32 / lookup_interval) as usize;
332+
let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
333+
// If the `idx` is outside of the precomputed table - use the slice
334+
// starting from the last covered index in the precomputed table and
335+
// ending with the length of the range table.
336+
%d..%d,
337+
|r| (r[0] as usize)..((r[1] + 1) as usize)
338+
);
339+
340+
// Compute pessimistic default lower and upper bounds on the category.
341+
// If character doesn't map to any range and there is no adjacent range
342+
// in the table slice - these bounds has to apply.
343+
let lower = idx as u32 * lookup_interval;
344+
let upper = lower + lookup_interval - 1;
345+
bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
306346
}
307347
308-
""" % (Name, Name, Name[0], name, Name, name))
348+
""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))
349+
350+
351+
if len(break_table) <= 0xff:
352+
lookup_type = "u8"
353+
elif len(break_table) <= 0xffff:
354+
lookup_type = "u16"
355+
else:
356+
lookup_type = "u32"
357+
358+
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
359+
pfun=lambda x: "%d" % x,
360+
is_pub=False, is_const=True)
309361

310362
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
311363
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),

0 commit comments

Comments
 (0)
0