8000 Improve table search speed through lookups · unicode-rs/unicode-segmentation@0b2442c · GitHub
[go: up one dir, main page]

Skip to content

Commit 0b2442c

Browse files
committed
Improve table search speed through lookups
Prior to this change table search would have to do a binary search over about 1000 entries which resulted in around 10 memory loads on average. In this commit we reduce the search space by doing a pre-lookup in a generated table to get a smaller (often zero-length) slice of the full sorted range list. On average this gives us just one entry of the range list to perform binary search on, which reduces the average number of memory loads to 2.
1 parent 07e6155 commit 0b2442c

File tree

2 files changed

+314
-16
lines changed

2 files changed

+314
-16
lines changed

scripts/unicode.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,13 +274,29 @@ def emit_break_module(f, break_table, break_cats, name):
274274
pub enum %sCat {
275275
""" % (name, Name, Name))
276276

277+
max_lookup_value = 0x20000
278+
lookup_range = 0x400
279+
lookup_interval = round(max_lookup_value / lookup_range)
280+
281+
lookup_table = [0] * lookup_range
282+
j = 0
283+
for i in range(0, lookup_range):
284+
lookup_from = i * lookup_interval
285+
lookup_to = i * lookup_interval
286+
while j < len(break_table):
287+
(_, entry_to, _) = break_table[j]
288+
if entry_to >= lookup_from:
289+
break
290+
j += 1
291+
lookup_table[i] = j
292+
277293
break_cats.append("Any")
278294
break_cats.sort()
279295
for cat in break_cats:
280296
f.write((" %sC_" % Name[0]) + cat + ",\n")
281297
f.write(""" }
282298
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
299+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], min: u32) -> (u32, u32, %sCat) {
284300
use core::cmp::Ordering::{Equal, Less, Greater};
285301
match r.binary_search_by(|&(lo, hi, _)| {
286302
if lo <= c && c <= hi { Equal }
@@ -293,7 +309,7 @@ def emit_break_module(f, break_table, break_cats, name):
293309
}
294310
Err(idx) => {
295311
(
296-
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
312+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { min },
297313
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
298314
%sC_Any,
299315
)
@@ -302,10 +318,26 @@ def emit_break_module(f, break_table, break_cats, name):
302318
}
303319
304320
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
305-
bsearch_range_value_table(c, %s_cat_table)
321+
let idx = c as usize / 0x%x;
322+
let r = %s_cat_lookup.get(idx..(idx + 2)).map_or(
323+
%d..%d,
324+
|r| (r[0] as usize)..((r[1] + 1) as usize)
325+
);
326+
bsearch_range_value_table(c, &%s_cat_table[r], idx as u32 * 0x%x)
306327
}
307328
308-
""" % (Name, Name, Name[0], name, Name, name))
329+
""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name, lookup_interval))
330+
331+
if len(break_table) <= 0xff:
332+
lookup_type = "u8"
333+
elif len(break_table) <= 0xffff:
334+
lookup_type = "u16"
335+
else:
336+
lookup_type = "u32"
337+
338+
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
339+
pfun=lambda x: "%d" % x,
340+
is_pub=False, is_const=True)
309341

310342
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
311343
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),

0 commit comments

Comments
 (0)
0