8000 Optimization for grapheme iteration. · YohDeadfall/unicode-segmentation@98fd158 · GitHub
[go: up one dir, main page]

Skip to content

Commit 98fd158

Browse files
committed
Optimization for grapheme iteration.
Typical text involves a lot of chars one-after-another that are in a similar scalar value range and in the same category. This commit takes advantage of that to speed up grapheme iteration by caching the range and category of the last char, and always checking against that cache first before doing a binary search on the whole grapheme category table. This results in significant speed ups on many texts, in some cases up to 2x faster.
1 parent ac54e2d commit 98fd158

File tree

5 files changed

+89
-46
lines changed

5 files changed

+89
-46
lines changed

scripts/unicode.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,22 +280,28 @@ def emit_break_module(f, break_table, break_cats, name):
280280
f.write((" %sC_" % Name[0]) + cat + ",\n")
281281
f.write(""" }
282282
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
283+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
284284
use core::cmp::Ordering::{Equal, Less, Greater};
285285
match r.binary_search_by(|&(lo, hi, _)| {
286286
if lo <= c && c <= hi { Equal }
287287
else if hi < c { Less }
288288
else { Greater }
289289
}) {
290290
Ok(idx) => {
291-
let (_, _, cat) = r[idx];
292-
cat
291+
let (lower, upper, cat) = r[idx];
292+
(lower as u32, upper as u32, cat)
293+
}
294+
Err(idx) => {
295+
(
296+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
297+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
298+
%sC_Any,
299+
)
293300
}
294-
Err(_) => %sC_Any
295301
}
296302
}
297303
298-
pub fn %s_category(c: char) -> %sCat {
304+
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
299305
bsearch_range_value_table(c, %s_cat_table)
300306
}
301307

src/grapheme.rs

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ pub struct GraphemeCursor {
178178
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
179179
// to needing more input.
180180
resuming: bool,
181+
// Cached grapheme category and associated scalar value range.
182+
grapheme_cat_cache: (u32, u32, GraphemeCat),
181183
}
182184

183185
/// An error return indicating that not enough content was available in the
@@ -276,9 +278,20 @@ impl GraphemeCursor {
276278
pre_context_offset: None,
277279
ris_count: None,
278280
resuming: false,
281+
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
279282
}
280283
}
281284

285+
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
286+
use tables::grapheme as gr;
287+
// If this char isn't within the cached range, update the cache to the
288+
// range that includes it.
289+
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
290+
self.grapheme_cat_cache = gr::grapheme_category(ch);
291+
}
292+
self.grapheme_cat_cache.2
293+
}
294+
282295
// Not sure I'm gonna keep this, the advantage over new() seems thin.
283296

284297
/// Set the cursor to a new location in the same string.
@@ -349,7 +362,7 @@ impl GraphemeCursor {
349362
self.pre_context_offset = None;
350363
if self.is_extended && chunk_start + chunk.len() == self.offset {
351364
let ch = chunk.chars().rev().next().unwrap();
352-
if gr::grapheme_category(ch) == gr::GC_Prepend {
365+
if self.grapheme_category(ch) == gr::GC_Prepend {
353366
self.decide(false); // GB9b
354367
return;
355368
}
@@ -359,7 +372,7 @@ impl GraphemeCursor {
359372
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
360373
_ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
361374
let ch = chunk.chars().rev().next().unwrap();
362-
self.cat_before = Some(gr::grapheme_category(ch));
375+
self.cat_before = Some(self.grapheme_category(ch));
363376
},
364377
}
365378
}
@@ -393,7 +406,7 @@ impl GraphemeCursor {
393406
use tables::grapheme as gr;
394407
let mut ris_count = self.ris_count.unwrap_or(0);
395408
for ch in chunk.chars().rev() {
396-
if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
409+
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
397410
self.ris_count = Some(ris_count);
398411
self.decide((ris_count % 2) == 0);
399412
return;
@@ -413,13 +426,13 @@ impl GraphemeCursor {
413426
use tables::grapheme as gr;
414427
let mut iter = chunk.chars().rev();
415428
if let Some(ch) = iter.next() {
416-
if gr::grapheme_category(ch) != gr::GC_ZWJ {
429+
if self.grapheme_category(ch) != gr::GC_ZWJ {
417430
self.decide(true);
418431
return;
419432
}
420433
}
421434
for ch in iter {
422-
match gr::grapheme_category(ch) {
435+
match self.grapheme_category(ch) {
423436
gr::GC_Extend => (),
424437
gr::GC_Extended_Pictographic => {
425438
self.decide(false);
@@ -481,7 +494,7 @@ impl GraphemeCursor {
481494
let offset_in_chunk = self.offset - chunk_start;
482495
if self.cat_after.is_none() {
483496
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
484-
self.cat_after = Some(gr::grapheme_category(ch));
497+
self.cat_after = Some(self.grapheme_category(ch));
485498
}
486499
if self.offset == chunk_start {
487500
let mut need_pre_context = true;
@@ -497,7 +510,7 @@ impl GraphemeCursor {
497510
}
498511
if self.cat_before.is_none() {
499512
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
500-
self.cat_before = Some(gr::grapheme_category(ch));
513+
self.cat_before = Some(self.grapheme_category(ch));
501514
}
502515
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
503516
PairResult::NotBreak => return self.decision(false),
@@ -562,14 +575,14 @@ impl GraphemeCursor {
562575
loop {
563576
if self.resuming {
564577
if self.cat_after.is_none() {
565-
self.cat_after = Some(gr::grapheme_category(ch));
578+
self.cat_after = Some(self.grapheme_category(ch));
566579
}
567580
} else {
568581
self.offset += ch.len_utf8();
569582
self.state = GraphemeState::Unknown;
570583
self.cat_before = self.cat_after.take();
571584
if self.cat_before.is_none() {
572-
self.cat_before = Some(gr::grapheme_category(ch));
585+
self.cat_before = Some(self.grapheme_category(ch));
573586
}
574587
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
575588
self.ris_count = self.ris_count.map(|c| c + 1);
@@ -578,7 +591,7 @@ impl GraphemeCursor {
578591
}
579592
if let Some(next_ch) = iter.next() {
580593
ch = next_ch;
581-
self.cat_after = Some(gr::grapheme_category(ch));
594+
self.cat_after = Some(self.grapheme_category(ch));
582595
} else if self.offset == self.len {
583596
self.decide(true);
584597
} else {
@@ -644,7 +657,7 @@ impl GraphemeCursor {
644657
return Err(GraphemeIncomplete::PrevChunk);
645658
}
646659
if self.resuming {
647-
self.cat_before = Some(gr::grapheme_category(ch));
660+
self.cat_before = Some(self.grapheme_category(ch));
648661
} else {
649662
self.offset -= ch.len_utf8();
650663
self.cat_after = self.cat_before.take();
@@ -654,12 +667,12 @@ impl GraphemeCursor {
654667
}
655668
if let Some(prev_ch) = iter.next() {
656669
ch = prev_ch;
657-
self.cat_before = Some(gr::grapheme_category(ch));
670+
self.cat_before = Some(self.grapheme_category(ch));
658671
} else if self.offset == 0 {
659672
self.decide(true);
660673
} else {
661674
self.resuming = true;
662-
self.cat_after = Some(gr::grapheme_category(ch));
675+
self.cat_after = Some(self.grapheme_category(ch));
663676
return Err(GraphemeIncomplete::PrevChunk);
664677
}
665678
}

src/sentence.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ mod fwd {
115115

116116
for next_char in ahead.chars() {
117117
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
118-
match se::sentence_category(next_char) {
118+
match se::sentence_category(next_char).2 {
119119
se::SC_Lower => return true,
120120
se::SC_OLetter |
121121
se::SC_Upper |
@@ -182,7 +182,7 @@ mod fwd {
182182
let position_before = self.pos;
183183
let state_before = self.state.clone();
184184

185-
let next_cat = se::sentence_category(next_char);
185+
let next_cat = se::sentence_category(next_char).2;
186186

187187
self.pos += next_char.len_utf8();
188188
self.state = self.state.next(next_cat);

src/tables.rs

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -345,22 +345,28 @@ pub mod grapheme {
345345
GC_ZWJ,
346346
}
347347

348-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
348+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
349349
use core::cmp::Ordering::{Equal, Less, Greater};
350350
match r.binary_search_by(|&(lo, hi, _)| {
351351
if lo <= c && c <= hi { Equal }
352352
else if hi < c { Less }
353353
else { Greater }
354354
}) {
355355
Ok(idx) => {
356-
let (_, _, cat) = r[idx];
357-
cat
356+
let (lower, upper, cat) = r[idx];
357+
(lower as u32, upper as u32, cat)
358+
}
359+
Err(idx) => {
360+
(
361+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
362+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
363+
GC_Any,
364+
)
358365
}
359-
Err(_) => GC_Any
360366
}
361367
}
362368

363-
pub fn grapheme_category(c: char) -> GraphemeCat {
369+
pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) {
364370
bsearch_range_value_table(c, grapheme_cat_table)
365371
}
366372

@@ -980,22 +986,28 @@ pub mod word {
980986
WC_ZWJ,
981987
}
982988

983-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> WordCat {
989+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
984990
use core::cmp::Ordering::{Equal, Less, Greater};
985991
match r.binary_search_by(|&(lo, hi, _)| {
986992
if lo <= c && c <= hi { Equal }
987993
else if hi < c { Less }
988994
else { Greater }
989995
}) {
990996
Ok(idx) => {
991-
let (_, _, cat) = r[idx];
992-
cat
997+
let (lower, upper, cat) = r[idx];
998+
(lower as u32, upper as u32, cat)
999+
}
1000+
Err(idx) => {
1001+
(
1002+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1003+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1004+
WC_Any,
1005+
)
9931006
}
994-
Err(_) => WC_Any
9951007
}
9961008
}
9971009

998-
pub fn word_category(c: char) -> WordCat {
1010+
pub fn word_category(c: char) -> (u32, u32, WordCat) {
9991011
bsearch_range_value_table(c, word_cat_table)
10001012
}
10011013

@@ -1439,22 +1451,28 @@ pub mod emoji {
14391451
EC_Extended_Pictographic,
14401452
}
14411453

1442-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> EmojiCat {
1454+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
14431455
use core::cmp::Ordering::{Equal, Less, Greater};
14441456
match r.binary_search_by(|&(lo, hi, _)| {
14451457
if lo <= c && c <= hi { Equal }
14461458
else if hi < c { Less }
14471459
else { Greater }
14481460
}) {
14491461
Ok(idx) => {
1450-
let (_, _, cat) = r[idx];
1451-
cat
1462+
let (lower, upper, cat) = r[idx];
1463+
(lower as u32, upper as u32, cat)
1464+
}
1465+
Err(idx) => {
1466+
(
1467+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1468+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1469+
EC_Any,
1470+
)
14521471
}
1453-
Err(_) => EC_Any
14541472
}
14551473
}
14561474

1457-
pub fn emoji_category(c: char) -> EmojiCat {
1475+
pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) {
14581476
bsearch_range_value_table(c, emoji_cat_table)
14591477
}
14601478

@@ -1535,22 +1553,28 @@ pub mod sentence {
15351553
SC_Upper,
15361554
}
15371555

1538-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> SentenceCat {
1556+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
15391557
use core::cmp::Ordering::{Equal, Less, Greater};
15401558
match r.binary_search_by(|&(lo, hi, _)| {
15411559
if lo <= c && c <= hi { Equal }
15421560
else if hi < c {< 10000 /span> Less }
15431561
else { Greater }
15441562
}) {
15451563
Ok(idx) => {
1546-
let (_, _, cat) = r[idx];
1547-
cat
1564+
let (lower, upper, cat) = r[idx];
1565+
(lower as u32, upper as u32, cat)
1566+
}
1567+
Err(idx) => {
1568+
(
1569+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1570+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1571+
SC_Any,
1572+
)
15481573
}
1549-
Err(_) => SC_Any
15501574
}
15511575
}
15521576

1553-
pub fn sentence_category(c: char) -> SentenceCat {
1577+
pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) {
15541578
bsearch_range_value_table(c, sentence_cat_table)
15551579
}
15561580

src/word.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ enum RegionalState {
125125

126126
fn is_emoji(ch: char) -> bool {
127127
use tables::emoji;
128-
emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
128+
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
129129
}
130130

131131
impl<'a> Iterator for UWordBounds<'a> {
@@ -164,7 +164,7 @@ impl<'a> Iterator for UWordBounds<'a> {
164164
prev_zwj = cat == wd::WC_ZWJ;
165165
// if there's a category cached, grab it
166166
cat = match self.cat {
167-
None => wd::word_category(ch),
167+
None => wd::word_category(ch).2,
168168
_ => self.cat.take().unwrap()
169169
};
170170
take_cat = true;
@@ -391,7 +391,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
391391

392392
// if there's a category cached, grab it
393393
cat = match self.catb {
394-
None => wd::word_category(ch),
394+
None => wd::word_category(ch).2,
395395
_ => self.catb.take().unwrap()
396396
};
397397
take_cat = true;
@@ -533,7 +533,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
533533
if regional_state == RegionalState::Unknown {
534534
let count = self.string[..previdx]
535535
.chars().rev()
536-
.map(|c| wd::word_category(c))
536+
.map(|c| wd::word_category(c).2)
537537
.filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
538538
.take_while(|&c| c == wd::WC_Regional_Indicator)
539539
.count();
@@ -624,7 +624,7 @@ impl<'a> UWordBounds<'a> {
624624
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
625625
if nidx < self.string.len() {
626626
let nch = self.string[nidx..].chars().next().unwrap();
627-
Some(wd::word_category(nch))
627+
Some(wd::word_category(nch).2)
628628
} else {
629629
None
630630
}
@@ -635,7 +635,7 @@ impl<'a> UWordBounds<'a> {
635635
use tables::word as wd;
636636
if idx > 0 {
637637
let nch = self.string[..idx].chars().next_back().unwrap();
638-
Some(wd::word_category(nch))
638+
Some(wd::word_category(nch).2)
639639
} else {
640640
None
641641
}

0 commit comments

Comments
 (0)
0