8000 Merge pull request #77 from cessen/faster_graphemes · simmsb/unicode-segmentation@b82ed4d · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit b82ed4d

Browse files
authored
Merge pull request unicode-rs#77 from cessen/faster_graphemes
Optimization for grapheme iteration.
2 parents ac54e2d + 29ecd77 commit b82ed4d

File tree

5 files changed

+94
-48
lines changed

5 files changed

+94
-48
lines changed

scripts/unicode.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,22 +280,29 @@ def emit_break_module(f, break_table, break_cats, name):
280280
f.write((" %sC_" % Name[0]) + cat + ",\n")
281281
f.write(""" }
282282
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
283+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
284+
use core;
284285
use core::cmp::Ordering::{Equal, Less, Greater};
285286
match r.binary_search_by(|&(lo, hi, _)| {
286287
if lo <= c && c <= hi { Equal }
287288
else if hi < c { Less }
288289
else { Greater }
289290
}) {
290291
Ok(idx) => {
291-
let (_, _, cat) = r[idx];
292-
cat
292+
let (lower, upper, cat) = r[idx];
293+
(lower as u32, upper as u32, cat)
294+
}
295+
Err(idx) => {
296+
(
297+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
298+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
299+
%sC_Any,
300+
)
293301
}
294-
Err(_) => %sC_Any
295302
}
296303
}
297304
298-
pub fn %s_category(c: char) -> %sCat {
305+
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
299306
bsearch_range_value_table(c, %s_cat_table)
300307
}
301308

src/grapheme.rs

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ pub struct GraphemeCursor {
178178
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
179179
// to needing more input.
180180
resuming: bool,
181+
// Cached grapheme category and associated scalar value range.
182+
grapheme_cat_cache: (u32, u32, GraphemeCat),
181183
}
182184

183185
/// An error return indicating that not enough content was available in the
@@ -276,9 +278,20 @@ impl GraphemeCursor {
276278
pre_context_offset: None,
277279
ris_count: None,
278280
resuming: false,
281+
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
279282
}
280283
}
281284

285+
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
286+
use tables::grapheme as gr;
287+
// If this char isn't within the cached range, update the cache to the
288+
// range that includes it.
289+
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
290+
self.grapheme_cat_cache = gr::grapheme_category(ch);
291+
}
292+
self.grapheme_cat_cache.2
293+
}
294+
282295
// Not sure I'm gonna keep this, the advantage over new() seems thin.
283296

284297
/// Set the cursor to a new location in the same string.
@@ -349,7 +362,7 @@ impl GraphemeCursor {
349362
self.pre_context_offset = None;
350363
if self.is_extended && chunk_start + chunk.len() == self.offset {
351364
let ch = chunk.chars().rev().next().unwrap();
352-
if gr::grapheme_category(ch) == gr::GC_Prepend {
365+
if self.grapheme_category(ch) == gr::GC_Prepend {
353366
self.decide(false); // GB9b
354367
return;
355368
}
@@ -359,7 +372,7 @@ impl GraphemeCursor {
359372
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
360373
_ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
361374
let ch = chunk.chars().rev().next().unwrap();
362-
self.cat_before = Some(gr::grapheme_category(ch));
375+
self.cat_before = Some(self.grapheme_category(ch));
363376
},
364377
}
365378
}
@@ -393,7 +406,7 @@ impl GraphemeCursor {
393406
use tables::grapheme as gr;
394407
let mut ris_count = self.ris_count.unwrap_or(0);
395408
for ch in chunk.chars().rev() {
396-
if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
409+
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
397410
self.ris_count = Some(ris_count);
398411
self.decide((ris_count % 2) == 0);
399412
return;
@@ -413,13 +426,13 @@ impl GraphemeCursor {
413426
use tables::grapheme as gr;
414427
let mut iter = chunk.chars().rev();
415428
if let Some(ch) = iter.next() {
416-
if gr::grapheme_category(ch) != gr::GC_ZWJ {
429+
if self.grapheme_category(ch) != gr::GC_ZWJ {
417430
self.decide(true);
418431
return;
419432
}
420433
}
421434
for ch in iter {
422-
match gr::grapheme_category(ch) {
435+
match self.grapheme_category(ch) {
423436
gr::GC_Extend => (),
424437
gr::GC_Extended_Pictographic => {
425438
self.decide(false);
@@ -481,7 +494,7 @@ impl GraphemeCursor {
481494
let offset_in_chunk = self.offset - chunk_start;
482495
if self.cat_after.is_none() {
483496
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
484-
self.cat_after = Some(gr::grapheme_category(ch));
497+
self.cat_after = Some(self.grapheme_category(ch));
485498
}
486499
if self.offset == chunk_start {
487500
let mut need_pre_context = true;
@@ -497,7 +510,7 @@ impl GraphemeCursor {
497510
}
498511
if self.cat_before.is_none() {
499512
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
500-
self.cat_before = Some(gr::grapheme_category(ch));
513+
self.cat_before = Some(self.grapheme_category(ch));
501514
}
502515
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
503516
PairResult::NotBreak => return self.decision(false),
@@ -553,7 +566,6 @@ impl GraphemeCursor {
553566
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
554567
/// ```
555568
pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
556-
use tables::grapheme as gr;
557569
if self.offset == self.len {
558570
return Ok(None);
559571
}
@@ -562,14 +574,14 @@ impl GraphemeCursor {
562574
loop {
563575
if self.resuming {
564576
if self.cat_after.is_none() {
565-
self.cat_after = Some(gr::grapheme_category(ch));
577+
self.cat_after = Some(self.grapheme_category(ch));
566578
}
567579
} else {
568580
self.offset += ch.len_utf8();
569581
self.state = GraphemeState::Unknown;
570582
self.cat_before = self.cat_after.take();
571583
if self.cat_before.is_none() {
572-
self.cat_before = Some(gr::grapheme_category(ch));
584+
self.cat_before = Some(self.grapheme_category(ch));
573585
}
574586
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
575587
self.ris_count = self.ris_count.map(|c| c + 1);
@@ -578,7 +590,7 @@ impl GraphemeCursor {
578590
}
579591
if let Some(next_ch) = iter.next() {
580592
ch = next_ch;
581-
self.cat_after = Some(gr::grapheme_category(ch));
593+
self.cat_after = Some(self.grapheme_category(ch));
582594
} else if self.offset == self.len {
583595
self.decide(true);
584596
} else {
@@ -629,7 +641,6 @@ impl GraphemeCursor {
629641
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
630642
/// ```
631643
pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
632-
use tables::grapheme as gr;
633644
if self.offset == 0 {
634645
return Ok(None);
635646
}
@@ -644,7 +655,7 @@ impl GraphemeCursor {
644655
return Err(GraphemeIncomplete::PrevChunk);
645656
}
646657
if self.resuming {
647-
self.cat_before = Some(gr::grapheme_category(ch));
658+
self.cat_before = Some(self.grapheme_category(ch));
648659
} else {
649660
self.offset -= ch.len_utf8();
650661
self.cat_after = self.cat_before.take();
@@ -654,12 +665,12 @@ impl GraphemeCursor {
654665
}
655666
if let Some(prev_ch) = iter.next() {
656667
ch = prev_ch;
657-
self.cat_before = Some(gr::grapheme_category(ch));
668+
self.cat_before = Some(self.grapheme_category(ch));
658669
} else if self.offset == 0 {
659670
self.decide(true);
660671
} else {
661672
self.resuming = true;
662-
self.cat_after = Some(gr::grapheme_category(ch));
673+
self.cat_after = Some(self.grapheme_category(ch));
663674
return Err(GraphemeIncomplete::PrevChunk);
664675
}
665676
}

src/sentence.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ mod fwd {
115115

116116
for next_char in ahead.chars() {
117117
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
118-
match se::sentence_category(next_char) {
118+
match se::sentence_category(next_char).2 {
119119
se::SC_Lower => return true,
120120
se::SC_OLetter |
121121
se::SC_Upper |
@@ -182,7 +182,7 @@ mod fwd {
182182
let position_before = self.pos;
183183
let state_before = self.state.clone();
184184

185-
let next_cat = se::sentence_category(next_char);
185+
let next_cat = se::sentence_category(next_char).2;
186186

187187
self.pos += next_char.len_utf8();
188188
self.state = self.state.next(next_cat);

src/tables.rs

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -345,22 +345,29 @@ pub mod grapheme {
345345
GC_ZWJ,
346346
}
347347

348-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
348+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
349+
use core;
349350
use core::cmp::Ordering::{Equal, Less, Greater};
350351
match r.binary_search_by(|&(lo, hi, _)| {
351352
if lo <= c && c <= hi { Equal }
352353
else if hi < c { Less }
353354
else { Greater }
354355
}) {
355356
Ok(idx) => {
356-
let (_, _, cat) = r[idx];
357-
cat
357+
let (lower, upper, cat) = r[idx];
358+
(lower as u32, upper as u32, cat)
359+
}
360+
Err(idx) => {
361+
(
362+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
363+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
364+
GC_Any,
365+
)
358366
}
359-
Err(_) => GC_Any
360367
}
361368
}
362369

363-
pub fn grapheme_category(c: char) -> GraphemeCat {
370+
pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) {
364371
bsearch_range_value_table(c, grapheme_cat_table)
365372
}
366373

@@ -980,22 +987,29 @@ pub mod word {
980987
WC_ZWJ,
981988
}
982989

983-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> WordCat {
990+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
991+
use core;
984992
use core::cmp::Ordering::{Equal, Less, Greater};
985993
match r.binary_search_by(|&(lo, hi, _)| {
986994
if lo <= c && c <= hi { Equal }
987995
else 10000 if hi < c { Less }
988996
else { Greater }
989997
}) {
990998
Ok(idx) => {
991-
let (_, _, cat) = r[idx];
992-
cat
999+
let (lower, upper, cat) = r[idx];
1000+
(lower as u32, upper as u32, cat)
1001+
}
1002+
Err(idx) => {
1003+
(
1004+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1005+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1006+
WC_Any,
1007+
)
9931008
}
994-
Err(_) => WC_Any
9951009
}
9961010
}
9971011

998-
pub fn word_category(c: char) -> WordCat {
1012+
pub fn word_category(c: char) -> (u32, u32, WordCat) {
9991013
bsearch_range_value_table(c, word_cat_table)
10001014
}
10011015

@@ -1439,22 +1453,29 @@ pub mod emoji {
14391453
EC_Extended_Pictographic,
14401454
}
14411455

1442-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> EmojiCat {
1456+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
1457+
use core;
14431458
use core::cmp::Ordering::{Equal, Less, Greater};
14441459
match r.binary_search_by(|&(lo, hi, _)| {
14451460
if lo <= c && c <= hi { Equal }
14461461
else if hi < c { Less }
14471462
else { Greater }
14481463
}) {
14491464
Ok(idx) => {
1450-
let (_, _, cat) = r[idx];
1451-
cat
1465+
let (lower, upper, cat) = r[idx];
1466+
(lower as u32, upper as u32, cat)
1467+
}
1468+
Err(idx) => {
1469+
(
1470+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1471+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1472+
EC_Any,
1473+
)
14521474
}
1453-
Err(_) => EC_Any
14541475
}
14551476
}
14561477

1457-
pub fn emoji_category(c: char) -> EmojiCat {
1478+
pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) {
14581479
bsearch_range_value_table(c, emoji_cat_table)
14591480
}
14601481

@@ -1535,22 +1556,29 @@ pub mod sentence {
15351556
SC_Upper,
15361557
}
15371558

1538-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> SentenceCat {
1559+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
1560+
use core;
15391561
use core::cmp::Ordering::{Equal, Less, Greater};
15401562
match r.binary_search_by(|&(lo, hi, _)| {
15411563
if lo <= c && c <= hi { Equal }
15421564
else if hi < c { Less }
15431565
else { Greater }
15441566
}) {
15451567
Ok(idx) => {
1546-
let (_, _, cat) = r[idx];
1547-
cat
1568+
let (lower, upper, cat) = r[idx];
1569+
(lower as u32, upper as u32, cat)
1570+
}
1571+
Err(idx) => {
1572+
(
1573+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1574+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1575+
SC_Any,
1576+
)
15481577
}
1549-
Err(_) => SC_Any
15501578
}
15511579
}
15521580

1553-
pub fn sentence_category(c: char) -> SentenceCat {
1581+
pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) {
15541582
bsearch_range_value_table(c, sentence_cat_table)
15551583
}
15561584

0 commit comments

Comments
 (0)
0