8000 Merge pull request #79 from cessen/ascii_grapheme_optimization · YohDeadfall/unicode-segmentation@fbba2a6 · GitHub
[go: up one dir, main page]

Skip to content

Commit fbba2a6

Browse files
authored
Merge pull request unicode-rs#79 from cessen/ascii_grapheme_optimization
Implement a special-case lookup for ascii grapheme categories.
2 parents 485767a + 945dbb6 commit fbba2a6

File tree

1 file changed

+23
-5
lines changed

1 file changed

+23
-5
lines changed

src/grapheme.rs

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -284,12 +284,30 @@ impl GraphemeCursor {
284284

285285
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
286286
use tables::grapheme as gr;
287-
// If this char isn't within the cached range, update the cache to the
288-
// range that includes it.
289-
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
290-
self.grapheme_cat_cache = gr::grapheme_category(ch);
287+
use tables::grapheme::GraphemeCat::*;
288+
289+
if ch <= '\u{7e}' {
290+
// Special-case optimization for ascii, except U+007F. This
291+
// improves performance even for many primarily non-ascii texts,
292+
// due to use of punctuation and white space characters from the
293+
// ascii range.
294+
if ch >= '\u{20}' {
295+
GC_Any
296+
} else if ch == '\n' {
297+
GC_LF
298+
} else if ch == '\r' {
299+
GC_CR
300+
} else {
301+
GC_Control
302+
}
303+
} else {
304+
// If this char isn't within the cached range, update the cache to the
305+
// range that includes it.
306+
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
307+
self.grapheme_cat_cache = gr::grapheme_category(ch);
308+
}
309+
self.grapheme_cat_cache.2
291310
}
292-
self.grapheme_cat_cache.2
293311
}
294312

295313
// Not sure I'm gonna keep this, the advantage over new() seems thin.

0 commit comments

Comments
 (0)
0