8000 Implement a special-case lookup for ascii grapeheme categories. · timClicks/unicode-segmentation@c9aa6fa · GitHub
[go: up one dir, main page]

Skip to content

Commit c9aa6fa

Browse files
committed
Implement a special-case lookup for ascii grapeheme categories.
This speeds up processing even for many non-ascii texts, since they often still use ascii-range punctuation and whitespace.
1 parent b82ed4d commit c9aa6fa

File tree

1 file changed

+23
-5
lines changed

1 file changed

+23
-5
lines changed

src/grapheme.rs

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -284,12 +284,30 @@ impl GraphemeCursor {
284284

285285
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
286286
use tables::grapheme as gr;
287-
// If this char isn't within the cached range, update the cache to the
288-
// range that includes it.
289-
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
290-
self.grapheme_cat_cache = gr::grapheme_category(ch);
287+
use tables::grapheme::GraphemeCat::*;
288+
289+
if ch <= '\u{7e}' {
290+
// Special-case optimization for ascii, except U+007F. This
291+
// improves performance even for many primarily non-ascii texts,
292+
// due to use of punctuation and white space characters from the
293+
// ascii range.
294+
if ch >= '\u{20}' {
295+
GC_Any
296+
} else if ch == '\u{a}' {
297+
GC_LF
298+
} else if ch == '\u{d}' {
299+
GC_CR
300+
} else {
301+
GC_Control
302+
}
303+
} else {
304+
// If this char isn't within the cached range, update the cache to the
305+
// range that includes it.
306+
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
307+
self.grapheme_cat_cache = gr::grapheme_category(ch);
308+
}
309+
self.grapheme_cat_cache.2
291310
}
292-
self.grapheme_cat_cache.2
293311
}
294312

295313
// Not sure I'm gonna keep this, the advantage over new() seems thin.

0 commit comments

Comments
 (0)
0