8000 Merge pull request #78 from cessen/grapheme_bench · unicode-rs/unicode-segmentation@485767a · GitHub
[go: up one dir, main page]

Skip to content

Commit 485767a

Browse files
authored
Merge pull request #78 from cessen/grapheme_bench
Add grapheme iteration benchmarks for various languages.
2 parents b82ed4d + b1765ec commit 485767a

12 files changed

+3619
-1
lines changed

Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,15 @@ This crate provides Grapheme Cluster, Word and Sentence boundaries
1616
according to Unicode Standard Annex #29 rules.
1717
"""
1818

19-
exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
19+
exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "benches/texts/*", "*.txt", ]
2020

2121
[features]
2222
no_std = [] # This is a no-op, preserved for backward compatibility only.
2323

2424
[dev-dependencies]
2525
quickcheck = "0.7"
26+
bencher = "0.1"
27+
28+
[[bench]]
29+
name = "graphemes"
30+
harness = false

benches/graphemes.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
extern crate unicode_segmentation;
4+
5+
use bencher::Bencher;
6+
use unicode_segmentation::UnicodeSegmentation;
7+
use std::fs;
8+
9+
fn graphemes(bench: &mut Bencher, path: &str) {
10+
let text = fs::read_to_string(path).unwrap();
11+
bench.iter(|| {
12+
for g in UnicodeSegmentation::graphemes(&*text, true) {
13+
bencher::black_box(g);
14+
}
15+
});
16+
17+
bench.bytes = text.len() as u64;
18+
}
19+
20+
fn graphemes_arabic(bench: &mut Bencher) {
21+
graphemes(bench, "benches/texts/arabic.txt");
22+
}
23+
24+
fn graphemes_english(bench: &mut Bencher) {
25+
graphemes(bench, "benches/texts/english.txt");
26+
}
27+
28+
fn graphemes_hindi(bench: &mut Bencher) {
29+
graphemes(bench, "benches/texts/hindi.txt");
30+
}
31+
32+
fn graphemes_japanese(bench: &mut Bencher) {
33+
graphemes(bench, "benches/texts/japanese.txt");
34+
}
35+
36+
fn graphemes_korean(bench: &mut Bencher) {
37+
graphemes(bench, "benches/texts/korean.txt");
38+
}
39+
40+
fn graphemes_mandarin(bench: &mut Bencher) {
41+
graphemes(bench, "benches/texts/mandarin.txt");
42+
}
43+
44+
fn graphemes_russian(bench: &mut Bencher) {
45+
graphemes(bench, "benches/texts/russian.txt");
46+
}
47+
48+
fn graphemes_source_code(bench: &mut Bencher) {
49+
graphemes(bench, "benches/texts/source_code.txt");
50+
}
51+
52+
benchmark_group!(
53+
benches,
54+
graphemes_arabic,
55+
graphemes_english,
56+
graphemes_hindi,
57+
graphemes_japanese,
58+
graphemes_korean,
59+
graphemes_mandarin,
60+
graphemes_russian,
61+
graphemes_source_code,
62+
);
63+
64+
benchmark_main!(benches);

benches/texts/LICENSE

Lines changed: 359 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/README

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
All language text files in this folder are copied from Wikipedia, under the CC-BY-SA 3.0 license
2+
(included in LICENSE). source_code.txt is from the Neovim source code, covered under the Apache 2.0
3+
license. The original source for each file is listed below
4+
5+
- english.txt: https://en.wikipedia.org/wiki/English_language
6+
- korean.txt: https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%96%B4
7+
- japanese.txt: https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E8%AA%9E
8+
- hindi.txt: https://hi.wikipedia.org/wiki/%E0%A4%B9%E0%A4%BF%E0%A4%A8%E0%A5%8D%E0%A4%A6%E0%A5%80
9+
- mandarin.txt: https://zh.wikipedia.org/wiki/%E5%AE%98%E8%AF%9D
10+
- arabic.txt: https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D9%84%D8%BA%D8%A9_%D8%A7%D9%84%D8%B9%D8%B1%D8%A8%D9%8A%D8%A9
11+
- russian.txt: https://ru.wikipedia.org/wiki/%D0%A0%D1%83%D1%81%D1%81%D0%BA%D0%B8%D0%B9_%D1%8F%D0%B7%D1%8B%D0%BA
12+
- source_code.txt: https://github.com/veonim/neovim/blob/master/src/nvim/buffer.c

benches/texts/arabic.txt

Lines changed: 106 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/english.txt

Lines changed: 222 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/hindi.txt

Lines changed: 155 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/japanese.txt

Lines changed: 269 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/korean.txt

Lines changed: 243 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/mandarin.txt

Lines changed: 356 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/russian.txt

Lines changed: 155 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
0