8000 Add grapheme iteration benchmarks for various languages. · simmsb/unicode-segmentation@c5bc229 · GitHub
[go: up one dir, main page]

Skip to content

Commit c5bc229

Browse files
committed
Add grapheme iteration benchmarks for various languages.
1 parent b82ed4d commit c5bc229

10 files changed

+3293
-1
lines changed

Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,15 @@ This crate provides Grapheme Cluster, Word and Sentence boundaries
1616
according to Unicode Standard Annex #29 rules.
1717
"""
1818

19-
exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
19+
exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "benches/texts/*", "*.txt", ]
2020

2121
[features]
2222
no_std = [] # This is a no-op, preserved for backward compatibility only.
2323

2424
[dev-dependencies]
2525
quickcheck = "0.7"
26+
bencher = "0.1"
27+
28+
[[bench]]
29+
name = "graphemes"
30+
harness = false

benches/graphemes.rs

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
extern crate unicode_segmentation;
4+
5+
use bencher::Bencher;
6+
use unicode_segmentation::UnicodeSegmentation;
7+
8+
const TEXT_ARABIC: &str = include_str!("texts/arabic.txt");
9+
const TEXT_ENGLISH: &str = include_str!("texts/english.txt");
10+
const TEXT_HINDI: &str = include_str!("texts/hindi.txt");
11+
const TEXT_JAPANESE: &str = include_str!("texts/japanese.txt");
12+
const TEXT_KOREAN: &str = include_str!("texts/korean.txt");
13+
const TEXT_MANDARIN: &str = include_str!("texts/mandarin.txt");
14+
const TEXT_RUSSIAN: &str = include_str!("texts/russian.txt");
15+
const TEXT_SOURCE_CODE: &str = include_str!("texts/source_code.txt");
16+
17+
fn graphemes_arabic(bench: &mut Bencher) {
18+
bench.iter(|| {
19+
for g in UnicodeSegmentation::graphemes(TEXT_ARABIC, true) {
20+
bencher::black_box(g);
21+
}
22+
});
23+
24+
bench.bytes = TEXT_ARABIC.len() as u64;
25+
}
26+
27+
fn graphemes_english(bench: &mut Bencher) {
28+
bench.iter(|| {
29+
for g in UnicodeSegmentation::graphemes(TEXT_ENGLISH, true) {
30+
bencher::black_box(g);
31+
}
32+
});
33+
34+
bench.bytes = TEXT_ENGLISH.len() as u64;
35+
}
36+
37+
fn graphemes_hindi(bench: &mut Bencher) {
38+
bench.iter(|| {
39+
for g in UnicodeSegmentation::graphemes(TEXT_HINDI, true) {
40+
bencher::black_box(g);
41+
}
42+
});
43+
44+
bench.bytes = TEXT_HINDI.len() as u64;
45+
}
46+
47+
fn graphemes_japanese(bench: &mut Bencher) {
48+
bench.iter(|| {
49+
for g in UnicodeSegmentation::graphemes(TEXT_JAPANESE, true) {
50+
bencher::black_box(g);
51+
}
52+
});
53+
54+
bench.bytes = TEXT_JAPANESE.len() as u64;
55+
}
56+
57+
fn graphemes_korean(bench: &mut Bencher) {
58+
bench.iter(|| {
59+
for g in UnicodeSegmentation::graphemes(TEXT_KOREAN, true) {
60+
bencher::black_box(g);
61+
}
62+
});
63+
64+
bench.bytes = TEXT_KOREAN.len() as u64;
65+
}
66+
67+
fn graphemes_mandarin(bench: &mut Bencher) {
68+
bench.iter(|| {
69+
for g in UnicodeSegmentation::graphemes(TEXT_MANDARIN, true) {
70+
bencher::black_box(g);
71+
}
72+
});
73+
74+
bench.bytes = TEXT_MANDARIN.len() as u64;
75+
}
76+
77+
fn graphemes_russian(bench: &mut Bencher) {
78+
bench.iter(|| {
79+
for g in UnicodeSegmentation::graphemes(TEXT_RUSSIAN, true) {
80+
bencher::black_box(g);
81+
}
82+
});
83+
84+
bench.bytes = TEXT_RUSSIAN.len() as u64;
85+
}
86+
87+
fn graphemes_source_code(bench: &mut Bencher) {
88+
bench.iter(|| {
89+
for g in UnicodeSegmentation::graphemes(TEXT_SOURCE_CODE, true) {
90+
bencher::black_box(g);
91+
}
92+
});
93+
94+
bench.bytes = TEXT_SOURCE_CODE.len() as u64;
95+
}
96+
97+
benchmark_group!(
98+
benches,
99+
graphemes_arabic,
100+
graphemes_english,
101+
graphemes_hindi,
102+
graphemes_japanese,
103+
graphemes_korean,
104+
graphemes_mandarin,
105+
graphemes_russian,
106+
graphemes_source_code,
107+
);
108+
109+
benchmark_main!(benches);

benches/texts/arabic.txt

Lines changed: 106 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/english.txt

Lines changed: 222 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/hindi.txt

Lines changed: 155 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/japanese.txt

Lines changed: 269 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/korean.txt

Lines changed: 243 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/mandarin.txt

Lines changed: 356 additions & 0 deletions
Large diffs are not rendered by default.

benches/texts/russian.txt

Lines changed: 155 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
0