8000 Benchmark other methods mentioned in README · unicode-rs/unicode-segmentation@52dba3b · GitHub
[go: up one dir, main page]

Skip to content

Commit 52dba3b

Browse files
committed
Benchmark other methods mentioned in README
The word boundary extensions to &str/String behave in a very similiar, but not identical manner to .graphemes(). For example, Mandarin to slow(ish) on .graphemes() but fast(ish) on .word_boundaries() whereas languages with whitespace-delimited words tend to have the same per- formance characteristics with the latter methods. As the library develops, it would be worthwhile to monitor the speed of the rest of the documented API.
1 parent 573b7bb commit 52dba3b

File tree

3 files changed

+136
-0
lines changed

3 files changed

+136
-0
lines changed

Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,12 @@ bencher = "0.1"
2828

2929
[[bench]]
3030
name = "graphemes"
31+
harness = false
32+
33+
[[bench]]
34+
name = "unicode_words"
35+
harness = false
36+
37+
[[bench]]
38+
name = "word_bounds"
3139
harness = false

benches/unicode_words.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
extern crate unicode_segmentation;
4+
5+
use bencher::Bencher;
6+
use unicode_segmentation::UnicodeSegmentation;
7+
use std::fs;
8+
9+
fn unicode_words(bench: &mut Bencher, path: &str) {
10+
let text = fs::read_to_string(path).unwrap();
11+
bench.iter(|| {
12+
for w in text.unicode_words() {
13+
bencher::black_box(w);
14+
}
15+
});
16+
17+
bench.bytes = text.len() as u64;
18+
}
19+
20+
fn unicode_words_arabic(bench: &mut Bencher) {
21+
unicode_words(bench, "benches/texts/arabic.txt");
22+
}
23+
24+
fn unicode_words_english(bench: &mut Bencher) {
25+
unicode_words(bench, "benches/texts/english.txt");
26+
}
27+
28+
fn unicode_words_hindi(bench: &mut Bencher) {
29+
unicode_words(bench, "benches/texts/hindi.txt");
30+
}
31+
32+
fn unicode_words_japanese(bench: &mut Bencher) {
33+
unicode_words(bench, "benches/texts/japanese.txt");
34+
}
35+
36+
fn unicode_words_korean(bench: &mut Bencher) {
37+
unicode_words(bench, "benches/texts/korean.txt");
38+
}
39+
40+
fn unicode_words_mandarin(bench: &mut Bencher) {
41+
unicode_words(bench, "benches/texts/mandarin.txt");
42+
}
43+
44+
fn unicode_words_russian(bench: &mut Bencher) {
45+
unicode_words(bench, "benches/texts/russian.txt");
46+
}
47+
48+
fn unicode_words_source_code(bench: &mut Bencher) {
49+
unicode_words(bench, "benches/texts/source_code.txt");
50+
}
51+
52+
benchmark_group!(
53+
benches,
54+
unicode_words_arabic,
55+
unicode_words_english,
56+
unicode_words_hindi,
57+
unicode_words_japanese,
58+
unicode_words_korean,
59+
unicode_words_mandarin,
60+
unicode_words_russian,
61+
unicode_words_source_code,
62+
);
63+
64+
benchmark_main!(benches);

benches/word_bounds.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
extern crate unicode_segmentation;
4+
5+
use bencher::Bencher;
6+
use unicode_segmentation::UnicodeSegmentation;
7+
use std::fs;
8+
9+
fn word_bounds(bench: &mut Bencher, path: &str) {
10+
let text = fs::read_to_string(path).unwrap();
11+
bench.iter(|| {
12+
for w in text.split_word_bounds() {
13+
bencher::black_box(w);
14+
}
15+
});
16+
17+
bench.bytes = text.len() as u64;
18+
}
19+
20+
fn word_bounds_arabic(bench: &mut Bencher) {
21+
word_bounds(bench, "benches/texts/arabic.txt");
22+
}
23+
24+
fn word_bounds_english(bench: &mut Bencher) {
25+
word_bounds(bench, "benches/texts/english.txt");
26+
}
27+
28+
fn word_bounds_hindi(bench: &mut Bencher) {
29+
word_bounds(bench, "benches/texts/hindi.txt");
30+
}
31+
32+
fn word_bounds_japanese(bench: &mut Bencher) {
33+
word_bounds(bench, "benches/texts/japanese.txt");
34+
}
35+
36+
fn word_bounds_korean(bench: &mut Bencher) {
37+
word_bounds(bench, "benches/texts/korean.txt");
38+
}
39+
40+
fn word_bounds_mandarin(bench: &mut Bencher) {
41+
word_bounds(bench, "benches/texts/mandarin.txt");
42+
}
43+
44+
fn word_bounds_russian(bench: &mut Bencher) {
45+
word_bounds(bench, "benches/texts/russian.txt");
46+
}
47+
48+
fn word_bounds_source_code(bench: &mut Bencher) {
49+
word_bounds(bench, "benches/texts/source_code.txt");
50+
}
51+
52+
benchmark_group!(
53+
benches,
54+
word_bounds_arabic,
55+
word_bounds_english,
56+
word_bounds_hindi,
57+
word_bounds_japanese,
58+
word_bounds_korean,
59+
word_bounds_mandarin,
60+
word_bounds_russian,
61+
word_bounds_source_code,
62+
);
63+
64+
benchmark_main!(benches);

0 commit comments

Comments
 (0)
0