unicode-rs
diff --git a/‎Cargo.toml
Lines changed: 6 additions & 1 deletion b/‎Cargo.toml
Lines changed: 6 additions & 1 deletion
diff --git a/‎benches/graphemes.rs
Lines changed: 64 additions & 0 deletions b/‎benches/graphemes.rs
Lines changed: 64 additions & 0 deletions
diff --git a/‎benches/texts/LICENSE
Lines changed: 359 additions & 0 deletions b/‎benches/texts/LICENSE
Lines changed: 359 additions & 0 deletions
diff --git a/‎benches/texts/README
Lines changed: 12 additions & 0 deletions b/‎benches/texts/README
Lines changed: 12 additions & 0 deletions
diff --git a/‎benches/texts/arabic.txt
Lines changed: 106 additions & 0 deletions b/‎benches/texts/arabic.txt
Lines changed: 106 additions & 0 deletions
diff --git a/‎benches/texts/english.txt
Lines changed: 222 additions & 0 deletions b/‎benches/texts/english.txt
Lines changed: 222 additions & 0 deletions
diff --git a/‎benches/texts/hindi.txt
Lines changed: 155 additions & 0 deletions b/‎benches/texts/hindi.txt
Lines changed: 155 additions & 0 deletions
diff --git a/‎benches/texts/korean.txt
Lines changed: 243 additions & 0 deletions b/‎benches/texts/korean.txt
Lines changed: 243 additions & 0 deletions
diff --git a/‎benches/texts/mandarin.txt
Lines changed: 356 additions & 0 deletions b/‎benches/texts/mandarin.txt
Lines changed: 356 additions & 0 deletions
diff --git a/‎benches/texts/russian.txt
Lines changed: 155 additions & 0 deletions b/‎benches/texts/russian.txt
Lines changed: 155 additions & 0 deletions
@@ -16,10 +16,15 @@ This crate provides Grapheme Cluster, Word and Sentence boundaries
 according to Unicode Standard Annex #29 rules.
 """
 
-exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
+exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "benches/texts/*", "*.txt", ]
 
 [features]
 no_std = [] # This is a no-op, preserved for backward compatibility only.
 
 [dev-dependencies]
 quickcheck = "0.7"
+bencher = "0.1"
+
+[[bench]]
+name = "graphemes"
+harness = false
@@ -0,0 +1,64 @@
+#[macro_use]
+extern crate bencher;
+extern crate unicode_segmentation;
+
+use bencher::Bencher;
+use unicode_segmentation::UnicodeSegmentation;
+use std::fs;
+
+fn graphemes(bench: &mut Bencher, path: &str) {
+    let text = fs::read_to_string(path).unwrap();
+    bench.iter(|| {
+        for g in UnicodeSegmentation::graphemes(&*text, true) {
+            bencher::black_box(g);
+        }
+    });
+
+    bench.bytes = text.len() as u64;
+}
+
+fn graphemes_arabic(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/arabic.txt");
+}
+
+fn graphemes_english(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/english.txt");
+}
+
+fn graphemes_hindi(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/hindi.txt");
+}
+
+fn graphemes_japanese(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/japanese.txt");
+}
+
+fn graphemes_korean(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/korean.txt");
+}
+
+fn graphemes_mandarin(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/mandarin.txt");
+}
+
+fn graphemes_russian(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/russian.txt");
+}
+
+fn graphemes_source_code(bench: &mut Bencher) {
+    graphemes(bench, "benches/texts/source_code.txt");
+}
+
+benchmark_group!(
+    benches,
+    graphemes_arabic,
+    graphemes_english,
+    graphemes_hindi,
+    graphemes_japanese,
+    graphemes_korean,
+    graphemes_mandarin,
+    graphemes_russian,
+    graphemes_source_code,
+);
+
+benchmark_main!(benches);
@@ -0,0 +1,12 @@
+All language text files in this folder are copied from Wikipedia, under the CC-BY-SA 3.0 license
+(included in LICENSE). source_code.txt is from the Neovim source code, covered under the Apache 2.0
+license. The original source for each file is listed below
+
+ - english.txt: https://en.wikipedia.org/wiki/English_language
+ - korean.txt: https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%96%B4
+ - japanese.txt: https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E8%AA%9E
+ - hindi.txt: https://hi.wikipedia.org/wiki/%E0%A4%B9%E0%A4%BF%E0%A4%A8%E0%A5%8D%E0%A4%A6%E0%A5%80
+ - mandarin.txt: https://zh.wikipedia.org/wiki/%E5%AE%98%E8%AF%9D
+ - arabic.txt: https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D9%84%D8%BA%D8%A9_%D8%A7%D9%84%D8%B9%D8%B1%D8%A8%D9%8A%D8%A9
+ - russian.txt: https://ru.wikipedia.org/wiki/%D0%A0%D1%83%D1%81%D1%81%D0%BA%D0%B8%D0%B9_%D1%8F%D0%B7%D1%8B%D0%BA
+ - source_code.txt: https://github.com/veonim/neovim/blob/master/src/nvim/buffer.c