8000 Merge pull request #24 from tomcumming/master · unicode-rs/unicode-segmentation@c7a6b6f · GitHub
[go: up one dir, main page]

Skip to content

Commit c7a6b6f

Browse files
authored
Merge pull request #24 from tomcumming/master
Unicode sentence boundaries
2 parents 8ca8e23 + 9c7abf2 commit c7a6b6f

File tree

8 files changed

+1757
-2
lines changed

8 files changed

+1757
-2
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
1212
keywords = ["text", "unicode", "grapheme", "word", "boundary"]
1313
readme = "README.md"
1414
description = """
15-
This crate provides Grapheme Cluster and Word boundaries
15+
This crate provides Grapheme Cluster, Word and Sentence boundaries
1616
according to Unicode Standard Annex #29 rules.
1717
"""
1818

scripts/unicode.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
351351
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
352352
word_table.sort(key=lambda w: w[0])
353353
emit_break_module(rf, word_table, word_cats.keys(), "word")
354+
355+
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
356+
sentence_table = []
357+
for cat in sentence_cats:
358+
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
359+
sentence_table.sort(key=lambda w: w[0])
360+
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")

scripts/unicode_gen_breaktests.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,23 @@ def create_words_data(f):
190190
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
191191
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
192192

193+
def create_sentence_data(f):
194+
d = load_test_data("auxiliary/SentenceBreakTest.txt")
195+
196+
test = []
197+
198+
for (c, i) in d:
199+
allchars = [cn for s in c for cn in s]
200+
test.append((allchars, c))
201+
202+
wtype = "&'static [(&'static str, &'static [&'static str])]"
203+
f.write(" // official Unicode test data\n")
204+
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
205+
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
206+
193207
if __name__ == "__main__":
194208
with open("testdata.rs", "w") as rf:
195209
rf.write(unicode.preamble)
196210
create_grapheme_data(rf)
197211
create_words_data(rf)
212+
create_sentence_data(rf)

src/lib.rs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
11+
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
1212
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
1313
//!
1414
//! ```rust
@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
6767
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
6868
pub use tables::UNICODE_VERSION;
6969
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
70+
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
7071

7172
mod grapheme;
7273
mod tables;
7374
mod word;
75+
mod sentence;
7476

7577
#[cfg(test)]
7678
mod test;
@@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {
174176
/// assert_eq!(&swi1[..], b);
175177
/// ```
176178
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
179+
180+
/// Returns an iterator over substrings of `self` separated on
181+
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
182+
///
183+
/// The concatenation of the substrings returned by this function is just the original string.
184+
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
185+
186+
/// Returns an iterator over substrings of `self` separated on
187+
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
188+
///
189+
/// Here, "sentences" are just those substrings which, after splitting on
190+
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
191+
/// substring must contain at least one character with the
192+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
193+
/// property, or with
194+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
195+
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
196+
197+
/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
198+
/// and their offsets. See `split_sentence_bounds()` for more information.
199+
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
177200
}
178201

179202
impl UnicodeSegmentation for str {
@@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {
201224
fn split_word_bound_indices(&self) -> UWordBoundIndices {
202225
word::new_word_bound_indices(self)
203226
}
227+
228+
#[inline]
229+
fn unicode_sentences(&self) -> UnicodeSentences {
230+
sentence::new_unicode_sentences(self)
231+
}
232+
233+
#[inline]
234+
fn split_sentence_bounds(&self) -> USentenceBounds {
235+
sentence::new_sentence_bounds(self)
236+
}
237+
238+
#[inline]
239+
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
240+
sentence::new_sentence_bound_indices(self)
241+
}
204242
}

0 commit comments

Comments
 (0)
0