|
8 | 8 | // option. This file may not be copied, modified, or distributed
|
9 | 9 | // except according to those terms.
|
10 | 10 |
|
11 |
| -//! Iterators which split strings on Grapheme Cluster or Word boundaries, according |
| 11 | +//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according |
12 | 12 | //! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
|
13 | 13 | //!
|
14 | 14 | //! ```rust
|
@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
|
67 | 67 | pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
|
68 | 68 | pub use tables::UNICODE_VERSION;
|
69 | 69 | pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
|
| 70 | +pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences}; |
70 | 71 |
|
71 | 72 | mod grapheme;
|
72 | 73 | mod tables;
|
73 | 74 | mod word;
|
| 75 | +mod sentence; |
74 | 76 |
|
75 | 77 | #[cfg(test)]
|
76 | 78 | mod test;
|
@@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {
|
174 | 176 | /// assert_eq!(&swi1[..], b);
|
175 | 177 | /// ```
|
176 | 178 | fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
|
| 179 | + |
| 180 | + /// Returns an iterator over substrings of `self` separated on |
| 181 | + /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). |
| 182 | + /// |
| 183 | + /// The concatenation of the substrings returned by this function is just the original string. |
| 184 | + fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>; |
| 185 | + |
| 186 | + /// Returns an iterator over substrings of `self` separated on |
| 187 | + /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). |
| 188 | + /// |
| 189 | + /// Here, "sentences" are just those substrings which, after splitting on |
| 190 | + /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the |
| 191 | + /// substring must contain at least one character with the |
| 192 | + /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
| 193 | + /// property, or with |
| 194 | + /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
| 195 | + fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>; |
| 196 | + |
| 197 | + /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries, |
| 198 | + /// and their offsets. See `split_sentence_bounds()` for more information. |
| 199 | + fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>; |
177 | 200 | }
|
178 | 201 |
|
179 | 202 | impl UnicodeSegmentation for str {
|
@@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {
|
201 | 224 | fn split_word_bound_indices(&self) -> UWordBoundIndices {
|
202 | 225 | word::new_word_bound_indices(self)
|
203 | 226 | }
|
| 227 | + |
| 228 | + #[inline] |
| 229 | + fn unicode_sentences(&self) -> UnicodeSentences { |
| 230 | + sentence::new_unicode_sentences(self) |
| 231 | + } |
| 232 | + |
| 233 | + #[inline] |
| 234 | + fn split_sentence_bounds(&self) -> USentenceBounds { |
| 235 | + sentence::new_sentence_bounds(self) |
| 236 | + } |
| 237 | + |
| 238 | + #[inline] |
| 239 | + fn split_sentence_bound_indices(&self) -> USentenceBoundIndices { |
| 240 | + sentence::new_sentence_bound_indices(self) |
| 241 | + } |
204 | 242 | }
|
0 commit comments