8000 Add WSegSpace support for in word boundaries from Unicode 11 · unicode-rs/unicode-segmentation@6591535 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6591535

Browse files
committed
Add WSegSpace support for in word boundaries from Unicode 11
1 parent 0b168d5 commit 6591535

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
//!
3030
//! let s = "The quick (\"brown\") fox";
3131
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32-
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
32+
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
3333
//! assert_eq!(w, b);
3434
//! }
3535
//! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
156156
/// ```
157157
/// # use self::unicode_segmentation::UnicodeSegmentation;
158158
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
159-
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
159+
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
160160
///
161161
/// assert_eq!(&swu1[..], b);
162162
/// ```

src/word.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ enum UWordBoundsState {
102102
FormatExtend(FormatExtendType),
103103
Zwj,
104104
Emoji,
105+
WSegSpace,
105106
}
106107

107108
// subtypes for FormatExtend state in UWordBoundsState
@@ -156,6 +157,8 @@ impl<'a> Iterator for UWordBounds<'a> {
156157
// Whether or not the previous category was ZWJ
157158
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
158159
let mut prev_zwj;
160+
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161+
let mut skipped_format_extend = false;
159162
for (curr, 10000 ch) in self.string.char_indices() {
160163
idx = curr;
161164
prev_zwj = cat == wd::WC_ZWJ;
@@ -177,6 +180,7 @@ impl<'a> Iterator for UWordBounds<'a> {
177180
if state != Start {
178181
match cat {
179182
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
183+
skipped_format_extend = true;
180184
continue
181185
}
182186
_ => {}
@@ -219,6 +223,7 @@ impl<'a> Iterator for UWordBounds<'a> {
219223
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
220224
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
221225
wd::WC_ZWJ => Zwj, // rule WB3c
226+
wd::WC_WSegSpace => WSegSpace, // rule WB3d
222227
_ => {
223228
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
224229
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
@@ -230,6 +235,13 @@ impl<'a> Iterator for UWordBounds<'a> {
230235
break; // rule WB999
231236
}
232237
},
238+
WSegSpace => match cat {
239+
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
240+
_ => {
241+
take_curr = false;
242+
break;
243+
}
244+
},
233245
Zwj => {
234246
// We already handle WB3c above.
235247
take_curr = false;
@@ -371,6 +383,8 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
371383
let mut savestate = Start;
372384
let mut cat = wd::WC_Any;
373385

386+
let mut skipped_format_extend = false;
387+
374388
for (curr, ch) in self.string.char_indices().rev() {
375389
previdx = idx;
376390
idx = curr;
@@ -409,6 +423,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
409423
state = savestate;
410424
previdx = saveidx;
411425
take_cat = false;
426+
skipped_format_extend = true;
412427
}
413428

414429
// Don't use `continue` in this match without updating `catb`
@@ -427,6 +442,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427442
saveidx = idx;
428443
FormatExtend(AcceptQLetter) // rule WB7a
429444
},
445+
wd::WC_WSegSpace => WSegSpace,
430446
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
431447
if state == Start {
432448
if cat == wd::WC_LF {
@@ -451,6 +467,15 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
451467
break;
452468
}
453469
},
470+
WSegSpace => match cat { // rule WB3d
471+
wd::WC_WSegSpace if !skipped_format_extend => {
472+
WSegSpace
473+
}
474+
_ => {
475+
take_curr = false;
476+
break;
477+
}
478+
},
454479
Letter | HLetter => match cat {
455480
wd::WC_ALetter => Letter, // rule WB5
456481
wd::WC_Hebrew_Letter => HLetter, // rule WB5

0 commit comments

Comments
 (0)
0