@@ -102,6 +102,7 @@ enum UWordBoundsState {
102
102
FormatExtend ( FormatExtendType ) ,
103
103
Zwj ,
104
104
Emoji ,
105
+ WSegSpace ,
105
106
}
106
107
107
108
// subtypes for FormatExtend state in UWordBoundsState
@@ -156,6 +157,8 @@ impl<'a> Iterator for UWordBounds<'a> {
156
157
// Whether or not the previous category was ZWJ
157
158
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
158
159
let mut prev_zwj;
160
+ // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161
+ let mut skipped_format_extend = false ;
159
162
for ( curr,
10000
ch) in self . string . char_indices ( ) {
160
163
idx = curr;
161
164
prev_zwj = cat == wd:: WC_ZWJ ;
@@ -177,6 +180,7 @@ impl<'a> Iterator for UWordBounds<'a> {
177
180
if state != Start {
178
181
match cat {
179
182
wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => {
183
+ skipped_format_extend = true ;
180
184
continue
181
185
}
182
186
_ => { }
@@ -219,6 +223,7 @@ impl<'a> Iterator for UWordBounds<'a> {
219
223
wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
220
224
wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
221
225
wd:: WC_ZWJ => Zwj , // rule WB3c
226
+ wd:: WC_WSegSpace => WSegSpace , // rule WB3d
222
227
_ => {
223
228
if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
224
229
if ncat == wd:: WC_Format || ncat == wd:: WC_Extend || ncat == wd:: WC_ZWJ {
@@ -230,6 +235,13 @@ impl<'a> Iterator for UWordBounds<'a> {
230
235
break ; // rule WB999
231
236
}
232
237
} ,
238
+ WSegSpace => match cat {
239
+ wd:: WC_WSegSpace if !skipped_format_extend => WSegSpace ,
240
+ _ => {
241
+ take_curr = false ;
242
+ break ;
243
+ }
244
+ } ,
233
245
Zwj => {
234
246
// We already handle WB3c above.
235
247
take_curr = false ;
@@ -371,6 +383,8 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
371
383
let mut savestate = Start ;
372
384
let mut cat = wd:: WC_Any ;
373
385
386
+ let mut skipped_format_extend = false ;
387
+
374
388
for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
375
389
previdx = idx;
376
390
idx = curr;
@@ -409,6 +423,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
409
423
state = savestate;
410
424
previdx = saveidx;
411
425
take_cat = false ;
426
+ skipped_format_extend = true ;
412
427
}
413
428
414
429
// Don't use `continue` in this match without updating `catb`
@@ -427,6 +442,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427
442
saveidx = idx;
428
443
FormatExtend ( AcceptQLetter ) // rule WB7a
429
444
} ,
445
+ wd:: WC_WSegSpace => WSegSpace ,
430
446
wd:: WC_CR | wd:: WC_LF | wd:: WC_Newline => {
431
447
if state == Start {
432
448
if cat == wd:: WC_LF {
@@ -451,6 +467,15 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
451
467
break ;
452
468
}
453
469
} ,
470
+ WSegSpace => match cat { // rule WB3d
471
+ wd:: WC_WSegSpace if !skipped_format_extend => {
472
+ WSegSpace
473
+ }
474
+ _ => {
475
+ take_curr = false ;
476
+ break ;
477
+ }
478
+ } ,
454
479
Letter | HLetter => match cat {
455
480
wd:: WC_ALetter => Letter , // rule WB5
456
481
wd:: WC_Hebrew_Letter => HLetter , // rule WB5
0 commit comments