@@ -82,7 +82,7 @@ enum UWordBoundsState {
82
82
ExtendNumLet ,
83
83
Regional ( RegionalState ) ,
84
84
FormatExtend ( FormatExtendType ) ,
85
- Zwj ( /* tainted */ bool ) ,
85
+ Zwj ,
86
86
Emoji ,
87
87
}
88
88
@@ -130,9 +130,12 @@ impl<'a> Iterator for UWordBounds<'a> {
130
130
let mut cat = wd:: WC_Any ;
131
131
let mut savecat = wd:: WC_Any ;
132
132
133
+ // Whether or not the previous category was ZWJ
134
+ // ZWJs get collapsed, so this handles precedence of WB3c over WB4
135
+ let mut prev_zwj;
133
136
for ( curr, ch) in self . string . char_indices ( ) {
134
137
idx = curr;
135
-
138
+ prev_zwj = cat == wd :: WC_ZWJ ;
136
139
// if there's a category cached, grab it
137
140
cat = match self . cat {
138
141
None => wd:: word_category ( ch) ,
@@ -141,42 +144,49 @@ impl<'a> Iterator for UWordBounds<'a> {
141
144
take_cat = true ;
142
145
143
146
// handle rule WB4
144
- // just skip all format and extend chars
147
+ // just skip all format, extend, and zwj chars
145
148
// note that Start is a special case: if there's a bunch of Format | Extend
146
149
// characters at the beginning of a block of text, dump them out as one unit.
147
150
//
148
151
// (This is not obvious from the wording of UAX#29, but if you look at the
149
152
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
150
153
// then the "correct" interpretation of WB4 becomes apparent.)
151
- //
154
+ if state != Start {
155
+ match cat {
156
+ wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => {
157
+ continue
158
+ }
159
+ _ => { }
160
+ }
161
+ }
162
+
163
+ // rule WB3c
152
164
// WB4 makes all ZWJs collapse into the previous state
153
165
// but you can still be in a Zwj state if you started with Zwj
154
166
//
155
167
// This means that Zwj + Extend will collapse into Zwj, which is wrong,
156
168
// since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
157
169
// and that rule (WB3c) has higher priority
158
170
//
159
- // Thus, when in the Zwj state, we track if the last collapsed character is also
160
- // a ZWJ. If it isn't, we treat that as a "tainted" zwj, which is basically
161
- // of the form ZWJ (Format | Extend | ZWJ)* (Format | Extend).
162
- if state != Start {
163
- match cat {
164
- wd:: WC_Extend | wd:: WC_Format => {
165
- if let Zwj ( ref mut taint) = state {
166
- * taint = true ;
167
- }
168
- continue
169
- }
170
- wd:: WC_ZWJ => {
171
- if let Zwj ( ref mut taint) = state {
172
- * taint = false ;
173
- }
174
- continue
175
- }
176
- _ => { }
171
+ // Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
172
+ // which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
173
+ //
174
+ // Thus, we separately keep track of whether or not the last character
175
+ // was a ZWJ. This is an additional bit of state tracked outside of the
176
+ // state enum; the state enum represents the last non-zwj state encountered.
177
+ // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
178
+ // however we are in the previous state for the purposes of all other rules.
179
+ if prev_zwj {
180
+ match cat {
181
+ wd:: WC_Glue_After_Zwj => continue ,
182
+ wd:: WC_E_Base_GAZ => {
183
+ state = Emoji ;
184
+ continue ;
185
+ } ,
186
+ _ => ( )
177
187
}
178
188
}
179
-
189
+ // Don't use `continue` in this match without updating `cat`
180
190
state = match state {
181
191
Start if cat == wd:: WC_CR => {
182
192
idx += match self . get_next_cat ( idx) {
@@ -193,7 +203,7 @@ impl<'a> Iterator for UWordBounds<'a> {
193
203
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
194
204
wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
195
205
wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
196
- wd:: WC_ZWJ => Zwj ( false ) , // rule WB3c
206
+ wd:: WC_ZWJ => Zwj , // rule WB3c
197
207
wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
198
208
_ => {
199
209
if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
@@ -206,21 +216,13 @@ impl<'a> Iterator for UWordBounds<'a> {
206
216
break ; // rule WB999
207
217
}
208
218
} ,
209
- Zwj ( true ) => {
219
+ Zwj => {
220
+ // We already handle WB3c above. At this point,
221
+ // the current category is not GAZ or EBG,
222
+ // or the previous character was not actually a ZWJ
210
223
take_curr = false ;
211
- break
224
+ break ;
212
225
}
213
- Zwj ( false ) => match cat { // rule WB3c
214
- wd:: WC_Glue_After_Zwj => continue ,
215
- wd:: WC_E_Base_GAZ => {
216
- state = Emoji ;
217
- continue ;
218
- } ,
219
- _ => {
220
- take_curr = false ;
221
- break ;
222
- }
223
- } ,
224
226
Letter | HLetter => match cat {
225
227
wd:: WC_ALetter => Letter , // rule WB5
226
228
wd:: WC_Hebrew_Letter => HLetter , // rule WB5
@@ -294,7 +296,7 @@ impl<'a> Iterator for UWordBounds<'a> {
294
296
} ,
295
297
Regional ( _) => unreachable ! ( "RegionalState::Unknown should not occur on forward iteration" ) ,
296
298
Emoji => match cat { // rule WB14
297
- wd:: WC_E_Modifier => continue ,
299
+ wd:: WC_E_Modifier => state ,
298
300
_ => {
299
301
take_curr = false ;
300
302
break ;
@@ -358,6 +360,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
358
360
let mut state = Start ;
359
361
let mut savestate = Start ;
360
362
let mut cat = wd:: WC_Any ;
363
+
361
364
for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
362
365
previdx = idx;
363
366
idx = curr;
@@ -375,9 +378,11 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
375
378
// Hebrew Letter immediately before it.
376
379
// (2) Format and Extend char handling takes some gymnastics.
377
380
378
- if cat == wd:: WC_Extend || cat == wd:: WC_Format || cat == wd:: WC_ZWJ {
381
+ if cat == wd:: WC_Extend
382
+ || cat == wd:: WC_Format
383
+ || ( cat == wd:: WC_ZWJ && state != Zwj ) { // WB3c has more priority so we should not
10000
code>
384
+ // fold in that case
379
385
if match state {
380
- Zwj ( _) if cat == wd:: WC_ZWJ => false ,
381
386
FormatExtend ( _) | Start => false ,
382
387
_ => true
383
388
} {
@@ -396,6 +401,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
396
401
take_cat = false ;
397
402
}
398
403
404
+ // Don't use `continue` in this match without updating `catb`
399
405
state = match state {
400
406
Start | FormatExtend ( AcceptAny ) => match cat {
401
407
wd:: WC_ALetter => Letter , // rule WB5, WB7, WB10, WB13b
@@ -404,7 +410,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
404
410
wd:: WC_Katakana => Katakana , // rule WB13, WB13b
405
411
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
406
412
wd:: WC_Regional_Indicator => Regional ( RegionalState :: Unknown ) , // rule WB13c
407
- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj ( false ) , // rule WB3c
413
+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
408
414
// rule WB4:
409
415
wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
410
416
wd:: WC_Single_Quote => {
@@ -427,8 +433,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427
433
} ,
428
434
_ => break // rule WB999
429
435
} ,
430
- Zwj ( _) => match cat { // rule WB3c
431
- wd:: WC_ZWJ => continue ,
436
+ Zwj => match cat { // rule WB3c
437
+ wd:: WC_ZWJ => {
438
+ FormatExtend ( AcceptAny )
439
+ }
432
440
_ => {
433
441
take_curr = false ;
434
442
break ;
@@ -515,8 +523,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
515
523
} ,
516
524
Emoji => match cat { // rule WB14
517
525
wd:: WC_E_Base | wd:: WC_E_Base_GAZ => {
518
- state = Zwj ( false ) ;
519
- continue
526
+ Zwj
520
527
} ,
521
528
_ => {
522
529
take_curr = false ;
0 commit comments