@@ -122,6 +122,11 @@ enum RegionalState {
122
122
Unknown ,
123
123
}
124
124
125
+ fn is_emoji ( ch : char ) -> bool {
126
+ use tables:: emoji;
127
+ emoji:: emoji_category ( ch) == emoji:: EmojiCat :: EC_Extended_Pictographic
128
+ }
129
+
125
130
impl < ' a > Iterator for UWordBounds < ' a > {
126
131
type Item = & ' a str ;
127
132
@@ -182,26 +187,18 @@ impl<'a> Iterator for UWordBounds<'a> {
182
187
// WB4 makes all ZWJs collapse into the previous state
183
188
// but you can still be in a Zwj state if you started with Zwj
184
189
//
185
- // This means that Zwj + Extend will collapse into Zwj, which is wrong,
186
- // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
187
- // and that rule (WB3c) has higher priority
188
- //
189
- // Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
190
- // which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
190
+ // This means that an EP + Zwj will collapse into EP, which is wrong,
191
+ // since EP+EP is not a boundary but EP+ZWJ+EP is
191
192
//
192
193
// Thus, we separately keep track of whether or not the last character
193
194
// was a ZWJ. This is an additional bit of state tracked outside of the
194
195
// state enum; the state enum represents the last non-zwj state encountered.
195
196
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
196
197
// however we are in the previous state for the purposes of all other rules.
197
198
if prev_zwj {
198
- match cat {
199
- wd:: WC_Glue_After_Zwj => continue ,
200
- wd:: WC_E_Base_GAZ => {
201
- state = Emoji ;
202
- continue ;
203
- } ,
204
- _ => ( )
199
+ if is_emoji ( ch) {
200
+ state = Emoji ;
201
+ continue ;
205
202
}
206
203
}
207
204
// Don't use `continue` in this match without updating `cat`
@@ -222,7 +219,6 @@ impl<'a> Iterator for UWordBounds<'a> {
222
219
wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
223
220
wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
224
221
wd:: WC_ZWJ => Zwj , // rule WB3c
225
- wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
226
222
_ => {
227
223
if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
228
224
if ncat == wd:: WC_Format || ncat == wd:: WC_Extend || ncat == wd:: WC_ZWJ {
@@ -235,9 +231,7 @@ impl<'a> Iterator for UWordBounds<'a> {
235
231
}
236
232
} ,
237
233
Zwj => {
238
- // We already handle WB3c above. At this point,
239
- // the current category is not GAZ or EBG,
240
- // or the previous character was not actually a ZWJ
234
+ // We already handle WB3c above.
241
235
take_curr = false ;
242
236
break ;
243
237
}
@@ -313,12 +307,10 @@ impl<'a> Iterator for UWordBounds<'a> {
313
307
}
314
308
} ,
315
309
Regional ( _) => unreachable ! ( "RegionalState::Unknown should not occur on forward iteration" ) ,
316
- Emoji => match cat { // rule WB14
317
- wd:: WC_E_Modifier => state,
318
- _ => {
319
- take_curr = false ;
320
- break ;
321
- }
310
+ Emoji => {
311
+ // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
312
+ take_curr = false ;
313
+ break ;
322
314
} ,
323
315
FormatExtend ( t) => match t { // handle FormatExtends depending on what type
324
316
RequireNumeric if cat == wd:: WC_Numeric => Numeric , // rule WB11
@@ -422,20 +414,19 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
422
414
// Don't use `continue` in this match without updating `catb`
423
415
state = match state {
424
416
Start | FormatExtend ( AcceptAny ) => match cat {
417
+ _ if is_emoji ( ch) => Zwj ,
425
418
wd:: WC_ALetter => Letter , // rule WB5, WB7, WB10, WB13b
426
419
wd:: WC_Hebrew_Letter => HLetter , // rule WB5, WB7, WB7c, WB10, WB13b
427
420
wd:: WC_Numeric => Numeric , // rule WB8, WB9, WB11, WB13b
428
421
wd:: WC_Katakana => Katakana , // rule WB13, WB13b
429
422
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
430
423
wd:: WC_Regional_Indicator => Regional ( RegionalState :: Unknown ) , // rule WB13c
431
- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
432
424
// rule WB4:
433
425
wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
434
426
wd:: WC_Single_Quote => {
435
427
saveidx = idx;
436
428
FormatExtend ( AcceptQLetter ) // rule WB7a
437
429
} ,
438
- wd:: WC_E_Modifier => Emoji , // rule WB14
439
430
wd:: WC_CR | wd:: WC_LF | wd:: WC_Newline => {
440
431
if state == Start {
441
432
if cat == wd:: WC_LF {
@@ -539,11 +530,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
539
530
break ;
540
531
}
541
532
} ,
542
- Emoji => match cat { // rule WB14
543
- wd :: WC_E_Base | wd :: WC_E_Base_GAZ => {
533
+ Emoji => {
534
+ if is_emoji ( ch ) { // rule WB3c
544
535
Zwj
545
- } ,
546
- _ => {
536
+ } else {
547
537
take_curr = false ;
548
538
break ;
549
539
}
0 commit comments