10000 Update emoji rules in word boundaries to Unicode 11 · simmsb/unicode-segmentation@98c9457 · GitHub
[go: up one dir, main page]

Skip to content

Commit 98c9457

Browse files
committed
Update emoji rules in word boundaries to Unicode 11
1 parent 4d58f18 commit 98c9457

File tree

1 file changed

+19
-29
lines changed

1 file changed

+19
-29
lines changed

src/word.rs

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,11 @@ enum RegionalState {
122122
Unknown,
123123
}
124124

125+
fn is_emoji(ch: char) -> bool {
126+
use tables::emoji;
127+
emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
128+
}
129+
125130
impl<'a> Iterator for UWordBounds<'a> {
126131
type Item = &'a str;
127132

@@ -182,26 +187,18 @@ impl<'a> Iterator for UWordBounds<'a> {
182187
// WB4 makes all ZWJs collapse into the previous state
183188
// but you can still be in a Zwj state if you started with Zwj
184189
//
185-
// This means that Zwj + Extend will collapse into Zwj, which is wrong,
186-
// since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
187-
// and that rule (WB3c) has higher priority
188-
//
189-
// Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
190-
// which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
190+
// This means that an EP + Zwj will collapse into EP, which is wrong,
191+
// since EP+EP is not a boundary but EP+ZWJ+EP is
191192
//
192193
// Thus, we separately keep track of whether or not the last character
193194
// was a ZWJ. This is an additional bit of state tracked outside of the
194195
// state enum; the state enum represents the last non-zwj state encountered.
195196
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
196197
// however we are in the previous state for the purposes of all other rules.
197198
if prev_zwj {
198-
match cat {
199-
wd::WC_Glue_After_Zwj => continue,
200-
wd::WC_E_Base_GAZ => {
201-
state = Emoji;
202-
continue;
203-
},
204-
_ => ()
199+
if is_emoji(ch) {
200+
state = Emoji;
201+
continue;
205202
}
206203
}
207204
// Don't use `continue` in this match without updating `cat`
@@ -222,7 +219,6 @@ impl<'a> Iterator for UWordBounds<'a> {
222219
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
223220
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
224221
wd::WC_ZWJ => Zwj, // rule WB3c
225-
wd::WC_E_Base | wd::WC_E_Base_GAZ => Emoji, // rule WB14
226222
_ => {
227223
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
228224
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
@@ -235,9 +231,7 @@ impl<'a> Iterator for UWordBounds<'a> {
235231
}
236232
},
237233
Zwj => {
238-
// We already handle WB3c above. At this point,
239-
// the current category is not GAZ or EBG,
240-
// or the previous character was not actually a ZWJ
234+
// We already handle WB3c above.
241235
take_curr = false;
242236
break;
243237
}
@@ -313,12 +307,10 @@ impl<'a> Iterator for UWordBounds<'a> {
313307
}
314308
},
315309
Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
316-
Emoji => match cat { // rule WB14
317-
wd::WC_E_Modifier => state,
318-
_ => {
319-
take_curr = false;
320-
break;
321-
}
310+
Emoji => {
311+
// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
312+
take_curr = false;
313+
break;
322314
},
323315
FormatExtend(t) => match t { // handle FormatExtends depending on what type
324316
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
@@ -422,20 +414,19 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
422414
// Don't use `continue` in this match without updating `catb`
423415
state = match state {
424416
Start | FormatExtend(AcceptAny) => match cat {
417+
_ if is_emoji(ch) => Zwj,
425418
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
426419
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
427420
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
428421
wd::WC_Katakana => Katakana, // rule WB13, WB13b
429422
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
430423
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
431-
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj, // rule WB3c
432424
// rule WB4:
433425
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
434426
wd::WC_Single_Quote => {
435427
saveidx = idx;
436428
FormatExtend(AcceptQLetter) // rule WB7a
437429
},
438-
wd::WC_E_Modifier => Emoji, // rule WB14
439430
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
440431
if state == Start {
441432
if cat == wd::WC_LF {
@@ -539,11 +530,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
539530
break;
540531
}
541532
},
542-
Emoji => match cat { // rule WB14
543-
wd::WC_E_Base | wd::WC_E_Base_GAZ => {
533+
Emoji => {
534+
if is_emoji(ch) { // rule WB3c
544535
Zwj
545-
},
546-
_ => {
536+
} else {
547537
take_curr = false;
548538
break;
549539
}

0 commit comments

Comments
 (0)
0