8000 Implement new rule WB14 (Emoji modifier sequences) · jmuk/unicode-segmentation@c80e5a3 · GitHub
[go: up one dir, main page]

Skip to content

Commit c80e5a3

Browse files
committed
Implement new rule WB14 (Emoji modifier sequences)
1 parent 675f347 commit c80e5a3

File tree

1 file changed

+17
-0
lines changed

1 file changed

+17
-0
lines changed

src/word.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ enum UWordBoundsState {
8383
Regional,
8484
FormatExtend(FormatExtendType),
8585
Zwj,
86+
Emoji,
8687
}
8788

8889
// subtypes for FormatExtend state in UWordBoundsState
@@ -163,6 +164,7 @@ impl<'a> Iterator for UWordBounds<'a> {
163164
wd::WC_Regional_Indicator => Regional, // rule WB13c
164165
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
165166
wd::WC_ZWJ => Zwj, // rule WB3c
167+
wd::WC_E_Base | wd::WC_E_Base_GAZ => Emoji, // rule WB14
166168
_ => {
167169
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
168170
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
@@ -245,6 +247,13 @@ impl<'a> Iterator for UWordBounds<'a> {
245247
break;
246248
}
247249
},
250+
Emoji => match cat { // rule WB14
251+
wd::WC_E_Modifier => continue,
252+
_ => {
253+
take_curr = false;
254+
break;
255+
}
256+
},
248257
FormatExtend(t) => match t { // handle FormatExtends depending on what type
249258
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
250259
RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
@@ -355,6 +364,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
355364
saveidx = idx;
356365
FormatExtend(AcceptQLetter) // rule WB7a
357366
},
367+
wd::WC_E_Modifier => Emoji, // rule WB14
358368
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
359369
if state == Start {
360370
if cat == wd::WC_LF {
@@ -435,6 +445,13 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
435445
break;
436446
}
437447
},
448+
Emoji => match cat { // rule WB14
449+
wd::WC_E_Base | wd::WC_E_Base_GAZ => continue,
450+
_ => {
451+
take_curr = false;
452+
break;
453+
}
454+
},
438455
FormatExtend(t) => match t {
439456
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
440457
RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6

0 commit comments

Comments
 (0)
0