8000 Handle ZWJ in rules WB3c and WB4 · cbarrick/unicode-segmentation@f9f7076 · GitHub
[go: up one dir, main page]

Skip to content

Commit f9f7076

Browse files
committed
Handle ZWJ in rules WB3c and WB4
1 parent db6e78f commit f9f7076

File tree

1 file changed

+25
-4
lines changed

1 file changed

+25
-4
lines changed

src/word.rs

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ enum UWordBoundsState {
8282
ExtendNumLet,
8383
Regional,
8484
FormatExtend(FormatExtendType),
85+
Zwj,
8586
}
8687

8788
// subtypes for FormatExtend state in UWordBoundsState
@@ -138,8 +139,11 @@ impl<'a> Iterator for UWordBounds<'a> {
138139
// (This is not obvious from the wording of UAX#29, but if you look at the
139140
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
140141
// then the "correct" interpretation of WB4 becomes apparent.)
141-
if state != Start && (cat == wd::WC_Extend || cat == wd::WC_Format) {
142-
continue;
142+
if state != Start {
143+
match cat {
144+
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => continue,
145+
_ => {}
146+
}
143147
}
144148

145149
state = match state {
@@ -158,9 +162,10 @@ impl<'a> Iterator for UWordBounds<'a> {
158162
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
159163
wd::WC_Regional_Indicator => Regional, // rule WB13c
160164
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
165+
wd::WC_ZWJ => Zwj, // rule WB3c
161166
_ => {
162167
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
163-
if ncat == wd::WC_Format || ncat == wd::WC_Extend {
168+
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
164169
state = FormatExtend(AcceptNone);
165170
self.cat = Some(ncat);
166171
continue;
@@ -169,6 +174,13 @@ impl<'a> Iterator for UWordBounds<'a> {
169174
break; // rule WB14
170175
}
171176
},
177+
Zwj => match cat { // rule WB3c
178+
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => continue,
179+
_ => {
180+
take_curr = false;
181+
break;
182+
}
183+
},
172184
Letter | HLetter => match cat {
173185
wd::WC_ALetter => Letter, // rule WB5
174186
wd::WC_Hebrew_Letter => HLetter, // rule WB5
@@ -336,7 +348,9 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
336348
wd::WC_Katakana => Katakana, // rule WB13, WB13b
337349
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
338350
wd::WC_Regional_Indicator => Regional, // rule WB13c
339-
wd::WC_Extend | wd::WC_Format => FormatExtend(AcceptAny), // rule WB4
351+
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj, // rule WB3c
352+
// rule WB4:
353+
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
340354
wd::WC_Single_Quote => {
341355
saveidx = idx;
342356
FormatExtend(AcceptQLetter) // rule WB7a
@@ -356,6 +370,13 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
356370
},
357371
_ => break // rule WB14
358372
},
373+
Zwj => match cat { // rule WB3c
374+
wd::WC_ZWJ => continue,
375+
_ => {
376+
take_curr = false;
377+
break;
378+
}
379+
},
359380
Letter | HLetter => match cat {
360381
wd::WC_ALetter => Letter, // rule WB5
361382
wd::WC_Hebrew_Letter => HLetter, // rule WB5

0 commit comments

Comments
 (0)
0