8000 Fix precedence of WB3c and ZWJ handling · cbarrick/unicode-segmentation@731d346 · GitHub
[go: up one dir, main page]

Skip to content

Commit 731d346

Browse files
committed
Fix precedence of WB3c and ZWJ handling
1 parent 8e06fc9 commit 731d346

File tree

2 files changed

+53
-45
lines changed

2 files changed

+53
-45
lines changed

src/test.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ fn test_words() {
9797
("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦🇴", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦🇴"]),
9898
("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
9999
("🇦a🇫🇦🇽a🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦", "a", "🇫🇦", "🇽", "a", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
100+
("\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}", &["\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}"]),
100101
];
101102
for &(s, w) in TEST_WORD.iter().chain(EXTRA_TESTS.iter()) {
102103
macro_rules! assert_ {

src/word.rs

Lines changed: 52 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ enum UWordBoundsState {
8282
ExtendNumLet,
8383
Regional(RegionalState),
8484
FormatExtend(FormatExtendType),
85-
Zwj(/* tainted */ bool),
85+
Zwj,
8686
Emoji,
8787
}
8888

@@ -130,9 +130,12 @@ impl<'a> Iterator for UWordBounds<'a> {
130130
let mut cat = wd::WC_Any;
131131
let mut savecat = wd::WC_Any;
132132

133+
// Whether or not the previous category was ZWJ
134+
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
135+
let mut prev_zwj;
133136
for (curr, ch) in self.string.char_indices() {
134137
idx = curr;
135-
138+
prev_zwj = cat == wd::WC_ZWJ;
136139
// if there's a category cached, grab it
137140
cat = match self.cat {
138141
None => wd::word_category(ch),
@@ -141,42 +144,49 @@ impl<'a> Iterator for UWordBounds<'a> {
141144
take_cat = true;
142145

143146
// handle rule WB4
144-
// just skip all format and extend chars
147+
// just skip all format, extend, and zwj chars
145148
// note that Start is a special case: if there's a bunch of Format | Extend
146149
// characters at the beginning of a block of text, dump them out as one unit.
147150
//
148151
// (This is not obvious from the wording of UAX#29, but if you look at the
149152
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
150153
// then the "correct" interpretation of WB4 becomes apparent.)
151-
//
154+
if state != Start {
155+
match cat {
156+
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
157+
continue
158+
}
159+
_ => {}
160+
}
161+
}
162+
163+
// rule WB3c
152164
// WB4 makes all ZWJs collapse into the previous state
153165
// but you can still be in a Zwj state if you started with Zwj
154166
//
155167
// This means that Zwj + Extend will collapse into Zwj, which is wrong,
156168
// since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
157169
// and that rule (WB3c) has higher priority
158170
//
159-
// Thus, when in the Zwj state, we track if the last collapsed character is also
160-
// a ZWJ. If it isn't, we treat that as a "tainted" zwj, which is basically
161-
// of the form ZWJ (Format | Extend | ZWJ)* (Format | Extend).
162-
if state != Start {
163-
match cat {
164-
wd::WC_Extend | wd::WC_Format => {
165-
if let Zwj(ref mut taint) = state {
166-
*taint = true;
167-
}
168-
continue
169-
}
170-
wd::WC_ZWJ => {
171-
if let Zwj(ref mut taint) = state {
172-
*taint = false;
173-
}
174-
continue
175-
}
176-
_ => {}
171+
// Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
172+
// which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
173+
//
174+
// Thus, we separately keep track of whether or not the last character
175+
// was a ZWJ. This is an additional bit of state tracked outside of the
176+
// state enum; the state enum represents the last non-zwj state encountered.
177+
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
178+
// however we are in the previous state for the purposes of all other rules.
179+
if prev_zwj {
180+
match cat {
181+
wd::WC_Glue_After_Zwj => continue,
182+
wd::WC_E_Base_GAZ => {
183+
state = Emoji;
184+
continue;
185+
},
186+
_ => ()
177187
}
178188
}
179-
189+
// Don't use `continue` in this match without updating `cat`
180190
state = match state {
181191
Start if cat == wd::WC_CR => {
182192
idx += match self.get_next_cat(idx) {
@@ -193,7 +203,7 @@ impl<'a> Iterator for UWordBounds<'a> {
193203
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
194204
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
195205
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
196-
wd::WC_ZWJ => Zwj(false), // rule WB3c
206+
wd::WC_ZWJ => Zwj, // rule WB3c
197207
wd::WC_E_Base | wd::WC_E_Base_GAZ => Emoji, // rule WB14
198208
_ => {
199209
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
@@ -206,21 +216,13 @@ impl<'a> Iterator for UWordBounds<'a> {
206216
break; // rule WB999
207217
}
208218
},
209-
Zwj(true) => {
219+
Zwj => {
220+
// We already handle WB3c above. At this point,
221+
// the current category is not GAZ or EBG,
222+
// or the previous character was not actually a ZWJ
210223
take_curr = false;
211-
break
224+
break;
212225
}
213-
Zwj(false) => match cat { // rule WB3c
214-
wd::WC_Glue_After_Zwj => continue,
215-
wd::WC_E_Base_GAZ => {
216-
state = Emoji;
217-
continue;
218-
},
219-
_ => {
220-
take_curr = false;
221-
break;
222-
}
223-
},
224226
Letter | HLetter => match cat {
225227
wd::WC_ALetter => Letter, // rule WB5
226228
wd::WC_Hebrew_Letter => HLetter, // rule WB5
@@ -294,7 +296,7 @@ impl<'a> Iterator for UWordBounds<'a> {
294296
},
295297
Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
296298
Emoji => match cat { // rule WB14
297-
wd::WC_E_Modifier => continue,
299+
wd::WC_E_Modifier => state,
298300
_ => {
299301
take_curr = false;
300302
break;
@@ -358,6 +360,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
358360
let mut state = Start;
359361
let mut savestate = Start;
360362
let mut cat = wd::WC_Any;
363+
361364
for (curr, ch) in self.string.char_indices().rev() {
362365
previdx = idx;
363366
idx = curr;
@@ -375,9 +378,11 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
375378
// Hebrew Letter immediately before it.
376379
// (2) Format and Extend char handling takes some gymnastics.
377380

378-
if cat == wd::WC_Extend || cat == wd::WC_Format || cat == wd::WC_ZWJ {
381+
if cat == wd::WC_Extend
382+
|| cat == wd::WC_Format
383+
|| (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
384+
// fold in that case
379385
if match state {
380-
Zwj(_) if cat == wd::WC_ZWJ => false,
381386
FormatExtend(_) | Start => false,
382387
_ => true
383388
} {
@@ -396,6 +401,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
396401
take_cat = false;
397402
}
398403

404+
// Don't use `continue` in this match without updating `catb`
399405
state = match state {
400406
Start | FormatExtend(AcceptAny) => match cat {
401407
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
@@ -404,7 +410,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
404410
wd::WC_Katakana => Katakana, // rule WB13, WB13b
405411
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
406412
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
407-
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj(false), // rule WB3c
413+
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj, // rule WB3c
408414
// rule WB4:
409415
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
410416
wd::WC_Single_Quote => {
@@ -427,8 +433,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427433
},
428434
_ => break // rule WB999
429435
},
430-
Zwj(_) => match cat { // rule WB3c
431-
wd::WC_ZWJ => continue,
436+
Zwj => match cat { // rule WB3c
437+
wd::WC_ZWJ => {
438+
FormatExtend(AcceptAny)
439+
}
432440
_ => {
433441
take_curr = false;
434442
break;
@@ -515,8 +523,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
515523
},
516524
Emoji => match cat { // rule WB14
517525
wd::WC_E_Base | wd::WC_E_Base_GAZ => {
518-
state = Zwj(false);
519-
continue
526+
Zwj
520527
},
521528
_ => {
522529
take_curr = false;

0 commit comments

Comments
 (0)
0