@@ -82,6 +82,7 @@ enum UWordBoundsState {
82
82
ExtendNumLet ,
83
83
Regional ,
84
84
FormatExtend ( FormatExtendType ) ,
85
+ Zwj ,
85
86
}
86
87
87
88
// subtypes for FormatExtend state in UWordBoundsState
@@ -138,8 +139,11 @@ impl<'a> Iterator for UWordBounds<'a> {
138
139
// (This is not obvious from the wording of UAX#29, but if you look at the
139
140
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
140
141
// then the "correct" interpretation of WB4 becomes apparent.)
141
- if state != Start && ( cat == wd:: WC_Extend || cat == wd:: WC_Format ) {
142
- continue ;
142
+ if state != Start {
143
+ match cat {
144
+ wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => continue ,
145
+ _ => { }
146
+ }
143
147
}
144
148
145
149
state = match state {
@@ -158,9 +162,10 @@ impl<'a> Iterator for UWordBounds<'a> {
158
162
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
159
163
wd:: WC_Regional_Indicator => Regional , // rule WB13c
160
164
wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
165
+ wd:: WC_ZWJ => Zwj , // rule WB3c
161
166
_ => {
162
167
if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
163
- if ncat == wd:: WC_Format || ncat == wd:: WC_Extend {
168
+ if ncat == wd:: WC_Format || ncat == wd:: WC_Extend || ncat == wd :: WC_ZWJ {
164
169
state = FormatExtend ( AcceptNone ) ;
165
170
self . cat = Some ( ncat) ;
166
171
continue ;
@@ -169,6 +174,13 @@ impl<'a> Iterator for UWordBounds<'a> {
169
174
break ; // rule WB14
170
175
}
171
176
} ,
177
+ Zwj => match cat { // rule WB3c
178
+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => continue ,
179
+ _ => {
180
+ take_curr = false ;
181
+ break ;
182
+ }
183
+ } ,
172
184
Letter | HLetter => match cat {
173
185
wd:: WC_ALetter => Letter , // rule WB5
174
186
wd:: WC_Hebrew_Letter => HLetter , // rule WB5
@@ -336,7 +348,9 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
336
348
wd:: WC_Katakana => Katakana , // rule WB13, WB13b
337
349
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
338
350
wd:: WC_Regional_Indicator => Regional , // rule WB13c
339
- wd:: WC_Extend | wd:: WC_Format => FormatExtend ( AcceptAny ) , // rule WB4
351
+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
352
+ // rule WB4:
353
+ wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
340
354
wd:: WC_Single_Quote => {
341
355
saveidx = idx;
342
356
FormatExtend ( AcceptQLetter ) // rule WB7a
@@ -356,6 +370,13 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
356
370
} ,
357
371
_ => break // rule WB14
358
372
} ,
373
+ Zwj => match cat { // rule WB3c
374
+ wd:: WC_ZWJ => continue ,
375
+ _ => {
376
+ take_curr = false ;
377
+ break ;
378
+ }
379
+ } ,
359
380
Letter | HLetter => match cat {
360
381
wd:: WC_ALetter => Letter , // rule WB5
361
382
wd:: WC_Hebrew_Letter => HLetter , // rule WB5
0 commit comments