@@ -72,7 +72,7 @@ impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
72
72
}
73
73
74
74
// state machine for word boundary rules
75
- #[ derive( Clone , Copy , PartialEq , Eq ) ]
75
+ #[ derive( Clone , Copy , PartialEq , Eq , Debug ) ]
76
76
enum UWordBoundsState {
77
77
Start ,
78
78
Letter ,
@@ -82,12 +82,12 @@ enum UWordBoundsState {
82
82
ExtendNumLet ,
83
83
Regional ,
84
84
FormatExtend ( FormatExtendType ) ,
85
- Zwj ,
85
+ Zwj ( bool ) ,
86
86
Emoji ,
87
87
}
88
88
89
89
// subtypes for FormatExtend state in UWordBoundsState
90
- #[ derive( Clone , Copy , PartialEq , Eq ) ]
90
+ #[ derive( Clone , Copy , PartialEq , Eq , Debug ) ]
91
91
enum FormatExtendType {
92
92
AcceptAny ,
93
93
AcceptNone ,
@@ -122,6 +122,7 @@ impl<'a> Iterator for UWordBounds<'a> {
122
122
let mut state = Start ;
123
123
let mut cat = wd:: WC_Any ;
124
124
let mut savecat = wd:: WC_Any ;
125
+
125
126
for ( curr, ch) in self . string . char_indices ( ) {
126
127
idx = curr;
127
128
@@ -140,9 +141,31 @@ impl<'a> Iterator for UWordBounds<'a> {
140
141
// (This is not obvious from the wording of UAX#29, but if you look at the
141
142
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
142
143
// then the "correct" interpretation of WB4 becomes apparent.)
144
+ //
145
+ // WB4 makes all ZWJs collapse into the previous state
146
+ // but you can still be in a Zwj state if you started with Zwj
147
+ //
148
+ // This means that Zwj + Extend will collapse into Zwj, which is wrong,
149
+ // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
150
+ // and that rule (WB3c) has higher priority
151
+ //
152
+ // Thus, when in the Zwj state, we track if the last collapsed character is also
153
+ // a ZWJ. If it isn't, we treat that as a "tainted" zwj, which is basically
154
+ // of the form ZWJ (Format | Extend | ZWJ)* (Format | Extend).
143
155
if state != Start {
144
156
match cat {
145
- wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => continue ,
157
+ wd:: WC_Extend | wd:: WC_Format => {
158
+ if let Zwj ( ref mut taint) = state {
159
+ * taint = true ;
160
+ }
161
+ continue
162
+ }
163
+ wd:: WC_ZWJ => {
164
+ if let Zwj ( ref mut taint) = state {
165
+ * taint = false ;
166
+ }
167
+ continue
168
+ }
146
169
_ => { }
147
170
}
148
171
}
@@ -163,7 +186,7 @@ impl<'a> Iterator for UWordBounds<'a> {
163
186
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
164
187
wd:: WC_Regional_Indicator => Regional , // rule WB13c
165
188
wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
166
- wd:: WC_ZWJ => Zwj , // rule WB3c
189
+ wd:: WC_ZWJ => Zwj ( false ) , // rule WB3c
167
190
wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
168
191
_ => {
169
192
if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
@@ -176,8 +199,14 @@ impl<'a> Iterator for UWordBounds<'a> {
176
199
break ; // rule WB999
177
200
}
178
201
} ,
179
- Zwj => match cat { // rule WB3c
180
- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => continue ,
202
+ Zwj ( true ) => {
203
+ take_curr = false ;
204
+ break
205
+ }
206
+ Zwj ( false ) => match cat { // rule WB3c
207
+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => {
208
+ continue ;
209
+ } ,
181
210
_ => {
182
211
take_curr = false ;
183
212
break ;
@@ -329,7 +358,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
329
358
// Hebrew Letter immediately before it.
330
359
// (2) Format and Extend char handling takes some gymnastics.
331
360
332
- if cat == wd:: WC_Extend || cat == wd:: WC_Format {
361
+ if cat == wd:: WC_Extend || cat == wd:: WC_Format || cat == wd :: WC_ZWJ {
333
362
if match state {
334
363
FormatExtend ( _) | Start => false ,
335
364
_ => true
@@ -357,7 +386,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
357
386
wd:: WC_Katakana => Katakana , // rule WB13, WB13b
358
387
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
359
388
wd:: WC_Regional_Indicator => Regional , // rule WB13c
360
- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
389
+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj ( false ) , // rule WB3c
361
390
// rule WB4:
362
391
wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
363
392
wd:: WC_Single_Quote => {
@@ -380,7 +409,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
380
409
} ,
381
410
_ => break // rule WB999
382
411
} ,
383
- Zwj => match cat { // rule WB3c
412
+ Zwj ( _ ) => match cat { // rule WB3c
384
413
wd:: WC_ZWJ => continue ,
385
414
_ => {
386
415
take_curr = false ;
0 commit comments