8000 Fix forward word boundary iteration · cmyr/unicode-segmentation@858d594 · GitHub
[go: up one dir, main page]

Skip to content

Commit 858d594

Browse files
Manishearthmbrubeck
authored andcommitted
Fix forward word boundary iteration
1 parent c80e5a3 commit 858d594

File tree

3 files changed

+47
-15
lines changed

3 files changed

+47
-15
lines changed

src/tables.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -868,7 +868,7 @@ pub mod word {
868868
pub use self::WordCat::*;
869869

870870
#[allow(non_camel_case_types)]
871-
#[derive(Clone, Copy, PartialEq, Eq)]
871+
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
872872
pub enum WordCat {
873873
WC_ALetter,
874874
WC_Any,

src/test.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,16 @@ fn test_words() {
9292
use testdata::TEST_WORD;
9393

9494
for &(s, w) in TEST_WORD {
95+
macro_rules! assert_ {
96+
($x:expr) => (assert!($x, "Word test {} for testcase ({:?}, {:?}) failed", stringify!($x), s, w))
97+
}
9598
// test forward iterator
96-
assert!(s.split_word_bounds()
99+
assert_!(s.split_word_bounds()
97100
.zip(w.iter().cloned())
98101
.all(|(a,b)| a == b));
99102

100103
// test reverse iterator
101-
assert!(s.split_word_bounds().rev()
104+
assert_!(s.split_word_bounds().rev()
102105
.zip(w.iter().rev().cloned())
103106
.all(|(a,b)| a == b));
104107

@@ -111,12 +114,12 @@ fn test_words() {
111114
let indices = indices;
112115

113116
// test forward indices iterator
114-
assert!(s.split_word_bound_indices()
117+
assert_!(s.split_word_bound_indices()
115118
.zip(indices.iter())
116119
.all(|((l,_),m)| l == *m));
117120

118121
// test backward indices iterator
119-
assert!(s.split_word_bound_indices().rev()
122+
assert_!(s.split_word_bound_indices().rev()
120123
.zip(indices.iter().rev())
121124
.all(|((l,_),m)| l == *m));
122125
}

src/word.rs

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
7272
}
7373

7474
// state machine for word boundary rules
75-
#[derive(Clone,Copy,PartialEq,Eq)]
75+
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
7676
enum UWordBoundsState {
7777
Start,
7878
Letter,
@@ -82,12 +82,12 @@ enum UWordBoundsState {
8282
ExtendNumLet,
8383
Regional,
8484
FormatExtend(FormatExtendType),
85-
Zwj,
85+
Zwj(bool),
8686
Emoji,
8787
}
8888

8989
// subtypes for FormatExtend state in UWordBoundsState
90-
#[derive(Clone,Copy,PartialEq,Eq)]
90+
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
9191
enum FormatExtendType {
9292
AcceptAny,
9393
AcceptNone,
@@ -122,6 +122,7 @@ impl<'a> Iterator for UWordBounds<'a> {
122122
let mut state = Start;
123123
let mut cat = wd::WC_Any;
124124
let mut savecat = wd::WC_Any;
125+
125126
for (curr, ch) in self.string.char_indices() {
126127
idx = curr;
127128

@@ -140,9 +141,31 @@ impl<'a> Iterator for UWordBounds<'a> {
140141
// (This is not obvious from the wording of UAX#29, but if you look at the
141142
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
142143
// then the "correct" interpretation of WB4 becomes apparent.)
144+
//
145+
// WB4 makes all ZWJs collapse into the previous state
146+
// but you can still be in a Zwj state if you started with Zwj
147+
//
148+
// This means that Zwj + Extend will collapse into Zwj, which is wrong,
149+
// since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
150+
// and that rule (WB3c) has higher priority
151+
//
152+
// Thus, when in the Zwj state, we track if the last collapsed character is also
153+
// a ZWJ. If it isn't, we treat that as a "tainted" zwj, which is basically
154+
// of the form ZWJ (Format | Extend | ZWJ)* (Format | Extend).
143155
if state != Start {
144156
match cat {
145-
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => continue,
157+
wd::WC_Extend | wd::WC_Format => {
158+
if let Zwj(ref mut taint) = state {
159+
*taint = true;
160+
}
161+
continue
162+
}
163+
wd::WC_ZWJ => {
164+
if let Zwj(ref mut taint) = state {
165+
*taint = false;
166+
}
167+
continue
168+
}
146169
_ => {}
147170
}
148171
}
@@ -163,7 +186,7 @@ impl<'a> Iterator for UWordBounds<'a> {
163186
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
164187
wd::WC_Regional_Indicator => Regional, // rule WB13c
165188
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
166-
wd::WC_ZWJ => Zwj, // rule WB3c
189+
wd::WC_ZWJ => Zwj(false), // rule WB3c
167190
wd::WC_E_Base | wd::WC_E_Base_GAZ => Emoji, // rule WB14
168191
_ => {
169192
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
@@ -176,8 +199,14 @@ impl<'a> Iterator for UWordBounds<'a> {
176199
break; // rule WB999
177200
}
178201
},
179-
Zwj => match cat { // rule WB3c
180-
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => continue,
202+
Zwj(true) => {
203+
take_curr = false;
204+
break
205+
}
206+
Zwj(false) => match cat { // rule WB3c
207+
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => {
208+
continue;
209+
},
181210
_ => {
182211
take_curr = false;
183212
break;
@@ -329,7 +358,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
329358
// Hebrew Letter immediately before it.
330359
// (2) Format and Extend char handling takes some gymnastics.
331360

332-
if cat == wd::WC_Extend || cat == wd::WC_Format {
361+
if cat == wd::WC_Extend || cat == wd::WC_Format || cat == wd::WC_ZWJ {
333362
if match state {
334363
FormatExtend(_) | Start => false,
335364
_ => true
@@ -357,7 +386,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
357386
wd::WC_Katakana => Katakana, // rule WB13, WB13b
358387
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
359388
wd::WC_Regional_Indicator => Regional, // rule WB13c
360-
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj, // rule WB3c
389+
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj(false), // rule WB3c
361390
// rule WB4:
362391
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
363392
wd::WC_Single_Quote => {
@@ -380,7 +409,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
380409
},
381410
_ => break // rule WB999
382411
},
383-
Zwj => match cat { // rule WB3c
412+
Zwj(_) => match cat { // rule WB3c
384413
wd::WC_ZWJ => continue,
385414
_ => {
386415
take_curr = false;

0 commit comments

Comments
 (0)
0