8000 Implement new rule GB9b (Do not break after Prepend) · cmyr/unicode-segmentation@bc121b5 · GitHub
[go: up one dir, main page]

Skip to content

Commit bc121b5

Browse files
committed
Implement new rule GB9b (Do not break after Prepend)
1 parent e3754bc commit bc121b5

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

src/grapheme.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ enum GraphemeState {
5858
HangulL,
5959
HangulLV,
6060
HangulLVT,
61+
Prepend,
6162
Regional,
6263
Emoji,
6364
Zwj,
@@ -123,6 +124,7 @@ impl<'a> Iterator for Graphemes<'a> {
123124
gr::GC_L => HangulL,
124125
gr::GC_LV | gr::GC_V => HangulLV,
125126
gr::GC_LVT | gr::GC_T => HangulLVT,
127+
gr::GC_Prepend if self.extended => Prepend,
126128
gr::GC_Regional_Indicator => Regional,
127129
gr::GC_E_Base | gr::GC_E_Base_GAZ => Emoji,
128130
_ => FindExtend
@@ -155,6 +157,13 @@ impl<'a> Iterator for Graphemes<'a> {
155157
break;
156158
}
157159
},
160+
Prepend => match cat { // rule GB9b
161+
gr::GC_Control => {
162+
take_curr = false;
163+
break;
164+
}
165+
_ => continue
166+
},
158167
Regional => match cat { // rule GB12/GB13
159168
gr::GC_Regional_Indicator => FindExtend,
160169
_ => {
@@ -276,6 +285,10 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
276285
break;
277286
}
278287
},
288+
Prepend => {
289+
// not used in reverse iteration
290+
unreachable!()
291+
},
279292
Regional => { // rule GB12/GB13
280293
// Need to scan backward to find if this is preceded by an odd or even number
281294
// of Regional_Indicator characters.
@@ -340,6 +353,17 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
340353
Some(cat)
341354
};
342355

356+
if self.extended && cat != gr::GC_Control {
357+
// rule GB9b: include any preceding Prepend characters
358+
for (i, c) in self.string[..idx].char_indices().rev() {
359+
// TODO: Cache this to avoid repeated lookups in the common case.
360+
match gr::grapheme_category(c) {
361+
gr::GC_Prepend => idx = i,
362+
_ => break
363+
}
364+
}
365+
}
366+
343367
let retstr = &self.string[idx..];
344368
self.string = &self.string[..idx];
345369
Some(retstr)

0 commit comments

Comments
 (0)
0