8000 Add comments and doc tests · rth/unicode-segmentation@0083ef5 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0083ef5

Browse files
committed
Add comments and doc tests
Some of the doc tests also encouraged me to tweak the implementation.
1 parent 4a76978 commit 0083ef5

File tree

1 file changed

+141
-12
lines changed

1 file changed

+141
-12
lines changed

src/grapheme.rs

Lines changed: 141 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -135,28 +135,51 @@ pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndice
135135
}
136136

137137
// maybe unify with PairResult?
138+
// An enum describing information about a potential boundary.
138139
#[derive(PartialEq, Eq, Clone)]
139140
enum GraphemeState {
141+
// No information is known.
140142
Unknown,
143+
// It is known to not be a boundary.
141144
NotBreak,
145+
// It is known to be a boundary.
142146
Break,
147+
// The codepoint after is LF, so a boundary iff the codepoint before is not CR. (GB3)
143148
CheckCrlf,
149+
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
150+
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
144151
Regional,
152+
// The codepoint after is in the E_Modifier category, so whether it's a boundary
153+
// depends on pre-context according to GB10.
145154
Emoji,
146155
}
147156

148157
/// Cursor-based segmenter for grapheme clusters.
149158
#[derive(Clone)]
150159
pub struct GraphemeCursor {
151-
offset: usize, // current cursor position
152-
len: usize, // total length of the string
160+
// Current cursor position.
161+
offset: usize,
162+
// Total length of the string.
163+
len: usize,
164+
// A config flag indicating whether this cursor computes legacy or extended
165+
// grapheme cluster boundaries (enables GB9a and GB9b if set).
153166
is_extended: bool,
167+
// Information about the potential boundary at `offset`
154168
state: GraphemeState,
155-
cat_before: Option<GraphemeCat>, // category of codepoint immediately preceding cursor
156-
cat_after: Option<GraphemeCat>, // category of codepoint immediately after cursor
169+
// Category of codepoint immediately preceding cursor, if known.
170+
cat_before: Option<GraphemeCat>,
171+
// Category of codepoint immediately after cursor, if known.
172+
cat_after: Option<GraphemeCat>,
173+
// If set, at least one more codepoint immediately preceding this offset
174+
// is needed to resolve whether there's a boundary at `offset`.
157175
pre_context_offset: Option<usize>,
176+
// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
177+
// is set, then counts the number of RIS between that and `offset`, otherwise
178+
// is an accurate count relative to the string.
158179
ris_count: Option<usize>,
159-
resuming: bool, // query was suspended
180+
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
181+
// to needing more input.
182+
resuming: bool,
160183
}
161184

162185
/// An error return indicating that not enough content was available in the
@@ -183,14 +206,15 @@ pub enum GraphemeIncomplete {
183206
InvalidOffset,
184207
}
185208

209+
// An enum describing the result from lookup of a pair of categories.
186210
#[derive(PartialEq, Eq)]
187211
enum PairResult {
188212
NotBreak, // definitely not a break
189213
Break, // definitely a break
190-
Extended, // a break if not in extended mode
214+
Extended, // a break iff not in extended mode
191215
CheckCrlf, // a break unless it's a CR LF pair
192216
Regional, // a break if preceded by an even number of RIS
193-
Emoji, // a break if preceded by emoji base and extend
217+
Emoji, // a break if preceded by emoji base and (Extend)*
194218
}
195219

196220
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
@@ -213,7 +237,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
213237
(_, GC_Extend) => NotBreak, // GB9
214238
(_, GC_ZWJ) => NotBreak, // GB9
215239
(_, GC_SpacingMark) => Extended, // GB9a
216-
(GC_Prepend, _) => Extended, // GB9a
240+
(GC_Prepend, _) => Extended, // GB9b
217241
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
218242
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
219243
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
@@ -230,6 +254,15 @@ impl GraphemeCursor {
230254
/// controls whether extended grapheme clusters are selected.
231255
///
232256
/// The `offset` parameter must be on a codepoint boundary.
257+
///
258+
/// ```rust
259+
/// # use unicode_segmentation::GraphemeCursor;
260+
/// let s = "हिन्दी";
261+
/// let mut legacy = GraphemeCursor::new(0, s.len(), false);
262+
/// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
263+
/// let mut extended = GraphemeCursor::new(0, s.len(), true);
264+
/// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
265+
/// ```
233266
pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
234267
let state = if offset == 0 || offset == len {
235268
GraphemeState::Break
@@ -252,6 +285,15 @@ impl GraphemeCursor {
252285
// Not sure I'm gonna keep this, the advantage over new() seems thin.
253286

254287
/// Set the cursor to a new location in the same string.
288+
///
289+
/// ```rust
290+
/// # use unicode_segmentation::GraphemeCursor;
291+
/// let s = "abcd";
292+
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
293+
/// assert_eq!(cursor.cur_cursor(), 0);
294+
/// cursor.set_cursor(2);
295+
/// assert_eq!(cursor.cur_cursor(), 2);
296+
/// ```
255297
pub fn set_cursor(&mut self, offset: usize) {
256298
if offset != self.offset {
257299
self.offset = offset;
@@ -270,13 +312,39 @@ impl GraphemeCursor {
270312
/// The current offset of the cursor. Equal to the last value provided to
271313
/// `new()` or `set_cursor( F438 )`, or returned from `next_boundary()` or
272314
/// `prev_boundary()`.
315+
///
316+
/// ```rust
317+
/// # use unicode_segmentation::GraphemeCursor;
318+
/// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
319+
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
320+
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
321+
/// assert_eq!(cursor.cur_cursor(), 4);
322+
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
323+
/// assert_eq!(cursor.cur_cursor(), 8);
324+
/// ```
273325
pub fn cur_cursor(&self) -> usize {
274326
self.offset
275327
}
276328

277329
/// Provide additional pre-context when it is needed to decide a boundary.
278330
/// The end of the chunk must coincide with the value given in the
279331
/// `GraphemeIncomplete::PreContext` request.
332+
///
333+
/// ```rust
334+
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
335+
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
336+
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
337+
/// // Note enough pre-context to decide if there's a boundary between the two flags.
338+
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
339+
/// // Provide one more Regional Indicator Symbol of pre-context
340+
/// cursor.provide_context(&flags[4..8], 4);
341+
/// // Still not enough context to decide.
342+
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
343+
/// // Provide additional requested context.
344+
/// cursor.provide_context(&flags[0..4], 0);
345+
/// // That's enough to decide (it always is when context goes to the start of the string)
346+
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
347+
/// ```
280348
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
281349
use tables::grapheme as gr;
282350
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
@@ -379,6 +447,15 @@ impl GraphemeCursor {
379447
/// All calls should have consistent chunk contents (ie, if a chunk provides
380448
/// content for a given slice, all further chunks covering that slice must have
381449
/// the same content for it).
450+
/ 10000 //
451+
/// ```rust
452+
/// # use unicode_segmentation::GraphemeCursor;
453+
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
454+
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
455+
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
456+
/// cursor.set_cursor(12);
457+
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
458+
/// ```
382459
pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
383460
use tables::grapheme as gr;
384461
if self.state == GraphemeState::Break {
@@ -388,7 +465,9 @@ impl GraphemeCursor {
388465
return Ok(false)
389466
}
390467
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
391-
return Err(GraphemeIncomplete::InvalidOffset)
468+
if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
469+
return Err(GraphemeIncomplete::InvalidOffset)
470+
}
392471
}
393472
if let Some(pre_context_offset) = self.pre_context_offset {
394473
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
@@ -399,6 +478,7 @@ impl GraphemeCursor {
399478
self.cat_after = Some(gr::grapheme_category(ch));
400479
}
401480
if self.offset == chunk_start {
481+
let mut need_pre_context = true;
402482
match self.cat_after.unwrap() {
403483
gr::GC_Control => {
404484
if chunk.as_bytes()[offset_in_chunk] == b'\n' {
@@ -407,10 +487,12 @@ impl GraphemeCursor {
407487
}
408488
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
409489
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
410-
_ => ()
490+
_ => need_pre_context = self.cat_before.is_none(),
491+
}
492+
if need_pre_context {
493+
self.pre_context_offset = Some(chunk_start);
494+
return Err(GraphemeIncomplete::PreContext(chunk_start));
411495
}
412-
self.pre_context_offset = Some(chunk_start);
413-
return Err(GraphemeIncomplete::PreContext(chunk_start));
414496
}
415497
if self.cat_before.is_none() {
416498
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
@@ -457,6 +539,29 @@ impl GraphemeCursor {
457539
/// given, then retry.
458540
///
459541
/// See `is_boundary` for expectations on the provided chunk.
542+
///
543+
/// ```rust
544+
/// # use unicode_segmentation::GraphemeCursor;
545+
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
546+
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
547+
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
548+
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
549+
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
550+
/// ```
551+
///
552+
/// And an example that uses partial strings:
553+
///
554+
/// ```rust
555+
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
556+
/// let s = "abcd";
557+
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
558+
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
559+
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
560+
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
561+
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
562+
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
563+
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
564+
/// ```
460565
pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
461566
use tables::grapheme as gr;
462567
if self.offset == self.len {
@@ -509,6 +614,30 @@ impl GraphemeCursor {
509614
/// given, then retry.
510615
///
511616
/// See `is_boundary` for expectations on the provided chunk.
617+
///
618+
/// ```rust
619+
/// # use unicode_segmentation::GraphemeCursor;
620+
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
621+
/// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
622+
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
623+
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
624+
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
625+
/// ```
626+
///
627+
/// And an example that uses partial strings (note the exact return is not
628+
/// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
629+
///
630+
/// ```rust
631+
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
632+
/// let s = "abcd";
633+
/// let mut cursor = GraphemeCursor::new(4, s.len(), false);
634+
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
635+
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
636+
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
637+
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
638+
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
639+
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
640+
/// ```
512641
pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
513642
use tables::grapheme as gr;
514643
if self.offset == 0 {

0 commit comments

Comments
 (0)
0