30
30
//! # Rules for determining width
31
31
//!
32
32
//! This crate currently uses the following rules to determine the width of a
33
- //! character or string, in order of decreasing precedence. These may be tweaked in the future.
33
+ //! character or string, in order of decreasing precedence. These may be tweaked in the future;
34
+ //! however see [guarantees](#guarantees) below.
34
35
//!
35
36
//! 1. [Emoji presentation sequences] have width 2.
36
- //! (The width of a string may therefore differ from the sum of the widths of its characters.)
37
- //! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
38
- //! have width 1:
37
+ //! 2. Outside of an East Asian context, [text presentation sequences] have width 1
38
+ //! if their base character:
39
39
//! - Has the [`Emoji_Presentation`] property, and
40
- //! - Not in the [Enclosed Ideographic Supplement] block.
40
+ //! - Is not in the [Enclosed Ideographic Supplement] block.
41
41
//! 3. The sequence `"\r\n"` has width 1.
42
42
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43
43
//! 5. The following have width 0:
44
44
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
45
- //! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
45
+ //! with the [`Default_Ignorable_Code_Point`] property.
46
46
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
47
47
//! with the [`Grapheme_Extend`] property.
48
48
//! - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] characters:
62
62
//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
63
63
//! 8. All other characters have width 1.
64
64
//!
65
+ //! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
65
66
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
66
67
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
67
68
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
71
72
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
72
73
//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
73
74
//!
74
- //! [Emoji presentation sequences]: ( https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
75
- //! [text presentation sequences]: ( https://unicode.org/reports/tr51/#def_text_presentation_sequence)
75
+ //! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
76
+ //! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
76
77
//!
77
78
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
78
79
//!
79
- //! ## Canonical equivalence
80
+ //! ## Guarantees
80
81
//!
81
- //! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
82
- //! However, this guarantee does not currently hold for the CJK width variants.
82
+ //! - Any two canonically equivalent strings have the same non-CJK width.
83
+ //! This will not change in any future semver-compatible version.
84
+ //! (This guarantee does not currently hold for the CJK width variants.)
85
+ //! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
86
+ //! This is unlikely to change in any future semver-compatible version.
87
+ //! (This guarantee holds for both CJK and non-CJK width.)
88
+ //!
89
+ //! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
83
90
84
91
#![ forbid( unsafe_code) ]
85
92
#![ deny( missing_docs) ]
@@ -95,6 +102,14 @@ pub use tables::UNICODE_VERSION;
95
102
mod tables;
96
103
97
104
/// Methods for determining displayed width of Unicode characters.
105
+ ///
106
+ /// **NB:** the width of a string may differ from the sum of the widths of its characters;
107
+ /// see the [crate-level documentation](crate#rules-for-determining-width) for more.
108
+ /// Instead of working with individual characters, consider using [extended grapheme clusters],
109
+ /// perhaps with the [`unicode-segmentation`] crate.
110
+ ///
111
+ /// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
112
+ /// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
98
113
pub trait UnicodeWidthChar {
99
114
/// Returns the character's displayed width in columns, or `None` if the
100
115
/// character is a control character.
0 commit comments