8000 Merge pull request #68 from unicode-rs/unicode-11 · YohDeadfall/unicode-segmentation@b159d9e · GitHub
[go: up one dir, main page]

Skip to content

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit b159d9e

Browse files
authored
Merge pull request unicode-rs#68 from unicode-rs/unicode-11
Update to Unicode 11
2 parents 7be58ca + df71866 commit b159d9e

File tree

8 files changed

+2284
-2394
lines changed

8 files changed

+2284
-2394
lines changed

scripts/unicode.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,21 @@
5454
# these are the surrogate codepoints, which are not valid rust characters
5555
surrogate_codepoints = (0xd800, 0xdfff)
5656

57+
UNICODE_VERSION = (11, 0, 0)
58+
59+
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
60+
5761
def is_surrogate(n):
5862
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
5963

6064
def fetch(f):
6165
if not os.path.exists(os.path.basename(f)):
62-
os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
63-
% f)
66+
if "emoji" in f:
67+
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68+
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
69+
else:
70+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71+
% (UNICODE_VERSION_NUMBER, f))
6472

6573
if not os.path.exists(os.path.basename(f)):
6674
sys.stderr.write("cannot load %s" % f)
@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
262270
pub use self::%sCat::*;
263271
264272
#[allow(non_camel_case_types)]
265-
#[derive(Clone, Copy, PartialEq, Eq)]
273+
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
266274
pub enum %sCat {
267275
""" % (name, Name, Name))
268276

@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
305313
with open(r, "w") as rf:
306314
# write the file's preamble
307315
rf.write(preamble)
308-
309-
# download and parse all the data
310-
fetch("ReadMe.txt")
311-
with open("ReadMe.txt") as readme:
312-
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
313-
unicode_version = re.search(pattern, readme.read()).groups()
314316
rf.write("""
315317
/// The version of [Unicode](http://www.unicode.org/)
316318
/// that this version of unicode-segmentation is based on.
317319
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
318-
""" % unicode_version)
320+
""" % UNICODE_VERSION)
319321

322+
# download and parse all the data
320323
gencats = load_gencats("UnicodeData.txt")
321324
derived = load_properties 8000 ("DerivedCoreProperties.txt", ["Alphabetic"])
322325

@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
341344
grapheme_table = []
342345
for cat in grapheme_cats:
343346
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
347+
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
348+
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
344349
grapheme_table.sort(key=lambda w: w[0])
345-
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
350+
last = -1
351+
for chars in grapheme_table:
352+
if chars[0] <= last:
353+
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354+
last = chars[1]
355+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
346356
rf.write("\n")
347357

348358
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
352362
word_table.sort(key=lambda w: w[0])
353363
emit_break_module(rf, word_table, list(word_cats.keys()), "word")
354364

365+
# There are some emoji which are also ALetter, so this needs to be stored separately
366+
# For efficiency, we could still merge the two tables and produce an ALetterEP state
367+
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
368+
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
369+
355370
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
356371
sentence_table = []
357372
for cat in sentence_cats:

scripts/unicode_gen_breaktests.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def create_grapheme_data(f):
172172
stype = "&'static [(&'static str, &'static [&'static str])]"
173173
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174174
f.write(" // official Unicode test data\n")
175-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
175+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
176176
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
177177
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
178178

@@ -187,7 +187,7 @@ def create_words_data(f):
187187

188188
wtype = "&'static [(&'static str, &'static [&'static str])]"
189189
f.write(" // official Unicode test data\n")
190-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
190+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
191191
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
192192

193193
def create_sentence_data(f):
@@ -201,7 +201,7 @@ def create_sentence_data(f):
201201

202202
wtype = "&'static [(&'static str, &'static [&'static str])]"
203203
f.write(" // official Unicode test data\n")
204-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
204+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
205205
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
206206

207207
if __name__ == "__main__":

src/grapheme.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ enum GraphemeState {
147147
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
148148
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
149149
Regional,
150-
// The codepoint after is in the E_Modifier category, so whether it's a boundary
151-
// depends on pre-context according to GB10.
150+
// The codepoint after is Extended_Pictographic,
151+
// so whether it's a boundary depends on pre-context according to GB11.
152152
Emoji,
153153
}
154154

@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
239239
(_, GC_ZWJ) => NotBreak, // GB9
240240
(_, GC_SpacingMark) => Extended, // GB9a
241241
(GC_Prepend, _) => Extended, // GB9b
242-
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
243-
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
244-
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
245-
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
246-
(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
242+
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
247243
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
248244
(_, _) => Break, // GB999
249245
}
@@ -415,10 +411,17 @@ impl GraphemeCursor {
415411

416412
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
417413
use tables::grapheme as gr;
418-
for ch in chunk.chars().rev() {
414+
let mut iter = chunk.chars().rev();
415+
if let Some(ch) = iter.next() {
416+
if gr::grapheme_category(ch) != gr::GC_ZWJ {
417+
self.decide(true);
418+
return;
419+
}
420+
}
421+
for ch in iter {
419422
match gr::grapheme_category(ch) {
420423
gr::GC_Extend => (),
421-
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
424+
gr::GC_Extended_Pictographic => {
422425
self.decide(false);
423426
return;
424427
}
@@ -484,7 +487,7 @@ impl GraphemeCursor {
484487
let mut need_pre_context = true;
485488
match self.cat_after.unwrap() {
486489
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
487-
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
490+
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
488491
_ => need_pre_context = self.cat_before.is_none(),
489492
}
490493
if need_pre_context {

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
//!
3030
//! let s = "The quick (\"brown\") fox";
3131
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32-
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
32+
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
3333
//! assert_eq!(w, b);
3434
//! }
3535
//! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
156156
/// ```
157157
/// # use self::unicode_segmentation::UnicodeSegmentation;
158158
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
159-
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
159+
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
160160
///
161161
/// assert_eq!(&swu1[..], b);
162162
/// ```

0 commit comments

Comments
 (0)
0