8000 Merge pull request #68 from unicode-rs/unicode-11 · unicode-rs/unicode-segmentation@b159d9e · GitHub
[go: up one dir, main page]

Skip to content

Commit b159d9e

Browse files
authored
Merge pull request #68 from unicode-rs/unicode-11
Update to Unicode 11
2 parents 7be58ca + df71866 commit b159d9e

File tree

8 files changed

+2284
-2394
lines changed

8 files changed

+2284
-2394
lines changed

scripts/unicode.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,21 @@
5454
# these are the surrogate codepoints, which are not valid rust characters
5555
surrogate_codepoints = (0xd800, 0xdfff)
5656

57+
UNICODE_VERSION = (11, 0, 0)
58+
59+
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
60+
5761
def is_surrogate(n):
5862
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
5963

6064
def fetch(f):
6165
if not os.path.exists(os.path.basename(f)):
62-
os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
63-
% f)
66+
if "emoji" in f:
67+
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68+
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
69+
else:
70+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71+
% (UNICODE_VERSION_NUMBER, f))
6472

6573
if not os.path.exists(os.path.basename(f)):
6674
sys.stderr.write("cannot load %s" % f)
@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
262270
pub use self::%sCat::*;
263271
264272
#[allow(non_camel_case_types)]
265-
#[derive(Clone, Copy, PartialEq, Eq)]
273+
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
266274
pub enum %sCat {
267275
""" % (name, Name, Name))
268276

@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
305313
with open(r, "w") as rf:
306314
# write the file's preamble
307315
rf.write(preamble)
308-
309-
# download and parse all the data
310-
fetch("ReadMe.txt")
311-
with open("ReadMe.txt") as readme:
312-
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
313-
unicode_version = re.search(pattern, readme.read()).groups()
314316
rf.write("""
315317
/// The version of [Unicode](http://www.unicode.org/)
316318
/// that this version of unicode-segmentation is based on.
317319
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
318-
""" % unicode_version)
320+
""" % UNICODE_VERSION)
319321

322+
# download and parse all the data
320323
gencats = load_gencats("UnicodeData.txt")
321324
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
322325

@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
341344
grapheme_table = []
342345
for cat in grapheme_cats:
343346
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
347+
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
348+
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
344349
grapheme_table.sort(key=lambda w: w[0])
345-
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
350+
last = -1
351+
for chars in grapheme_table:
352+
if chars[0] <= last:
353+
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354+
last = chars[1]
355+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
346356
rf.write("\n")
347357

348358
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
352362
word_table.sort(key=lambda w: w[0])
353363
emit_break_module(rf, word_table, list(word_cats.keys()), "word")
354364

365+
# There are some emoji which are also ALetter, so this needs to be stored separately
366+
# For efficiency, we could still merge the two tables and produce an ALetterEP state
367+
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
368+
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
369+
355370
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
356371
sentence_table = []
357372
for cat in sentence_cats:

scripts/unicode_gen_breaktests.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def create_grapheme_data(f):
172172
stype = "&'static [(&'static str, &'static [&'static str])]"
173173
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174174
f.write(" // official Unicode test data\n")
175-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
175+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
176176
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
177177
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
178178

@@ -187,7 +187,7 @@ def create_words_data(f):
187187

188188
wtype = "&'static [(&'static str, &'static [&'static str])]"
189189
f.write(" // official Unicode test data\n")
190-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
190+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
191191
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
192192

193193
def create_sentence_data(f):
@@ -201,7 +201,7 @@ def create_sentence_data(f):
201201

202202
wtype = "&'static [(&'static str, &'static [&'static str])]"
203203
f.write(" // official Unicode test data\n")
204-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
204+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
205205
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
206206

207207
if __name__ == "__main__":

src/grapheme.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ enum GraphemeState {
147147
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
148148
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
149149
Regional,
150-
// The codepoint after is in the E_Modifier category, so whether it's a boundary
151-
// depends on pre-context according to GB10.
150+
// The codepoint after is Extended_Pictographic,
151+
// so whether it's a boundary depends on pre-context according to GB11.
152152
Emoji,
153153
}
154154

@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
239239
(_, GC_ZWJ) => NotBreak, // GB9
240240
(_, GC_SpacingMark) => Extended, // GB9a
241241
(GC_Prepend, _) => Extended, // GB9b
242-
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
243-
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
244-
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
245-
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
246-
(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
242+
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
247243
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
248244
(_, _) => Break, // GB999
249245
}
@@ -415,10 +411,17 @@ impl GraphemeCursor {
415411

416412
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
417413
use tables::grapheme as gr;
418-
for ch in chunk.chars().rev() {
414+
let mut iter = chunk.chars().rev();
415+
if let Some(ch) = iter.next() {
416+
if gr::grapheme_category(ch) != gr::GC_ZWJ {
417+
self.decide(true);
418+
return;
419+
}
420+
}
421+
for ch in iter {
419422
match gr::grapheme_category(ch) {
420423
gr::GC_Extend => (),
421-
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
424+
gr::GC_Extended_Pictographic => {
422425
self.decide(false);
423426
return;
424427
}
@@ -484,7 +487,7 @@ impl GraphemeCursor {
484487
let mut need_pre_context = true;
485488
match self.cat_after.unwrap() {
486489
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
487-
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
490+
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
488491
_ => need_pre_context = self.cat_before.is_none(),
489492
}
490493
if need_pre_context {

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
//!
3030
//! let s = "The quick (\"brown\") fox";
3131
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32-
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
32+
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
3333
//! assert_eq!(w, b);
3434
//! }
3535
//! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
156156
/// ```
157157
/// # use self::unicode_segmentation::UnicodeSegmentation;
158158
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
159-
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
159+
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
160160
///
161161
/// assert_eq!(&swu1[..], b);
162162
/// ```

0 commit comments

Comments
 (0)
0