8000 Move update script over to Unicode 11; make it handle emoji data · simmsb/unicode-segmentation@9d0c1e0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9d0c1e0

Browse files
committed
Move update script over to Unicode 11; make it handle emoji data
1 parent 666eeed commit 9d0c1e0

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

scripts/unicode.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
# these are the surrogate codepoints, which are not valid rust characters
5555
surrogate_codepoints = (0xd800, 0xdfff)
5656

57-
UNICODE_VERSION = (10, 0, 0)
57+
UNICODE_VERSION = (11, 0, 0)
5858

5959
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
6060

@@ -63,8 +63,12 @@ def is_surrogate(n):
6363

6464
def fetch(f):
6565
if not os.path.exists(os.path.basename(f)):
66-
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
67-
% (UNICODE_VERSION_NUMBER, f))
66+
if "emoji" in f:
67+
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68+
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
69+
else:
70+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71+
% (UNICODE_VERSION_NUMBER, f))
6872

6973
if not os.path.exists(os.path.basename(f)):
7074
sys.stderr.write("cannot load %s" % f)
@@ -266,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
266270
pub use self::%sCat::*;
267271
268272
#[allow(non_camel_case_types)]
269-
#[derive(Clone, Copy, PartialEq, Eq)]
273+
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
270274
pub enum %sCat {
271275
""" % (name, Name, Name))
272276

@@ -340,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
340344
grapheme_table = []
341345
for cat in grapheme_cats:
342346
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
347+
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
348+
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
343349
grapheme_table.sort(key=lambda w: w[0])
344-
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
350+
last = -1
351+
for chars in grapheme_table:
352+
if chars[0] <= last:
353+
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354+
last = chars[1]
355+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
345356
rf.write("\n")
346357

347358
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
@@ -351,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
351362
word_table.sort(key=lambda w: w[0])
352363
emit_break_module(rf, word_table, list(word_cats.keys()), "word")
353364

365+
# There are some emoji which are also ALetter, so this needs to be stored separately
366+
# For efficiency, we could still merge the two tables and produce an ALetterEP state
367+
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
368+
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
369+
354370
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
355371
sentence_table = []
356372
for cat in sentence_cats:

0 commit comments

Comments
 (0)
0