54
54
# these are the surrogate codepoints, which are not valid rust characters
55
55
surrogate_codepoints = (0xd800 , 0xdfff )
56
56
57
+ UNICODE_VERSION = (11 , 0 , 0 )
58
+
59
+ UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
60
+
57
61
def is_surrogate (n ):
58
62
return surrogate_codepoints [0 ] <= n <= surrogate_codepoints [1 ]
59
63
60
64
def fetch (f ):
61
65
if not os .path .exists (os .path .basename (f )):
62
- os .system ("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
63
- % f )
66
+ if "emoji" in f :
67
+ os .system ("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68
+ % (UNICODE_VERSION [0 ], UNICODE_VERSION [1 ], f ))
69
+ else :
70
+ os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71
+ % (UNICODE_VERSION_NUMBER , f ))
64
72
65
73
if not os .path .exists (os .path .basename (f )):
66
74
sys .stderr .write ("cannot load %s" % f )
@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
262
270
pub use self::%sCat::*;
263
271
264
272
#[allow(non_camel_case_types)]
265
- #[derive(Clone, Copy, PartialEq, Eq)]
273
+ #[derive(Clone, Copy, PartialEq, Eq, Debug )]
266
274
pub enum %sCat {
267
275
""" % (name , Name , Name ))
268
276
@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
305
313
with open (r , "w" ) as rf :
306
314
# write the file's preamble
307
315
rf .write (preamble )
308
-
309
- # download and parse all the data
310
- fetch ("ReadMe.txt" )
311
- with open ("ReadMe.txt" ) as readme :
312
- pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
313
- unicode_version = re .search (pattern , readme .read ()).groups ()
314
316
rf .write ("""
315
317
/// The version of [Unicode](http://www.unicode.org/)
316
318
/// that this version of unicode-segmentation is based on.
317
319
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
318
- """ % unicode_version )
320
+ """ % UNICODE_VERSION )
319
321
322
+ # download and parse all the data
320
323
gencats = load_gencats ("UnicodeData.txt" )
321
324
derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" ])
322
325
@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
341
344
grapheme_table = []
342
345
for cat in grapheme_cats :
343
346
grapheme_table .extend ([(x , y , cat ) for (x , y ) in grapheme_cats [cat ]])
347
+ emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
348
+ grapheme_table .extend ([(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]])
344
349
grapheme_table .sort (key = lambda w : w [0 ])
345
- emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()), "grapheme" )
350
+ last = - 1
351
+ for chars in grapheme_table :
352
+ if chars [0 ] <= last :
353
+ raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354
+ last = chars [1 ]
355
+ emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()) + ["Extended_Pictographic" ], "grapheme" )
346
356
rf .write ("\n " )
347
357
348
358
word_cats = load_properties ("auxiliary/WordBreakProperty.txt" , [])
@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
352
362
word_table .sort (key = lambda w : w [0 ])
353
363
emit_break_module (rf , word_table , list (word_cats .keys ()), "word" )
354
364
365
+ # There are some emoji which are also ALetter, so this needs to be stored separately
366
+ # For efficiency, we could still merge the two tables and produce an ALetterEP state
367
+ emoji_table = [(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]]
368
+ emit_break_module (rf , emoji_table , ["Extended_Pictographic" ], "emoji" )
369
+
355
370
sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" , [])
356
371
sentence_table = []
357
372
for cat in sentence_cats :
0 commit comments