17
17
# - confusables.txt
18
18
# - ReadMe.txt
19
19
# This script also uses the following Unicode UCD data:
20
+ # - DerivedCoreProperties.txt
20
21
# - Scripts.txt
21
22
#
22
23
# Since this should not require frequent updates, we just store this
@@ -53,6 +54,8 @@ def fetch(f):
53
54
sys .stderr .write ("cannot load %s\n " % f )
54
55
exit (1 )
55
56
57
+ return f
58
+
56
59
# Download a UCD table file
57
60
def fetch_unidata (f ):
58
61
if not os .path .exists (os .path .basename (f )):
@@ -63,14 +66,14 @@ def fetch_unidata(f):
63
66
sys .stderr .write ("cannot load %s" % f )
64
67
exit (1 )
65
68
66
- # Loads code point data from IdentifierStatus.txt and
67
- # IdentifierType.txt
68
- # Implementation from unicode-segmentation
69
+ return f
70
+
71
+ # Loads code point data from provided filename f
72
+ # Implementation adapted from unicode-segmentation
69
73
def load_properties (f , interestingprops = None ):
70
- fetch (f )
71
74
props = {}
72
- re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+) " )
73
- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) " )
75
+ re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#\s]+) *# " )
76
+ re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *# " )
74
77
75
78
for line in fileinput .input (os .path .basename (f ), openhook = fileinput .hook_encoded ("utf-8" )):
76
79
prop = None
@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):
99
102
100
103
return props
101
104
102
- # Loads script data from Scripts.txt
103
- def load_script_properties (f , interestingprops ):
104
- fetch_unidata (f )
105
- props = {}
106
- # Note: these regexes are different from those in unicode-segmentation,
107
- # becase we need to handle spaces here
108
- re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#]+) *#" )
109
- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#" )
110
-
111
- for line in fileinput .input (os .path .basename (f )):
112
- prop = None
113
- d_lo = 0
114
- d_hi = 0
115
- m = re1 .match (line )
116
- if m :
117
- d_lo = m .group (1 )
118
- d_hi = m .group (1 )
119
- prop = m .group (2 ).strip ()
120
- else :
121
- m = re2 .match (line )
122
- if m :
123
- d_lo = m .group (1 )
124
- d_hi = m .group (2 )
125
- prop = m .group (3 ).strip ()
126
- else :
127
- continue
128
- if interestingprops and prop not in interestingprops :
129
- continue
130
- d_lo = int (d_lo , 16 )
131
- d_hi = int (d_hi , 16 )
132
- if prop not in props :
133
- props [prop ] = []
134
- props [prop ].append ((d_lo , d_hi ))
135
-
136
- return props
137
-
138
105
# Loads confusables data from confusables.txt
139
106
def load_confusables (f ):
140
107
fetch (f )
@@ -189,7 +156,7 @@ def load_scripts(f):
189
156
# changes are introduced, update accordingly.
190
157
191
158
(longforms , shortforms ) = aliases ()
192
- scripts = load_script_properties ( f , [])
159
+ scripts = load_properties ( fetch_unidata ( f ) , [])
193
160
194
161
script_table = []
195
162
script_list = []
@@ -546,10 +513,10 @@ def emit_identifier_module(f):
546
513
""" )
547
514
548
515
f .write (" // Identifier status table:\n " )
549
- identifier_status_table = load_properties ("IdentifierStatus.txt" )
516
+ identifier_status_table = load_properties (fetch ( "IdentifierStatus.txt" ) )
550
517
emit_table (f , "IDENTIFIER_STATUS" , identifier_status_table ['Allowed' ], "&'static [(char, char)]" , is_pub = False ,
551
518
pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])))
552
- identifier_type = load_properties ("IdentifierType.txt" )
519
+ identifier_type = load_properties (fetch ( "IdentifierType.txt" ) )
553
520
type_table = []
554
521
for ty in identifier_type :
555
522
type_table .extend ([(x , y , ty ) for (x , y ) in identifier_type [ty ]])
@@ -560,6 +527,26 @@ def emit_identifier_module(f):
560
527
pfun = lambda x : "(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]))
561
528
f .write ("}\n \n " )
562
529
530
+ def emit_default_ignorable_detection_module (f ):
531
+ f .write ("pub mod default_ignorable_code_point {" )
532
+ f .write ("""
533
+
534
+ #[inline]
535
+ pub fn default_ignorable_code_point(c: char) -> bool {
536
+ match c as usize {
537
+ _ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
538
+ }
539
+ }
540
+
541
+ """ )
542
+
543
+ f .write (" // Default ignorable code point table:\n " )
544
+ default_ignorable_table = load_properties (fetch_unidata ("DerivedCoreProperties.txt" ), ["Default_Ignorable_Code_Point" ])
545
+ emit_table (f , "DEFAULT_IGNORABLE" , default_ignorable_table ["Default_Ignorable_Code_Point" ], "&'static [(char, char)]" , is_pub = False ,
546
+ pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])))
547
+
548
+ f .write ("}\n \n " )
549
+
563
550
def emit_confusable_detection_module (f ):
564
551
f .write ("pub mod confusable_detection {" )
565
552
f .write ("""
@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):
601
588
}
602
589
}
603
590
""" )
604
- identifier_status_table = load_properties ("IdentifierStatus.txt" )
591
+ identifier_status_table = load_properties (fetch ( "IdentifierStatus.txt" ) )
605
592
_ , scripts = load_scripts ("Scripts.txt" )
606
593
identifier_allowed = identifier_status_table ['Allowed' ]
607
594
(mixedscript_confusable , mixedscript_confusable_unresolved ) = load_potential_mixedscript_confusables ("confusables.txt" , identifier_allowed , scripts )
@@ -688,6 +675,8 @@ def emit_util_mod(f):
688
675
emit_util_mod (rf )
689
676
### identifier module
690
677
emit_identifier_module (rf )
678
+ ### default_ignorable_detection module
679
+ emit_default_ignorable_detection_module (rf )
691
680
### confusable_detection module
692
681
emit_confusable_detection_module (rf )
693
682
### mixed_script_confusable_detection module
0 commit comments