@@ -47,37 +47,39 @@ def fetch(f):
47
47
sys .stderr .write ("cannot load %s\n " % f )
48
48
exit (1 )
49
49
50
- # load identifier status data
51
- def load_identifier_status ():
52
- f = "IdentifierStatus.txt"
50
+ # Implementation from unicode-segmentation
51
+ def load_properties (f , interestingprops = None ):
53
52
fetch (f )
54
- statuses = []
55
- re1 = re .compile ("^ ([0-9A-F]+) +; + (\w+)" )
56
- re2 = re .compile ("^ ([0-9A-F]+)\.\.([0-9A-F]+) +; + (\w+)" )
53
+ props = {}
54
+ re1 = re .compile (r"^ * ([0-9A-F]+) *; * (\w+)" )
55
+ re2 = re .compile (r"^ * ([0-9A-F]+)\.\.([0-9A-F]+) *; * (\w+)" )
57
56
58
- for line in fileinput .input (f ):
57
+ for line in fileinput .input (os .path .basename (f )):
58
+ prop = None
59
59
d_lo = 0
60
60
d_hi = 0
61
- cat = None
62
61
m = re1 .match (line )
63
62
if m :
64
63
d_lo = m .group (1 )
65
64
d_hi = m .group (1 )
66
- cat = m .group (2 )
65
+ prop = m .group (2 ). strip ( )
67
66
else :
68
67
m = re2 .match (line )
69
68
if m :
70
69
d_lo = m .group (1 )
71
70
d_hi = m .group (2 )
72
- cat = m .group (3 )
71
+ prop = m .group (3 ). strip ( )
73
72
else :
74
73
continue
75
- if cat != "Allowed" :
74
+ if interestingprops and prop not in interestingprops :
76
75
continue
77
76
d_lo = int (d_lo , 16 )
78
77
d_hi = int (d_hi , 16 )
79
- statuses .append ((d_lo , d_hi ))
80
- return statuses
78
+ if prop not in props :
79
+ props [prop ] = []
80
+ props [prop ].append ((d_lo , d_hi ))
81
+
82
+ return props
81
83
82
84
def format_table_content (f , content , indent ):
83
85
line = " " * indent
@@ -115,41 +117,95 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
115
117
format_table_content (f , data , 8 )
116
118
f .write ("\n ];\n \n " )
117
119
118
- def emit_identifier_status_module ( f , statuses_table ):
119
- f .write ("pub mod identifier_status {" )
120
+ def emit_identifier_module ( f ):
121
+ f .write ("pub mod identifier {" )
120
122
f .write ("""
121
- use core::result::Result::{Ok, Err};
122
123
124
+ #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
125
+ #[allow(non_camel_case_types)]
126
+ /// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
127
+ pub enum IdentifierType {
128
+ // Restricted
129
+ Not_Character,
130
+ Deprecated,
131
+ Default_Ignorable,
132
+ Not_NFKC,
133
+ Not_XID,
134
+ Exclusion,
135
+ Obsolete,
136
+ Technical,
137
+ Uncommon_Use,
138
+ Limited_Use,
139
+
140
+ // Allowed
141
+ Inclusion,
142
+ Recommended
143
+ }
123
144
#[inline]
124
- fn bsearch_range_value_table(c: char, r: &'static [(char, char)]) -> bool {
125
- use core::cmp::Ordering::{Equal, Less, Greater};
126
- match r.binary_search_by(|&(lo, hi)| {
127
- if lo <= c && c <= hi { Equal }
128
- else if hi < c { Less }
129
- else { Greater }
130
- }) {
131
- Ok(_) => true,
132
- Err(_) => false
145
+ pub fn identifier_status_allowed(c: char) -> bool {
146
+ // FIXME: do we want to special case ASCII here?
147
+ match c as usize {
148
+ _ => super::util::bsearch_range_table(c, IDENTIFIER_STATUS)
133
149
}
134
150
}
135
- """ )
136
151
137
- f .write ("""
138
152
#[inline]
139
- pub fn identifier_status_allowed (c: char) -> bool {
153
+ pub fn identifier_type (c: char) -> Option<IdentifierType> {
140
154
// FIXME: do we want to special case ASCII here?
141
155
match c as usize {
142
- _ => bsearch_range_value_table(c, identifier_status_table )
156
+ _ => super::util:: bsearch_range_value_table(c, IDENTIFIER_TYPE )
143
157
}
144
158
}
145
-
146
159
""" )
147
160
148
- f .write (" // identifier status table.\n " )
149
- emit_table (f , "identifier_status_table" , statuses_table , "&'static [(char, char)]" , is_pub = False ,
161
+ f .write (" // Identifier status table:\n " )
162
+ identifier_status_table = load_properties ("IdentifierStatus.txt" )
163
+ emit_table (f , "IDENTIFIER_STATUS" , identifier_status_table ['Allowed' ], "&'static [(char, char)]" , is_pub = False ,
150
164
pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])))
165
+ identifier_type = load_properties ("IdentifierType.txt" )
166
+ type_table = []
167
+ for ty in identifier_type :
168
+ type_table .extend ([(x , y , ty ) for (x , y ) in identifier_type [ty ]])
169
+
170
+ type_table .sort (key = lambda w : w [0 ])
171
+
172
+ emit_table (f , "IDENTIFIER_TYPE" , type_table , "&'static [(char, char, IdentifierType)]" , is_pub = False ,
173
+ pfun = lambda x : "(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]))
151
174
f .write ("}\n \n " )
152
175
176
+ def emit_util_mod (f ):
177
+ f .write ("""
178
+ pub mod util {
179
+ use core::result::Result::{Ok, Err};
180
+ #[inline]
181
+ pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182
+ use core::cmp::Ordering::{Equal, Less, Greater};
183
+ r.binary_search_by(|&(lo,hi)| {
184
+ if lo <= c && c <= hi { Equal }
185
+ else if hi < c { Less }
186
+ else { Greater }
187
+ }).is_ok()
188
+ }
189
+
190
+ pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
191
+ use core::cmp::Ordering::{Equal, Less, Greater};
192
+ match r.binary_search_by(|&(lo, hi, _)| {
193
+ if lo <= c && c <= hi { Equal }
194
+ else if hi < c { Less }
195
+ else { Greater }
196
+ }) {
197
+ Ok(idx) => {
198
+ let (_, _, cat) = r[idx];
199
+ Some(cat)
200
+ }
201
+ Err(_) => None
202
+ }
203
+ }
204
+
205
+ }
206
+
207
+ """ )
208
+
153
209
if __name__ == "__main__" :
154
210
r = "tables.rs"
155
211
if os .path .exists (r ):
@@ -164,6 +220,7 @@ def emit_identifier_status_module(f, statuses_table):
164
220
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
165
221
166
222
""" % UNICODE_VERSION )
167
- ### identifier status module
168
- identifier_status_table = load_identifier_status ()
169
- emit_identifier_status_module (rf , identifier_status_table )
223
+
224
+ emit_util_mod (rf )
225
+ ### identifier module
226
+ emit_identifier_module (rf )
0 commit comments