8000 Merge pull request #7 from unicode-rs/identifiertype · unicode-rs/unicode-security@0ae055e · GitHub
[go: up one dir, main page]

Skip to content

Commit 0ae055e

Browse files
authored
Merge pull request #7 from unicode-rs/identifiertype
Support Identifier Type
2 parents f35d6b6 + 6dc688d commit 0ae055e

File tree

3 files changed

+1611
-50
lines changed

3 files changed

+1611
-50
lines changed

scripts/unicode.py

Lines changed: 92 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -47,37 +47,39 @@ def fetch(f):
4747
sys.stderr.write("cannot load %s\n" % f)
4848
exit(1)
4949

50-
# load identifier status data
51-
def load_identifier_status():
52-
f = "IdentifierStatus.txt"
50+
# Implementation from unicode-segmentation
51+
def load_properties(f, interestingprops = None):
5352
fetch(f)
54-
statuses = []
55-
re1 = re.compile("^([0-9A-F]+) +; +(\w+)")
56-
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; +(\w+)")
53+
props = {}
54+
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
55+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
5756

58-
for line in fileinput.input(f):
57+
for line in fileinput.input(os.path.basename(f)):
58+
prop = None
5959
d_lo = 0
6060
d_hi = 0
61-
cat = None
6261
m = re1.match(line)
6362
if m:
6463
d_lo = m.group(1)
6564
d_hi = m.group(1)
66-
cat = m.group(2)
65+
prop = m.group(2).strip()
6766
else:
6867
m = re2.match(line)
6968
if m:
7069
d_lo = m.group(1)
7170
d_hi = m.group(2)
72-
cat = m.group(3)
71+
prop = m.group(3).strip()
7372
else:
7473
continue
75-
if cat != "Allowed":
74+
if interestingprops and prop not in interestingprops:
7675
continue
7776
d_lo = int(d_lo, 16)
7877
d_hi = int(d_hi, 16)
79-
statuses.append((d_lo, d_hi))
80-
return statuses
78+
if prop not in props:
79+
props[prop] = []
80+
props[prop].append((d_lo, d_hi))
81+
82+
return props
8183

8284
def format_table_content(f, content, indent):
8385
line = " "*indent
@@ -115,41 +117,95 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
115117
format_table_content(f, data, 8)
116118
f.write("\n ];\n\n")
117119

118-
def emit_identifier_status_module(f, statuses_table):
119-
f.write("pub mod identifier_status {")
120+
def emit_identifier_module(f):
121+
f.write("pub mod identifier {")
120122
f.write("""
121-
use core::result::Result::{Ok, Err};
122123
124+
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
125+
#[allow(non_camel_case_types)]
126+
/// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
127+
pub enum IdentifierType {
128+
// Restricted
129+
Not_Character,
130+
Deprecated,
131+
Default_Ignorable,
132+
Not_NFKC,
133+
Not_XID,
134+
Exclusion,
135+
Obsolete,
136+
Technical,
137+
Uncommon_Use,
138+
Limited_Use,
139+
140+
// Allowed
141+
Inclusion,
142+
Recommended
143+
}
123144
#[inline]
124-
fn bsearch_range_value_table(c: char, r: &'static [(char, char)]) -> bool {
125-
use core::cmp::Ordering::{Equal, Less, Greater};
126-
match r.binary_search_by(|&(lo, hi)| {
127-
if lo <= c && c <= hi { Equal }
128-
else if hi < c { Less }
129-
else { Greater }
130-
}) {
131-
Ok(_) => true,
132-
Err(_) => false
145+
pub fn identifier_status_allowed(c: char) -> bool {
146+
// FIXME: do we want to special case ASCII here?
147+
match c as usize {
148+
_ => super::util::bsearch_range_table(c, IDENTIFIER_STATUS)
133149
}
134150
}
135-
""")
136151
137-
f.write("""
138152
#[inline]
139-
pub fn identifier_status_allowed(c: char) -> bool {
153+
pub fn identifier_type(c: char) -> Option<IdentifierType> {
140154
// FIXME: do we want to special case ASCII here?
141155
match c as usize {
142-
_ => bsearch_range_value_table(c, identifier_status_table)
156+
_ => super::util::bsearch_range_value_table(c, IDENTIFIER_TYPE)
143157
}
144158
}
145-
146159
""")
147160

148-
f.write(" // identifier status table.\n")
149-
emit_table(f, "identifier_status_table", statuses_table, "&'static [(char, char)]", is_pub=False,
161+
f.write(" // Identifier status table:\n")
162+
identifier_status_table = load_properties("IdentifierStatus.txt")
163+
emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
150164
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
165+
identifier_type = load_properties("IdentifierType.txt")
166+
type_table = []
167+
for ty in identifier_type:
168+
type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
169+
170+
type_table.sort(key=lambda w: w[0])
171+
172+
emit_table(f, "IDENTIFIER_TYPE", type_table, "&'static [(char, char, IdentifierType)]", is_pub=False,
173+
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
151174
f.write("}\n\n")
152175

176+
def emit_util_mod(f):
177+
f.write("""
178+
pub mod util {
179+
use core::result::Result::{Ok, Err};
180+
#[inline]
181+
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182+
use core::cmp::Ordering::{Equal, Less, Greater};
183+
r.binary_search_by(|&(lo,hi)| {
184+
if lo <= c && c <= hi { Equal }
185+
else if hi < c { Less }
186+
else { Greater }
187+
}).is_ok()
188+
}
189+
190+
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
191+
use core::cmp::Ordering::{Equal, Less, Greater};
192+
match r.binary_search_by(|&(lo, hi, _)| {
193+
if lo <= c && c <= hi { Equal }
194+
else if hi < c { Less }
195+
else { Greater }
196+
}) {
197+
Ok(idx) => {
198+
let (_, _, cat) = r[idx];
199+
Some(cat)
200+
}
201+
Err(_) => None
202+
}
203+
}
204+
205+
}
206+
207+
""")
208+
153209
if __name__ == "__main__":
154210
r = "tables.rs"
155211
if os.path.exists(r):
@@ -164,6 +220,7 @@ def emit_identifier_status_module(f, statuses_table):
164220
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
165221
166222
""" % UNICODE_VERSION)
167-
### identifier status module
168-
identifier_status_table = load_identifier_status()
169-
emit_identifier_status_module(rf, identifier_status_table)
223+
224+
emit_util_mod(rf)
225+
### identifier module
226+
emit_identifier_module(rf)

src/general_security_profile.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,23 @@
11
//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile)
22
//! for identifiers
33
4-
use crate::tables::identifier_status as is;
4+
use crate::tables::identifier;
5+
6+
pub use identifier::IdentifierType;
57

68
/// Methods for determining characters not restricted from use for identifiers.
79
pub trait GeneralSecurityProfile {
810
/// Returns whether the character is not restricted from use for identifiers.
911
fn identifier_allowed(self) -> bool;
12+
13+
/// Returns the [identifier type](https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type)
14+
fn identifier_type(self) -> Option<IdentifierType>;
1015
}
1116

1217
impl GeneralSecurityProfile for char {
1318
#[inline]
14-
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
15-
}
16-
17-
impl GeneralSecurityProfile for &'_ str {
19+
fn identifier_allowed(self) -> bool { identifier::identifier_status_allowed(self) }
1820
#[inline]
19-
fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) }
21+
fn identifier_type(self) -> Option<IdentifierType> { identifier::identifier_type(self) }
22+
2023
}

0 commit comments

Comments
 (0)
0