10000 Clean up the interface, add simple tests. · unicode-rs/unicode-security@a18eb9c · GitHub
[go: up one dir, main page]

Skip to content

Commit a18eb9c

Browse files
committed
Clean up the interface, add simple tests.
1 parent 741303d commit a18eb9c

File tree

6 files changed

+107
-197
lines changed

6 files changed

+107
-197
lines changed

scripts/unicode.py

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
239239
return True
240240
return False
241241

242-
def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
242+
def load_potential_mixedscript_confusables(f, identifier_allowed, scripts):
243243
# First, load all confusables data from confusables.txt
10000 244244
confusables = load_confusables(f)
245245

@@ -248,15 +248,6 @@ def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
248248
# seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C,
249249
# and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable.
250250

251-
# here we first make a dict that contains all As and Bs whose corresponding C is single code point.
252-
seekup_map = {}
253-
for item in confusables:
254-
d_proto_list = item[1]
255-
d_source = item[0]
256-
assert(len(d_proto_list) > 0)
257-
if len(d_proto_list) == 1:
258-
seekup_map[escape_char(d_source)] = d_proto_list
259-
260251
# Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes.
261252
# Principally we'll be using the rhs operands as the representive element of its equivalence classes.
262253
# However some rhs operands are single code point, while some others are not.
@@ -275,9 +266,8 @@ def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
275266
if d_proto not in codepoint_map:
276267
codepoint_map[d_proto] = []
277268
# when we create new equivalence class, we'll check whether the representative element should be collected.
278-
# i.e. if it is not subject to substituion, and not restricted from identifier usage,
279-
# we collect it into the equivalence class.
280-
if d_proto not in seekup_map and is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
269+
# i.e. if it is not restricted from identifier usage, we collect it into the equivalence class.
270+
if is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
281271
codepoint_map[d_proto].append(d_proto_list[0])
282272
# we collect the original code point to be substituted into this list.
283273
codepoint_map[d_proto].append(d_source)
@@ -562,23 +552,20 @@ def emit_confusable_detection_module(f):
562552
def escape_script_constant(name, longforms):
563553
return "Script::" + longforms[name].strip()
564554

565-
def emit_rustc_mixed_script_confusable_detection(f):
566-
f.write("pub mod rustc_mixed_script_confusable_detection {")
555+
def emit_potiential_mixed_script_confusable(f):
556+
f.write("pub mod potential_mixed_script_confusable {")
567557
f.write("""
568-
use unicode_script::Script;
569-
570558
#[inline]
571-
pub fn is_rustc_mixed_script_confusable(c: char) -> Option<Script> {
559+
pub fn potential_mixed_script_confusable(c: char) -> bool {
572560
match c as usize {
573-
_ => super::util::bsearch_value_table(c, CONFUSABLES)
561+
_ => super::util::bsearch_table(c, CONFUSABLES)
574562
}
575563
}
576-
577564
""")
578565
identifier_status_table = load_properties("IdentifierStatus.txt")
579-
longforms, scripts = load_scripts("Scripts.txt")
566+
_, scripts = load_scripts("Scripts.txt")
580567
identifier_allowed = identifier_status_table['Allowed']
581-
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_rustc_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
568+
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
582569
debug = False
583570
if debug == True:
584571
debug_emit_mixedscript_confusable(f, mixedscript_confusable, "mixedscript_confusable", scripts)
@@ -589,16 +576,21 @@ def emit_rustc_mixed_script_confusable_detection(f):
589576
source = pair[0]
590577
confusable_table.append((source, script))
591578
confusable_table.sort(key=lambda w: w[0])
592-
emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, Script)]", is_pub=False,
593-
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_script_constant(x[1], longforms)))
579+
emit_table(f, "CONFUSABLES", confusable_table, "&'static [char]", is_pub=False,
580+
pfun=lambda x: "%s" % escape_char(x[0]))
594581
f.write("}\n\n")
595582

596583

597584
def emit_util_mod(f):
598585
f.write("""
599586
pub mod util {
600587
use core::result::Result::{Ok, Err};
601-
588+
589+
#[inline]
590+
pub fn bsearch_table(c: char, r: &'static [char]) -> bool {
591+
r.binary_search(&c).is_ok()
592+
}
593+
602594
#[inline]
603595
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
604596
match r.binary_search_by_key(&c, |&(k, _)| k) {
@@ -609,7 +601,7 @@ def emit_util_mod(f):
609601
Err(_) => None
610602
}
611603
}
612-
604+
613605
#[inline]
614606
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
615607
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -619,7 +611,7 @@ def emit_util_mod(f):
619611
else { Greater }
620612
}).is_ok()
621613
}
622-
614+
623615
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
624616
use core::cmp::Ordering::{Equal, Less, Greater};
625617
match r.binary_search_by(|&(lo, hi, _)| {
@@ -660,4 +652,4 @@ def emit_util_mod(f):
660652
### confusable_detection module
661653
emit_confusable_detection_module(rf)
662654
### mixed_script_confusable_detection module
663-
emit_rustc_mixed_script_confusable_detection(rf)
655+
emit_potiential_mixed_script_confusable(rf)

src/lib.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,12 @@ pub mod confusable_detection;
6262
pub mod general_security_profile;
6363
pub mod mixed_script;
6464
pub mod restriction_level;
65-
pub mod rustc_mixed_script_confusable_detection;
6665

6766
pub use confusable_detection::skeleton;
6867
pub use general_security_profile::GeneralSecurityProfile;
68+
pub use mixed_script::is_potential_mixed_script_confusable_char;
6969
pub use mixed_script::MixedScript;
7070
pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection};
71-
pub use rustc_mixed_script_confusable_detection::rustc_mixed_script_confusable_codepoint;
7271

7372
#[rustfmt::skip]
7473
pub(crate) mod tables;

src/mixed_script.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,17 @@ impl MixedScript for &'_ str {
130130
self.into()
131131
}
132132
}
133+
134+
/// Check if a character is considered potential mixed script confusable.
135+
///
136+
/// If the specified character is not restricted from use for identifiers,
137+
/// this function returns whether it is considered mixed script confusable
138+
/// with another character that is not restricted from use for identifiers.
139+
///
140+
/// If the specified character is restricted from use for identifiers,
141+
/// the return value is unspecified.
142+
pub fn is_potential_mixed_script_confusable_char(c: char) -> bool {
143+
use crate::tables::potential_mixed_script_confusable::potential_mixed_script_confusable;
144+
145+
potential_mixed_script_confusable(c)
146+
}

src/rustc_mixed_script_confusable_detection.rs

Lines changed: 0 additions & 17 deletions
This file was deleted.

0 commit comments

Comments
 (0)
0