8000 Merge pull request #31 from ohhithere/fix-internal-skeleton · unicode-rs/unicode-security@eb9d304 · GitHub
[go: up one dir, main page]

Skip to content

Commit eb9d304

Browse files
authored
Merge pull request #31 from ohhithere/fix-internal-skeleton
Fix internalSkeleton
2 parents 22d684a + 78707a7 commit eb9d304

File tree

4 files changed

+744
-902
lines changed

4 files changed

+744
-902
lines changed

scripts/unicode.py

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# - confusables.txt
1818
# - ReadMe.txt
1919
# This script also uses the following Unicode UCD data:
20+
# - DerivedCoreProperties.txt
2021
# - Scripts.txt
2122
#
2223
# Since this should not require frequent updates, we just store this
@@ -53,6 +54,8 @@ def fetch(f):
5354
sys.stderr.write("cannot load %s\n" % f)
5455
exit(1)
5556

57+
return f
58+
5659
# Download a UCD table file
5760
def fetch_unidata(f):
5861
if not os.path.exists(os.path.basename(f)):
@@ -63,14 +66,14 @@ def fetch_unidata(f):
6366
sys.stderr.write("cannot load %s" % f)
6467
exit(1)
6568

66-
# Loads code point data from IdentifierStatus.txt and
67-
# IdentifierType.txt
68-
# Implementation from unicode-segmentation
69+
return f
70+
71+
# Loads code point data from provided filename f
72+
# Implementation adapted from unicode-segmentation
6973
def load_properties(f, interestingprops = None):
70-
fetch(f)
7174
props = {}
72-
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
73-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
75+
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#\s]+) *#")
76+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *#")
7477

7578
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
7679
prop = None
@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):
99102

100103
return props
101104

102-
# Loads script data from Scripts.txt
103-
def load_script_properties(f, interestingprops):
104-
fetch_unidata(f)
105-
props = {}
106-
# Note: these regexes are different from those in unicode-segmentation,
107-
# becase we need to handle spaces here
108-
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
109-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
110-
111-
for line in fileinput.input(os.path.basename(f)):
112-
prop = None
113-
d_lo = 0
114-
d_hi = 0
115-
m = re1.match(line)
116-
if m:
117-
d_lo = m.group(1)
118-
d_hi = m.group(1)
119-
prop = m.group(2).strip()
120-
else:
121-
m = re2.match(line)
122-
if m:
123-
d_lo = m.group(1)
124-
d_hi = m.group(2)
125-
prop = m.group(3).strip()
126-
else:
127-
continue
128-
if interestingprops and prop not in interestingprops:
129-
continue
130-
d_lo = int(d_lo, 16)
131-
d_hi = int(d_hi, 16)
132-
if prop not in props:
133-
props[prop] = []
134-
props[prop].append((d_lo, d_hi))
135-
136-
return props
137-
138105
# Loads confusables data from confusables.txt
139106
def load_confusables(f):
140107
fetch(f)
@@ -189,7 +156,7 @@ def load_scripts(f):
189156
# changes are introduced, update accordingly.
190157

191158
(longforms, shortforms) = aliases()
192-
scripts = load_script_properties(f, [])
159+
scripts = load_properties(fetch_unidata(f), [])
193160

194161
script_table = []
195162
script_list = []
@@ -546,10 +513,10 @@ def emit_identifier_module(f):
546513
""")
547514

548515
f.write(" // Identifier status table:\n")
549-
identifier_status_table = load_properties("IdentifierStatus.txt")
516+
identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
550517
emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
551518
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
552-
identifier_type = load_properties("IdentifierType.txt")
519+
identifier_type = load_properties(fetch("IdentifierType.txt"))
553520
type_table = []
554521
for ty in identifier_type:
555522
type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
@@ -560,6 +527,26 @@ def emit_identifier_module(f):
560527
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
561528
f.write("}\n\n")
562529

530+
def emit_default_ignorable_detection_module(f):
531+
f.write("pub mod default_ignorable_code_point {")
532+
f.write("""
533+
534+
#[inline]
535+
pub fn default_ignorable_code_point(c: char) -> bool {
536+
match c as usize {
537+
_ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
538+
}
539+
}
540+
541+
""")
542+
543+
f.write(" // Default ignorable code point table:\n")
544+
default_ignorable_table = load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"])
545+
emit_table(f, "DEFAULT_IGNORABLE", default_ignorable_table["Default_Ignorable_Code_Point"], "&'static [(char, char)]", is_pub=False,
546+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
547+
548+
f.write("}\n\n")
549+
563550
def emit_confusable_detection_module(f):
564551
f.write("pub mod confusable_detection {")
565552
f.write("""
@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):
601588
}
602589
}
603590
""")
604-
identifier_status_table = load_properties("IdentifierStatus.txt")
591+
identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
605592
_, scripts = load_scripts("Scripts.txt")
606593
identifier_allowed = identifier_status_table['Allowed']
607594
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
@@ -688,6 +675,8 @@ def emit_util_mod(f):
688675
emit_util_mod(rf)
689676
### identifier module
690677
emit_identifier_module(rf)
678+
### default_ignorable_detection module
679+
emit_default_ignorable_detection_module(rf)
691680
### confusable_detection module
692681
emit_confusable_detection_module(rf)
693682
### mixed_script_confusable_detection module

src/confusable_detection.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
3434

3535
/// Calculate skeleton for string, as defined by UTS 39
3636
pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
37+
use crate::tables::default_ignorable_code_point::default_ignorable_code_point;
3738
use unicode_normalization::UnicodeNormalization;
38-
s.chars().nfd().flat_map(char_prototype).nfd()
39+
40+
s.chars()
41+
.nfd()
42+
.filter(|c| !default_ignorable_code_point(*c))
43+
.flat_map(char_prototype)
44+
.nfd()
3945
}

0 commit comments

Comments
 (0)
0