8000 unicode-script: Refactor load_properties · unicode-rs/unicode-security@1955790 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1955790

Browse files
committed
unicode-script: Refactor load_properties
1 parent 22d684a commit 1955790

File tree

1 file changed

+12
-46
lines changed

1 file changed

+12
-46
lines changed

scripts/unicode.py

Lines changed: 12 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ def fetch(f):
5353
sys.stderr.write("cannot load %s\n" % f)
5454
exit(1)
5555

56+
return f
57+
5658
# Download a UCD table file
5759
def fetch_unidata(f):
5860
if not os.path.exists(os.path.basename(f)):
@@ -63,14 +65,14 @@ def fetch_unidata(f):
6365
sys.stderr.write("cannot load %s" % f)
6466
exit(1)
6567

66-
# Loads code point data from IdentifierStatus.txt and
67-
# IdentifierType.txt
68-
# Implementation from unicode-segmentation
68+
return f
69+
70+
# Loads code point data from provided filename f
71+
# Implementation adapted from unicode-segmentation
6972
def load_properties(f, interestingprops = None):
70-
fetch(f)
7173
props = {}
72-
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
73-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
74+
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#\s]+) *#")
75+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *#")
7476

7577
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
7678
prop = None
@@ -99,42 +101,6 @@ def load_properties(f, interestingprops = None):
99101

100102
return props
101103

102-
# Loads script data from Scripts.txt
103-
def load_script_properties(f, interestingprops):
104-
fetch_unidata(f)
105-
props = {}
106-
# Note: these regexes are different from those in unicode-segmentation,
107-
# becase we need to handle spaces here
108-
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
109-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
110-
111-
for line in fileinput.input(os.path.basename(f)):
112-
prop = None
113-
d_lo = 0
114-
d_hi = 0
115-
m = re1.match(line)
116-
if m:
117-
d_lo = m.group(1)
118-
d_hi = m.group(1)
119-
prop = m.group(2).strip()
120-
else:
121-
m = re2.match(line)
122-
if m:
123-
d_lo = m.group(1)
124-
d_hi = m.group(2)
125-
prop = m.group(3).strip()
126-
else:
127-
continue
128-
if interestingprops and prop not in interestingprops:
129-
continue
130-
d_lo = int(d_lo, 16)
131-
d_hi = int(d_hi, 16)
132-
if prop not in props:
133-
props[prop] = []
134-
props[prop].append((d_lo, d_hi))
135-
136-
return props
137-
138104
# Loads confusables data from confusables.txt
139105
def load_confusables(f):
140106
fetch(f)
@@ -189,7 +155,7 @@ def load_scripts(f):
189155
# changes are introduced, update accordingly.
190156

191157
(longforms, shortforms) = aliases()
192-
scripts = load_script_properties(f, [])
158+
scripts = load_properties(fetch_unidata(f), [])
193159

194160
script_table = []
195161
script_list = []
@@ -546,10 +512,10 @@ def emit_identifier_module(f):
546512
""")
547513

548514
f.write(" // Identifier status table:\n")
549-
identifier_status_table = load_properties("IdentifierStatus.txt")
515+
identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
550516
emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
551517
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
552-
identifier_type = load_properties("IdentifierType.txt")
518+
identifier_type = load_properties(fetch("IdentifierType.txt"))
553519
type_table = []
554520
for ty in identifier_type:
555521
type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
@@ -601,7 +567,7 @@ def emit_potiential_mixed_script_confusable(f):
601567
}
602568
}
603569
""")
604-
identifier_status_table = load_properties("IdentifierStatus.txt")
570+
identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
605571
_, scripts = load_scripts("Scripts.txt")
606572
identifier_allowed = identifier_status_table['Allowed']
607573
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)

0 commit comments

Comments
 (0)
0