8000 Replace boolean intersects() with intersect(), which actually calcula… · unicode-rs/unicode-script@ba42eb6 · GitHub
[go: up one dir, main page]

Skip to content

Commit ba42eb6

Browse files
committed
Replace boolean intersects() with intersect(), which actually calculates an intersection
1 parent 66fb0d3 commit ba42eb6

File tree

3 files changed

+1485
-545
lines changed

3 files changed

+1485
-545
lines changed

scripts/unicode.py

Lines changed: 58 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def emit_search(f):
183183
}
184184
""")
185185

186-
def emit_enums(f, script_list, extension_list, longforms):
186+
def emit_enums(f, script_list, extension_list, longforms, intersections):
187187
"""
188188
Emit the Script and ScriptExtension enums as well as any related utility functions
189189
"""
@@ -278,51 +278,81 @@ def emit_enums(f, script_list, extension_list, longforms):
278278
}
279279
280280
#[inline]
281-
pub(crate) fn inner_intersects(self, other: Self) -> bool {
281+
pub(crate) fn inner_intersect(self, other: Self) -> Self {
282282
match (self, other) {
283283
(ScriptExtension::Single(Script::Unknown), _) |
284-
(_, ScriptExtension::Single(Script::Unknown)) => false,
285-
(a, b) if a == b => true,
286-
(ScriptExtension::Single(Script::Common), _) |
287-
(ScriptExtension::Single(Script::Inherited), _) |
288-
(_, ScriptExtension::Single(Script::Common)) |
289-
(_, ScriptExtension::Single(Script::Inherited)) => true,
290-
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) => o.inner_contains_script(s),
284+
(_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown),
285+
(a, b) if a == b => a,
286+
(ScriptExtension::Single(Script::Common), a) |
287+
(ScriptExtension::Single(Script::Inherited), a) |
288+
(a, ScriptExtension::Single(Script::Common)) |
289+
(a, ScriptExtension::Single(Script::Inherited)) => a,
290+
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s),
291291
""")
292-
intersections = compute_intersections(extension_list)
293-
for (e1, e2) in intersections:
294-
f.write(" (%s, %s) => true,\n" % (extension_name(e1), extension_name(e2)))
295-
f.write(""" _ => false,
292+
for (e1, e2, i) in intersections:
293+
f.write(" (%s, %s) => %s,\n" % (extension_name(e1), extension_name(e2), extension_name(i, longforms)))
294+
f.write(""" _ => ScriptExtension::Single(Script::Unknown),
296295
}
297296
}
298297
}
299298
""")
300299

301300

302-
# We currently do NOT have an optimized method to compute
303-
# the actual intersection between two script extensions, we
304-
# only check if they *do* intersect
305-
#
306-
# To add such a method we'd need to do an extra pass where we compute any
307-
# new ScriptExtension enums we'll need from the intersections. It doesn't
308-
# seem worth it for now
309-
def compute_intersections(extension_list):
301+
def compute_intersections_elements(extension_list):
310302
"""
311-
Compute which pairs of elements intersect. This will return duplicate pairs with
312-
the elements swapped, but that's fine.
303+
Compute all intersections between the script extensions.
304+
This will add new elements to extension_list, be sure to call it first!
313305
"""
306+
307+
# This is the only third-level intersection
308+
# It's easier to hardcode things here rather than
309+
# do the below calculation in a loop
310+
extension_list.append(['Deva', 'Knda', 'Tirh'])
314311
intersections = []
312+
# Some intersections will not exist in extension_list and we'll need to add them
313+
new_elements = []
315314
sets = [(e, set(e)) for e in extension_list]
316315
for (e1, s1) in sets:
317316
for (e2, s2) in sets:
318317
if e1 == e2:
319318
continue
320319
intersection = s1.intersection(s2)
321320
if len(intersection) > 0:
322-
intersections.append((e1, e2))
321+
intersection = [i for i in intersection]
322+
intersection.sort()
323+
if len(intersection) > 1 and intersection not in extension_list and intersection not in new_elements:
324+
new_elements.append(intersection)
325+
if (e1, e2, intersection) not in intersections:
326+
intersections.append((e1, e2, intersection))
327+
extension_list.extend(new_elements)
328+
329+
# We now go through the newly added second-level extension values and calculate their intersections
330+
# with the original set and each other
331+
new_sets = [(e, set(e)) for e in new_elements]
332+
sets = [(e, set(e)) for e in extension_list]
333+
for (e1, s1) in new_sets:
334+
for (e2, s2) in sets:
335+
if e1 == e2:
336+
continue
337+
intersection = s1.intersection(s2)
338+
if len(intersection) > 0:
339+
intersection = [i for i in intersection]
340+
intersection.sort()
341+
if len(intersection) > 1 and intersection not in extension_list:
342+
raise "Found new third-level intersection, please hardcode it"
343+
# The previous routine would automatically get both versions
344+
# of an intersection because it would iterate each pair in both orders,
345+
# but here we're working on an asymmetric pair, so we insert both in order to not
346+
# miss anything
347+
if (e1, e2, intersection) not in intersections:
348+
intersections.append((e1, e2, intersection))
349+
if (e2, e1, intersection) not in intersections:
350+
intersections.append((e2, e1, intersection))
351+
352+
intersections.sort()
323353
return intersections
324354

325-
def extension_name(ext, longforms=[]):
355+
def extension_name(ext, longforms={}):
326356
"""Get the rust source for a given ScriptExtension"""
327357
if len(ext) == 1:
328358
return "ScriptExtension::Single(Script::%s)" % longforms[ext[0]]
@@ -373,7 +403,9 @@ def extension_name(ext, longforms=[]):
373403
extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]])
374404
extension_table.sort(key=lambda w: w[0])
375405

376-
emit_enums(rf, script_list, extension_list, longforms)
406+
intersections = compute_intersections_elements(extension_list)
407+
408+
emit_enums(rf, script_list, extension_list, longforms, intersections)
377409
emit_search(rf)
378410

379411
emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]",

src/lib.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,13 @@ impl ScriptExtension {
4040
self.inner_contains_script(script)
4141
}
4242

43-
/// Check if this ScriptExtension has any intersection with another
44-
/// ScriptExtension
43+
/// Find the intersection between two ScriptExtensions. Returns Unknown if things
44+
/// do not intersect.
4545
///
4646
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
4747
/// everything.
48-
///
49-
/// "Unknown" intersects nothing
50-
pub fn intersects(self, other: Self) -> bool {
51-
self.inner_intersects(other)
48+
pub fn intersect(self, other: Self) -> Self {
49+
self.inner_intersect(other)
5250
}
5351
}
5452

0 commit comments

Comments
 (0)
0