8000 bpo-37760: Factor out standard range-expanding logic in makeunicodedata. by gnprice · Pull Request #15248 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-37760: Factor out standard range-expanding logic in makeunicodedata. #15248

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 14, 2019
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
bpo-37760: Factor out standard range-expanding logic in makeunicodedata.
Much like the lower-level logic in commit ef2af1a, we had
4 copies of this logic, written in a couple of different ways.
They're all implementing the same standard, so write it just once.
  • Loading branch information
gnprice committed Aug 13, 2019
commit 4a92e19abe10e39639e22bdb5e99ad5ad15c1a8e
66 changes: 34 additions & 32 deletions Tools/unicode/makeunicodedata.py
683F
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,19 @@ def open_data(template, version):
return open(local, 'rb')


def expand_range(char_range: str) -> Iterator[int]:
'''
Parses ranges of code points, as described in UAX #44:
https://www.unicode.org/reports/tr44/#Code_Point_Ranges
'''
if '..' in char_range:
first, last = [int(c, 16) for c in char_range.split('..')]
else:
first = last = int(char_range, 16)
for char in range(first, last+1):
yield char


class UcdFile:
'''
A file in the standard format of the UCD.
Expand All @@ -929,6 +942,12 @@ def records(self) -> Iterator[List[str]]:
def __iter__(self) -> Iterator[List[str]]:
return self.records()

def expanded(self) -> Iterator[Tuple[int, List[str]]]:
for record in self.records():
char_range, rest = record[0], record[1:]
for char in expand_range(char_range):
yield char, rest


# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
Expand All @@ -955,6 +974,9 @@ def __init__(self, version, cjk_check=True):
# expand first-last ranges
field = None
for i in range(0, 0x110000):
# The file UnicodeData.txt has its own distinct way of
# expressing ranges. See:
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
s = table[i]
if s:
if s[1][-6:] == "First>":
Expand Down Expand Up @@ -1019,14 +1041,8 @@ def __init__(self, version, cjk_check=True):
self.exclusions[char] = 1

widths = [None] * 0x110000
for s in UcdFile(EASTASIAN_WIDTH, version):
if '..' in s[0]:
first, last = [int(c, 16) for c in s[0].split('..')]
chars = list(range(first, last+1))
else:
chars = [int(s[0], 16)]
for char in chars:
widths[char] = s[1]
for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
widths[char] = width

for i in range(0, 0x110000):
if table[i] is not None:
Expand All @@ -1036,26 +1052,16 @@ def __init__(self, version, cjk_check=True):
if table[i] is not None:
table[i].append(set())

for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
if ".." in r:
first, last = [int(c, 16) for c in r.split('..')]
chars = list(range(first, last+1))
else:
chars = [int(r, 16)]
for char in chars:
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char][-1].add(p)

for s in UcdFile(LINE_BREAK, version):
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char][-1].add(p)

for char_range, value in UcdFile(LINE_BREAK, version):
if value not in MANDATORY_LINE_BREAKS:
continue
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
for char in expand_range(char_range):
table[char][-1].add('Line_Break')

# We only want the quickcheck properties
Expand All @@ -1073,11 +1079,7 @@ def __init__(self, version, cjk_check=True):
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
for char in expand_range(s[0]):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
Expand Down
0