8000 gh-53203: Fix strptime() for %c, %x and %X formats on some locales by serhiy-storchaka · Pull Request #135971 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-53203: Fix strptime() for %c, %x and %X formats on some locales #135971

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 149 additions & 41 deletions Lib/_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import time
import locale
import calendar
import re
from re import compile as re_compile
from re import sub as re_sub
from re import IGNORECASE
Expand Down Expand Up @@ -41,6 +42,21 @@ def _findall(haystack, needle):
yield i
i += len(needle)


lzh_TW_alt_digits = (
# 〇:一:二:三:四:五:六:七:八:九
'\u3007', '\u4e00', '\u4e8c', '\u4e09', '\u56db',
'\u4e94', '\u516d', '\u4e03', '\u516b', '\u4e5d',
# 十:十一:十二:十三:十四:十五:十六:十七:十八:十九
'\u5341', '\u5341\u4e00', '\u5341\u4e8c', '\u5341\u4e09', '\u5341\u56db',
'\u5341\u4e94', '\u5341\u516d', '\u5341\u4e03', '\u5341\u516b', '\u5341\u4e5d',
# 廿:廿一:廿二:廿三:廿四:廿五:廿六:廿七:廿八:廿九
'\u5eff', '\u5eff\u4e00', '\u5eff\u4e8c', '\u5eff\u4e09', '\u5eff\u56db',
'\u5eff\u4e94', '\u5eff\u516d', '\u5eff\u4e03', '\u5eff\u516b', '\u5eff\u4e5d',
# 卅:卅一
'\u5345', '\u5345\u4e00')


class LocaleTime(object):
"""Stores and handles locale-specific information related to time.

Expand Down Expand Up @@ -84,6 +100,7 @@ def __init__(self):
self.__calc_weekday()
self.__calc_month()
self.__calc_am_pm()
self.__calc_alt_digits()
self.__calc_timezone()
self.__calc_date_time()
if _getlang() != self.lang:
Expand Down Expand Up @@ -119,36 +136,76 @@ def __calc_am_pm(self):
am_pm.append(time.strftime("%p", time_tuple).lower().strip())
self.am_pm = am_pm

def __calc_alt_digits(self):
# Set self.LC_alt_digits by using time.strftime().

# The magic data should contain all decimal digits.
time_tuple = time.struct_time((1998, 1, 27, 10, 43, 56, 1, 27, 0))
s = time.strftime("%x%X", time_tuple)
if s.isascii():
# Fast path -- all digits are ASCII.
self.LC_alt_digits = ()
return

digits = ''.join(sorted(set(re.findall(r'\d', s))))
if len(digits) == 10 and ord(digits[-1]) == ord(digits[0]) + 9:
# All 10 decimal digits from the same set.
if digits.isascii():
# All digits are ASCII.
self.LC_alt_digits = ()
return

self.LC_alt_digits = [a + b for a in digits for b in digits]
# Test whether the numbers contain leading zero.
time_tuple2 = time.struct_time((2000, 1, 1, 1, 1, 1, 5, 1, 0))
if self.LC_alt_digits[1] not in time.strftime("%x %X", time_tuple2):
self.LC_alt_digits[:10] = digits
return

# Either non-Gregorian calendar or non-decimal numbers.
if {'\u4e00', '\u4e03', '\u4e5d', '\u5341', '\u5eff'}.issubset(s):
# lzh_TW
self.LC_alt_digits = lzh_TW_alt_digits
return

self.LC_alt_digits = None

def __calc_date_time(self):
# Set self.date_time, self.date, & self.time by using
# time.strftime().
# Set self.LC_date_time, self.LC_date, self.LC_time and
# self.LC_time_ampm by using time.strftime().

# Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
# overloaded numbers is minimized. The order in which searches for
# values within the format string is very important; it eliminates
# possible ambiguity for what something represents.
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
replacement_pairs = [
replacement_pairs = []

# Non-ASCII digits
if self.LC_alt_digits or self.LC_alt_digits is None:
for n, d in [(19, '%OC'), (99, '%Oy'), (22, '%OH'),
(44, '%OM'), (55, '%OS'), (17, '%Od'),
(3, '%Om'), (2, '%Ow'), (10, '%OI')]:
if self.LC_alt_digits is None:
s = chr(0x660 + n // 10) + chr(0x660 + n % 10)
replacement_pairs.append((s, d))
if n < 10:
replacement_pairs.append((s[1], d))
elif len(self.LC_alt_digits) > n:
replacement_pairs.append((self.LC_alt_digits[n], d))
else:
replacement_pairs.append((time.strftime(d, time_tuple), d))
replacement_pairs += [
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
('44', '%M'), ('55', '%S'), ('76', '%j'),
('17', '%d'), ('03', '%m'), ('3', '%m'),
# '3' needed for when no leading zero.
('2', '%w'), ('10', '%I'),
# Non-ASCII digits
('\u0661\u0669\u0669\u0669', '%Y'),
('\u0669\u0669', '%Oy'),
('\u0662\u0662', '%OH'),
('\u0664\u0664', '%OM'),
('\u0665\u0665', '%OS'),
('\u0661\u0667', '%Od'),
('\u0660\u0663', '%Om'),
('\u0663', '%Om'),
('\u0662', '%Ow'),
('\u0661\u0660', '%OI'),
]

date_time = []
for directive in ('%c', '%x', '%X'):
for directive in ('%c', '%x', '%X', '%r'):
current_format = time.strftime(directive, time_tuple).lower()
current_format = current_format.replace('%', '%%')
# The month and the day of the week formats are treated specially
Expand All @@ -172,9 +229,10 @@ def __calc_date_time(self):
if tz:
current_format = current_format.replace(tz, "%Z")
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
current_format = re_sub(r'\d(?<![0-9])',
lambda m: chr(0x0660 + int(m[0])),
current_format)
if not current_format.isascii() and self.LC_alt_digits is None:
current_format = re_sub(r'\d(?<![0-9])',
lambda m: chr(0x0660 + int(m[0])),
current_format)
for old, new in replacement_pairs:
current_format = current_format.replace(old, new)
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
Expand All @@ -189,6 +247,7 @@ def __calc_date_time(self):
self.LC_date_time = date_time[0]
self.LC_date = date_time[1]
self.LC_time = date_time[2]
self.LC_time_ampm = date_time[3]

def __find_month_format(self, directive):
"""Find the month format appropriate for the current locale.
Expand All @@ -213,7 +272,7 @@ def __find_month_format(self, directive):
full_indices &= indices
indices = set(_findall(datetime, self.a_month[m]))
if abbr_indices is None:
abbr_indices = indices
abbr_indices = set(indices)
else:
abbr_indices &= indices
if not full_indices and not abbr_indices:
Expand Down Expand Up @@ -241,7 +300,7 @@ def __find_weekday_format(self, directive):
if self.f_weekday[wd] != self.a_weekday[wd]:
indices = set(_findall(datetime, self.a_weekday[wd]))
if abbr_indices is None:
abbr_indices = indices
abbr_indices = set(indices)
else:
abbr_indices &= indices
if not full_indices and not abbr_indices:
Expand Down Expand Up @@ -288,8 +347,10 @@ def __init__(self, locale_time=None):
# The " [1-9]" part of the regex is to make %c from ANSI C work
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P<f>[0-9]{1,6})",
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
'H': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
'k': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
'l': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
'G': r"(?P<G>\d\d\d\d)",
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
Expand All @@ -312,16 +373,49 @@ def __init__(self, locale_time=None):
for tz in tz_names),
'Z'),
'%': '%'}
for d in 'dmyHIMS':
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
mapping['Ow'] = r'(?P<w>\d)'
if self.locale_time.LC_alt_digits is None:
for d in 'dmyCHIMS':
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
mapping['Ow'] = r'(?P<w>\d)'
else:
mapping.update({
'Od': self.__seqToRE(self.locale_time.LC_alt_digits[1:32], 'd',
'3[0-1]|[1-2][0-9]|0[1-9]|[1-9]'),
'Om': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'm',
'1[0-2]|0[1-9]|[1-9]'),
'Ow': self.__seqToRE(self.locale_time.LC_alt_digits[:7], 'w',
'[0-6]'),
'Oy': self.__seqToRE(self.locale_time.LC_alt_digits, 'y',
'[0-9][0-9]'),
'OC': self.__seqToRE(self.locale_time.LC_alt_digits, 'C',
'[0-9][0-9]'),
'OH': self.__seqToRE(self.locale_time.LC_alt_digits[:24], 'H',
'2[0-3]|[0-1][0-9]|[0-9]'),
'OI': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'I',
'1[0-2]|0[1-9]|[1-9]'),
'OM': self.__seqToRE(self.locale_time.LC_alt_digits[:60], 'M',
'[0-5][0-9]|[0-9]'),
'OS': self.__seqToRE(self.locale_time.LC_alt_digits[:62], 'S',
'6[0-1]|[0-5][0-9]|[0-9]'),
})
mapping.update({
'e': mapping['d'],
'Oe': mapping['Od'],
'P': mapping['p'],
'Op': mapping['p'],
'W': mapping['U'].replace('U', 'W'),
})
mapping['W'] = mapping['U'].replace('U', 'W')

base.__init__(mapping)
base.__setitem__('T', self.pattern('%H:%M:%S'))
base.__setitem__('R', self.pattern('%H:%M'))
base.__setitem__('r', self.pattern(self.locale_time.LC_time_ampm))
base.__setitem__('X', self.pattern(self.locale_time.LC_time))
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))

def __seqToRE(self, to_convert, directive):
def __seqToRE(self, to_convert, directive, altregex=None):
"""Convert a list to a regex string for matching a directive.

Want possible matching values to be from longest to shortest. This
Expand All @@ -337,8 +431,9 @@ def __seqToRE(self, to_convert, directive):
else:
return ''
regex = '|'.join(re_escape(stuff) for stuff in to_convert)
regex = '(?P<%s>%s' % (directive, regex)
return '%s)' % regex
if altregex is not None:
regex += '|' + altregex
return '(?P<%s>%s)' % (directive, regex)

def pattern(self, format):
"""Return regex pattern for the format string.
Expand All @@ -365,7 +460,7 @@ def repl(m):
nonlocal day_of_month_in_format
day_of_month_in_format = True
return self[format_char]
format = re_sub(r'%([OE]?\\?.?)', repl, format)
format = re_sub(r'%[-_0^#]*[0-9]*([OE]?\\?.?)', repl, format)
if day_of_month_in_format and not year_in_format:
import warnings
warnings.warn("""\
Expand Down Expand Up @@ -467,37 +562,50 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
# values
weekday = julian = None
found_dict = found.groupdict()
if locale_time.LC_alt_digits:
def parse_int(s):
try:
return locale_time.LC_alt_digits.index(s)
except ValueError:
return int(s)
else:
parse_int = int

for group_key in found_dict.keys():
# Directives not explicitly handled below:
# c, x, X
# handled by making out of other directives
# U, W
# worthless without day of the week
if group_key == 'y':
year = int(found_dict['y'])
# Open Group specification for strptime() states that a %y
#value in the range of [00, 68] is in the century 2000, while
#[69,99] is in the century 1900
if year <= 68:
year += 2000
year = parse_int(found_dict['y'])
if 'C' in found_dict:
century = parse_int(found_dict['C'])
year += century * 100
else:
year += 1900
# Open Group specification for strptime() states that a %y
#value in the range of [00, 68] is in the century 2000, while
#[69,99] is in the century 1900
if year <= 68:
year += 2000
else:
9E7A year += 1900
elif group_key == 'Y':
year = int(found_dict['Y'])
elif group_key == 'G':
iso_year = int(found_dict['G'])
elif group_key == 'm':
month = int(found_dict['m'])
month = parse_int(found_dict['m'])
elif group_key == 'B':
month = locale_time.f_month.index(found_dict['B'].lower())
elif group_key == 'b':
month = locale_time.a_month.index(found_dict['b'].lower())
elif group_key == 'd':
day = int(found_dict['d'])
day = parse_int(found_dict['d'])
elif group_key == 'H':
hour = int(found_dict['H'])
hour = parse_int(found_dict['H'])
elif group_key == 'I':
hour = int(found_dict['I'])
hour = parse_int(found_dict['I'])
ampm = found_dict.get('p', '').lower()
# If there was no AM/PM indicator, we'll treat this like AM
if ampm in ('', locale_time.am_pm[0]):
Expand All @@ -513,9 +621,9 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
if hour != 12:
hour += 12
elif group_key == 'M':
minute = int(found_dict['M'])
minute = parse_int(found_dict['M'])
elif group_key == 'S':
second = int(found_dict['S'])
second = parse_int(found_dict['S'])
elif group_key == 'f':
s = found_dict['f']
# Pad to always return microseconds.
Expand Down
Loading
Loading
0