8000 gh-53203: Fix strptime() for %c, %x and %X formats on some locales (#… · python/cpython@07183eb · GitHub
[go: up one dir, main page]

Skip to content

Commit 07183eb

Browse files
gh-53203: Fix strptime() for %c, %x and %X formats on some locales (#135971)
* Add detection of decimal non-ASCII alt digits. * Add support of non-decimal alt digits on locale lzh_TW. * Accept only numbers in correct range if alt digits are known. * Fix bug in detecting the position of the week day name on locales byn_ER and wal_ET. * Fix support of single-digit hour on locales ar_SA and bg_BG. * Add support for %T, %R, %r, %C, %OC. * Prepare code to use nl_langinfo().
1 parent 0c6c09b commit 07183eb

File tree

3 files changed

+164
-56
lines changed

3 files changed

+164
-56
lines changed

Lib/_strptime.py

Lines changed: 149 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import time
1515
import locale
1616
import calendar
17+
import re
1718
from re import compile as re_compile
1819
from re import sub as re_sub
1920
from re import IGNORECASE
@@ -41,6 +42,21 @@ def _findall(haystack, needle):
4142
yield i
4243
i += len(needle)
4344

45+
46+
lzh_TW_alt_digits = (
47+
# 〇:一:二:三:四:五:六:七:八:九
48+
'\u3007', '\u4e00', '\u4e8c', '\u4e09', '\u56db',
49+
'\u4e94', '\u516d', '\u4e03', '\u516b', '\u4e5d',
50+
# 十:十一:十二:十三:十四:十五:十六:十七:十八:十九
51+
'\u5341', '\u5341\u4e00', '\u5341\u4e8c', '\u5341\u4e09', '\u5341\u56db',
52+
'\u5341\u4e94', '\u5341\u516d', '\u5341\u4e03', '\u5341\u516b', '\u5341\u4e5d',
53+
# 廿:廿一:廿二:廿三:廿四:廿五:廿六:廿七:廿八:廿九
54+
'\u5eff', '\u5eff\u4e00', '\u5eff\u4e8c', '\u5eff\u4e09', '\u5eff\u56db',
55+
'\u5eff\u4e94', '\u5eff\u516d', '\u5eff\u4e03', '\u5eff\u516b', '\u5eff\u4e5d',
56+
# 卅:卅一
57+
'\u5345', '\u5345\u4e00')
58+
59+
4460
class LocaleTime(object):
4561
"""Stores and handles locale-specific information related to time.
4662
@@ -84,6 +100,7 @@ def __init__(self):
84100
self.__calc_weekday()
85101
self.__calc_month()
86102
self.__calc_am_pm()
103+
self.__calc_alt_digits()
87104
self.__calc_timezone()
88105
self.__calc_date_time()
89106
if _getlang() != self.lang:
@@ -119,36 +136,76 @@ def __calc_am_pm(self):
119136
am_pm.append(time.strftime("%p", time_tuple).lower().strip())
120137
self.am_pm = am_pm
121138

139+
def __calc_alt_digits(self):
140+
# Set self.LC_alt_digits by using time.strftime().
141+
142+
# The magic data should contain all decimal digits.
143+
time_tuple = time.struct_time((1998, 1, 27, 10, 43, 56, 1, 27, 0))
144+
s = time.strftime("%x%X", time_tuple)
145+
if s.isascii():
146+
# Fast path -- all digits are ASCII.
147+
self.LC_alt_digits = ()
148+
return
149+
150+
digits = ''.join(sorted(set(re.findall(r'\d', s))))
151+
if len(digits) == 10 and ord(digits[-1]) == ord(digits[0]) + 9:
152+
# All 10 decimal digits from the same set.
153+
if digits.isascii():
154+
# All digits are ASCII.
155+
self.LC_alt_digits = ()
156+
return
157+
158+
self.LC_alt_digits = [a + b for a in digits for b in digits]
159+
# Test whether the numbers contain leading zero.
160+
time_tuple2 = time.struct_time((2000, 1, 1, 1, 1, 1, 5, 1, 0))
161+
if self.LC_alt_digits[1] not in time.strftime("%x %X", time_tuple2):
162+
self.LC_alt_digits[:10] = digits
163+
return
164+
165+
# Either non-Gregorian calendar or non-decimal numbers.
166+
if {'\u4e00', '\u4e03', '\u4e5d', '\u5341', '\u5eff'}.issubset(s):
167+
# lzh_TW
168+
self.LC_alt_digits = lzh_TW_alt_digits
169+
return
170+
171+
self.LC_alt_digits = None
172+
122173
def __calc_date_time(self):
123-
# Set self.date_time, self.date, & self.time by using
124-
# time.strftime().
174+
# Set self.LC_date_time, self.LC_date, self.LC_time and
175+
# self.LC_time_ampm by using time.strftime().
125176

126177
# Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
127178
# overloaded numbers is minimized. The order in which searches for
128179
# values within the format string is very important; it eliminates
129180
# possible ambiguity for what something represents.
130181
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
131182
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
132-
replacement_pairs = [
183+
replacement_pairs = []
184+
185+
# Non-ASCII digits
186+
if self.LC_alt_digits or self.LC_alt_digits is None:
187+
for n, d in [(19, '%OC'), (99, '%Oy'), (22, '%OH'),
188+
(44, '%OM'), (55, '%OS'), (17, '%Od'),
189+
(3, '%Om'), (2, '%Ow'), (10, '%OI')]:
190+
if self.LC_alt_digits is None:
191+
s = chr(0x660 + n // 10) + chr(0x660 + n % 10)
192+
replacement_pairs.append((s, d))
193+
if n < 10:
194+
replacement_pairs.append((s[1], d))
195+
elif len(self.LC_alt_digits) > n:
196+
replacement_pairs.append((self.LC_alt_digits[n], d))
197+
else:
198+
replacement_pairs.append((time.strftime(d, time_tuple), d))
199+
replacement_pairs += [
133200
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
134201
('44', '%M'), ('55', '%S'), ('76', '%j'),
135202
('17', '%d'), ('03', '%m'), ('3', '%m'),
136203
# '3' needed for when no leading zero.
137204
('2', '%w'), ('10', '%I'),
138-
# Non-ASCII digits
139-
('\u0661\u0669\u0669\u0669', '%Y'),
140-
('\u0669\u0669', '%Oy'),
141-
('\u0662\u0662', '%OH'),
142-
('\u0664\u0664', '%OM'),
143-
('\u0665\u0665', '%OS'),
144-
('\u0661\u0667', '%Od'),
145-
('\u0660\u0663', '%Om'),
146-
('\u0663', '%Om'),
147-
('\u0662', '%Ow'),
148-
('\u0661\u0660', '%OI'),
149205
]
206+
150207
date_time = []
151-
for directive in ('%c', '%x', '%X'):
208+
for directive in ('%c', '%x', '%X', '%r'):
152209
current_format = time.strftime(directive, time_tuple).lower()
153210
current_format = current_format.replace('%', '%%')
154211
# The month and the day of the week formats are treated specially
@@ -172,9 +229,10 @@ def __calc_date_time(self):
172229
if tz:
173230
current_format = current_format.replace(tz, "%Z")
174231
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
175-
current_format = re_sub(r'\d(?<![0-9])',
176-
lambda m: chr(0x0660 + int(m[0])),
177-
current_format)
232+
if not current_format.isascii() and self.LC_alt_digits is None:
233+
current_format = re_sub(r'\d(?<![0-9])',
234+
lambda m: chr(0x0660 + int(m[0])),
235+
current_format)
178236
for old, new in replacement_pairs:
179237
current_format = current_format.replace(old, new)
180238
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
@@ -189,6 +247,7 @@ def __calc_date_time(self):
189247
self.LC_date_time = date_time[0]
190248
self.LC_date = date_time[1]
191249
self.LC_time = date_time[2]
250+
self.LC_time_ampm = date_time[3]
192251

193252
def __find_month_format(self, directive):
194253
"""Find the month format appropriate for the current locale.
@@ -213,7 +272,7 @@ def __find_month_format(self, directive):
213272
full_indices &= indices
214273
indices = set(_findall(datetime, self.a_month[m]))
215274
if abbr_indices is None:
216-
abbr_indices = indices
275+
abbr_indices = set(indices)
217276
else:
218277
abbr_indices &= indices
219278
if not full_indices and not abbr_indices:
@@ -241,7 +300,7 @@ def __find_weekday_format(self, directive):
241300
if self.f_weekday[wd] != self.a_weekday[wd]:
242301
indices = set(_findall(datetime, self.a_weekday[wd]))
243302
if abbr_indices is None:
244-
abbr_indices = indices
303+
abbr_indices = set(indices)
245304
else:
246305
abbr_indices &= indices
247306
if not full_indices and not abbr_indices:
@@ -288,8 +347,10 @@ def __init__(self, locale_time=None):
288347
# The " [1-9]" part of the regex is to make %c from ANSI C work
289348
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
290349
'f': r"(?P<f>[0-9]{1,6})",
291-
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
350+
'H': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
351+
'k': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
292352
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
353+
'l': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
293354
'G': r"(?P<G>\d\d\d\d)",
294355
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
295356
'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
@@ -312,16 +373,49 @@ def __init__(self, locale_time=None):
312373
for tz in tz_names),
313374
'Z'),
314375
'%': '%'}
315-
for d in 'dmyHIMS':
316-
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
317-
mapping['Ow'] = r'(?P<w>\d)'
376+
if self.locale_time.LC_alt_digits is None:
377+
for d in 'dmyCHIMS':
378+
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
379+
mapping['Ow'] = r'(?P<w>\d)'
380+
else:
381+
mapping.update({
382+
'Od': self.__seqToRE(self.locale_time.LC_alt_digits[1:32], 'd',
383+
'3[0-1]|[1-2][0-9]|0[1-9]|[1-9]'),
384+
'Om': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'm',
385+
'1[0-2]|0[1-9]|[1-9]'),
386+
'Ow': self.__seqToRE(self.locale_time.LC_alt_digits[:7], 'w',
387+
'[0-6]'),
388+
'Oy': self.__seqToRE(self.locale_time.LC_alt_digits, 'y',
389+
'[0-9][0-9]'),
390+
'OC': self.__seqToRE(self.locale_time.LC_alt_digits, 'C',
391+
'[0-9][0-9]'),
392+
'OH': self.__seqToRE(self.locale_time.LC_alt_digits[:24], 'H',
393+
'2[0-3]|[0-1][0-9]|[0-9]'),
394+
'OI': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'I',
395+
'1[0-2]|0[1-9]|[1-9]'),
396+
'OM': self.__seqToRE(self.locale_time.LC_alt_digits[:60], 'M',
397+
'[0-5][0-9]|[0-9]'),
398+
'OS': self.__seqToRE(self.locale_time.LC_alt_digits[:62], 'S',
399+
'6[0-1]|[0-5][0-9]|[0-9]'),
400+
})
401+
mapping.update({
402+
'e': mapping['d'],
403+
'Oe': mapping['Od'],
404+
'P': mapping['p'],
405+
'Op': mapping['p'],
406+
'W': mapping['U'].replace('U', 'W'),
407+
})
318408
mapping['W'] = mapping['U'].replace('U', 'W')
409+
319410
base.__init__(mapping)
411+
base.__setitem__('T', self.pattern('%H:%M:%S'))
412+
base.__setitem__('R', self.pattern('%H:%M'))
413+
base.__setitem__('r', self.pattern(self.locale_time.LC_time_ampm))
320414
base.__setitem__('X', self.pattern(self.locale_time.LC_time))
321415
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
322416
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
323417

324-
def __seqToRE(self, to_convert, directive):
418+
def __seqToRE(self, to_convert, directive, altregex=None):
325419
"""Convert a list to a regex string for matching a directive.
326420
327421
Want possible matching values to be from longest to shortest. This
@@ -337,8 +431,9 @@ def __seqToRE(self, to_convert, directive):
337431
else:
338432
return ''
339433
regex = '|'.join(re_escape(stuff) for stuff in to_convert)
340-
regex = '(?P<%s>%s' % (directive, regex)
341-
return '%s)' % regex
434+
if altregex is not None:
435+
regex += '|' + altregex
436+
return '(?P<%s>%s)' % (directive, regex)
342437

343438
def pattern(self, format):
344439
"""Return regex pattern for the format string.
@@ -365,7 +460,7 @@ def repl(m):
365460
nonlocal day_of_month_in_format
366461
day_of_month_in_format = True
367462
return self[format_char]
368-
format = re_sub(r'%([OE]?\\?.?)', repl, format)
463+
format = re_sub(r'%[-_0^#]*[0-9]*([OE]?\\?.?)', repl, format)
369464
if day_of_month_in_format and not year_in_format:
370465
import warnings
371466
warnings.warn("""\
@@ -467,37 +562,50 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
467562
# values
468563
weekday = julian = None
469564
found_dict = found.groupdict()
565+
if locale_time.LC_alt_digits:
566+
def parse_int(s):
567+
try:
568+
return locale_time.LC_alt_digits.index(s)
569+
except ValueError:
570+
return int(s)
571+
else:
572+
parse_int = int
573+
470574
for group_key in found_dict.keys():
471575
# Directives not explicitly handled below:
472576
# c, x, X
473577
# handled by making out of other directives
474578
# U, W
475579
# worthless without day of the week
476580
if group_key == 'y':
477-
year = int(found_dict['y'])
478-
# Open Group specification for strptime() states that a %y
479-
#value in the range of [00, 68] is in the century 2000, while
480-
#[69,99] is in the century 1900
481-
if year <= 68:
482-
year += 2000
581+
year = parse_int(found_dict['y'])
582+
if 'C' in found_dict:
583+
century = parse_int(found_dict['C'])
584+
year += century * 100
483585
else:
484-
year += 1900
586+
# Open Group specification for strptime() states that a %y
587+
#value in the range of [00, 68] is in the century 2000, while
588+
#[69,99] is in the century 1900
589+
if year <= 68:
590+
year += 2000
591+
else:
592+
year += 1900
485593
elif group_key == 'Y':
486594
year = int(found_dict['Y'])
487595
elif group_key == 'G':
488596
iso_year = int(found_dict['G'])
489597
elif group_key == 'm':
490-
month = int(found_dict['m'])
598+
month = parse_int(found_dict['m'])
491599
elif group_key == 'B':
492600
month = locale_time.f_month.index(found_dict['B'].lower())
493601
elif group_key == 'b':
494602
month = locale_time.a_month.index(found_dict['b'].lower())
495603
elif group_key == 'd':
496-
day = int(found_dict['d'])
604+
day = parse_int(found_dict['d'])
497605
elif group_key == 'H':
498-
hour = int(found_dict['H'])
606+
hour = parse_int(found_dict['H'])
499607
elif group_key == 'I':
500-
hour = int(found_dict['I'])
608+
hour = parse_int(found_dict['I'])
501609
ampm = found_dict.get('p', '').lower()
502610
# If there was no AM/PM indicator, we'll treat this like AM
503611
if ampm in ('', locale_time.am_pm[0]):
@@ -513,9 +621,9 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
513621
if hour != 12:
514622
hour += 12
515623
elif group_key == 'M':
516-
minute = int(found_dict['M'])
624+
minute = parse_int(found_dict['M'])
517625
elif group_key == 'S':
518-
second = int(found_dict['S'])
626+
second = parse_int(found_dict['S'])
519627
elif group_key == 'f':
520628
s = found_dict['f']
521629
# Pad to always return microseconds.

0 commit comments

Comments
 (0)
0