8000 [3.13] gh-53203: Fix strptime() for %c, %x and %X formats on some loc… · python/cpython@ea25f4a · GitHub
[go: up one dir, main page]

Skip to content
  • Commit ea25f4a

    Browse files
    [3.13] gh-53203: Fix strptime() for %c, %x and %X formats on some locales (GH-135971) (GH-136020)
    * Add detection of decimal non-ASCII alt digits. * Add support of non-decimal alt digits on locale lzh_TW. * Accept only numbers in correct range if alt digits are known. * Fix bug in detecting the position of the week day name on locales byn_ER and wal_ET. * Fix support of single-digit hour on locales ar_SA and bg_BG. * Add support for %T, %R, %r, %C, %OC. * Prepare code to use nl_langinfo(). (cherry picked from commit 07183eb) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
    1 parent d105eae commit ea25f4a

    File tree

    3 files changed

    +164
    -56
    lines changed

    3 files changed

    +164
    -56
    lines changed

    Lib/_strptime.py

    Lines changed: 149 additions & 41 deletions
    Original file line numberDiff line numberDiff line change
    @@ -14,6 +14,7 @@
    1414
    import time
    1515
    import locale
    1616
    import calendar
    17+
    import re
    1718
    from re import compile as re_compile
    1819
    from re import sub as re_sub
    1920
    from re import IGNORECASE
    @@ -41,6 +42,21 @@ def _findall(haystack, needle):
    4142
    yield i
    4243
    i += len(needle)
    4344

    45+
    46+
    lzh_TW_alt_digits = (
    47+
    # 〇:一:二:三:四:五:六:七:八:九
    48+
    '\u3007', '\u4e00', '\u4e8c', '\u4e09', '\u56db',
    49+
    '\u4e94', '\u516d', '\u4e03', '\u516b', '\u4e5d',
    50+
    # 十:十一:十二:十三:十四:十五:十六:十七:十八:十九
    51+
    '\u5341', '\u5341\u4e00', '\u5341\u4e8c', '\u5341\u4e09', '\u5341\u56db',
    52+
    '\u5341\u4e94', '\u5341\u516d', '\u5341\u4e03', '\u5341\u516b', '\u5341\u4e5d',
    53+
    # 廿:廿一:廿二:廿三:廿四:廿五:廿六:廿七:廿八:廿九
    54+
    '\u5eff', '\u5eff\u4e00', '\u5eff\u4e8c', '\u5eff\u4e09', '\u5eff\u56db',
    55+
    '\u5eff\u4e94', '\u5eff\u516d', '\u5eff\u4e03', '\u5eff\u516b', '\u5eff\u4e5d',
    56+
    # 卅:卅一
    57+
    '\u5345', '\u5345\u4e00')
    58+
    59+
    4460
    class LocaleTime(object):
    4561
    """Stores and handles locale-specific information related to time.
    4662
    @@ -84,6 +100,7 @@ def __init__(self):
    84100
    self.__calc_weekday()
    85101
    self.__calc_month()
    86102
    self.__calc_am_pm()
    103+
    self.__calc_alt_digits()
    87104
    self.__calc_timezone()
    88105
    self.__calc_date_time()
    89106
    if _getlang() != self.lang:
    @@ -119,36 +136,76 @@ def __calc_am_pm(self):
    119136
    am_pm.append(time.strftime("%p", time_tuple).lower().strip())
    120137
    self.am_pm = am_pm
    121138

    139+
    def __calc_alt_digits(self):
    140+
    # Set self.LC_alt_digits by using time.strftime().
    141+
    142+
    # The magic data should contain all decimal digits.
    143+
    time_tuple = time.struct_time((1998, 1, 27, 10, 43, 56, 1, 27, 0))
    144+
    s = time.strftime("%x%X", time_tuple)
    145+
    if s.isascii():
    146+
    # Fast path -- all digits are ASCII.
    147+
    self.LC_alt_digits = ()
    148+
    return
    149+
    150+
    digits = ''.join(sorted(set(re.findall(r'\d', s))))
    151+
    if len(digits) == 10 and ord(digits[-1]) == ord(digits[0]) + 9:
    152+
    # All 10 decimal digits from the same set.
    153+
    if digits.isascii():
    154+
    # All digits are ASCII.
    155+
    self.LC_alt_digits = ()
    156+
    return
    157+
    158+
    self.LC_alt_digits = [a + b for a in digits for b in digits]
    159+
    # Test whether the numbers contain leading zero.
    160+
    time_tuple2 = time.struct_time((2000, 1, 1, 1, 1, 1, 5, 1, 0))
    161+
    if self.LC_alt_digits[1] not in time.strftime("%x %X", time_tuple2):
    162+
    self.LC_alt_digits[:10] = digits
    163+
    return
    164+
    165+
    # Either non-Gregorian calendar or non-decimal numbers.
    166+
    if {'\u4e00', '\u4e03', '\u4e5d', '\u5341', '\u5eff'}.issubset(s):
    167+
    # lzh_TW
    168+
    self.LC_alt_digits = lzh_TW_alt_digits
    169+
    return
    170+
    171+
    self.LC_alt_digits = None
    172+
    122173
    def __calc_date_time(self):
    123-
    # Set self.date_time, self.date, & self.time by using
    124-
    # time.strftime().
    174+
    # Set self.LC_date_time, self.LC_date, self.LC_time and
    175+
    # self.LC_time_ampm by using time.strftime().
    125176

    126177
    # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
    127178
    # overloaded numbers is minimized. The order in which searches for
    128179
    # values within the format string is very important; it eliminates
    129180
    # possible ambiguity for what something represents.
    130181
    time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
    131182
    time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
    132-
    replacement_pairs = [
    183+
    replacement_pairs = []
    184+
    185+
    # Non-ASCII digits
    186+
    if self.LC_alt_digits or self.LC_alt_digits is None:
    187+
    for n, d in [(19, '%OC'), (99, '%Oy'), (22, '%OH'),
    188+
    (44, '%OM'), (55, '%OS'), (17, '%Od'),
    189+
    (3, '%Om'), (2, '%Ow'), (10, '%OI')]:
    190+
    if self.LC_alt_digits is None:
    191+
    s = chr(0x660 + n // 10) + chr(0x660 + n % 10)
    192+
    replacement_pairs.append((s, d))
    193+
    if n < 10:
    194+
    replacement_pairs.append((s[1], d))
    195+
    elif len(self.LC_alt_digits) > n:
    196+
    replacement_pairs.append((self.LC_alt_digits[n], d))
    197+
    else:
    198+
    replacement_pairs.append((time.strftime(d, time_tuple), d))
    199+
    replacement_pairs += [
    133200
    ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
    134201
    ('44', '%M'), ('55', '%S'), ('76', '%j'),
    135202
    ('17', '%d'), ('03', '%m'), ('3', '%m'),
    136203
    # '3' needed for when no leading zero.
    137204
    ('2', '%w'), ('10', '%I'),
    138-
    # Non-ASCII digits
    139-
    ('\u0661\u0669\u0669\u0669', '%Y'),
    140-
    ('\u0669\u0669', '%Oy'),
    141-
    ('\u0662\u0662', '%OH'),
    142-
    ('\u0664\u0664', '%OM'),
    143-
    ('\u0665\u0665', '%OS'),
    144-
    ('\u0661\u0667', '%Od'),
    145-
    ('\u0660\u0663', '%Om'),
    146-
    ('\u0663', '%Om'),
    147-
    ('\u0662', '%Ow'),
    148-
    ('\u0661\u0660', '%OI'),
    149205
    ]
    206+
    150207
    date_time = []
    151-
    for directive in ('%c', '%x', '%X'):
    208+
    for directive in ('%c', '%x', '%X', '%r'):
    152209
    current_format = time.strftime(directive, time_tuple).lower()
    153210
    current_format = current_format.replace('%', '%%')
    154211
    # The month and the day of the week formats are treated specially
    @@ -172,9 +229,10 @@ def __calc_date_time(self):
    172229
    if tz:
    173230
    current_format = current_format.replace(tz, "%Z")
    174231
    # Transform all non-ASCII digits to digits in range U+0660 to U+0669.
    175-
    current_format = re_sub(r'\d(?<![0-9])',
    176-
    lambda m: chr(0x0660 + int(m[0])),
    177-
    current_format)
    232+
    if not current_format.isascii() and self.LC_alt_digits is None:
    233+
    current_format = re_sub(r'\d(?<![0-9])',
    234+
    lambda m: chr(0x0660 + int(m[0])),
    235+
    current_format)
    178236
    for old, new in replacement_pairs:
    179237
    current_format = current_format.replace(old, new)
    180238
    # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
    @@ -189,6 +247,7 @@ def __calc_date_time(self):
    189247
    self.LC_date_time = date_time[0]
    190248
    self.LC_date = date_time[1]
    191249
    self.LC_time = date_time[2]
    250+
    self.LC_time_ampm = date_time[3]
    192251

    193252
    def __find_month_format(self, directive):
    194253
    """Find the month format appropriate for the current locale.
    @@ -213,7 +272,7 @@ def __find_month_format(self, directive):
    213272
    full_indices &= indices
    214273
    indices = set(_findall(datetime, self.a_month[m]))
    215274
    if abbr_indices is None:
    216-
    abbr_indices = indices
    275+
    abbr_indices = set(indices)
    217276
    else:
    218277
    abbr_indices &= indices
    219278
    if not full_indices and not abbr_indices:
    @@ -241,7 +300,7 @@ def __find_weekday_format(self, directive):
    241300
    if self.f_weekday[wd] != self.a_weekday[wd]:
    242301
    indices = set(_findall(datetime, self.a_weekday[wd]))
    243302
    if abbr_indices is None:
    244-
    abbr_indices = indices
    303+
    abbr_indices = set(indices)
    245304
    else:
    246305
    abbr_indices &= indices
    247306
    if not full_indices and not abbr_indices:
    @@ -288,8 +347,10 @@ def __init__(self, locale_time=None):
    288347
    # The " [1-9]" part of the regex is to make %c from ANSI C work
    289348
    'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
    290349
    'f': r"(?P<f>[0-9]{1,6})",
    291-
    'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
    350+
    'H': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
    351+
    'k': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
    292352
    'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
    353+
    'l': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
    293354
    'G': r"(?P<G>\d\d\d\d)",
    294355
    'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
    295356
    'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
    @@ -312,16 +373,49 @@ def __init__(self, locale_time=None):
    312373
    for tz in tz_names),
    313374
    'Z'),
    314375
    '%': '%'}
    315-
    for d in 'dmyHIMS':
    316-
    mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
    317-
    mapping['Ow'] = r'(?P<w>\d)'
    376+
    if self.locale_time.LC_alt_digits is None:
    377+
    for d in 'dmyCHIMS':
    378+
    mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
    379+
    mapping['Ow'] = r'(?P<w>\d)'
    380+
    else:
    381+
    mapping.update({
    382+
    'Od': self.__seqToRE(self.locale_time.LC_alt_digits[1:32], 'd',
    383+
    '3[0-1]|[1-2][0-9]|0[1-9]|[1-9]'),
    384+
    'Om': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'm',
    385+
    '1[0-2]|0[1-9]|[1-9]'),
    386+
    'Ow': self.__seqToRE(self.locale_time.LC_alt_digits[:7], 'w',
    387+
    '[0-6]'),
    388+
    'Oy': self.__seqToRE(self.locale_time.LC_alt_digits, 'y',
    389+
    '[0-9][0-9]'),
    390+
    'OC': self.__seqToRE(self.locale_time.LC_alt_digits, 'C',
    391+
    '[0-9][0-9]'),
    392+
    'OH': self.__seqToRE(self.locale_time.LC_alt_digits[:24], 'H',
    393+
    '2[0-3]|[0-1][0-9]|[0-9]'),
    394+
    'OI': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'I',
    395+
    '1[0-2]|0[1-9]|[1-9]'),
    396+
    'OM': self.__seqToRE(self.locale_time.LC_alt_digits[:60], 'M',
    397+
    '[0-5][0-9]|[0-9]'),
    398+
    'OS': self.__seqToRE(self.locale_time.LC_alt_digits[:62], 'S',
    399+
    '6[0-1]|[0-5][0-9]|[0-9]'),
    400+
    })
    401+
    mapping.update({
    402+
    'e': mapping['d'],
    403+
    'Oe': mapping['Od'],
    404+
    'P': mapping['p'],
    405+
    'Op': mapping['p'],
    406+
    'W': mapping['U'].replace('U', 'W'),
    407+
    })
    318408
    mapping['W'] = mapping['U'].replace('U', 'W')
    409+
    319410
    base.__init__(mapping)
    411+
    base.__setitem__('T', self.pattern('%H:%M:%S'))
    412+
    base.__setitem__('R', self.pattern('%H:%M'))
    413+
    base.__setitem__('r', self.pattern(self.locale_time.LC_time_ampm))
    320414
    base.__setitem__('X', self.pattern(self.locale_time.LC_time))
    321415
    base.__setitem__('x', self.pattern(self.locale_time.LC_date))
    322416
    base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
    323417

    324-
    def __seqToRE(self, to_convert, directive):
    418+
    def __seqToRE(self, to_convert, directive, altregex=None):
    325419
    """Convert a list to a regex string for matching a directive.
    326420
    327421
    Want possible matching values to be from longest to shortest. This
    @@ -337,8 +431,9 @@ def __seqToRE(self, to_convert, directive):
    337431
    else:
    338432
    return ''
    339433
    regex = '|'.join(re_escape(stuff) for stuff in to_convert)
    340-
    regex = '(?P<%s>%s' % (directive, regex)
    341-
    return '%s)' % regex
    434+
    if altregex is not None:
    435+
    regex += '|' + altregex
    436+
    return '(?P<%s>%s)' % (directive, regex)
    342437

    343438
    def pattern(self, format):
    344439
    """Return regex pattern for the format string.
    @@ -365,7 +460,7 @@ def repl(m):
    365460
    nonlocal day_of_month_in_format
    366461
    day_of_month_in_format = True
    367462
    return self[format_char]
    368-
    format = re_sub(r'%([OE]?\\?.?)', repl, format)
    463+
    format = re_sub(r'%[-_0^#]*[0-9]*([OE]?\\?.?)', repl, format)
    369464
    if day_of_month_in_format and not year_in_format:
    370465
    import warnings
    371466
    warnings.warn("""\
    @@ -467,37 +562,50 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
    467562
    # values
    468563
    weekday = julian = None
    469564
    found_dict = found.groupdict()
    565+
    if locale_time.LC_alt_digits:
    566+
    def parse_int(s):
    567+
    try:
    568+
    return locale_time.LC_alt_digits.index(s)
    569+
    except ValueError:
    570+
    return int(s)
    571+
    else:
    572+
    parse_int = int
    573+
    470574
    for group_key in found_dict.keys():
    471575
    # Directives not explicitly handled below:
    472576
    # c, x, X
    473577
    # handled by making out of other directives
    474578
    # U, W
    475579
    # worthless without day of the week
    476580
    if group_key == 'y':
    477-
    year = int(found_dict['y'])
    478-
    # Open Group specification for strptime() states that a %y
    479-
    #value in the range of [00, 68] is in the century 2000, while
    480-
    #[69,99] is in the century 1900
    481-
    if year <= 68:
    482-
    year += 2000
    581+
    year = parse_int(found_dict['y'])
    582+
    if 'C' in found_dict:
    583+
    century = parse_int(found_dict['C'])
    584+
    year += century * 100
    483585
    else:
    484-
    year += 1900
    586+
    # Open Group specification for strptime() states that a %y
    587+
    #value in the range of [00, 68] is in the century 2000, while
    588+
    #[69,99] is in the century 1900
    589+
    if year <= 68:
    590+
    year += 2000
    591+
    else:
    592+
    year += 1900
    485593
    elif group_key == 'Y':
    486594
    year = int(found_dict['Y'])
    487595
    elif group_key == 'G':
    488596
    iso_year = int(found_dict['G'])
    489597
    elif group_key == 'm':
    490-
    month = int(found_dict['m'])
    598+
    month = parse_int(found_dict['m'])
    491599
    elif group_key == 'B':
    492600
    month = locale_time.f_month.index(found_dict['B'].lower())
    493601
    elif group_key == 'b':
    494602
    month = locale_time.a_month.index(found_dict['b'].lower())
    495603
    elif group_key == 'd':
    496-
    day = int(found_dict['d'])
    604+
    day = parse_int(found_dict['d'])
    497605
    elif group_key == 'H':
    498-
    hour = int(found_dict['H'])
    606+
    hour = parse_int(found_dict['H'])
    499607
    elif group_key == 'I':
    500-
    hour = int(found_dict['I'])
    608+
    hour = parse_int(found_dict['I'])
    501609
    ampm = found_dict.get('p', '').lower()
    502610
    # If there was no AM/PM indicator, we'll treat this like AM
    503611
    if ampm in ('', locale_time.am_pm[0]):
    @@ -513,9 +621,9 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
    513621
    if hour != 12:
    514622
    hour += 12
    515623
    elif group_key == 'M':
    516-
    minute = int(found_dict['M'])
    624+
    minute = parse_int(found_dict['M'])
    517625
    elif group_key == 'S':
    518-
    second = int(found_dict['S'])
    626+
    second = parse_int(found_dict['S'])
    519627
    elif group_key == 'f':
    520628
    s = found_dict['f']
    521629
    # Pad to always return microseconds.

    0 commit comments

    Comments
     (0)
    0