8000 bpo-30215: Make re.compile() locale agnostic. (#1361) · python/cpython@898ff03 · GitHub
[go: up one dir, main page]

Skip to content

Commit 898ff03

Browse files
bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer depend on the locale at compile time. Only the locale at matching time affects the result of matching.
1 parent 647c3d3 commit 898ff03

File tree

9 files changed

+141
-23
lines changed

9 files changed

+141
-23
lines changed

Doc/library/re.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,11 @@ form.
559559
:const:`re.LOCALE` can be used only with bytes patterns and is
560560
not compatible with :const:`re.ASCII`.
561561

562+
.. versionchanged:: 3.7
563+
Compiled regular expression objects with the :const:`re.LOCALE` flag no
564+
longer depend on the locale at compile time. Only the locale at
565+
matching time affects the result of matching.
566+
562567

563568
.. data:: M
564569
MULTILINE

Lib/re.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,7 @@ def escape(pattern):
268268
def _compile(pattern, flags):
269269
# internal: compile pattern
270270
try:
271-
p, loc = _cache[type(pattern), pattern, flags]
272-
if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
273-
return p
271+
return _cache[type(pattern), pattern, flags]
274272
except KeyError:
275273
pass
276274
if isinstance(pattern, _pattern_type):
@@ -284,13 +282,7 @@ def _compile(pattern, flags):
284282
if not (flags & DEBUG):
285283
if len(_cache) >= _MAXCACHE:
286284
_cache.clear()
287-
if p.flags & LOCALE:
288-
if not _locale:
289-
return p
290-
loc = _locale.setlocale(_locale.LC_CTYPE)
291-
else:
292-
loc = None
293-
_cache[type(pattern), pattern, flags] = p, loc
285+
_cache[type(pattern), pattern, flags] = p
294286
return p
295287

296288
@functools.lru_cache(_MAXCACHE)

Lib/sre_compile.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
7878
fixes = None
7979
for op, av in pattern:
8080
if op in LITERAL_CODES:
81-
if flags & SRE_FLAG_IGNORECASE:
81+
if not flags & SRE_FLAG_IGNORECASE:
82+
emit(op)
83+
emit(av)
84+
elif flags & SRE_FLAG_LOCALE:
85+
emit(OP_LOC_IGNORE[op])
86+
emit(av)
87+
else:
8288
lo = _sre.getlower(av, flags)
8389
if fixes and lo in fixes:
8490
emit(IN_IGNORE)
@@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
9399
else:
94100
emit(OP_IGNORE[op])
95101
emit(lo)
96-
else:
97-
emit(op)
98-
emit(av)
99102
elif op is IN:
100-
if flags & SRE_FLAG_IGNORECASE:
101-
emit(OP_IGNORE[op])
102-
def fixup(literal, flags=flags):
103-
return _sre.getlower(literal, flags)
104-
else:
103+
if not flags & SRE_FLAG_IGNORECASE:
105104
emit(op)
106105
fixup = None
106+
elif flags & SRE_FLAG_LOCALE:
107+
emit(IN_LOC_IGNORE)
108+
fixup = None
109+
else:
110+
emit(IN_IGNORE)
111+
def fixup(literal, flags=flags):
112+
return _sre.getlower(literal, flags)
107113
skip = _len(code); emit(0)
108114
_compile_charset(av, flags, code, fixup, fixes)
109115
code[skip] = _len(code) - skip

Lib/sre_constants.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
# update when constants are added or removed
1515

16-
MAGIC = 20140917
16+
MAGIC = 20170530
1717

1818
from _sre import MAXREPEAT, MAXGROUPS
1919

@@ -87,6 +87,9 @@ def _makecodes(names):
8787
SUBPATTERN
8888
MIN_REPEAT_ONE
8989
RANGE_IGNORE
90+
LITERAL_LOC_IGNORE
91+
NOT_LITERAL_LOC_IGNORE
92+
IN_LOC_IGNORE
9093
9194
MIN_REPEAT MAX_REPEAT
9295
""")
@@ -124,6 +127,11 @@ def _makecodes(names):
124127
RANGE: RANGE_IGNORE,
125128
}
126129

130+
OP_LOC_IGNORE = {
131+
LITERAL: LITERAL_LOC_IGNORE,
132+
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
133+
}
134+
127135
AT_MULTILINE = {
128136
AT_BEGINNING: AT_BEGINNING_LINE,
129137
AT_END: AT_END_LINE

Lib/test/test_re.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1730,6 +1730,38 @@ def check_en_US_utf8(self):
17301730
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
17311731
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
17321732

1733+
def test_locale_compiled(self):
1734+
oldlocale = locale.setlocale(locale.LC_CTYPE)
1735+
self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1736+
for loc in 'en_US.iso88591', 'en_US.utf8':
1737+
try:
1738+
locale.setlocale(locale.LC_CTYPE, loc)
1739+
except locale.Error:
1740+
# Unsupported locale on this system
1741+
self.skipTest('test needs %s locale' % loc)
1742+
1743+
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1744+
p1 = re.compile(b'\xc5\xe5', re.L|re.I)
1745+
p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
1746+
p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
1747+
p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
1748+
for p in p1, p2, p3:
1749+
self.assertTrue(p.match(b'\xc5\xe5'))
1750+
self.assertTrue(p.match(b'\xe5\xe5'))
1751+
self.assertTrue(p.match(b'\xc5\xc5'))
1752+
self.assertIsNone(p4.match(b'\xe5\xc5'))
1753+
self.assertIsNone(p4.match(b'\xe5\xe5'))
1754+
self.assertIsNone(p4.match(b'\xc5\xc5'))
1755+
1756+
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1757+
for p in p1, p2, p3:
1758+
self.assertTrue(p.match(b'\xc5\xe5'))
1759+
self.assertIsNone(p.match(b'\xe5\xe5'))
1760+
self.assertIsNone(p.match(b'\xc5\xc5'))
1761+
self.assertTrue(p4.match(b'\xe5\xc5'))
1762+
self.assertIsNone(p4.match(b'\xe5\xe5'))
1763+
self.assertIsNone(p4.match(b'\xc5\xc5'))
1764+
17331765
def test_error(self):
17341766
with self.assertRaises(re.error) as cm:
17351767
re.compile('(\u20ac))')

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,10 @@ Extension Modules
317317
Library
318318
-------
319319

320+
- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no
321+
longer depend on the locale at compile time. Only the locale at matching
322+
time affects the result of matching.
323+
320324
- bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process
321325
when Ctrl-C is received.
322326

Modules/_sre.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
15881588
case SRE_OP_NOT_LITERAL:
15891589
case SRE_OP_LITERAL_IGNORE:
15901590
case SRE_OP_NOT_LITERAL_IGNORE:
1591+
case SRE_OP_LITERAL_LOC_IGNORE:
1592+
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
15911593
GET_ARG;
15921594
/* The arg is just a character, nothing to check */
15931595
break;
@@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
16251627

16261628
case SRE_OP_IN:
16271629
case SRE_OP_IN_IGNORE:
1630+
case SRE_OP_IN_LOC_IGNORE:
16281631
GET_SKIP;
16291632
/* Stop 1 before the end; we check the FAILURE below */
16301633
if (!_validate_charset(code, code+skip-2))

Modules/sre_constants.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* See the _sre.c file for information on usage and redistribution.
1212
*/
1313

14-
#define SRE_MAGIC 20140917
14+
#define SRE_MAGIC 20170530
1515
#define SRE_OP_FAILURE 0
1616
#define SRE_OP_SUCCESS 1
1717
#define SRE_OP_ANY 2
@@ -45,6 +45,9 @@
4545
#define SRE_OP_SUBPATTERN 30
4646
#define SRE_OP_MIN_REPEAT_ONE 31
4747
#define SRE_OP_RANGE_IGNORE 32
48+
#define SRE_OP_LITERAL_LOC_IGNORE 33
49+
#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
50+
#define SRE_OP_IN_LOC_IGNORE 35
4851
#define SRE_AT_BEGINNING 0
4952
#define SRE_AT_BEGINNING_LINE 1
5053
#define SRE_AT_BEGINNING_STRING 2

Modules/sre_lib.h

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
100100
return 0;
101101
}
102102

103+
LOCAL(int)
104+
SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
105+
{
106+
return ch == pattern
107+
|| (SRE_CODE) state->lower(ch) == pattern
108+
|| (SRE_CODE) state->upper(ch) == pattern;
109+
}
110+
103111
LOCAL(int)
104112
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
105113
{
@@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
187195
}
188196
}
189197

198+
LOCAL(int)
199+
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
200+
{
201+
SRE_CODE lo, up;
202+
lo = state->lower(ch);
203+
if (SRE(charset)(state, set, lo))
204+
return 1;
205+
206+
up = state->upper(ch);
207+
return up != lo && SRE(charset)(state, set, up);
208+
}
209+
190210
LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
191211

192212
LOCAL(Py_ssize_t)
@@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
247267
ptr++;
248268
break;
249269

270+
case SRE_OP_LITERAL_LOC_IGNORE:
271+
/* repeated literal */
272+
chr = pattern[1];
273+
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
274+
while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
275+
ptr++;
276+
break;
277+
250278
case SRE_OP_NOT_LITERAL:
251279
/* repeated non-literal */
252280
chr = pattern[1];
@@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
269297
ptr++;
270298
break;
271299

300+
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
301+
/* repeated non-literal */
302+
chr = pattern[1];
303+
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
304+
while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
305+
ptr++;
306+
break;
307+
272308
default:
273309
/* repeated single character pattern */
274310
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
@@ -651,7 +687,17 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
651687
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
652688
ctx->pattern, ctx->ptr, ctx->pattern[0]));
653689
if (ctx->ptr >= end ||
654-
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
690+
state->lower(*ctx->ptr) != *ctx->pattern)
691+
RETURN_FAILURE;
692+
ctx->pattern++;
693+
ctx->ptr++;
694+
break;
695+
696+
case SRE_OP_LITERAL_LOC_IGNORE:
697+
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
698+
ctx->pattern, ctx->ptr, ctx->pattern[0]));
699+
if (ctx->ptr >= end
700+
|| !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
655701
RETURN_FAILURE;
656702
ctx->pattern++;
657703
ctx->ptr++;
@@ -661,7 +707,17 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
661707
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
662708
ctx->pattern, ctx->ptr, *ctx->pattern));
663709
if (ctx->ptr >= end ||
664-
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
710+
state->lower(*ctx->ptr) == *ctx->pattern)
711+
RETURN_FAILURE;
712+
ctx->pattern++;
713+
ctx->ptr++;
714+
break;
715+
716+
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
717+
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
718+
ctx->pattern, ctx->ptr, *ctx->pattern));
719+
if (ctx->ptr >= end
720+
|| SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
665721
RETURN_FAILURE;
666722
ctx->pattern++;
667723
ctx->ptr++;
@@ -677,6 +733,15 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
677733
ctx->ptr++;
678734
break;
679735

736+
case SRE_OP_IN_LOC_IGNORE:
737+
TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
738+
if (ctx->ptr >= end
739+
|| !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
740+
RETURN_FAILURE;
741+
ctx->pattern += ctx->pattern[0];
742+
ctx->ptr++;
743+
break;
744+
680745
case SRE_OP_JUMP:
681746
case SRE_OP_INFO:
682747
/* jump forward */

0 commit comments

Comments
 (0)
0