10000 extmod/modure: Convert byte offsets to unicode indices when necessary. · lowfatcode/micropython@e90b85c · GitHub
[go: up one dir, main page]

Skip to content

Commit e90b85c

Browse files
jeplerdpgeorge
authored andcommitted
extmod/modure: Convert byte offsets to unicode indices when necessary.
And add a test. Fixes issue micropython#9202. Signed-off-by: Jeff Epler <jepler@gmail.com>
1 parent 719dbbf commit e90b85c

File tree

2 files changed

+48
-0
lines changed

2 files changed

+48
-0
lines changed

extmod/modure.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@
3333
#include "py/objstr.h"
3434
#include "py/stackctrl.h"
3535

36+
#if MICROPY_PY_BUILTINS_STR_UNICODE
37+
#include "py/unicode.h"
38+
#endif
39+
3640
#if MICROPY_PY_URE
3741

3842
#define re1_5_stack_chk() MP_STACK_CHECK()
@@ -121,6 +125,18 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span
121125
e 8000 = self->caps[no * 2 + 1] - begin;
122126
}
123127

128+
#if MICROPY_PY_BUILTINS_STR_UNICODE
129+
if (mp_obj_get_type(self->str) == &mp_type_str) {
130+
const byte *begin = (const byte *)mp_obj_str_get_str(self->str);
131+
if (s != -1) {
132+
s = utf8_ptr_to_index(begin, begin + s);
133+
}
134+
if (e != -1) {
135+
e = utf8_ptr_to_index(begin, begin + e);
136+
}
137+
}
138+
#endif
139+
124140
span[0] = mp_obj_new_int(s);
125141
span[1] = mp_obj_new_int(e);
126142
}

tests/unicode/unicode_ure.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# test match.span() for unicode strings
2+
3+
try:
4+
import ure as re
5+
except ImportError:
6+
try:
7+
import re
8+
except ImportError:
9+
print("SKIP")
10+
raise SystemExit
11+
12+
try:
13+
m = re.match(".", "a")
14+
m.span
15+
except AttributeError:
16+
print("SKIP")
17+
raise SystemExit
18+
19+
20+
def print_spans(match):
21+
print("----")
22+
try:
23+
i = 0
24+
while True:
25+
print(match.span(i), match.start(i), match.end(i))
26+
i += 1
27+
except IndexError:
28+
pass
29+
30+
31+
m = re.match(r"([0-9]*)(([a-z]*)([0-9]*))", "1234\u2764567")
32+
print_spans(m)

0 commit comments

Comments
 (0)
0