8000 py/objstr: Add check for valid UTF-8 when making a str from bytes. · devmonkZA/micropython@68c2817 · GitHub
[go: up one dir, main page]

Skip to content < 8000 div data-target="react-partial.reactRoot">

Commit 68c2817

Browse files
TonyLianLongdpgeorge
authored andcommitted
py/objstr: Add check for valid UTF-8 when making a str from bytes.
This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64.
1 parent 069fc48 commit 68c2817

File tree

5 files changed

+58
-0
lines changed

5 files changed

+58
-0
lines changed

py/mpconfig.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,11 @@ typedef double mp_float_t;
691691
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
692692
#endif
693693

694+
// Whether to check for valid UTF-8 when converting bytes to str
695+
#ifndef MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
696+
#define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE)
697+
#endif
698+
694699
// Whether str.center() method provided
695700
#ifndef MICROPY_PY_BUILTINS_STR_CENTER
696701
#define MICROPY_PY_BUILTINS_STR_CENTER (0)

py/objstr.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,13 +161,23 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
161161
if (str_hash == 0) {
162162
str_hash = qstr_compute_hash(str_data, str_len);
163163
}
164+
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
165+
if (!utf8_check(str_data, str_len)) {
166+
mp_raise_msg(&mp_type_UnicodeError, NULL);
167+
}
168+
#endif
164169
mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
165170
o->data = str_data;
166171
o->hash = str_hash;
167172
return MP_OBJ_FROM_PTR(o);
168173
} else {
169174
mp_buffer_info_t bufinfo;
170175
mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
176+
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
177+
if (!utf8_check(bufinfo.buf, bufinfo.len)) {
178+
mp_raise_msg(&mp_type_UnicodeError, NULL);
179+
}
180+
#endif
171181
return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
172182
}
173183
}

py/unicode.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
182182
}
183183
return n;
184184
}
185+
186+
bool utf8_check(const byte *p, size_t len) {
187+
uint8_t need = 0;
188+
const byte *end = p + len;
189+
for (; p < end; p++) {
190+
byte c = *p;
191+
if (need) {
192+
if (c >= 0x80) {
193+
need--;
194+
} else {
195+
// mismatch
196+
return 0;
197+
}
198+
} else {
199+
if (c >= 0xc0) {
200+
if (c >= 0xf8) {
201+
// mismatch
202+
return 0;
203+
}
204+
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
205+
} else if (c >= 0x80) {
206+
// mismatch
207+
return 0;
208+
}
209+
}
210+
}
211+
return need == 0; // no pending fragments allowed
212+
}

py/unicode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,6 @@
3030
#include "py/misc.h"
3131

3232
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
33+
bool utf8_check(const byte *p, size_t len);
3334

3435
#endif // MICROPY_INCLUDED_PY_UNICODE_H

tests/unicode/unicode.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,17 @@
3333
int('\u0200')
3434
except ValueError:
3535
print('ValueError')
36+
37+
# test invalid UTF-8 string
38+
try:
39+
str(b'ab\xa1', 'utf8')
40+
except UnicodeError:
41+
print('UnicodeError')
42+
try:
43+
str(b'ab\xf8', 'utf8')
44+
except UnicodeError:
45+
print('UnicodeError')
46+
try:
47+
str(bytearray(b'ab\xc0a'), 'utf8')
48+
except UnicodeError:
49+
print('UnicodeError')

0 commit comments

Comments
 (0)
0