py/objstr: Add check for valid UTF-8 when making a str from bytes.

This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64.
author: tll <1040424979@qq.com> 2017-06-24 08:38:32 +0800
committer: Damien George <damien.p.george@gmail.com> 2017-09-06 16:43:09 +1000
commit: 68c28174d0e0ec3f6b1461aea3a0b6a1b84610bb (patch)
tree: 441a42ce59c5f965b66722bd6a5a5b24525c6bcf /py/objstr.c
parent: 069fc48bf60b31fca4339d26cee7b4a415b185f9 (diff)
1 files changed, 10 insertions, 0 deletions
diff --git a/py/objstr.c b/py/objstr.c
index 4c287af04..f6214f80c 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -161,6 +161,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
                 if (str_hash == 0) {
                     str_hash = qstr_compute_hash(str_data, str_len);
                 }
+                #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+                if (!utf8_check(str_data, str_len)) {
+                    mp_raise_msg(&mp_type_UnicodeError, NULL);
+                }
+                #endif
                 mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
                 o->data = str_data;
                 o->hash = str_hash;
@@ -168,6 +173,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
             } else {
                 mp_buffer_info_t bufinfo;
                 mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
+                #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+                if (!utf8_check(bufinfo.buf, bufinfo.len)) {
+                    mp_raise_msg(&mp_type_UnicodeError, NULL);
+                }
+                #endif
                 return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
             }
     }
author	tll <1040424979@qq.com>	2017-06-24 08:38:32 +0800
committer	Damien George <damien.p.george@gmail.com>	2017-09-06 16:43:09 +1000
commit	68c28174d0e0ec3f6b1461aea3a0b6a1b84610bb (patch)
tree	441a42ce59c5f965b66722bd6a5a5b24525c6bcf /py/objstr.c
parent	069fc48bf60b31fca4339d26cee7b4a415b185f9 (diff)