py: Do adjacent str/bytes literal concatenation in lexer, not compiler.

It's much more efficient in RAM and code size to do implicit literal string concatenation in the lexer, as opposed to the compiler. RAM usage is reduced because the concatenation can be done right away in the tokeniser by just accumulating the string/bytes literals into the lexer's vstr. Prior to this patch adjacent strings/bytes would create a parse tree (one node per string/bytes) and then in the compiler a whole new chunk of memory was allocated to store the concatenated string, which used more than double the memory compared to just accumulating in the lexer. This patch also significantly reduces code size: bare-arm: -204 minimal: -204 unix x64: -328 stmhal: -208 esp8266: -284 cc3200: -224
author: Damien George <damien.p.george@gmail.com> 2017-02-17 12:12:40 +1100
committer: Damien George <damien.p.george@gmail.com> 2017-02-17 12:12:40 +1100
commit: 534b7c368dc2af7720f3aaed0c936ef46d773957 (patch)
tree: e1b71ef80c0b883728a8871fe277da33b4889c14 /py/lexer.c
parent: 773278ec3030ea9ed809c5a248fde2278ce4b557 (diff)
1 files changed, 198 insertions, 157 deletions
diff --git a/py/lexer.c b/py/lexer.c
index ad4fe3fcb..329875ab0 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -63,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
     return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
 }
 
-/*
 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
     return lex->chr1 == c;
 }
-*/
 
 STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
     return lex->chr1 == c1 || lex->chr1 == c2;
@@ -106,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
     return lex->chr1 >= '0' && lex->chr1 <= '7';
 }
 
+STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
+    return is_char_or(lex, '\'', '\"')
+        || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+        || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
+            && is_char_following_following_or(lex, '\'', '\"'));
+}
+
 // to easily parse utf-8 identifiers we allow any raw byte with high bit set
 STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
     return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
@@ -272,14 +277,144 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
     return true;
 }
 
-void mp_lexer_to_next(mp_lexer_t *lex) {
-    // start new token text
-    vstr_reset(&lex->vstr);
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
+    // get first quoting character
+    char quote_char = '\'';
+    if (is_char(lex, '\"')) {
+        quote_char = '\"';
+    }
+    next_char(lex);
 
-    // skip white space and comments
+    // work out if it's a single or triple quoted literal
+    size_t num_quotes;
+    if (is_char_and(lex, quote_char, quote_char)) {
+        // triple quotes
+        next_char(lex);
+        next_char(lex);
+        num_quotes = 3;
+    } else {
+        // single quotes
+        num_quotes = 1;
+    }
+
+    size_t n_closing = 0;
+    while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
+        if (is_char(lex, quote_char)) {
+            n_closing += 1;
+            vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+        } else {
+            n_closing = 0;
+            if (is_char(lex, '\\')) {
+                next_char(lex);
+                unichar c = CUR_CHAR(lex);
+                if (is_raw) {
+                    // raw strings allow escaping of quotes, but the backslash is also emitted
+                    vstr_add_char(&lex->vstr, '\\');
+                } else {
+                    switch (c) {
+                        // note: "c" can never be MP_LEXER_EOF because next_char
+                        // always inserts a newline at the end of the input stream
+                        case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
+                        case '\\': break;
+                        case '\'': break;
+                        case '"': break;
+                        case 'a': c = 0x07; break;
+                        case 'b': c = 0x08; break;
+                        case 't': c = 0x09; break;
+                        case 'n': c = 0x0a; break;
+                        case 'v': c = 0x0b; break;
+                        case 'f': c = 0x0c; break;
+                        case 'r': c = 0x0d; break;
+                        case 'u':
+                        case 'U':
+                            if (lex->tok_kind == MP_TOKEN_BYTES) {
+                                // b'\u1234' == b'\\u1234'
+                                vstr_add_char(&lex->vstr, '\\');
+                                break;
+                            }
+                            // Otherwise fall through.
+                        case 'x':
+                        {
+                            mp_uint_t num = 0;
+                            if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
+                                // not enough hex chars for escape sequence
+                                lex->tok_kind = MP_TOKEN_INVALID;
+                            }
+                            c = num;
+                            break;
+                        }
+                        case 'N':
+                            // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
+                            // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
+                            // 3MB of text; even gzip-compressed and with minimal structure, it'll take
+                            // roughly half a meg of storage. This form of Unicode escape may be added
+                            // later on, but it's definitely not a priority right now. -- CJA 20140607
+                            mp_not_implemented("unicode name escapes");
+                            break;
+                        default:
+                            if (c >= '0' && c <= '7') {
+                                // Octal sequence, 1-3 chars
+                                mp_uint_t digits = 3;
+                                mp_uint_t num = c - '0';
+                                while (is_following_odigit(lex) && --digits != 0) {
+                                    next_char(lex);
+                                    num = num * 8 + (CUR_CHAR(lex) - '0');
+                                }
+                                c = num;
+                            } else {
+                                // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
+                                vstr_add_char(&lex->vstr, '\\');
+                            }
+                            break;
+                    }
+                }
+                if (c != MP_LEXER_EOF) {
+                    if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
+                        if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
+                            vstr_add_char(&lex->vstr, c);
+                        } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
+                            vstr_add_byte(&lex->vstr, c);
+                        } else {
+                            // unicode character out of range
+                            // this raises a generic SyntaxError; could provide more info
+                            lex->tok_kind = MP_TOKEN_INVALID;
+                        }
+                    } else {
+                        // without unicode everything is just added as an 8-bit byte
+                        if (c < 0x100) {
+                            vstr_add_byte(&lex->vstr, c);
+                        } else {
+                            // 8-bit character out of range
+                            // this raises a generic SyntaxError; could provide more info
+                            lex->tok_kind = MP_TOKEN_INVALID;
+                        }
+                    }
+                }
+            } else {
+                // Add the "character" as a byte so that we remain 8-bit clean.
+                // This way, strings are parsed correctly whether or not they contain utf-8 chars.
+                vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
+            }
+        }
+        next_char(lex);
+    }
+
+    // check we got the required end quotes
+    if (n_closing < num_quotes) {
+        lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
+    }
+
+    // cut off the end quotes from the token text
+    vstr_cut_tail_bytes(&lex->vstr, n_closing);
+}
+
+STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
     bool had_physical_newline = false;
     while (!is_end(lex)) {
         if (is_physical_newline(lex)) {
+            if (stop_at_newline && lex->nested_bracket_level == 0) {
+                break;
+            }
             had_physical_newline = true;
             next_char(lex);
         } else if (is_whitespace(lex)) {
@@ -298,6 +433,15 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
             break;
         }
     }
+    return had_physical_newline;
+}
+
+void mp_lexer_to_next(mp_lexer_t *lex) {
+    // start new token text
+    vstr_reset(&lex->vstr);
+
+    // skip white space and comments
+    bool had_physical_newline = skip_whitespace(lex, false);
 
     // set token source information
     lex->tok_line = lex->line;
@@ -332,168 +476,65 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
     } else if (is_end(lex)) {
         lex->tok_kind = MP_TOKEN_END;
 
-    } else if (is_char_or(lex, '\'', '\"')
-               || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
-               || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
+    } else if (is_string_or_bytes(lex)) {
         // a string or bytes literal
 
-        // parse type codes
-        bool is_raw = false;
-        bool is_bytes = false;
-        if (is_char(lex, 'u')) {
-            next_char(lex);
-        } else if (is_char(lex, 'b')) {
-            is_bytes = true;
-            next_char(lex);
-            if (is_char(lex, 'r')) {
-                is_raw = true;
-                next_char(lex);
-            }
-        } else if (is_char(lex, 'r')) {
-            is_raw = true;
-            next_char(lex);
-            if (is_char(lex, 'b')) {
-                is_bytes = true;
-                next_char(lex);
-            }
-        }
+        // Python requires adjacent string/bytes literals to be automatically
+        // concatenated.  We do it here in the tokeniser to make efficient use of RAM,
+        // because then the lexer's vstr can be used to accumulate the string literal,
+        // in contrast to creating a parse tree of strings and then joining them later
+        // in the compiler.  It's also more compact in code size to do it here.
 
-        // set token kind
-        if (is_bytes) {
-            lex->tok_kind = MP_TOKEN_BYTES;
-        } else {
-            lex->tok_kind = MP_TOKEN_STRING;
-        }
+        // MP_TOKEN_END is used to indicate that this is the first string token
+        lex->tok_kind = MP_TOKEN_END;
 
-        // get first quoting character
-        char quote_char = '\'';
-        if (is_char(lex, '\"')) {
-            quote_char = '\"';
-        }
-        next_char(lex);
+        // Loop to accumulate string/bytes literals
+        do {
+            // parse type codes
+            bool is_raw = false;
+            mp_token_kind_t kind = MP_TOKEN_STRING;
+            int n_char = 0;
+            if (is_char(lex, 'u')) {
+                n_char = 1;
+            } else if (is_char(lex, 'b')) {
+                kind = MP_TOKEN_BYTES;
+                n_char = 1;
+                if (is_char_following(lex, 'r')) {
+                    is_raw = true;
+                    n_char = 2;
+                }
+            } else if (is_char(lex, 'r')) {
+                is_raw = true;
+                n_char = 1;
+                if (is_char_following(lex, 'b')) {
+                    kind = MP_TOKEN_BYTES;
+                    n_char = 2;
+                }
+            }
 
-        // work out if it's a single or triple quoted literal
-        mp_uint_t num_quotes;
-        if (is_char_and(lex, quote_char, quote_char)) {
-            // triple quotes
-            next_char(lex);
-            next_char(lex);
-            num_quotes = 3;
-        } else {
-            // single quotes
-            num_quotes = 1;
-        }
+            // Set or check token kind
+            if (lex->tok_kind == MP_TOKEN_END) {
+                lex->tok_kind = kind;
+            } else if (lex->tok_kind != kind) {
+                // Can't concatenate string with bytes
+                break;
+            }
 
-        // parse the literal
-        mp_uint_t n_closing = 0;
-        while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
-            if (is_char(lex, quote_char)) {
-                n_closing += 1;
-                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
-            } else {
-                n_closing = 0;
-                if (is_char(lex, '\\')) {
+            // Skip any type code characters
+            if (n_char != 0) {
+                next_char(lex);
+                if (n_char == 2) {
                     next_char(lex);
-                    unichar c = CUR_CHAR(lex);
-                    if (is_raw) {
-                        // raw strings allow escaping of quotes, but the backslash is also emitted
-                        vstr_add_char(&lex->vstr, '\\');
-                    } else {
-                        switch (c) {
-                            // note: "c" can never be MP_LEXER_EOF because next_char
-                            // always inserts a newline at the end of the input stream
-                            case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
-                            case '\\': break;
-                            case '\'': break;
-                            case '"': break;
-                            case 'a': c = 0x07; break;
-                            case 'b': c = 0x08; break;
-                            case 't': c = 0x09; break;
-                            case 'n': c = 0x0a; break;
-                            case 'v': c = 0x0b; break;
-                            case 'f': c = 0x0c; break;
-                            case 'r': c = 0x0d; break;
-                            case 'u':
-                            case 'U':
-                                if (is_bytes) {
-                                    // b'\u1234' == b'\\u1234'
-                                    vstr_add_char(&lex->vstr, '\\');
-                                    break;
-                                }
-                                // Otherwise fall through.
-                            case 'x':
-                            {
-                                mp_uint_t num = 0;
-                                if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
-                                    // not enough hex chars for escape sequence
-                                    lex->tok_kind = MP_TOKEN_INVALID;
-                                }
-                                c = num;
-                                break;
-                            }
-                            case 'N':
-                                // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
-                                // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
-                                // 3MB of text; even gzip-compressed and with minimal structure, it'll take
-                                // roughly half a meg of storage. This form of Unicode escape may be added
-                                // later on, but it's definitely not a priority right now. -- CJA 20140607
-                                mp_not_implemented("unicode name escapes");
-                                break;
-                            default:
-                                if (c >= '0' && c <= '7') {
-                                    // Octal sequence, 1-3 chars
-                                    mp_uint_t digits = 3;
-                                    mp_uint_t num = c - '0';
-                                    while (is_following_odigit(lex) && --digits != 0) {
-                                        next_char(lex);
-                                        num = num * 8 + (CUR_CHAR(lex) - '0');
-                                    }
-                                    c = num;
-                                } else {
-                                    // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
-                                    vstr_add_char(&lex->vstr, '\\');
-                                }
-                                break;
-                        }
-                    }
-                    if (c != MP_LEXER_EOF) {
-                        if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
-                            if (c < 0x110000 && !is_bytes) {
-                                vstr_add_char(&lex->vstr, c);
-                            } else if (c < 0x100 && is_bytes) {
-                                vstr_add_byte(&lex->vstr, c);
-                            } else {
-                                // unicode character out of range
-                                // this raises a generic SyntaxError; could provide more info
-                                lex->tok_kind = MP_TOKEN_INVALID;
-                            }
-                        } else {
-                            // without unicode everything is just added as an 8-bit byte
-                            if (c < 0x100) {
-                                vstr_add_byte(&lex->vstr, c);
-                            } else {
-                                // 8-bit character out of range
-                                // this raises a generic SyntaxError; could provide more info
-                                lex->tok_kind = MP_TOKEN_INVALID;
-                            }
-                        }
-                    }
-                } else {
-                    // Add the "character" as a byte so that we remain 8-bit clean.
-                    // This way, strings are parsed correctly whether or not they contain utf-8 chars.
-                    vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
                 }
             }
-            next_char(lex);
-        }
 
-        // check we got the required end quotes
-        if (n_closing < num_quotes) {
-            lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
-        }
+            // Parse the literal
+            parse_string_literal(lex, is_raw);
+
+            // Skip whitespace so we can check if there's another string following
+            skip_whitespace(lex, true);
 
-        // cut off the end quotes from the token text
-        vstr_cut_tail_bytes(&lex->vstr, n_closing);
+        } while (is_string_or_bytes(lex));
 
     } else if (is_head_of_identifier(lex)) {
         lex->tok_kind = MP_TOKEN_NAME;
author	Damien George <damien.p.george@gmail.com>	2017-02-17 12:12:40 +1100
committer	Damien George <damien.p.george@gmail.com>	2017-02-17 12:12:40 +1100
commit	534b7c368dc2af7720f3aaed0c936ef46d773957 (patch)
tree	e1b71ef80c0b883728a8871fe277da33b4889c14 /py/lexer.c
parent	773278ec3030ea9ed809c5a248fde2278ce4b557 (diff)