-
-
Notifications
You must be signed in to change notification settings - Fork 8.2k
py: Implement partial PEP-498 (f-string) support #4998
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) { | |
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3; | ||
} | ||
|
||
STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) { | ||
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4; | ||
} | ||
|
||
STATIC bool is_char_following(mp_lexer_t *lex, byte c) { | ||
return lex->chr1 == c; | ||
} | ||
|
@@ -105,7 +109,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) { | |
|
||
STATIC bool is_string_or_bytes(mp_lexer_t *lex) { | ||
return is_char_or(lex, '\'', '\"') | ||
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"')) | ||
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"')) | ||
|| (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r')) | ||
&& is_char_following_following_or(lex, '\'', '\"'))) | ||
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) | ||
&& is_char_following_following_or(lex, '\'', '\"')); | ||
} | ||
|
@@ -119,6 +125,29 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) { | |
return is_head_of_identifier(lex) || is_digit(lex); | ||
} | ||
|
||
STATIC void swap_char_banks(mp_lexer_t *lex) { | ||
if (lex->vstr_postfix_processing) { | ||
lex->chr3 = lex->chr0; | ||
lex->chr4 = lex->chr1; | ||
lex->chr5 = lex->chr2; | ||
lex->chr0 = lex->vstr_postfix.buf[0]; | ||
lex->chr1 = lex->vstr_postfix.buf[1]; | ||
lex->chr2 = lex->vstr_postfix.buf[2]; | ||
|
||
lex->vstr_postfix_idx = 3; | ||
} else { | ||
// blindly reset to the "backup" bank when done postfix processing | ||
// this restores control to the mp_reader | ||
lex->chr0 = lex->chr3; | ||
lex->chr1 = lex->chr4; | ||
lex->chr2 = lex->chr5; | ||
// willfully ignoring setting chr3-5 here - WARNING consider those garbage data now | ||
|
||
vstr_reset(&lex->vstr_postfix); | ||
lex->vstr_postfix_idx = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does idx need to be reset? Do the vstr need to be reset? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the current implementation: I think since I blindly write If you're implying it'd be lighter on code size/CPU cycles/memory usage to just maintain a constantly growing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reset is cheap, I just thought it might be better to reset it before use (to be guaranteed it's reset), rather than after use (when you may not use it again). |
||
} | ||
} | ||
|
||
STATIC void next_char(mp_lexer_t *lex) { | ||
if (lex->chr0 == '\n') { | ||
// a new line | ||
|
@@ -134,7 +163,16 @@ STATIC void next_char(mp_lexer_t *lex) { | |
|
||
lex->chr0 = lex->chr1; | ||
lex->chr1 = lex->chr2; | ||
lex->chr2 = lex->reader.readbyte(lex->reader.data); | ||
|
||
if (lex->vstr_postfix_processing) { | ||
if (lex->vstr_postfix_idx == lex->vstr_postfix.len) { | ||
lex->chr2 = '\0'; | ||
} else { | ||
lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++]; | ||
} | ||
} else { | ||
lex->chr2 = lex->reader.readbyte(lex->reader.data); | ||
} | ||
|
||
if (lex->chr1 == '\r') { | ||
// CR is a new line, converted to LF | ||
|
@@ -149,6 +187,11 @@ STATIC void next_char(mp_lexer_t *lex) { | |
if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') { | ||
lex->chr2 = '\n'; | ||
} | ||
|
||
if (lex->vstr_postfix_processing && lex->chr0 == '\0') { | ||
lex->vstr_postfix_processing = false; | ||
swap_char_banks(lex); | ||
} | ||
} | ||
|
||
STATIC void indent_push(mp_lexer_t *lex, size_t indent) { | ||
|
@@ -270,7 +313,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) { | |
return true; | ||
} | ||
|
||
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { | ||
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) { | ||
// get first quoting character | ||
char quote_char = '\''; | ||
if (is_char(lex, '\"')) { | ||
|
@@ -291,15 +334,69 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { | |
} | ||
|
||
size_t n_closing = 0; | ||
# if MICROPY_PY_FSTRING | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There shouldn't be a space between the |
||
bool in_expression = false; | ||
bool expression_eat = true; | ||
# endif | ||
|
||
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) { | ||
if (is_char(lex, quote_char)) { | ||
n_closing += 1; | ||
vstr_add_char(&lex->vstr, CUR_CHAR(lex)); | ||
} else { | ||
n_closing = 0; | ||
|
||
# if MICROPY_PY_FSTRING | ||
if (is_fstring && is_char(lex, '{')) { | ||
vstr_add_char(&lex->vstr, CUR_CHAR(lex)); | ||
in_expression = !in_expression; | ||
expression_eat = in_expression; | ||
|
||
if (lex->vstr_postfix.len == 0) { | ||
vstr_add_str(&lex->vstr_postfix, ".format("); | ||
} | ||
|
||
next_char(lex); | ||
continue; | ||
} | ||
|
||
if (is_fstring && is_char(lex, '}')) { | ||
vstr_add_char(&lex->vstr, CUR_CHAR(lex)); | ||
|
||
if (in_expression) { | ||
in_expression = false; | ||
vstr_add_char(&lex->vstr_postfix, ','); | ||
} | ||
|
||
next_char(lex); | ||
continue; | ||
} | ||
|
||
if (in_expression) { | ||
// throw errors for illegal chars inside f-string expressions | ||
if (is_char(lex, '#') || is_char(lex, '\\')) { | ||
lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING; | ||
return; | ||
} else if (is_char(lex, ':')) { | ||
expression_eat = false; | ||
} | ||
|
||
unichar c = CUR_CHAR(lex); | ||
if (expression_eat) { | ||
vstr_add_char(&lex->vstr_postfix, c); | ||
} else { | ||
vstr_add_char(&lex->vstr, c); | ||
} | ||
|
||
next_char(lex); | ||
continue; | ||
} | ||
# endif | ||
|
||
if (is_char(lex, '\\')) { | ||
next_char(lex); | ||
unichar c = CUR_CHAR(lex); | ||
|
||
if (is_raw) { | ||
// raw strings allow escaping of quotes, but the backslash is also emitted | ||
vstr_add_char(&lex->vstr, '\\'); | ||
|
@@ -448,6 +545,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { | |
} | ||
|
||
void mp_lexer_to_next(mp_lexer_t *lex) { | ||
if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) { | ||
// end format call injection | ||
vstr_add_char(&lex->vstr_postfix, ')'); | ||
lex->vstr_postfix_processing = true; | ||
swap_char_banks(lex); | ||
} | ||
|
||
// start new token text | ||
vstr_reset(&lex->vstr); | ||
|
||
|
@@ -503,6 +607,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) { | |
do { | ||
// parse type codes | ||
bool is_raw = false; | ||
bool is_fstring = false; | ||
mp_token_kind_t kind = MP_TOKEN_STRING; | ||
int n_char = 0; | ||
if (is_char(lex, 'u')) { | ||
|
@@ -521,7 +626,23 @@ void mp_lexer_to_next(mp_lexer_t *lex) { | |
kind = MP_TOKEN_BYTES; | ||
n_char = 2; | ||
} | ||
# if MICROPY_PY_FSTRING | ||
if (is_char_following(lex, 'f')) { | ||
lex->tok_kind = MP_TOKEN_FSTRING_RAW; | ||
break; | ||
} | ||
# endif | ||
} | ||
# if MICROPY_PY_FSTRING | ||
else if (is_char(lex, 'f')) { | ||
if (is_char_following(lex, 'r')) { | ||
lex->tok_kind = MP_TOKEN_FSTRING_RAW; | ||
break; | ||
} | ||
n_char = 1; | ||
is_fstring = true; | ||
} | ||
# endif | ||
|
||
// Set or check token kind | ||
if (lex->tok_kind == MP_TOKEN_END) { | ||
|
@@ -540,13 +661,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) { | |
} | ||
|
||
// Parse the literal | ||
parse_string_literal(lex, is_raw); | ||
parse_string_literal(lex, is_raw, is_fstring); | ||
|
||
// Skip whitespace so we can check if there's another string following | ||
skip_whitespace(lex, true); | ||
|
||
} while (is_string_or_bytes(lex)); | ||
|
||
} else if (is_head_of_identifier(lex)) { | ||
lex->tok_kind = MP_TOKEN_NAME; | ||
|
||
|
@@ -700,6 +820,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) { | |
lex->num_indent_level = 1; | ||
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level); | ||
vstr_init(&lex->vstr, 32); | ||
vstr_init(&lex->vstr_postfix, 0); | ||
|
||
// store sentinel for first indentation level | ||
lex->indent_level[0] = 0; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t { | |
MP_TOKEN_INVALID, | ||
MP_TOKEN_DEDENT_MISMATCH, | ||
MP_TOKEN_LONELY_STRING_OPEN, | ||
# if MICROPY_PY_FSTRING | ||
MP_TOKEN_MALFORMED_FSTRING, | ||
MP_TOKEN_FSTRING_RAW, | ||
# endif | ||
|
||
MP_TOKEN_NEWLINE, | ||
MP_TOKEN_INDENT, | ||
|
@@ -157,6 +161,7 @@ typedef struct _mp_lexer_t { | |
mp_reader_t reader; // stream source | ||
|
||
unichar chr0, chr1, chr2; // current cached characters from source | ||
unichar chr3, chr4, chr5; // current cached characters from alt source | ||
|
||
size_t line; // current source line | ||
size_t column; // current source column | ||
|
@@ -172,6 +177,9 @@ typedef struct _mp_lexer_t { | |
size_t tok_column; // token source column | ||
mp_token_kind_t tok_kind; // token kind | ||
vstr_t vstr; // token data | ||
vstr_t vstr_postfix; // postfix to apply to string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably should guard these additions in |
||
bool vstr_postfix_processing; | ||
uint16_t vstr_postfix_idx; | ||
} mp_lexer_t; | ||
|
||
mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since the default is disabled this line doesn't need to be here