8000 py: Implement partial PEP-498 (f-string) support by klardotsh · Pull Request #4998 · micropython/micropython · GitHub
[go: up one dir, main page]

Skip to content

py: Implement partial PEP-498 (f-string) support #4998

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ports/bare-arm/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#define MICROPY_PY_ARRAY (0)
#define MICROPY_PY_ATTRTUPLE (0)
#define MICROPY_PY_COLLECTIONS (0)
#define MICROPY_PY_FSTRING (0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the default is disabled this line doesn't need to be here

#define MICROPY_PY_MATH (0)
#define MICROPY_PY_CMATH (0)
#define MICROPY_PY_IO (0)
Expand Down
1 change: 1 addition & 0 deletions ports/unix/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
#define MICROPY_PY_SYS_EXC_INFO (1)
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
#define MICROPY_PY_FSTRING (1)
#ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
#endif
Expand Down
1 change: 1 addition & 0 deletions ports/windows/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
#define MICROPY_PY_SYS_EXC_INFO (1)
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
#define MICROPY_PY_FSTRING (1)
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
#define MICROPY_PY_MATH_ISCLOSE (1)
#define MICROPY_PY_CMATH (1)
Expand Down
131 changes: 126 additions & 5 deletions py/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}

STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
}

STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
return lex->chr1 == c;
}
Expand Down Expand Up @@ -105,7 +109,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {

STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
return is_char_or(lex, '\'', '\"')
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
|| (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
&& is_char_following_following_or(lex, '\'', '\"')))
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
&& is_char_following_following_or(lex, '\'', '\"'));
}
Expand All @@ -119,6 +125,29 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
return is_head_of_identifier(lex) || is_digit(lex);
}

STATIC void swap_char_banks(mp_lexer_t *lex) {
if (lex->vstr_postfix_processing) {
lex->chr3 = lex->chr0;
lex->chr4 = lex->chr1;
lex->chr5 = lex->chr2;
lex->chr0 = lex->vstr_postfix.buf[0];
lex->chr1 = lex->vstr_postfix.buf[1];
lex->chr2 = lex->vstr_postfix.buf[2];

lex->vstr_postfix_idx = 3;
} else {
// blindly reset to the "backup" bank when done postfix processing
// this restores control to the mp_reader
lex->chr0 = lex->chr3;
lex->chr1 = lex->chr4;
lex->chr2 = lex->chr5;
// willfully ignoring setting chr3-5 here - WARNING consider those garbage data now

vstr_reset(&lex->vstr_postfix);
lex->vstr_postfix_idx = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does idx need to be reset? Do the vstr need to be reset?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the current implementation: I think since I blindly write .format( and the postfix translations to vstr_postfix at least the string should be reset/cleared. vstr_postfix_idx needs reset somewhere, though there may well be better places to do it.

If you're implying it'd be lighter on code size/CPU cycles/memory usage to just maintain a constantly growing vstr that starts to look roughly like .format(x).format(y).format(z), with an ever-increasing vstr_postfix_idx, I'm totally open to trying that. I'm not sure how expensive the vstr_resets are

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reset is cheap, I just thought it might be better to reset it before use (to be guaranteed it's reset), rather than after use (when you may not use it again).

}
}

STATIC void next_char(mp_lexer_t *lex) {
if (lex->chr0 == '\n') {
// a new line
Expand All @@ -134,7 +163,16 @@ STATIC void next_char(mp_lexer_t *lex) {

lex->chr0 = lex->chr1;
lex->chr1 = lex->chr2;
lex->chr2 = lex->reader.readbyte(lex->reader.data);

if (lex->vstr_postfix_processing) {
if (lex->vstr_postfix_idx == lex->vstr_postfix.len) {
lex->chr2 = '\0';
} else {
lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++];
}
} else {
lex->chr2 = lex->reader.readbyte(lex->reader.data);
}

if (lex->chr1 == '\r') {
// CR is a new line, converted to LF
Expand All @@ -149,6 +187,11 @@ STATIC void next_char(mp_lexer_t *lex) {
if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
lex->chr2 = '\n';
}

if (lex->vstr_postfix_processing && lex->chr0 == '\0') {
lex->vstr_postfix_processing = false;
swap_char_banks(lex);
}
}

STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
Expand Down Expand Up @@ -270,7 +313,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
return true;
}

STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
// get first quoting character
char quote_char = '\'';
if (is_char(lex, '\"')) {
Expand All @@ -291,15 +334,69 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
}

size_t n_closing = 0;
# if MICROPY_PY_FSTRING
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There shouldn't be a space between the # and the if (same for endif below, and others below)

bool in_expression = false;
bool expression_eat = true;
# endif

while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
if (is_char(lex, quote_char)) {
n_closing += 1;
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
} else {
n_closing = 0;

# if MICROPY_PY_FSTRING
if (is_fstring && is_char(lex, '{')) {
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
in_expression = !in_expression;
expression_eat = in_expression;

if (lex->vstr_postfix.len == 0) {
vstr_add_str(&lex->vstr_postfix, ".format(");
}

next_char(lex);
continue;
}

if (is_fstring && is_char(lex, '}')) {
vstr_add_char(&lex->vstr, CUR_CHAR(lex));

if (in_expression) {
in_expression = false;
vstr_add_char(&lex->vstr_postfix, ',');
}

next_char(lex);
continue;
}

if (in_expression) {
// throw errors for illegal chars inside f-string expressions
if (is_char(lex, '#') || is_char(lex, '\\')) {
lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING;
return;
} else if (is_char(lex, ':')) {
expression_eat = false;
}

unichar c = CUR_CHAR(lex);
if (expression_eat) {
vstr_add_char(&lex->vstr_postfix, c);
} else {
vstr_add_char(&lex->vstr, c);
}

next_char(lex);
continue;
}
# endif

if (is_char(lex, '\\')) {
next_char(lex);
unichar c = CUR_CHAR(lex);

if (is_raw) {
// raw strings allow escaping of quotes, but the backslash is also emitted
vstr_add_char(&lex->vstr, '\\');
Expand Down Expand Up @@ -448,6 +545,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
}

void mp_lexer_to_next(mp_lexer_t *lex) {
if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) {
// end format call injection
vstr_add_char(&lex->vstr_postfix, ')');
lex->vstr_postfix_processing = true;
swap_char_banks(lex);
}

// start new token text
vstr_reset(&lex->vstr);

Expand Down Expand Up @@ -503,6 +607,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
do {
// parse type codes
bool is_raw = false;
bool is_fstring = false;
mp_token_kind_t kind = MP_TOKEN_STRING;
int n_char = 0;
if (is_char(lex, 'u')) {
Expand All @@ -521,7 +626,23 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
kind = MP_TOKEN_BYTES;
n_char = 2;
}
# if MICROPY_PY_FSTRING
if (is_char_following(lex, 'f')) {
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
break;
}
# endif
}
# if MICROPY_PY_FSTRING
else if (is_char(lex, 'f')) {
if (is_char_following(lex, 'r')) {
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
break;
}
n_char = 1;
is_fstring = true;
}
# endif

// Set or check token kind
if (lex->tok_kind == MP_TOKEN_END) {
Expand All @@ -540,13 +661,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
}

// Parse the literal
parse_string_literal(lex, is_raw);
parse_string_literal(lex, is_raw, is_fstring);

// Skip whitespace so we can check if there's another string following
skip_whitespace(lex, true);

} while (is_string_or_bytes(lex));

} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;

Expand Down Expand Up @@ -700,6 +820,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
lex->num_indent_level = 1;
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
vstr_init(&lex->vstr, 32);
vstr_init(&lex->vstr_postfix, 0);

// store sentinel for first indentation level
lex->indent_level[0] = 0;
Expand Down
8 changes: 8 additions & 0 deletions py/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_INVALID,
MP_TOKEN_DEDENT_MISMATCH,
MP_TOKEN_LONELY_STRING_OPEN,
# if MICROPY_PY_FSTRING
MP_TOKEN_MALFORMED_FSTRING,
MP_TOKEN_FSTRING_RAW,
# endif

MP_TOKEN_NEWLINE,
MP_TOKEN_INDENT,
Expand Down Expand Up @@ -157,6 +161,7 @@ typedef struct _mp_lexer_t {
mp_reader_t reader; // stream source

unichar chr0, chr1, chr2; // current cached characters from source
unichar chr3, chr4, chr5; // current cached characters from alt source

size_t line; // current source line
size_t column; // current source column
Expand All @@ -172,6 +177,9 @@ typedef struct _mp_lexer_t {
size_t tok_column; // token source column
mp_token_kind_t tok_kind; // token kind
vstr_t vstr; // token data
vstr_t vstr_postfix; // postfix to apply to string
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably should guard these additions in #if MICROPY_PY_FSTRING because otherwise they'll unnecessarily use up RAM. Then also need to guard all code that uses these new members (hopefully that's not too messy...)

bool vstr_postfix_processing;
uint16_t vstr_postfix_idx;
} mp_lexer_t;

mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);
Expand Down
6 changes: 6 additions & 0 deletions py/mpconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -1108,6 +1108,12 @@ typedef double mp_float_t;
#define MICROPY_PY_COLLECTIONS_NAMEDTUPLE__ASDICT (0)
#endif

// Whether to include support for PEP-498 f-strings
#ifndef MICROPY_PY_FSTRING
#define MICROPY_PY_FSTRING (0)
#endif


// Whether to provide "math" module
#ifndef MICROPY_PY_MATH
#define MICROPY_PY_MATH (1)
Expand Down
8 changes: 8 additions & 0 deletions py/parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
} else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
MP_ERROR_TEXT("unindent doesn't match any outer indent level"));
# if MICROPY_PY_FSTRING
} else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) {
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
MP_ERROR_TEXT("malformed f-string"));
} else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) {
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
MP_ERROR_TEXT("raw f-strings are not supported"));
# endif
} else {
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
MP_ERROR_TEXT("invalid syntax"));
Expand Down
Loading
0