8000 py: Implement partial PEP-498 (f-string) support (v3) by jimmo · Pull Request #7649 · micropython/micropython · GitHub
[go: up one dir, main page]

Skip to content

py: Implement partial PEP-498 (f-string) support (v3) #7649

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mpy-cross/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
#define MICROPY_PY_IO (0)
#define MICROPY_PY_SYS (0)

#define MICROPY_PY_FSTRING (1)

// type definitions for the specific machine

#ifdef __LP64__
Expand Down
1 change: 1 addition & 0 deletions ports/esp32/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
#define MICROPY_PY_MATH_ISCLOSE (1)
#define MICROPY_PY_CMATH (1)
#define MICROPY_PY_FSTRING (1)
#define MICROPY_PY_GC (1)
#define MICROPY_PY_IO (1)
#define MICROPY_PY_IO_IOBASE (1)
Expand Down
1 change: 1 addition & 0 deletions ports/rp2/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
#define MICROPY_PY_FUNCTION_ATTRS (1)
#define MICROPY_PY_DESCRIPTORS (1)
#define MICROPY_PY_DELATTR_SETATTR (1)
#define MICROPY_PY_FSTRING (1)
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
#define MICROPY_PY_BUILTINS_STR_CENTER (1)
#define MICROPY_PY_BUILTINS_STR_PARTITION (1)
Expand Down
3 changes: 3 additions & 0 deletions ports/stm32/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@
#define MICROPY_PY_MATH_ISCLOSE (1)
#define MICROPY_PY_MATH_FACTORIAL (1)
#define MICROPY_PY_CMATH (1)
#ifndef MICROPY_PY_FSTRING
#define MICROPY_PY_FSTRING (1)
#endif
#define MICROPY_PY_IO (1)
#define MICROPY_PY_IO_IOBASE (1)
#define MICROPY_PY_IO_FILEIO (MICROPY_VFS_FAT || MICROPY_VFS_LFS1 || MICROPY_VFS_LFS2)
Expand Down
1 change: 1 addition & 0 deletions ports/unix/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@
#define MICROPY_PY_SYS_EXC_INFO (1)
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
#define MICROPY_PY_FSTRING (1)
#ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
#endif
Expand Down
1 change: 1 addition & 0 deletions ports/windows/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
#define MICROPY_PY_SYS_EXC_INFO (1)
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
#define MICROPY_PY_FSTRING (1)
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
#define MICROPY_PY_MATH_ISCLOSE (1)
#define MICROPY_PY_CMATH (1)
Expand Down
130 changes: 126 additions & 4 deletions py/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}

#if MICROPY_PY_FSTRING
STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
}
#endif

STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
return lex->chr1 == c;
}
Expand Down Expand Up @@ -105,7 +111,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {

STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
return is_char_or(lex, '\'', '\"')
#if MICROPY_PY_FSTRING
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
|| (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
&& is_char_following_following_or(lex, '\'', '\"')))
#else
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
#endif
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
&& is_char_following_following_or(lex, '\'', '\"'));
}
Expand All @@ -132,9 +144,35 @@ STATIC void next_char(mp_lexer_t *lex) {
++lex->column;
}

// shift the input queue forward
lex->chr0 = lex->chr1;
lex->chr1 = lex->chr2;
lex->chr2 = lex->reader.readbyte(lex->reader.data);

// and add the next byte from either the fstring args or the reader
#if MICROPY_PY_FSTRING
if (lex->fstring_args_idx) {
// if there are saved chars, then we're currently injecting fstring args
if (lex->fstring_args_idx < lex->fstring_args.len) {
lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
} else {
// no more fstring arg bytes
lex->chr2 = '\0';
}

if (lex->chr0 == '\0') {
// consumed all fstring data, restore saved input queue
lex->chr0 = lex->chr0_saved;
lex->chr1 = lex->chr1_saved;
lex->chr2 = lex->chr2_saved;
// stop consuming fstring arg data
vstr_reset(&lex->fstring_args);
lex->fstring_args_idx = 0;
}
} else
#endif
{
lex->chr2 = lex->reader.readbyte(lex->reader.data);
}

if (lex->chr1 == '\r') {
// CR is a new line, converted to LF
Expand Down Expand Up @@ -272,7 +310,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
return true;
}

STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
// get first quoting character
char quote_char = '\'';
if (is_char(lex, '\"')) {
Expand All @@ -293,15 +331,61 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
}

size_t n_closing = 0;
#if MICROPY_PY_FSTRING
if (is_fstring) {
// assume there's going to be interpolation, so prep the injection data
// fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
// only when fstring_args_idx>0 will we consume the arg data
// note: lex->fstring_args will be empty already (it's reset when finished)
vstr_add_str(&lex->fstring_args, ".format(");
}
#endif

while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
if (is_char(lex, quote_char)) {
n_closing += 1;
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
} else {
n_closing = 0;

#if MICROPY_PY_FSTRING
while (is_fstring && is_char(lex, '{')) {
next_char(lex);
if (is_char(lex, '{')) {
// "{{" is passed through unchanged to be handled by str.format
vstr_add_byte(&lex->vstr, '{');
next_char(lex);
} else {
// remember the start of this argument (if we need it for f'{a=}').
size_t i = lex->fstring_args.len;
// extract characters inside the { until we reach the
// format specifier or closing }.
// (MicroPython limitation) note: this is completely unaware of
// Python syntax and will not handle any expression containing '}' or ':'.
// e.g. f'{"}"}' or f'{foo({})}'.
while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
// like the default case at the end of this function, stay 8-bit clean
vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
next_char(lex);
}
if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
// if the last character of the arg was '=', then inject "arg=" before the '{'.
// f'{a=}' --> 'a={}'.format(a)
vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i);
// remove the trailing '='
lex->fstring_args.len--;
}
// comma-separate args
vstr_add_byte(&lex->fstring_args, ',');
}
vstr_add_byte(&lex->vstr, '{');
}
#endif

if (is_char(lex, '\\')) {
next_char(lex);
unichar c = CUR_CHAR(lex);

if (is_raw) {
// raw strings allow escaping of quotes, but the backslash is also emitted
vstr_add_char(&lex->vstr, '\\');
Expand Down Expand Up @@ -451,6 +535,23 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
}

void mp_lexer_to_next(mp_lexer_t *lex) {
#if MICROPY_PY_FSTRING
if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
// moving onto the next token means the literal string is complete.
// switch into injecting the format args.
vstr_add_byte(&lex->fstring_args, ')');
lex->chr0_saved = lex->chr0;
lex->chr1_saved = lex->chr1;
lex->chr2_saved = lex->chr2;
lex->chr0 = lex->fstring_args.buf[0];
lex->chr1 = lex->fstring_args.buf[1];
lex->chr2 = lex->fstring_args.buf[2];
// we've already extracted 3 chars, but setting this non-zero also
// means we'll start consuming the fstring data
lex->fstring_args_idx = 3;
}
#endif

// start new token text
vstr_reset(&lex->vstr);

Expand Down Expand Up @@ -506,6 +607,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
do {
// parse type codes
bool is_raw = false;
bool is_fstring = false;
mp_token_kind_t kind = MP_TOKEN_STRING;
int n_char = 0;
if (is_char(lex, 'u')) {
Expand All @@ -524,7 +626,25 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
kind = MP_TOKEN_BYTES;
n_char = 2;
}
#if MICROPY_PY_FSTRING
if (is_char_following(lex, 'f')) {
// raw-f-strings unsupported, immediately return (invalid) token.
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
break;
}
#endif
}
#if MICROPY_PY_FSTRING
else if (is_char(lex, 'f')) {
if (is_char_following(lex, 'r')) {
// raw-f-strings unsupported, immediately return (invalid) token.
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
break;
}
n_char = 1;
is_fstring = true;
}
#endif

// Set or check token kind
if (lex->tok_kind == MP_TOKEN_END) {
Expand All @@ -543,13 +663,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
}

// Parse the literal
parse_string_literal(lex, is_raw);
parse_string_literal(lex, is_raw, is_fstring);

// Skip whitespace so we can check if there's another string following
skip_whitespace(lex, true);

} while (is_string_or_bytes(lex));

} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;

Expand Down Expand Up @@ -703,6 +822,9 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
lex->num_indent_level = 1;
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
vstr_init(&lex->vstr, 32);
#if MICROPY_PY_FSTRING
vstr_init(&lex->fstring_args, 0);
#endif

// store sentinel for first indentation level
lex->indent_level[0] = 0;
Expand Down
11 changes: 11 additions & 0 deletions py/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_INVALID,
MP_TOKEN_DEDENT_MISMATCH,
MP_TOKEN_LONELY_STRING_OPEN,
#if MICROPY_PY_FSTRING
MP_TOKEN_MALFORMED_FSTRING,
MP_TOKEN_FSTRING_RAW,
#endif

MP_TOKEN_NEWLINE,
MP_TOKEN_INDENT,
Expand Down Expand Up @@ -158,6 +162,9 @@ typedef struct _mp_lexer_t {
mp_reader_t reader; // stream source

unichar chr0, chr1, chr2; // current cached characters from source
#if MICROPY_PY_FSTRING
unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source
#endif

size_t line; // current source line
size_t column; // current source column
Expand All @@ -173,6 +180,10 @@ typedef struct _mp_lexer_t {
size_t tok_column; // token source column
mp_token_kind_t tok_kind; // token kind
vstr_t vstr; // token data
#if MICROPY_PY_FSTRING
vstr_t fstring_args; // extracted arguments to pass to .format()
uint16_t fstring_args_idx; // how many bytes of fstring_args have been read
#endif
} mp_lexer_t;

mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);
Expand Down
6 changes: 6 additions & 0 deletions py/mpconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -1153,6 +1153,12 @@ typedef double mp_float_t;
#define MICROPY_PY_COLLECTIONS_NAMEDTUPLE__ASDICT (0)
#endif

// Whether to include support for PEP-498 f-strings
#ifndef MICROPY_PY_FSTRING
#define MICROPY_PY_FSTRING (0)
#endif


// Whether to provide "math" module
#ifndef MICROPY_PY_MATH
#define MICROPY_PY_MATH (1)
Expand Down
8 changes: 8 additions & 0 deletions py/parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
} else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
MP_ERROR_TEXT("unindent doesn't match any outer indent level"));
#if MICROPY_PY_FSTRING
} else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) {
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
MP_ERROR_TEXT("malformed f-string"));
} else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) {
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
MP_ERROR_TEXT("raw f-strings are not supported"));
#endif
} else {
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
MP_ERROR_TEXT("invalid syntax"));
Expand Down
57 changes: 57 additions & 0 deletions tests/basics/string_fstring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
def f():
return 4
def g(_):
return 5
def h():
return 6

print(f'no interpolation')
print(f"no interpolation")
print(f"""no interpolation""")

x, y = 1, 2
print(f'{x}')
print(f'{x:08x}')
print(f'{x=}')
print(f'{x=:08x}')
print(f'a {x} b {y} c')
print(f'a {x:08x} b {y} c')
print(f'a {x=} b {y} c')
print(f'a {x=:08x} b {y} c')

print(f'a {"hello"} b')
print(f'a {f() + g("foo") + h()} b')
print(f'a {f() + g("foo") + h()=} b')
print(f'a {f() + g("foo") + h()=:08x} b')

def foo(a, b):
return f'{x}{y}{a}{b}'
print(foo(7, 8))

# PEP-0498 specifies that '\\' and '#' must be disallowed explicitly, whereas
# MicroPython relies on the syntax error as a result of the substitution.

print(f"\\")
print(f'#')
try:
eval("f'{\}'")
except SyntaxError:
print('SyntaxError')
try:
eval("f'{#}'")
except SyntaxError:
print('SyntaxError')


# PEP-0498 specifies that handling of double braces '{{' or '}}' should
# behave like str.format.
print(f'{{}}')
print(f'{{{4*10}}}', '{40}')

# A single closing brace, unlike str.format should raise a syntax error.
# MicroPython instead raises ValueError at runtime from the substitution.
try:
eval("f'{{}'")
except (ValueError, SyntaxError):
# MicroPython incorrectly raises ValueError here.
print('SyntaxError')
1 change: 1 addition & 0 deletions tests/cmdline/cmd_parsetree.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
e = b"a very long bytes that will not be interned"
f = 123456789012345678901234567890
g = 123
h = f"fstring: '{b}'"
Loading
0