8000 py: Implement partial PEP-498 (f-string) support. · micropython/micropython@6942eb4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6942eb4

Browse files
committed
py: Implement partial PEP-498 (f-string) support.
This implements (most of) the PEP-498 spec for f-strings and is based on #4998 by @klardotsh. It is implemented in the lexer as a syntax translation to `str.format`: f"{a}" --> "{}".format(a) It also supports: f"{a=}" --> "a={}".format(a) This is done by extracting the arguments into a temporary vstr buffer, then after the string has been tokenized, the lexer input queue is saved and the contents of the temporary vstr buffer are injected ito the lexer instead. There are four main limitations: - raw f-strings (`fr` or `rf` prefixes) are not supported and will raise `SyntaxError: raw f-strings are not supported`. - literal concatenation of f-strings with adjacent strings will fail "{}" f"{a}" --> "{}{}".format(a) (str.format will incorrectly use the braces from the non-f-string) f"{a}" f"{a}" --> "{}".format(a) "{}".format(a) (cannot concatenate) - PEP-498 requires the full parser to understand the interpolated argument, however because this entirely runs in the lexer it cannot resolve nested braces in expressions like f"{'}'}" - The !r, !s, and !a conversions are not supported. Includes tests and cpydiffs. Signed-off-by: Jim Mussared <jim.mussared@gmail.com>
1 parent 42d1a16 commit 6942eb4

18 files changed

+298
-22
lines changed

mpy-cross/mpconfigport.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@
9393
#define MICROPY_PY_IO (0)
9494
#define MICROPY_PY_SYS (0)
9595

96+
#define MICROPY_PY_FSTRING (1)
97+
9698
// type definitions for the specific machine
9799

98100
#ifdef __LP64__

ports/unix/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@
123123
#define MICROPY_PY_SYS_EXC_INFO (1)
124124
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
125125
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
126+
#define MICROPY_PY_FSTRING (1)
126127
#ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS
127128
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
128129
#endif

ports/windows/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
#define MICROPY_PY_SYS_EXC_INFO (1)
9494
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
9595
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
96+
#define MICROPY_PY_FSTRING (1)
9697
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
9798
#define MICROPY_PY_MATH_ISCLOSE (1)
9899
#define MICROPY_PY_CMATH (1)

py/lexer.c

Lines changed: 126 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
6262
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
6363
}
6464

65+
#if MICROPY_PY_FSTRING
66+
STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
67+
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
68+
}
69+
#endif
70+
6571
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
6672
return lex->chr1 == c;
6773
}
@@ -105,7 +111,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
105111

106112
STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
107113
return is_char_or(lex, '\'', '\"')
114+
#if MICROPY_PY_FSTRING
115+
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
116+
|| (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
117+
&& is_char_following_following_or(lex, '\'', '\"')))
118+
#else
108119
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
120+
#endif
109121
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
110122
&& is_char_following_following_or(lex, '\'', '\"'));
111123
}
@@ -132,9 +144,35 @@ STATIC void next_char(mp_lexer_t *lex) {
132144
++lex->column;
133145
}
134146

147+
// shift the input queue forward
135148
lex->chr0 = lex->chr1;
136149
lex->chr1 = lex->chr2;
137-
lex->chr2 = lex->reader.readbyte(lex->reader.data);
150+
151+
// and add the next byte from either the fstring args or the reader
152+
#if MICROPY_PY_FSTRING
153+
if (lex->fstring_args_idx) {
154+
// if there are saved chars, then we're currently injecting fstring args
155+
if (lex->fstring_args_idx < lex->fstring_args.len) {
156+
lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
157+
} else {
158+
// no more fstring arg bytes
159+
lex->chr2 = '\0';
160+
}
161+
162+
if (lex->chr0 == '\0') {
163+
// consumed all fstring data, restore saved input queue
164+
lex->chr0 = lex->chr0_saved;
165+
lex->chr1 = lex->chr1_saved;
166+
lex->chr2 = lex->chr2_saved;
167+
// stop consuming fstring arg data
168+
vstr_reset(&lex->fstring_args);
169+
lex->fstring_args_idx = 0;
170+
}
171+
} else
172+
#endif
173+
{
174+
lex->chr2 = lex->reader.readbyte(lex->reader.data);
175+
}
138176

139177
if (lex->chr1 == '\r') {
140178
// CR is a new line, converted to LF
@@ -272,7 +310,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
272310
return true;
273311
}
274312

275-
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
313+
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
276314
// get first quoting character
277315
char quote_char = '\'';
278316
if (is_char(lex, '\"')) {
@@ -293,15 +331,61 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
293331
}
294332

295333
size_t n_closing = 0;
334+
#if MICROPY_PY_FSTRING
335+
if (is_fstring) {
336+
// assume there's going to be interpolation, so prep the injection data
337+
// fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
338+
// only when fstring_args_idx>0 will we consume the arg data
339+
// note: lex->fstring_args will be empty already (it's reset when finished)
340+
vstr_add_str(&lex->fstring_args, ".format(");
341+
}
342+
#endif
343+
296344
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
297345
if (is_char(lex, quote_char)) {
298346
n_closing += 1;
299347
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
300348
} else {
301349
n_closing = 0;
350+
351+
#if MICROPY_PY_FSTRING
352+
while (is_fstring && is_char(lex, '{')) {
353+
next_char(lex);
354+
if (is_char(lex, '{')) {
355+
// "{{" is passed through unchanged to be handled by str.format
356+
vstr_add_byte(&lex->vstr, '{');
357+
next_char(lex);
358+
} else {
359+
// remember the start of this argument (if we need it for f'{a=}').
360+
size_t i = lex->fstring_args.len;
361+
// extract characters inside the { until we reach the
362+
// format specifier or closing }.
363+
// (MicroPython limitation) note: this is completely unaware of
364+
// Python syntax and will not handle any expression containing '}' or ':'.
365+
// e.g. f'{"}"}' or f'{foo({})}'.
366+
while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
367+
// like the default case at the end of this function, stay 8-bit clean
368+
vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
369+
next_char(lex);
370+
}
371+
if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
372+
// if the last character of the arg was '=', then inject "arg=" before the '{'.
373+
// f'{a=}' --> 'a={}'.format(a)
374+
vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i);
375+
// remove the trailing '='
376+
lex->fstring_args.len--;
377+
}
378+
// comma-separate args
379+
vstr_add_byte(&lex->fstring_args, ',');
380+
}
381+
vstr_add_byte(&lex->vstr, '{');
382+
}
383+
#endif
384+
302385
if (is_char(lex, '\\')) {
303386
next_char(lex);
304387
unichar c = CUR_CHAR(lex);
388+
305389
if (is_raw) {
306390
// raw strings allow escaping of quotes, but the backslash is also emitted
307391
vstr_add_char(&lex->vstr, '\\');
@@ -451,6 +535,23 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
451< F438 code>535
}
452536

453537
void mp_lexer_to_next(mp_lexer_t *lex) {
538+
#if MICROPY_PY_FSTRING
539+
if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
540+
// moving onto the next token means the literal string is complete.
541+
// switch into injecting the format args.
542+
vstr_add_byte(&lex->fstring_args, ')');
543+
lex->chr0_saved = lex->chr0;
544+
lex->chr1_saved = lex->chr1;
545+
lex->chr2_saved = lex->chr2;
546+
lex->chr0 = lex->fstring_args.buf[0];
547+
lex->chr1 = lex->fstring_args.buf[1];
548+
lex->chr2 = lex->fstring_args.buf[2];
549+
// we've already extracted 3 chars, but setting this non-zero also
550+
// means we'll start consuming the fstring data
551+
lex->fstring_args_idx = 3;
552+
}
553+
#endif
554+
454555
// start new token text
455556
vstr_reset(&lex->vstr);
456557

@@ -506,6 +607,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
506607
do {
507608
// parse type codes
508609
bool is_raw = false;
610+
bool is_fstring = false;
509611
mp_token_kind_t kind = MP_TOKEN_STRING;
510612
int n_char = 0;
511613
if (is_char(lex, 'u')) {
@@ -524,7 +626,25 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
524626
kind = MP_TOKEN_BYTES;
525627
n_char = 2;
526628
}
629+
#if MICROPY_PY_FSTRING
630+
if (is_char_following(lex, 'f')) {
631+
// raw-f-strings unsupported, immediately return (invalid) token.
632+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
633+
break;
634+
}
635+
#endif
636+
}
637+
#if MICROPY_PY_FSTRING
638+
else if (is_char(lex, 'f')) {
639+
if (is_char_following(lex, 'r')) {
640+
// raw-f-strings unsupported, immediately return (invalid) token.
641+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
642+
break;
643+
}
644+
n_char = 1;
645+
is_fstring = true;
527646
}
647+
#endif
528648

529649
// Set or check token kind
530650
if (lex->tok_kind == MP_TOKEN_END) {
@@ -543,13 +663,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
543663
}
544664

545665
// Parse the literal
546-
parse_string_literal(lex, is_raw);
666+
parse_string_literal(lex, is_raw, is_fstring);
547667

548668
// Skip whitespace so we can check if there's another string following
549669
skip_whitespace(lex, true);
550670

551671
} while (is_string_or_bytes(lex));
552-
553672
} else if (is_head_of_identifier(lex)) {
554673
lex->tok_kind = MP_TOKEN_NAME;
555674

@@ -703,6 +822,9 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
703822
lex->num_indent_level = 1;
704823
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
705824
vstr_init(&lex->vstr, 32);
825+
#if MICROPY_PY_FSTRING
826+
vstr_init(&lex->fstring_args, 0);
827+
#endif
706828

707829
// store sentinel for first indentation level
708830
lex->indent_level[0] = 0;

py/lexer.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t {
4444
MP_TOKEN_INVALID,
4545
MP_TOKEN_DEDENT_MISMATCH,
4646
MP_TOKEN_LONELY_STRING_OPEN,
47+
#if MICROPY_PY_FSTRING
48+
MP_TOKEN_MALFORMED_FSTRING,
49+
MP_TOKEN_FSTRING_RAW,
50+
#endif
4751

4852
MP_TOKEN_NEWLINE,
4953
MP_TOKEN_INDENT,
@@ -158,6 +162,9 @@ typedef struct _mp_lexer_t {
158162
mp_reader_t reader; // stream source
159163

160164
unichar chr0, chr1, chr2; // current cached characters from source
165+
#if MICROPY_PY_FSTRING
166+
unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source
167+
#endif
161168

162169
size_t line; // current source line
163170
size_t column; // current source column
@@ -173,6 +180,10 @@ typedef struct _mp_lexer_t {
173180
size_t tok_column; // token source column
174181
mp_token_kind_t tok_kind; // token kind
175182
vstr_t vstr; // token data
183+
#if MICROPY_PY_FSTRING
184+
vstr_t fstring_args; // extracted arguments to pass to .format()
185+
uint16_t fstring_args_idx; // how many bytes of fstring_args have been read
186+
#endif
176187
} mp_lexer_t;
177188

178189
mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);

py/mpconfig.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,12 @@ typedef double mp_float_t;
11531153
#define MICROPY_PY_COLLECTIONS_NAMEDTUPLE__ASDICT (0)
11541154
#endif
11551155

1156+
// Whether to include support for PEP-498 f-strings
1157+
#ifndef MICROPY_PY_FSTRING
1158+
#define MICROPY_PY_FSTRING (0)
1159+
#endif
1160+
1161+
11561162
// Whether to provide "math" module
11571163
#ifndef MICROPY_PY_MATH
11581164
#define MICROPY_PY_MATH (1)

py/parse.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,6 +1152,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
11521152
} else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
11531153
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
11541154
MP_ERROR_TEXT("unindent doesn't match any outer indent level"));
1155+
#if MICROPY_PY_FSTRING
1156+
} else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) {
1157+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1158+
MP_ERROR_TEXT("malformed f-string"));
1159+
} else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) {
1160+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1161+
MP_ERROR_TEXT("raw f-strings are not supported"));
1162+
#endif
11551163
} else {
11561164
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
11571165
MP_ERROR_TEXT("invalid syntax"));

tests/basics/string_fstring.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
def f():
2+
return 4
3+
def g(_):
4+
return 5
5+
def h():
6+
return 6
7+
8+
print(f'no interpolation')
9+
print(f"no interpolation")
10+
print(f"""no interpolation""")
11+
12+
x, y = 1, 2
13+
print(f'{x}')
14+
print(f'{x:08x}')
15+
print(f'{x=}')
16+
print(f'{x=:08x}')
17+
print(f'a {x} b {y} c')
18+
print(f'a {x:08x} b {y} c')
19+
print(f'a {x=} b {y} c')
20+
print(f'a {x=:08x} b {y} c')
21+
22+
print(f'a {"hello"} b')
23+
print(f'a {f() + g("foo") + h()} b')
24+
print(f'a {f() + g("foo") + h()=} b')
25+
print(f'a {f() + g("foo") + h()=:08x} b')
26+
27+
def foo(a, b):
28+
return f'{x}{y}{a}{b}'
29+
print(foo(7, 8))
30+
31+
# PEP-0498 specifies that '\\' and '#' must be disallowed explicitly, whereas
32+
# MicroPython relies on the syntax error as a result of the substitution.
33+
34+
print(f"\\")
35+
print(f'#')
36+
try:
37+
eval("f'{\}'")
38+
except SyntaxError:
39+
print('SyntaxError')
40+
try:
41+
eval("f'{#}'")
42+
except SyntaxError:
43+
print('SyntaxError')
44+
45+
46+
# PEP-0498 specifies that handling of double braces '{{' or '}}' should
47+
# behave like str.format.
48+
print(f'{{}}')
49+
print(f'{{{4*10}}}', '{40}')
50+
51+
# A single closing brace, unlike str.format should raise a syntax error.
52+
# MicroPython instead raises ValueError at runtime from the substitution.
53+
try:
54+
eval("f'{{}'")
55+
except (ValueError, SyntaxError):
56+
# MicroPython incorrectly raises ValueError here.
57+
print('SyntaxError')

tests/cmdline/cmd_parsetree.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010
e = b"a very long bytes that will not be interned"
1111
f = 123456789012345678901234567890
1212
g = 123
13+
h = f"fstring: '{b}'"

0 commit comments

Comments
 (0)
0