8000 py: Implement partial PEP-498 (f-string) support. · jimmo/micropython@765e275 · GitHub
[go: up one dir, main page]

Skip to content

Commit 765e275

Browse files
committed
py: Implement partial PEP-498 (f-string) support.
This implements (most of) the PEP-498 spec for f-strings and is based on micropython#4998 by @klardotsh. It is implemented in the lexer as a syntax translation to `str.format`: f"{a}" --> "{}".format(a) It also supports: f"{a=}" --> "a={}".format(a) This is done by extracting the arguments into a temporary vstr buffer, then after the string has been tokenized, the lexer input queue is saved and the contents of the temporary vstr buffer are injected ito the lexer instead. There are four main limitations: - raw f-strings (`fr` or `rf` prefixes) are not supported and will raise `SyntaxError: raw f-strings are not supported`. - literal concatenation of f-strings with adjacent strings will fail "{}" f"{a}" --> "{}{}".format(a) (str.format will incorrectly use the braces from the non-f-string) f"{a}" f"{a}" --> "{}".format(a) "{}".format(a) (cannot concatenate) - PEP-498 requires the full parser to understand the interpolated argument, however because this entirely runs in the lexer it cannot resolve nested braces in expressions like f"{'}'}" - The !r, !s, and !a conversions are not supported. Includes tests and cpydiffs. Signed-off-by: Jim Mussared <jim.mussared@gmail.com>
1 parent 671d7cc commit 765e275

16 files changed

+196
-171
lines changed

mpy-cross/mpconfigport.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@
9393
#define MICROPY_PY_IO (0)
9494
#define MICROPY_PY_SYS (0)
9595

96+
#define MICROPY_PY_FSTRING (1)
97+
9698
// type definitions for the specific machine
9799

98100
#ifdef __LP64__

ports/unix/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@
123123
#define MICROPY_PY_SYS_EXC_INFO (1)
124124
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
125125
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
126+
#define MICROPY_PY_FSTRING (1)
126127
#ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS
127128
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
128129
#endif

ports/windows/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
#define MICROPY_PY_SYS_EXC_INFO (1)
9494
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
9595
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
96+
#define MICROPY_PY_FSTRING (1)
9697
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
9798
#define MICROPY_PY_MATH_ISCLOSE (1)
9899
#define MICROPY_PY_CMATH (1)

py/lexer.c

Lines changed: 68 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -131,31 +131,6 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
131131
return is_head_of_identifier(lex) || is_digit(lex);
132132
}
133133

134-
#if MICROPY_PY_FSTRING
135-
STATIC void swap_char_banks(mp_lexer_t *lex) {
136-
if (lex->vstr_postfix_processing) {
137-
lex->chr3 = lex->chr0;
138-
lex->chr4 = lex->chr1;
139-
lex->chr5 = lex->chr2;
140-
lex->chr0 = lex->vstr_postfix.buf[0];
141-
lex->chr1 = lex->vstr_postfix.buf[1];
142-
lex->chr2 = lex->vstr_postfix.buf[2];
143-
144-
lex->vstr_postfix_idx = 3;
145-
} else {
146-
// blindly reset to the "backup" bank when done postfix processing
147-
// this restores control to the mp_reader
148-
lex->chr0 = lex->chr3;
149-
lex->chr1 = lex->chr4;
150-
lex->chr2 = lex->chr5;
151-
// willfully ignoring setting chr3-5 here - WARNING consider those garbage data now
152-
153-
vstr_reset(&lex->vstr_postfix);
154-
lex->vstr_postfix_idx = 0;
155-
}
156-
}
157-
#endif
158-
159134
STATIC void next_char(mp_lexer_t *lex) {
160135
if (lex->chr0 == '\n') {
161136
// a new line
@@ -169,15 +144,29 @@ STATIC void next_char(mp_lexer_t *lex) {
169144
++lex->column;
170145
}
171146

147+
// shift the input queue forward
172148
lex->chr0 = lex->chr1;
173149
lex->chr1 = lex->chr2;
174150

151+
// and add the next byte from either the fstring args or the reader
175152
#if MICROPY_PY_FSTRING
176-
if (lex->vstr_postfix_processing) {
177-
if (lex->vstr_postfix_idx == lex->vstr_postfix.len) {
178-
lex->chr2 = '\0';
153+
if (lex->fstring_args_idx) {
154+
// if there are saved chars, then we're currently injecting fstring args
155+
if (lex->fstring_args_idx < lex->fstring_args.len) {
156+
lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
179157
} else {
180-
lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++];
158+
// no more fstring arg bytes
159+
lex->chr2 = '\0';
160+
}
161+
162+
if (lex->chr0 == '\0') {
163+
// consumed all fstring data, restore saved input queue
164+
lex->chr0 = lex->chr0_saved;
165+
lex->chr1 = lex->chr1_saved;
166+
lex->chr2 = lex->chr2_saved;
167+
// stop consuming fstring arg data
168+
vstr_reset(&lex->fstring_args);
169+
lex->fstring_args_idx = 0;
181170
}
182171
} else
183172
#endif
@@ -198,13 +187,6 @@ STATIC void next_char(mp_lexer_t *lex) {
198187
if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
199188
lex->chr2 = '\n';
200189
}
201-
202-
#if MICROPY_PY_FSTRING
203-
if (lex->vstr_postfix_processing && lex->chr0 == '\0') {
204-
lex->vstr_postfix_processing = false;
205-
swap_char_banks(lex);
206-
}
207-
#endif
208190
}
209191

210192
STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
@@ -350,8 +332,13 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
350332

351333
size_t n_closing = 0;
352334
#if MICROPY_PY_FSTRING
353-
bool in_expression = false;
354-
bool expression_eat = true;
335+
if (is_fstring) {
336+
// assume there's going to be interpolation, so prep the injection data
337+
// fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
338+
// only when fstring_args_idx>0 will we consume the arg data
339+
// note: lex->fstring_args will be empty already (it's reset when finished)
340+
vstr_add_str(&lex->fstring_args, ".format(");
341+
}
355342
#endif
356343

357344
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
@@ -362,49 +349,36 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
362349
n_closing = 0;
363350

364351
#if MICROPY_PY_FSTRING
365-
if (is_fstring && is_char(lex, '{')) {
366-
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
367-
in_expression = !in_expression;
368-
expression_eat = in_expression;
369-
370-
if (lex->vstr_postfix.len == 0) {
371-
vstr_add_str(&lex->vstr_postfix, ".format(");
372-
}
373-
352+
while (is_fstring && is_char(lex, '{')) {
374353
next_char(lex);
375-
continue;
376-
}
377-
378-
if (is_fstring && is_char(lex, '}')) {
379-
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
380-
381-
if (in_expression) {
382-
in_expression = false;
383-
vstr_add_char(&lex->vstr_postfix, ',');
384-
}
385-
386-
next_char(lex);
387-
continue;
388-
}
389-
390-
if (in_expression) {
391-
// throw errors for illegal chars inside f-string expressions
392-
if (is_char(lex, '#') || is_char(lex, '\\')) {
393-
lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING;
394-
return;
395-
} else if (is_char(lex, ':')) {
396-
expression_eat = false;
397-
}
398-
399-
unichar c = CUR_CHAR(lex);
400-
if (expression_eat) {
401-
vstr_add_char(&lex->vstr_postfix, c);
354+
if (is_char(lex, '{')) {
355+
// "{{" is passed through unchanged to be handled by str.format
356+
vstr_add_byte(&lex->vstr, '{');
357+
next_char(lex);
402358
} else {
403-
vstr_add_char(&lex->vstr, c);
359+
// remember the start of this argument (if we need it for f'{a=}').
360+
size_t i = lex->fstring_args.len;
361+
// extract characters inside the { until we reach the
362+
// format specifier or closing }.
363+
// (MicroPython limitation) note: this is completely unaware of
364+
// Python syntax and will not handle any expression containing '}' or ':'.
365+
// e.g. f'{"}"}' or f'{foo({})}'.
366+
while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
367+
// like the default case at the end of this function, stay 8-bit clean
368+
vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
369+
next_char(lex);
370+
}
371+
if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
372+
// if the last character of the arg was '=', then inject "arg=" before the '{'.
373+
// f'{a=}' --> 'a={}'.format(a)
374+
vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i);
375+
// remove the trailing '='
376+
lex->fstring_args.len--;
377+
}
378+
// comma-separate args
379+
vstr_add_byte(&lex->fstring_args, ',');
404380
}
405-
406-
next_char(lex);
407-
continue;
381+
vstr_add_byte(&lex->vstr, '{');
408382
}
409383
#endif
410384

@@ -562,11 +536,19 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
562536

563537
void mp_lexer_to_next(mp_lexer_t *lex) {
564538
#if MICROPY_PY_FSTRING
565-
if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) {
566-
// end format call injection
567-
vstr_add_char(&lex->vstr_postfix, ')');
568-
lex->vstr_postfix_processing = true;
569-
swap_char_banks(lex);
539+
if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
540+
// moving onto the next token means the literal string is complete.
541+
// switch into injecting the format args.
542+
vstr_add_byte(&lex->fstring_args, ')');
543+
lex->chr0_saved = lex->chr0;
544+
lex->chr1_saved = lex->chr1;
545+
lex->chr2_saved = lex->chr2;
546+
lex->chr0 = lex->fstring_args.buf[0];
547+
lex->chr1 = lex->fstring_args.buf[1];
548+
lex->chr2 = lex->fstring_args.buf[2];
549+
// we've already extracted 3 chars, but setting this non-zero also
550+
// means we'll start consuming the fstring data
551+
lex->fstring_args_idx = 3;
570552
}
571553
#endif
572554

@@ -646,6 +628,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
646628
}
647629
#if MICROPY_PY_FSTRING
648630
if (is_char_following(lex, 'f')) {
631+
// raw-f-strings unsupported, immediately return (invalid) token.
649632
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
650633
break;
651634
}
@@ -654,6 +637,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
654637
#if MICROPY_PY_FSTRING
655638
else if (is_char(lex, 'f')) {
656639
if (is_char_following(lex, 'r')) {
640+
// raw-f-strings unsupported, immediately return (invalid) token.
657641
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
658642
break;
659643
}
@@ -839,7 +823,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
839823
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
840824
vstr_init(&lex->vstr, 32);
841825
#if MICROPY_PY_FSTRING
842-
vstr_init(&lex->vstr_postfix, 0);
826+
vstr_init(&lex->fstring_args, 0);
843827
#endif
844828

845829
// store sentinel for first indentation level

py/lexer.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,9 @@ typedef struct _mp_lexer_t {
162162
mp_reader_t reader; // stream source
163163

164164
unichar chr0, chr1, chr2; // current cached characters from source
165-
unichar chr3, chr4, chr5; // current cached characters from alt source
165+
#if MICROPY_PY_FSTRING
166+
unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source
167+
#endif
166168

167169
size_t line; // current source line
168170
size_t column; // current source column
@@ -179,9 +181,8 @@ typedef struct _mp_lexer_t {
179181
mp_token_kind_t tok_kind; // token kind
180182
vstr_t vstr; // token data
181183
#if MICROPY_PY_FSTRING
182-
vstr_t vstr_postfix; // postfix to apply to string
183-
bool vstr_postfix_processing;
184-
uint16_t vstr_postfix_idx;
184+
vstr_t fstring_args; // extracted arguments to pass to .format()
185+
uint16_t fstring_args_idx; // how many bytes of fstring_args have been read
185186
#endif
186187
} mp_lexer_t;
187188

tests/basics/string_fstring.py

Lines changed: 33 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,57 @@
1-
# Tests against https://www.python.org/dev/peps/pep-0498/
2-
3-
import sys
1+
def f():
2+
return 4
3+
def g(_):
4+
return 5
5+
def h():
6+
return 6
47

58
print(f'no interpolation')
69
print(f"no interpolation")
10+
print(f"""no interpolation""")
711

8-
# Quoth the PEP:
9-
# Backslashes may not appear anywhere within expressions. Comments, using the
10-
# '#' character, are not allowed inside an expression
11-
#
12-
# CPython (3.7.4 on Linux) raises a SyntaxError here:
13-
# >>> f'{#}'
14-
# File "<stdin>", line 1
15-
# SyntaxError: f-string expression part cannot include '#'
16-
# >>> f'{\}'
17-
# File "<stdin>", line 1
18-
# SyntaxError: f-string expression part cannot include a backslash
19-
# >>> f'{\\}'
20-
# File "<stdin>", line 1
21-
# SyntaxError: f-string expression part cannot include a backslash
22-
# >>> f'{\#}'
23-
# File "<stdin>", line 1
24-
# SyntaxError: f-string expression part cannot include a backslash
12+
x, y = 1, 2
13+
print(f'{x}')
14+
print(f'{x:08x}')
15+
print(f'{x=}')
16+
print(f'{x=:08x}')
17+
print(f'a {x} b {y} c')
18+
print(f'a {x:08x} b {y} c')
19+
print(f'a {x=} b {y} c')
20+
print(f'a {x=:08x} b {y} c')
21+
22+
print(f'a {"hello"} b')
23+
print(f'a {f() + g("foo") + h()} b')
24+
print(f'a {f() + g("foo") + h()=} b')
25+
print(f'a {f() + g("foo") + h()=:08x} b')
26+
27+
def foo(a, b):
28+
return f'{x}{y}{a}{b}'
29+
print(foo(7, 8))
30+
31+
# PEP-0498 specifies that '\\' and '#' must be disallowed explicitly, whereas
32+
# MicroPython relies on the syntax error as a result of the substitution.
2533

26-
# Backslashes and comments allowed outside expression
2734
print(f"\\")
2835
print(f'#')
29-
30-
## But not inside
3136
try:
3237
eval("f'{\}'")
3338
except SyntaxError:
3439
print('SyntaxError')
35-
else:
36-
print('f-string with backslash in expression did not raise SyntaxError')
37-
3840
try:
3941
eval("f'{#}'")
4042
except SyntaxError:
4143
print('SyntaxError')
42-
else:
43-
print('f-string with \'#\' in expression did not raise SyntaxError')
4444

45-
# Quoth the PEP:
46-
# While scanning the string for expressions, any doubled braces '{{' or '}}'
47-
# inside literal portions of an f-string are replaced by the corresponding
48-
# single brace. Doubled literal opening braces do not signify the start of an
49-
# expression. A single closing curly brace '}' in the literal portion of a
50-
# string is an error: literal closing curly braces must be doubled '}}' in
51-
# order to represent a single closing brace.
52-
#
53-
# CPython (3.7.4 on Linux) raises a SyntaxError for the last case:
54-
# >>> f'{{}'
55-
# File "<stdin>", line 1
56-
# SyntaxError: f-string: single '}' is not allowed
5745

46+
# PEP-0498 specifies that handling of double braces '{{' or '}}' should
47+
# behave like str.format.
5848
print(f'{{}}')
49+
print(f'{{{4*10}}}', '{40}')
5950

51+
# A single closing brace, unlike str.format should raise a syntax error.
52+
# MicroPython instead raises ValueError at runtime from the substitution.
6053
try:
6154
eval("f'{{}'")
6255
except (ValueError, SyntaxError):
6356
# MicroPython incorrectly raises ValueError here.
6457
print('SyntaxError')
65-
else:
66-
print('Expected ValueError for invalid f-string literal bracing')
67-
68-
x = 1
69-
print(f'{x}')
70-
71-
# Quoth the PEP:
72-
# The expressions that are extracted from the string are evaluated in the
73-
# context where the f-string appeared. This means the expression has full
74-
# access to local and global variables. Any valid Python expression can be
75-
# used, including function and method calls. Because the f-strings are
76-
# evaluated where the string appears in the source code, there is no additional
77-
# expressiveness available with f-strings. There are also no additional
78-
# security concerns: you could have also just written the same expression, not
79-
# inside of an f-string:
80-
81-
def foo():
82-
return 20
83-
84-
print(f'result={foo()}', 'result=20')
85-
print(f'result={foo()}', 'result={}'.format(foo()))
86-
print(f'result={foo()}', 'result={result}'.format(result=foo()))
87-
88-
# Other tests
89-
print(f'{{{4*10}}}', '{40}')

tests/cmdline/cmd_parsetree.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010
e = b"a very long bytes that will not be interned"
1111
f = 123456789012345678901234567890
1212
g = 123
13+
h = f"fstring: '{b}'"

0 commit com 319B ments

Comments
 (0)
0