8000 gh-105017: Include CRLF lines in strings and column numbers by mgmacias95 · Pull Request #105030 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-105017: Include CRLF lines in strings and column numbers #105030

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 28, 2023
Prev Previous commit
Next Next commit
Only preserve crlf in the Python-tokenize extension
  • Loading branch information
pablogsal committed May 27, 2023
commit 37f77adc0f865a579e22a8c9db3196ae7330cdab
4 changes: 2 additions & 2 deletions Parser/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen

struct tok_state *tok;
if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
tok = _PyTokenizer_FromUTF8(str, exec_input);
tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
} else {
tok = _PyTokenizer_FromString(str, exec_input);
tok = _PyTokenizer_FromString(str, exec_input, 0);
}
if (tok == NULL) {
if (PyErr_Occurred()) {
Expand Down
32 changes: 22 additions & 10 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,9 @@ translate_into_utf8(const char* str, const char* enc) {


static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
translate_newlines(const char *s, int exec_input, int preserve_crlf,
struct tok_state *tok) {
int skip_next_lf = 0;
size_t needed_length = strlen(s) + 2, final_length;
char *buf, *current;
char c = '\0';
Expand All @@ -783,8 +785,18 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
}
for (current = buf; *s; s++, current++) {
c = *s;
if (!c)
break;
if (skip_next_lf) {
skip_next_lf = 0;
if (c == '\n') {
c = *++s;
if (!c)
break;
}
}
if (!preserve_crlf && c == '\r') {
skip_next_lf = 1;
c = '\n';
}
*current = c;
}
/* If this is exec input, add a newline to the end of the string if
Expand All @@ -811,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
inside TOK. */

static char *
decode_str(const char *input, int single, struct tok_state *tok)
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
{
PyObject* utf8 = NULL;
char *str;
const char *s;
const char *newl[2] = {NULL, NULL};
int lineno = 0;
tok->input = str = translate_newlines(input, single, tok);
tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
if (str == NULL)
return NULL;
tok->enc = NULL;
Expand Down Expand Up @@ -870,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
/* Set up tokenizer for string */

struct tok_state *
_PyTokenizer_FromString(const char *str, int exec_input)
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *decoded;

if (tok == NULL)
return NULL;
decoded = decode_str(str, exec_input, tok);
decoded = decode_str(str, exec_input, tok, preserve_crlf);
if (decoded == NULL) {
_PyTokenizer_Free(tok);
return NULL;
Expand All @@ -891,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input)
/* Set up tokenizer for UTF-8 string */

struct tok_state *
_PyTokenizer_FromUTF8(const char *str, int exec_input)
_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *translated;
if (tok == NULL)
return NULL;
tok->input = translated = translate_newlines(str, exec_input, tok);
tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
if (translated == NULL) {
_PyTokenizer_Free(tok);
return NULL;
Expand Down Expand Up @@ -1039,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok) {
}
char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
if (newtok != NULL) {
char *translated = translate_newlines(newtok, 0, tok);
char *translated = translate_newlines(newtok, 0, 0, tok);
PyMem_Free(newtok);
if (translated == NULL) {
return 0;
Expand Down
4 changes: 2 additions & 2 deletions Parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ struct tok_state {
#endif
};

extern struct tok_state *_PyTokenizer_FromString(const char *, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
Expand Down
2 changes: 1 addition & 1 deletion Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (filename == NULL) {
return NULL;
}
self->tok = _PyTokenizer_FromUTF8(source, 1);
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
if (self->tok == NULL) {
Py_DECREF(filename);
return NULL;
Expand Down
0