8000 gh-105017: Include CRLF lines in strings and column numbers by mgmacias95 · Pull Request #105030 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-105017: Include CRLF lines in strings and column numbers #105030

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 28, 2023
Next Next commit
fix(tokenizer): Include CRLF lines in strings and column numbers
  • Loading branch information
mgmacias95 committed May 27, 2023
commit ad130da3f980daed1f1f8d8ca63cffc605749869
19 changes: 14 additions & 5 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,20 @@ def test_basic(self):
DEDENT '' (5, 0) (5, 0)
""")

self.check_tokenize("foo='bar'\r\n", """\
NAME 'foo' (1, 0) (1, 3)
OP '=' (1, 3) (1, 4)
STRING "'bar'" (1, 4) (1, 9)
NEWLINE '\\n' (1, 9) (1, 10)
self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\
NAME 'if' (1, 0) (1, 2)
NAME 'True' (1, 3) (1, 7)
OP ':' (1, 7) (1, 8)
NEWLINE '\\r\\n' (1, 8) (1, 10)
COMMENT '# NL' (2, 4) (2, 8)
NL '\\r\\n' (2, 8) (2, 10)
INDENT ' ' (3, 0) (3, 4)
NAME 'foo' (3, 4) (3, 7)
OP '=' (3, 7) (3, 8)
STRING "\'bar\'" (3, 8) (3, 13)
NEWLINE '\\r\\n' (3, 13) (3, 15)
NL '\\r\\n' (4, 0) (4, 2)
DEDENT '' (5, 0) (5, 0)
""")

indent_error_file = b"""\
Expand Down
23 changes: 8 additions & 15 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,6 @@ translate_into_utf8(const char* str, const char* enc) {

static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
int skip_next_lf = 0;
size_t needed_length = strlen(s) + 2, final_length;
char *buf, *current;
char c = '\0';
Expand All @@ -784,18 +783,8 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
}
for (current = buf; *s; s++, current++) {
c = *s;
if (skip_next_lf) {
skip_next_lf = 0;
if (c == '\n') {
c = *++s;
if (!c)
break;
}
}
if (c == '\r') {
skip_next_lf = 1;
c = '\n';
}
if (!c)
break;
*current = c;
}
/* If this is exec input, add a newline to the end of the string if
Expand Down Expand Up @@ -1693,7 +1682,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
if (c == '#' || c == '\n' || c == '\r') {
/* Lines with only whitespace and/or comments
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
Expand Down Expand Up @@ -1822,7 +1811,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
const char *prefix, *type_start;
int current_starting_col_offset;

while (c != EOF && c != '\n') {
while (c != EOF && c != '\n' && c != '\r') {
c = tok_nextc(tok);
}

Expand Down Expand Up @@ -2002,6 +1991,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(NAME);
}

if (c == '\r') {
c = tok_nextc(tok);
}

/* Newline */
if (c == '\n') {
tok->atbol = 1;
Expand Down
6 changes: 5 additions & 1 deletion Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,11 @@ tokenizeriter_next(tokenizeriterobject *it)
type = NAME;
}
else if (type == NEWLINE) {
str = PyUnicode_FromString("\n");
if (it->tok->start[0] == '\r') {
str = PyUnicode_FromString("\r\n");
} else {
str = PyUnicode_FromString("\n");
}
end_col_offset++;
}
}
Expand Down
0