Fix more bugs

python · pablogsal · May 21, 2023 · Apr 19, 2023 · Apr 27, 2023 · Apr 29, 2023
commit 6833b1aea88632fb9d3c28bf9f617bb4d1faef96
diff --git a/Lib/inspect.py b/Lib/inspect.py
@@ -2187,7 +2187,6 @@ def _signature_strip_non_python_syntax(signature):
             if string == ',':
                 current_parameter += 1
 
-        # if (type == ERRORTOKEN) and (string == '$'):
         if (type == OP) and (string == '$'):
             assert self_parameter is None
             self_parameter = current_parameter

@@ -1,9 +1,9 @@
 from test import support
 from test.support import os_helper
-from tokenize import (tokenize, tokenize2, _tokenize, untokenize, NUMBER, NAME, OP,
+from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                      STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                      open as tokenize_open, Untokenizer, generate_tokens,
-                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
+                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
 from io import BytesIO, StringIO
 import unittest
 from textwrap import dedent
@@ -46,7 +46,7 @@ def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
         # The ENDMARKER and final NEWLINE are omitted.
         f = BytesIO(s.encode('utf-8'))
-        result = stringify_tokens_from_source(tokenize2(f.readline), s)
+        result = stringify_tokens_from_source(tokenize(f.readline), s)
         self.assertEqual(result,
                          ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                          expected.rstrip().splitlines())
@@ -1128,33 +1128,16 @@ def readline():
             nonlocal first
             if not first:
                 first = True
-                return line
+                yield line
             else:
-                return b''
+                yield b''
 
         # skip the initial encoding token and the end tokens
-        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
+        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
         self.assertEqual(tokens, expected_tokens,
                          "bytes not decoded with encoding")
 
-    def test__tokenize_does_not_decode_with_encoding_none(self):
-        literal = '"ЉЊЈЁЂ"'
-        first = False
-        def readline():
-            nonlocal first
-            if not first:
-                first = True
-                return literal
-            else:
-                return b''
-
-        # skip the end tokens
-        tokens = list(_tokenize(readline, encoding=None))[:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
-        self.assertEqual(tokens, expected_tokens,
-                         "string not tokenized when encoding is None")
-
 
 class TestDetectEncoding(TestCase):
 
@@ -1412,7 +1395,7 @@ def test_open_error(self):
 
 class TestTokenize(TestCase):
 
-    def test_tokenizee(self):
+    def test_tokenize(self):
         import tokenize as tokenize_module
         encoding = "utf-8"
         encoding_used = None
@@ -1424,7 +1407,10 @@ def mock__tokenize(readline, encoding):
             encoding_used = encoding
             out = []
             while True:
-                next_line = readline()
+                try:
+                    next_line = next(readline)
+                except StopIteration:
+                    return out
                 if next_line:
                     out.append(next_line)
                     continue
@@ -1444,7 +1430,7 @@ def mock_readline():
         tokenize_module._tokenize = mock__tokenize
         try:
             results = tokenize(mock_readline)
-            self.assertEqual(list(results),
+            self.assertEqual(list(results)[1:],
                              [b'first', b'second', b'1', b'2', b'3', b'4'])
         finally:
             tokenize_module.detect_encoding = orig_detect_encoding
@@ -1740,7 +1726,7 @@ def test_random_files(self):
             if support.verbose >= 2:
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
-                with self.subTest(file=testfile):
+                # with self.subTest(file=testfile):
                     self.check_roundtrip(f)
 
 

@@ -213,6 +213,14 @@ def untokenize(self, iterable):
                     self.tokens.append(indent)
                     self.prev_col = len(indent)
                 startline = False
+            elif tok_type == FSTRING_MIDDLE:
+                if '{' in token or '}' in token:
+                    end_line, end_col = end
+                    end = (end_line, end_col + token.count('{') + token.count('}'))
+                    token = re.sub('{', '{{', token)
+                    token = re.sub('}', '}}', token)
+
+
             self.add_whitespace(start)
             self.tokens.append(token)
             self.prev_row, self.prev_col = end
@@ -255,6 +263,11 @@ def compat(self, token, iterable):
             elif startline and indents:
                 toks_append(indents[-1])
                 startline = False
+            elif toknum == FSTRING_MIDDLE:
+                if '{' in tokval or '}' in tokval:
+                    tokval = re.sub('{', '{{', tokval)
+                    tokval = re.sub('}', '}}', tokval)
+
             toks_append(tokval)
 
 
@@ -404,36 +417,6 @@ def open(filename):
         buffer.close()
         raise
 
-def tokenize2(readline):
-    encoding, consumed = detect_encoding(readline)
-    rl_gen = _itertools.chain(consumed, iter(readline, b""))
-    if encoding is not Non
6D4E
e:
-        if encoding == "utf-8-sig":
-            # BOM will already have been stripped.
-            encoding = "utf-8"
-        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    yield from _tokenize2(rl_gen, encoding)
-
-def _tokenize2(rl_gen, encoding):
-    source = b"".join(rl_gen)
-    token = None
-    for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True):
-        # TODO: Marta -> limpiar esto
-        if 6 < token.type <= 54:
-            token = token._replace(type=OP)
-        if token.type in {ASYNC, AWAIT}:
-            token = token._replace(type=NAME)
-        if token.type == NEWLINE:
-            l_start, c_start = token.start
-            l_end, c_end = token.end
-            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
-
-        yield token
-    if token is not None:
-        last_line, _ = token.start
-        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
-
-
 def tokenize(readline):
     """
     The tokenize() generator requires one argument, readline, which
@@ -454,194 +437,33 @@ def tokenize(readline):
     which tells you which encoding was used to decode the bytes stream.
     """
     encoding, consumed = detect_encoding(readline)
-    empty = _itertools.repeat(b"")
-    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
-    return _tokenize(rl_gen.__next__, encoding)
-
-
-def _tokenize(readline, encoding):
-    lnum = parenlev = continued = 0
-    numchars = '0123456789'
-    contstr, needcont = '', 0
-    contline = None
-    indents = [0]
-
+        rl_gen = _itertools.chain(consumed, iter(readline, b""))
     if encoding is not None:
         if encoding == "utf-8-sig":
             # BOM will already have been stripped.
             encoding = "utf-8"
         yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    last_line = b''
-    line = b''
-    while True:                                # loop over lines in stream
-        try:
-            # We capture the value of the line variable here because
-            # readline uses the empty string '' to signal end of input,
-            # hence `line` itself will always be overwritten at the end
-            # of this loop.
-            last_line = line
-            line = readline()
-        except StopIteration:
-            line = b''
-
-        if encoding is not None:
-            line = line.decode(encoding)
-        lnum += 1
-        pos, max = 0, len(line)
-
-        if contstr:                            # continued string
-            if not line:
-                raise TokenError("EOF in multi-line string", strstart)
-            endmatch = endprog.match(line)
-            if endmatch:
-                pos = end = endmatch.end(0)
-                yield TokenInfo(STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
-                contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield TokenInfo(ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
-                contline = None
-                continue
-            else:
-                contstr = contstr + line
-                contline = contline + line
-                continue
-
-        elif parenlev == 0 and not continued:  # new statement
-            if not line: break
-            column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ':
-                    column += 1
-                elif line[pos] == '\t':
-                    column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f':
-                    column = 0
-                else:
-                    break
-                pos += 1
-            if pos == max:
-                break
-
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    yield TokenInfo(COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    pos += len(comment_token)
-
-                yield TokenInfo(NL, line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
-                continue
-
-            if column > indents[-1]:           # count indents or dedents
-                indents.append(column)
-                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
-                if column not in indents:
-                    raise IndentationError(
-                        "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
-                indents = indents[:-1]
-
-                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
-        else:                                  # continued statement
-            if not line:
-                raise TokenError("EOF in multi-line statement", (lnum, 0))
-            continued = 0
-
-        while pos < max:
-            pseudomatch = _compile(PseudoToken).match(line, pos)
-            if pseudomatch:                                # scan for tokens
-                start, end = pseudomatch.span(1)
-                spos, epos, pos = (lnum, start), (lnum, end), end
-                if start == end:
-                    continue
-                token, initial = line[start:end], line[start]
-
-                if (initial in numchars or                 # ordinary number
-                    (initial == '.' and token != '.' and token != '...')):
-                    yield TokenInfo(NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
-                    if parenlev > 0:
-                        yield TokenInfo(NL, token, spos, epos, line)
-                    else:
-                        yield TokenInfo(NEWLINE, token, spos, epos, line)
-
-                elif initial == '#':
-                    assert not token.endswith("\n")
-                    yield TokenInfo(COMMENT, token, spos, epos, line)
-
-                elif token in triple_quoted:
-                    endprog = _compile(endpats[token])
-                    endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
-                        pos = endmatch.end(0)
-                        token = line[start:pos]
-                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
-                    else:
-                        strstart = (lnum, start)           # multiple lines
-                        contstr = line[start:]
-                        contline = line
-                        break
-
-                # Check up to the first 3 chars of the token to see if
-                #  they're in the single_quoted set. If so, they start
-                #  a string.
-                # We're using the first 3, because we're looking for
-                #  "rb'" (for example) at the start of the token. If
-                #  we switch to longer prefixes, this needs to be
-                #  adjusted.
-                # Note that initial == token[:1].
-                # Also note that single quote checking must come after
-                #  triple quote checking (above).
-                elif (initial in single_quoted or
-                      token[:2] in single_quoted or
-                      token[:3] in single_quoted):
-                    if token[-1] == '\n':                  # continued string
-                        strstart = (lnum, start)
-                        # Again, using the first 3 chars of the
-                        #  token. This is looking for the matching end
-                        #  regex for the correct type of quote
-                        #  character. So it's really looking for
-                        #  endpats["'"] or endpats['"'], by trying to
-                        #  skip string prefix characters, if any.
-                        endprog = _compile(endpats.get(initial) or
-                                           endpats.get(token[1]) or
-                                           endpats.get(token[2]))
-                        contstr, needcont = line[start:], 1
-                        contline = line
-                        break
-                    else:                                  # ordinary string
-                        yield TokenInfo(STRING, token, spos, epos, line)
-
-                elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
-                elif initial == '\\':                      # continued stmt
-                    continued = 1
-                else:
-                    if initial in '([{':
-                        parenlev += 1
-                    elif initial in ')]}':
-                        parenlev -= 1
-                    yield TokenInfo(OP, token, spos, epos, line)
-            else:
-                yield TokenInfo(ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
-                pos += 1
+    yield from _tokenize(rl_gen, encoding)
+
+def _tokenize(rl_gen, encoding):
+    source = b"".join(rl_gen).decode(encoding)
+    token = None
+    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
+        # TODO: Marta -> limpiar esto
+        if 6 < token.type <= 54:
+            token = token._replace(type=OP)
+        if token.type in {ASYNC, AWAIT}:
+            token = token._replace(type=NAME)
+        if token.type =
939E
= NEWLINE:
+            l_start, c_start = token.start
+            l_end, c_end = token.end
+            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
 
-    # Add an implicit NEWLINE if the input doesn't end in one
-    if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
-        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+        yield token
+    if token is not None:
+        last_line, _ = token.start
+        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
 
-tokenize = tokenize2
 
 def generate_tokens(readline):
     """Tokenize a source reading Python code as unicode strings.
@@ -658,7 +480,7 @@ def _gen():
             if not line:
                 return
             yield line.encode()
-    return _tokenize2(_gen(), 'utf-8')
+    return _tokenize(_gen(), 'utf-8')
 
 def main():
     import argparse