8000 bpo-35808: Retire pgen and use pgen2 to generate the parser by pablogsal · Pull Request #11814 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-35808: Retire pgen and use pgen2 to generate the parser #11814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 1, 2019
Prev Previous commit
Next Next commit
Use Lib/token.py and Lib/tokenize.py as the source of tokens
  • Loading branch information
pablogsal committed Feb 21, 2019
commit 3d593ef6d1c13a3e43f4538b986abe3ce2c84f92
41 changes: 10 additions & 31 deletions Parser/pgen/__main__.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,12 @@
import os
import sys
import argparse
import collections

from lib2to3.pgen2 import grammar, tokenize

from . import token
from . import grammar as pgen_grammar

def monkey_patch_pgen2(token_lines):
tokens = dict(token.generate_tokens(token_lines))
for name, value in tokens.items():
setattr(tokenize, name, value)

from .pgen import ParserGenerator


def main(grammar_file, tokens_file, gramminit_h_file, gramminit_c_file, verbose):
with open(tokens_file) as tok_file:
token_lines = tok_file.readlines()

monkey_patch_pgen2(token_lines)

p = ParserGenerator(grammar_file, token_lines, verbose=verbose)
grammar = p.make_grammar()
grammar.produce_graminit_h(gramminit_h_file.write)
grammar.produce_graminit_c(gramminit_c_file.write)


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser(description="Parser generator main program.")
parser.add_argument(
"grammar", type=str, help="The file with the grammar definition in EBNF format"
)
parser.add_argument(
"tokens", type=str, help="The file with the token definition"
)
parser.add_argument(
"gramminit_h",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

8000

Normally this is spelled with one ‘m’. (Also below.)

type=argparse.FileType('w'),
Expand All @@ -48,4 +19,12 @@ def main(grammar_file, tokens_file, gramminit_h_file, gramminit_c_file, verbose)
)
parser.add_argument("--verbose", "-v", action="count")
args = parser.parse_args()
main(args.grammar, args.tokens, args.gramminit_h, args.gramminit_c, args.verbose)

p = ParserGenerator(args.grammar, verbose=args.verbose)
grammar = p.make_grammar()
grammar.produce_graminit_h(args.gramminit_h.write)
grammar.produce_graminit_c(args.gramminit_c.write)


if __name__ == "__main__":
main()
3 changes: 0 additions & 3 deletions Parser/pgen/grammar.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from lib2to3.pgen2 import grammar
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe also copy this, so we’re completely independent from lib2to3?


from . import token


class Grammar(grammar.Grammar):

def produce_graminit_h(self, writer):
Expand Down
57 changes: 39 additions & 18 deletions Parser/pgen/pgen.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,38 @@
import os
import sys
import collections
from lib2to3.pgen2 import tokenize
import importlib.machinery

from . import token, grammar
# Use Lib/token.py and Lib/tokenize.py to obtain the tokens. To maintain this
8000 Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would get them directly from Grammar/Tokens

# compatible with older versions of Python, we need to make sure that we only
# import these two files (and not any of the dependencies of these files).

CURRENT_FOLDER_LOCATION = os.path.dirname(os.path.realpath(__file__))
LIB_LOCATION = os.path.realpath(os.path.join(CURRENT_FOLDER_LOCATION, '..', '..', 'Lib'))
TOKEN_LOCATION = os.path.join(LIB_LOCATION, 'token.py')
TOKENIZE_LOCATION = os.path.join(LIB_LOCATION, 'tokenize.py')

token = importlib.machinery.SourceFileLoader('token',
TOKEN_LOCATION).load_module()
# Add token to the module cache so tokenize.py uses that excact one instead of
# the one in the stdlib of the interpreter executing this file.
sys.modules['token'] = token
tokenize = importlib.machinery.SourceFileLoader('tokenize',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still looks fragile. Why do we need to use the latest tokenize to parse the Grammar file? The “meta” grammar is super simple, it just has NAME, string literals, and some basic punctuation and operators. The tokenize module from Python 2.4 can handle this. :-)

Copy link
Member Author
@pablogsal pablogsal Feb 21, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tokenize module from Python 2.4 can handle this.

:)

Why do we need to use the latest tokenize to parse the Grammar file?

Is not that the tokenize cannot handle the grammar is that but that the tokenizer uses different values for the tokens, it fails when constructing the dfas when calling self.parse():

Traceback (most recent call last):
  File "/usr/lib/python3.4/runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.4/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/src/Parser/pgen/__main__.py", line 36, in <module>
    main()
  File "/src/Parser/pgen/__main__.py", line 29, in main
    p = ParserGenerator(args.grammar, token_lines, verbose=args.verbose)
  File "/src/Parser/pgen/pgen.py", line 20, in __init__
    self.dfas, self.startsymbol = self.parse()
  File "/src/Parser/pgen/pgen.py", line 173, in parse
    self.expect(self.tokens['OP'], ":")
  File "/src/Parser/pgen/pgen.py", line 337, in expect
    type, value, self.type, self.value)
  File "/src/Parser/pgen/pgen.py", line 356, in raise_error
    self.end[1], self.line))
  File "./Grammar/Grammar", line 13
    single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
                ^
SyntaxError: expected 54/:, got 52/:

This is because OP has the value of 52 in Python3.5 (in this example) and 54 in the tokens that we construct from Grammar/Tokens (or in Lib/tokens.py). This difference is because the value of 52 is yielded by the tokenize (from Python3.5) when calling next(self.generator) in gettoken. Maybe I am missing something here, but that is the problem I found when triying to use the tokenize from the running Python :(

TOKENIZE_LOCATION).load_module()

from . import grammar

class ParserGenerator(object):

def __init__(self, filename, tokens, stream=None, verbose=False):
def __init__(self, filename, stream=None, verbose=False):
close_stream = None
if stream is None:
stream = open(filename)
close_stream = stream.close
self.tokens = dict(token.generate_tokens(tokens))
self.opmap = dict(token.generate_opmap(tokens))
self.tokens = token
self.opmap = token.EXACT_TOKEN_TYPES
# Manually add <> so it does not collide with !=
self.opmap['<>'] = self.tokens.NOTEQUAL
self.verbose = verbose
self.filename = filename
self.stream = stream
Expand Down Expand Up @@ -87,9 +108,9 @@ def make_label(self, c, label):
return ilabel
else:
# A named token (NAME, NUMBER, STRING)
itoken = self.tokens.get(label, None)
itoken = getattr(self.tokens, label, None)
assert isinstance(itoken, int), label
assert itoken in self.tokens.values(), label
assert itoken in self.tokens.tok_name, label
if itoken in c.tokens:
return c.tokens[itoken]
else:
Expand All @@ -105,12 +126,12 @@ def make_label(self, c, label):
if value in c.keywords:
return c.keywords[value]
else:
c.labels.append((self.tokens['NAME'], value))
c.labels.append((self.tokens.NAME, value))
c.keywords[value] = ilabel
return ilabel
else:
# An operator (any non-numeric token)
itoken = self.tokens[self.opmap[value]] # Fails if unknown token
itoken = self.opmap[value] # Fails if unknown token
if itoken in c.tokens:
return c.tokens[itoken]
else:
Expand Down Expand Up @@ -163,16 +184,16 @@ def parse(self):
dfas = collections.OrderedDict()
startsymbol = None
# MSTART: (NEWLINE | RULE)* ENDMARKER
while self.type != self.tokens['ENDMARKER']:
while self.type == self.tokens['NEWLINE']:
while self.type != self.tokens.ENDMARKER:
while self.type == self.tokens.NEWLINE:
self.gettoken()
# RULE: NAME ':' RHS NEWLINE
name = self.expect(self.tokens['NAME'])
name = self.expect(self.tokens.NAME)
if self.verbose:
print("Processing rule {dfa_name}".format(dfa_name=name))
self.expect(self.tokens['OP'], ":")
self.expect(self.tokens.OP, ":")
a, z = self.parse_rhs()
self.expect(self.tokens['NEWLINE'])
self.expect(self.tokens.NEWLINE)
if self.verbose:
self.dump_nfa(name, a, z)
dfa = self.make_dfa(a, z)
Expand Down Expand Up @@ -288,7 +309,7 @@ def parse_alt(self):
# ALT: ITEM+
a, b = self.parse_item()
while (self.value in ("(", "[") or
self.type in (self.tokens['NAME'], self.tokens['STRING'])):
self.type in (self.tokens.NAME, self.tokens.STRING)):
c, d = self.parse_item()
b.addarc(c)
b = d
Expand All @@ -299,7 +320,7 @@ def parse_item(self):
if self.value == "[":
self.gettoken()
a, z = self.parse_rhs()
self.expect(self.tokens['OP'], "]")
self.expect(self.tokens.OP, "]")
a.addarc(z)
return a, z
else:
Expand All @@ -319,9 +340,9 @@ def parse_atom(self):
if self.value == "(":
self.gettoken()
a, z = self.parse_rhs()
self.expect(self.tokens['OP'], ")")
self.expect(self.tokens.OP, ")")
return a, z
elif self.type in (self.tokens['NAME'], self.tokens['STRING']):
elif self.type in (self.tokens.NAME, self.tokens.STRING):
a = NFAState()
z = NFAState()
a.addarc(z, self.value)
Expand Down
40 changes: 0 additions & 40 deletions Parser/pgen/token.py
< 3EC6 p id="hidden-diff-reason-81bf57e2c48b0767d7489f506d6dadd5d5a51e15f1700afdd4017fbfb2173871" class="color-fg-muted f6"> This file was deleted.

0