-
-
Notifications
You must be signed in to change notification settings - Fork 32.4k
bpo-35808: Retire pgen and use pgen2 to generate the parser #11814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
2bc3198
9f8b58f
1777ae5
7fa4dae
e2ed0d1
dc2facb
7e3beaf
3d593ef
b1fae75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,12 @@ | ||
import os | ||
import sys | ||
import argparse | ||
import collections | ||
|
||
from lib2to3.pgen2 import grammar, tokenize | ||
|
||
from . import token | ||
from . import grammar as pgen_grammar | ||
|
||
def monkey_patch_pgen2(token_lines): | ||
tokens = dict(token.generate_tokens(token_lines)) | ||
for name, value in tokens.items(): | ||
setattr(tokenize, name, value) | ||
|
||
from .pgen import ParserGenerator | ||
|
||
|
||
def main(grammar_file, tokens_file, gramminit_h_file, gramminit_c_file, verbose): | ||
with open(tokens_file) as tok_file: | ||
token_lines = tok_file.readlines() | ||
|
||
monkey_patch_pgen2(token_lines) | ||
|
||
p = ParserGenerator(grammar_file, token_lines, verbose=verbose) | ||
grammar = p.make_grammar() | ||
grammar.produce_graminit_h(gramminit_h_file.write) | ||
grammar.produce_graminit_c(gramminit_c_file.write) | ||
|
||
|
||
if __name__ == "__main__": | ||
def main(): | ||
parser = argparse.ArgumentParser(description="Parser generator main program.") | ||
parser.add_argument( | ||
"grammar", type=str, help="The file with the grammar definition in EBNF format" | ||
) | ||
parser.add_argument( | ||
"tokens", type=str, help="The file with the token definition" | ||
) | ||
parser.add_argument( | ||
"gramminit_h", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
8000
Normally this is spelled with one ‘m’. (Also below.) |
||
type=argparse.FileType('w'), | ||
|
@@ -48,4 +19,12 @@ def main(grammar_file, tokens_file, gramminit_h_file, gramminit_c_file, verbose) | |
) | ||
parser.add_argument("--verbose", "-v", action="count") | ||
args = parser.parse_args() | ||
main(args.grammar, args.tokens, args.gramminit_h, args.gramminit_c, args.verbose) | ||
|
||
p = ParserGenerator(args.grammar, verbose=args.verbose) | ||
grammar = p.make_grammar() | ||
grammar.produce_graminit_h(args.gramminit_h.write) | ||
grammar.produce_graminit_c(args.gramminit_c.write) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,5 @@ | ||
from lib2to3.pgen2 import grammar | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe also copy this, so we’re completely independent from lib2to3? |
||
|
||
from . import token | ||
|
||
|
||
class Grammar(grammar.Grammar): | ||
|
||
def produce_graminit_h(self, writer): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,38 @@ | ||
import os | ||
import sys | ||
import collections | ||
from lib2to3.pgen2 import tokenize | ||
import importlib.machinery | ||
|
||
from . import token, grammar | ||
# Use Lib/token.py and Lib/tokenize.py to obtain the tokens. To maintain this | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would get them directly from Grammar/Tokens |
||
# compatible with older versions of Python, we need to make sure that we only | ||
# import these two files (and not any of the dependencies of these files). | ||
|
||
CURRENT_FOLDER_LOCATION = os.path.dirname(os.path.realpath(__file__)) | ||
LIB_LOCATION = os.path.realpath(os.path.join(CURRENT_FOLDER_LOCATION, '..', '..', 'Lib')) | ||
TOKEN_LOCATION = os.path.join(LIB_LOCATION, 'token.py') | ||
TOKENIZE_LOCATION = os.path.join(LIB_LOCATION, 'tokenize.py') | ||
|
||
token = importlib.machinery.SourceFileLoader('token', | ||
TOKEN_LOCATION).load_module() | ||
# Add token to the module cache so tokenize.py uses that excact one instead of | ||
# the one in the stdlib of the interpreter executing this file. | ||
sys.modules['token'] = token | ||
tokenize = importlib.machinery.SourceFileLoader('tokenize', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This still looks fragile. Why do we need to use the latest tokenize to parse the Grammar file? The “meta” grammar is super simple, it just has NAME, string literals, and some basic punctuation and operators. The tokenize module from Python 2.4 can handle this. :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
:)
Is not that the tokenize cannot handle the grammar is that but that the tokenizer uses different values for the tokens, it fails when constructing the dfas when calling
This is because OP has the value of 52 in Python3.5 (in this example) and 54 in the tokens that we construct from |
||
TOKENIZE_LOCATION).load_module() | ||
|
||
from . import grammar | ||
|
||
class ParserGenerator(object): | ||
|
||
def __init__(self, filename, tokens, stream=None, verbose=False): | ||
def __init__(self, filename, stream=None, verbose=False): | ||
close_stream = None | ||
if stream is None: | ||
stream = open(filename) | ||
close_stream = stream.close | ||
self.tokens = dict(token.generate_tokens(tokens)) | ||
self.opmap = dict(token.generate_opmap(tokens)) | ||
self.tokens = token | ||
self.opmap = token.EXACT_TOKEN_TYPES | ||
# Manually add <> so it does not collide with != | ||
self.opmap['<>'] = self.tokens.NOTEQUAL | ||
self.verbose = verbose | ||
self.filename = filename | ||
self.stream = stream | ||
|
@@ -87,9 +108,9 @@ def make_label(self, c, label): | |
return ilabel | ||
else: | ||
# A named token (NAME, NUMBER, STRING) | ||
itoken = self.tokens.get(label, None) | ||
itoken = getattr(self.tokens, label, None) | ||
assert isinstance(itoken, int), label | ||
assert itoken in self.tokens.values(), label | ||
assert itoken in self.tokens.tok_name, label | ||
if itoken in c.tokens: | ||
return c.tokens[itoken] | ||
else: | ||
|
@@ -105,12 +126,12 @@ def make_label(self, c, label): | |
if value in c.keywords: | ||
return c.keywords[value] | ||
else: | ||
c.labels.append((self.tokens['NAME'], value)) | ||
c.labels.append((self.tokens.NAME, value)) | ||
c.keywords[value] = ilabel | ||
return ilabel | ||
else: | ||
# An operator (any non-numeric token) | ||
itoken = self.tokens[self.opmap[value]] # Fails if unknown token | ||
itoken = self.opmap[value] # Fails if unknown token | ||
if itoken in c.tokens: | ||
return c.tokens[itoken] | ||
else: | ||
|
@@ -163,16 +184,16 @@ def parse(self): | |
dfas = collections.OrderedDict() | ||
startsymbol = None | ||
# MSTART: (NEWLINE | RULE)* ENDMARKER | ||
while self.type != self.tokens['ENDMARKER']: | ||
while self.type == self.tokens['NEWLINE']: | ||
while self.type != self.tokens.ENDMARKER: | ||
while self.type == self.tokens.NEWLINE: | ||
self.gettoken() | ||
# RULE: NAME ':' RHS NEWLINE | ||
name = self.expect(self.tokens['NAME']) | ||
name = self.expect(self.tokens.NAME) | ||
if self.verbose: | ||
print("Processing rule {dfa_name}".format(dfa_name=name)) | ||
self.expect(self.tokens['OP'], ":") | ||
self.expect(self.tokens.OP, ":") | ||
a, z = self.parse_rhs() | ||
self.expect(self.tokens['NEWLINE']) | ||
self.expect(self.tokens.NEWLINE) | ||
if self.verbose: | ||
self.dump_nfa(name, a, z) | ||
dfa = self.make_dfa(a, z) | ||
|
@@ -288,7 +309,7 @@ def parse_alt(self): | |
# ALT: ITEM+ | ||
a, b = self.parse_item() | ||
while (self.value in ("(", "[") or | ||
self.type in (self.tokens['NAME'], self.tokens['STRING'])): | ||
self.type in (self.tokens.NAME, self.tokens.STRING)): | ||
c, d = self.parse_item() | ||
b.addarc(c) | ||
b = d | ||
|
@@ -299,7 +320,7 @@ def parse_item(self): | |
if self.value == "[": | ||
self.gettoken() | ||
a, z = self.parse_rhs() | ||
self.expect(self.tokens['OP'], "]") | ||
self.expect(self.tokens.OP, "]") | ||
a.addarc(z) | ||
return a, z | ||
else: | ||
|
@@ -319,9 +340,9 @@ def parse_atom(self): | |
if self.value == "(": | ||
self.gettoken() | ||
a, z = self.parse_rhs() | ||
self.expect(self.tokens['OP'], ")") | ||
self.expect(self.tokens.OP, ")") | ||
return a, z | ||
elif self.type in (self.tokens['NAME'], self.tokens['STRING']): | ||
elif self.type in (self.tokens.NAME, self.tokens.STRING): | ||
a = NFAState() | ||
z = NFAState() | ||
a.addarc(z, self.value) | ||
|
Uh oh!
There was an error while loading. Please reload this page.