diff --git a/.gitignore b/.gitignore index 73d97fec..ce463d19 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ stats.prof # We have no interest in built Sphinx files /doc/_build + +venv diff --git a/html5lib/constants.py b/html5lib/constants.py index 5735d7b6..d1477b31 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3088,12 +3088,19 @@ "ParseError": 7, "JinjaStatementStartTag": 8, "JinjaStatementEndTag": 9, - "JinjaStatementTag": 10, + "JinjaStatement": 10, "JinjaVariableStartTag": 11, "JinjaVariableEndTag": 12, "JinjaVariable": 13, "JinjaFilter": 14, - "JinjaPipe": 15 + "JinjaPipe": 15, + "JinjaArgumentStartTag": 16, + "JinjaArgumentEndTag": 17, + "JinjaArgument": 18, + "JinjaExtendTag": 19, + "JinjaIncludeTag": 20, + "JinjaImportTag": 21, + "JinjaComment": 22 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 9d836e16..9768ab06 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -168,12 +168,19 @@ def mainLoop(self): ParseErrorToken = tokenTypes["ParseError"] JinjaStatementStartTag = tokenTypes["JinjaStatementStartTag"] JinjaStatementEndTag = tokenTypes["JinjaStatementEndTag"] - JinjaStatementTag = tokenTypes["JinjaStatementTag"] + JinjaStatement = tokenTypes["JinjaStatement"] JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] JinjaVariable = tokenTypes["JinjaVariable"] JinjaPipe = tokenTypes["JinjaPipe"] JinjaFilter = tokenTypes["JinjaFilter"] + JinjaArgumentStartTag = tokenTypes["JinjaArgumentStartTag"] + JinjaArgumentEndTag = tokenTypes["JinjaArgumentEndTag"] + JinjaArgument = tokenTypes["JinjaArgument"] + JinjaExtendTag = tokenTypes["JinjaExtendTag"] + JinjaIncludeTag = tokenTypes["JinjaIncludeTag"] + JinjaImportTag = tokenTypes["JinjaImportTag"] + JinjaComment = tokenTypes["JinjaComment"] for token in self.normalizedTokens(): new_token = token @@ -190,8 +197,11 @@ def mainLoop(self): new_token = None else: if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter, JinjaPipe): - log.debug(u"Type is a jinja tag") phase = self.phases["inJinjaVariable"] + elif type in (JinjaStatementStartTag, JinjaStatementEndTag, JinjaStatement): + phase = self.phases["inJinjaStatement"] + elif type in (JinjaArgumentStartTag, JinjaArgumentEndTag, JinjaArgument): + phase = self.phases["inJinjaArgument"] elif ( len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or @@ -224,8 +234,8 @@ def mainLoop(self): new_token = phase.processJinjaStatementStartTag(new_token) elif type == JinjaStatementEndTag: new_token = phase.processJinjaStatementEndTag(new_token) - elif type == JinjaStatementTag: - new_token = phase.processJinjaStatementTag(new_token) + elif type == JinjaStatement: + new_token = phase.processJinjaStatement(new_token) elif type == JinjaVariableStartTag: new_token = phase.processJinjaVariableStartTag(new_token) elif type == JinjaVariableEndTag: @@ -234,8 +244,22 @@ def mainLoop(self): new_token = phase.processJinjaVariable(new_token) elif type == JinjaPipe: new_token = phase.processJinjaPipe(new_token) + elif type == JinjaComment: + new_token = phase.processJinjaComment(new_token) elif type == JinjaFilter: new_token = phase.processJinjaFilter(new_token) + elif type == JinjaArgumentStartTag: + new_token = phase.processJinjaArgumentStartTag(new_token) + elif type == JinjaArgumentEndTag: + new_token = phase.processJinjaArgumentEndTag(new_token) + elif type == JinjaArgument: + new_token = phase.processJinjaArgument(new_token) + elif type == JinjaExtendTag: + new_token = phase.processJinjaExtendTag(new_token) + elif type == JinjaIncludeTag: + new_token = phase.processJinjaIncludeTag(new_token) + elif type == JinjaImportTag: + new_token = phase.processJinjaImportTag(new_token) if (type == StartTagToken and token["selfClosing"] and not token["selfClosingAcknowledged"]): @@ -432,7 +456,6 @@ def resetInsertionMode(self): new_phase = self.phases["inBody"] break - #log.debug(u"Changing phase to {}".format(new_phase)) self.phase = new_phase def parseRCDataRawtext(self, token, contentType): @@ -450,7 +473,6 @@ def parseRCDataRawtext(self, token, contentType): self.originalPhase = self.phase - log.debug(u"Changing phase to text") self.phase = self.phases["text"] @@ -517,7 +539,7 @@ def processJinjaStatementStartTag(self, token): def processJinjaStatementEndTag(self, token): pass - def processJinjaStatementTag(self, token): + def processJinjaStatement(self, token): pass def processJinjaVariableStartTag(self, token): @@ -529,6 +551,31 @@ def processJinjaVariableEndTag(self, token): def processJinjaVariable(self, token): pass + def processJinjaExtendTag(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + + def processJinjaIncludeTag(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + + def processJinjaComment(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + + def processJinjaImportTag(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + + def processJinjaArgumentStartTag(self, token): + pass + + def processJinjaArgumentEndTag(self, token): + pass + + def processJinjaArgument(self, token): + pass + def processJinjaPipe(self, token): pass @@ -553,19 +600,13 @@ def processEndTag(self, token): class InJinjaVariablePhase(Phase): def processJinjaVariableStartTag(self, token): - log = logging.getLogger('html5lib') - log.debug(u"InJinja: Start Tag") self.tree.reconstructActiveFormattingElements() self.tree.insertElement(token) def processJinjaVariableEndTag(self, token): - log = logging.getLogger('html5lib') - log.debug(u"InJinja: End Tag {}".format(token["name"])) for node in self.tree.openElements[::-1]: - log.debug(u"InJinja: Open tag {} token {}".format(node, token)) if node.name == token["name"]: self.tree.generateImpliedEndTags(exclude=token["name"]) - log.debug(u"InJinja: Implied end tag {} {}".format(self.tree.openElements[-1].name, token["name"])) if self.tree.openElements[-1].name != token["name"]: self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) while self.tree.openElements.pop() != node: @@ -573,7 +614,6 @@ def processJinjaVariableEndTag(self, token): break else: if node.nameTuple in specialElements: - log.debug(u"Nametuple {} in {}".format(node.nameTuple, specialElements)) self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) break @@ -589,6 +629,68 @@ def processJinjaFilter(self, token): element = self.tree.createElementWithoutNamespace(token) self.tree.openElements[-1].appendChild(element) + class InJinjaStatementPhase(Phase): + def processJinjaStatementStartTag(self, token): + if token['name'] == 'jinjaelse': + self.closeOpenIf(token) + elif token['name'] == 'jinjaelif': + self.closeOpenIf(token) + + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + + def closeOpenIf(self, token): + #import logging + #log = logging.getLogger(u"html5lib") + + for node in self.tree.openElements[::-1]: + #log.debug(u"Prev {} Cur {}".format(node.name, token['name'])) + + if node.name == token["name"] or (node.name in ["jinjaif", "jinjaelif"] and token["name"] in ["jinjaelse", "jinjaelif"]): + self.tree.generateImpliedEndTags(exclude=token["name"]) + + if self.tree.openElements[-1].name in ["jinjaif", "jinjaelif"] and token["name"] in ["jinjaelse", "jinjaelif"]: + pass + elif self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + while self.tree.openElements.pop() != node: + pass + + break + else: + if node.nameTuple in specialElements: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + break + + def processJinjaStatementEndTag(self, token): + import logging + log = logging.getLogger(u"html5lib") + + for node in self.tree.openElements[::-1]: + if node.name == token["name"] or (node.name in ["jinjaelse", "jinjaelif"] and token["name"] == "jinjaif"): + self.tree.generateImpliedEndTags(exclude=token["name"]) + + if self.tree.openElements[-1].name in ["jinjaelse", "jinjaelif"] and token["name"] == "jinjaif": + pass + elif self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + while self.tree.openElements.pop() != node: + pass + break + else: + log.debug(u"Node {}".format(node.name)) + self.tree.openElements.pop() + + def processJinjaStatement(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + + class InJinjaArgumentPhase(Phase): + def processJinjaArgument(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].childNodes[-1].appendChild(element) + class InitialPhase(Phase): def processSpaceCharacters(self, token): pass @@ -882,8 +984,6 @@ def startTagOther(self, token): def endTagHead(self, token): node = self.parser.tree.openElements.pop() assert node.name == "head", "Expected head got %s" % node.name - log = logging.getLogger(u"html5lib") - log.debug(u"Switching phase to afterHead") self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): @@ -894,8 +994,6 @@ def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): - log = logging.getLogger(u"html5lib") - log.debug(u"Implied end head tag") self.endTagHead(impliedTagToken("head")) # XXX If we implement a parser for which scripting is disabled we need to @@ -966,8 +1064,6 @@ def endTagOther(self, token): def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) - log = logging.getLogger(u"html5lib") - log.debug(u"Changing phase to body") self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True @@ -1080,6 +1176,9 @@ def processEOF(self): "tfoot", "th", "thead", "tr", "body", "html")) for node in self.tree.openElements[::-1]: + if node.name.startswith("jinja"): + continue + if node.name not in allowed_elements: self.parser.parseError("expected-closing-tag-but-got-eof") break @@ -2794,6 +2893,8 @@ def processEndTag(self, token): # XXX "inHeadNoscript": InHeadNoScriptPhase, "afterHead": AfterHeadPhase, "inJinjaVariable": InJinjaVariablePhase, + "inJinjaStatement": InJinjaStatementPhase, + "inJinjaArgument": InJinjaArgumentPhase, "inBody": InBodyPhase, "text": TextPhase, "inTable": InTablePhase, diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..6eb74fbd 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -4,6 +4,7 @@ import codecs import re +import logging from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from .constants import encodings, ReparseException @@ -43,6 +44,8 @@ class BufferedIOBase(object): # Cache for charsUntil() charsUntilRegEx = {} +log = logging.getLogger(u"html5lib") + class BufferedStream(object): """Buffering for streams that do not have buffering of their own diff --git a/html5lib/tests/test_jinja.py b/html5lib/tests/test_jinja.py new file mode 100644 index 00000000..9194faf8 --- /dev/null +++ b/html5lib/tests/test_jinja.py @@ -0,0 +1,549 @@ +import html5lib +import unittest +import logging + +log = logging.getLogger(__name__) + + +def dump(tree, tabs=0): + log.debug(u"{}Tag '{}' - {} children - Value = {} - Text = {}".format( + "".join(["\t" for i in range(tabs)]), tree.tag, len(tree), tree.attrib['value'] if 'value' in tree.attrib else None, tree.text)) + + for child in tree: + dump(child, tabs + 1) + + +class JinjaTestCase(unittest.TestCase): + def setUp(self): + self.parser = html5lib.HTMLParser(strict=True, namespaceHTMLElements=False, tree=html5lib.treebuilders.getTreeBuilder("etree", fullTree=True)) + + def assertTree(self, root, spec): + self.assertEqual(len(root), len(spec)) + + for child, spec_child in zip(root, spec): + self.assertEqual(child.tag, spec_child['tag']) + + if 'text' in spec_child: + self.assertEqual(child.text, spec_child['text']) + + if 'value' in spec_child: + self.assertEqual(child.attrib['value'], spec_child['value']) + + if 'children' in spec_child: + self.assertTree(child, spec_child['children']) + else: + self.assertEqual(len(child), 0) + + if 'attrs' in spec_child: + for k, v in spec_child['attrs'].iteritems(): + self.assertIn(k, child.attrib) + self.assertEqual(v, child.attrib[k]) + + def test_var_1(self): + html_string = """
Praesent dapibus, neque id cursus faucibus, tortor neque egestas augue, eu vulputate magna eros eu erat. Faucibus, tortor praesent neque id dapibus.
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Donec odio. Quisque volutpat tmattis eros. Nullam malesuada erat ut turpis. Suspendisse urna nibh, viverra non, semper suscipit, posuere a, pede. Donec nec justo eget felis facilisis fermentum. Aliquam porttitor mauris sit amet orci. Aenean dignissim pellentesque felis.Morbi in sem quis dui placerat ornare. Pellentesque odio nisi, euismod in, pharetra a, ultricies in, diam. Sed arcu. Cras consequat.
+Praesent dapibus, neque id cursus faucibus, tortor neque egestas augue, eu vulputate magna eros eu erat. Aliquam erat volutpat. Nam dui mi, tincidunt quis, accumsan porttitor, facilisis luctus, metus.
+