8000 Many fixes to the treewalkers and added preliminary treewalkers tests… · awesome-python/html5lib-python@917094a · GitHub
[go: up one dir, main page]

Skip to content

Commit 917094a

Browse files
committed
Many fixes to the treewalkers and added preliminary treewalkers tests (for now using the parser tests and serializing the treewalkers result tokens to the appropriate parser-tests format; all tests pass !!!)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40619
1 parent 4017fa8 commit 917094a

File tree

9 files changed

+239
-35
lines changed

9 files changed

+239
-35
lines changed

src/treebuilders/etree.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,6 @@ def getDocument(self):
244244
return self.document._element.find("html")
245245

246246
def getFragment(self):
247-
return ElementTree.ElementTree(_base.TreeBuilder.getFragment(self)._element)
247+
return _base.TreeBuilder.getFragment(self)._element
248248

249249
return locals()

src/treewalkers/_base.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import gettext
2+
_ = gettext.gettext
3+
14
from constants import voidElements, spaceCharacters
25
spaceCharacters = u"".join(spaceCharacters)
36

@@ -9,7 +12,7 @@ def walkChildren(self, node):
912
raise NodeImplementedError
1013

1114
def error(self, msg):
12-
yield {"type": "SerializeError", "data": msg}
15+
return {"type": "SerializeError", "data": msg}
1316

1417
def normalizeAttrs(self, attrs):
1518
if not attrs:
@@ -18,14 +21,14 @@ def normalizeAttrs(self, attrs):
1821
attrs = attrs.items()
1922
return attrs
2023

21-
def element(self, name, attrs, hasChildren):
24+
def element(self, node, name, attrs, hasChildren):
2225
if name in voidElements:
2326
for token in self.emptyTag(name, attrs, hasChildren):
2427
yield token
2528
else:
2629
yield self.startTag(name, attrs)
2730
if hasChildren:
28-
for token in self.serializeChildren(node):
31+
for token in self.walkChildren(node):
2932
yield token
3033
yield self.endTag(name)
3134

@@ -47,14 +50,13 @@ def text(self, data):
4750
left = data[:len(data)-len(middle)]
4851
if left:
4952
yield {"type": "SpaceCharacters", "data": left}
53+
data = middle
54+
middle = data.rstrip(spaceCharacters)
55+
right = data[len(middle):]
5056
if middle:
51-
data = middle
52-
middle = data.rstrip(spaceCharacters)
53-
right = data[len(data)-len(middle):]
54-
if middle:
55-
yield {"type": "Characters", "data": middle}
56-
if right:
57-
yield {"type": "SpaceCharacters", "data": right}
57+
yield {"type": "Characters", "data": middle}
58+
if right:
59+
yield {"type": "SpaceCharacters", "data": right}
5860

5961
def comment(self, data):
6062
return {"type": "Comment", "data": data}

src/treewalkers/dom.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,29 @@
77

88
class TreeWalker(_base.TreeWalker):
99
def walk(self, node):
10-
if node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
11-
for token in self.walkChildren(node):
12-
yield token
13-
14-
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
10+
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
1511
yield self.doctype(node.nodeName)
1612

1713
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
1814
for token in self.text(node.nodeValue):
1915
yield token
2016

2117
elif node.nodeType == Node.ELEMENT_NODE:
22-
for token in self.element(node.nodeName, \
18+
for token in self.element(node, node.nodeName, \
2319
node.attributes.items(), node.childNodes):
2420
yield token
2521

2622
elif node.nodeType == Node.COMMENT_NODE:
2723
yield self.comment(node.nodeValue)
2824

25+
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
26+
for token in self.walkChildren(node):
27+
yield token
28+
2929
else:
3030
yield self.unknown(node.nodeType)
3131

3232
def walkChildren(self, node):
3333
for childNode in node.childNodes:
34-
for token in self.walk(node):
34+
for token in self.walk(childNode):
3535
yield token

src/treewalkers/etree.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import copy
66

77
import _base
8+
from constants import voidElements
89

910
moduleCache = {}
1011

@@ -24,28 +25,33 @@ def getETreeBuilder(ElementTreeImplementation):
2425

2526
class TreeWalker(_base.TreeWalker):
2627
def walk(self, node):
27-
if type(element) == type(ElementTree.ElementTree):
28-
element = element.getroot()
28+
if not(hasattr(node, "tag")):
29+
node = node.getroot()
2930

3031
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
3132
for token in self.walkChildren(node):
3233
yield token
3334

3435
elif node.tag == "<!DOCTYPE>":
3536
yield self.doctype(node.text)
36-
if node.tail:
37-
for token in self.text(node.tail):
38-
yield token
3937

4038
elif type(node.tag) == type(ElementTree.Comment):
4139
yield self.comment(node.text)
42-
if node.tail:
43-
for token in self.text(node.tail):
44-
yield token
4540

4641
else:
4742
#This is assumed to be an ordinary element
48-
for token in self.element(node):
43+
if node.tag in voidElements:
44+
for token in self.emptyTag(node.tag, \
45+
node.attrib.items(), len(node) or node.text):
46+
yield token
47+
else:
48+
yield self.startTag(node.tag, node.attrib.items())
49+
for token in self.walkChildren(node):
50+
yield token
51+
yield self.endTag(node.tag)
52+
53+
if node.tail:
54+
for token in self.text(node.tail):
4955
yield token
5056

5157
def walkChildren(self, node):
@@ -55,8 +61,5 @@ def walkChildren(self, node):
5561
for childNode in node.getchildren():
5662
for token in self.walk(childNode):
5763
yield token
58-
if node.tail:
59-
for token in self.text(node.tail):
60-
yield token
6164

6265
return locals()

src/treewalkers/simpletree.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class TreeWalker(_base.TreeWalker):
77
def walk(self, node):
88
# testing node.type allows us not to import treebuilders.simpletree
99
if node.type in (1, 2): # Document or DocumentFragment
10-
for token in self.serializeChildren(node):
10+
for token in self.walkChildren(node):
1111
yield token
1212

1313
elif node.type == 3: # DocumentType
@@ -18,7 +18,7 @@ def walk(self, node):
1818
yield token
1919

2020
elif node.type == 5: # Element
21-
for token in self.element(node.name, \
21+
for token in self.element(node, node.name, \
2222
node.attributes.items(), node.childNodes):
2323
yield token
2424

src/treewalkers/soup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ def walk(self, node):
2222
yield token
2323

2424
elif isinstance(node, Tag): # Element
25-
for token in self.element(node):
25+
for token in self.element(node, node.name, \
26+
node.attrs.items(), node.contents):
2627
yield token
2728

2829
else:

tests/test_parser.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@
6060
except ImportError:
6161
pass
6262

63-
sys.stdout.write('Testing trees '+ " ".join(treeTypes.keys()) + "\n")
64-
6563
#Run the parse error checks
6664
checkParseErrors = False
6765

@@ -141,6 +139,7 @@ def runParserTest(self, innerHTML, input, expected, errors, treeClass):
141139
self.assertEquals(len(p.errors), len(errors), errorMsg2)
142140

143141
def test_parser():
142+
sys.stdout.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
144143
for name, cls in treeTypes.iteritems():
145144
for filename in glob.glob('tree-construction/*.dat'):
146145
f = open(filename)

tests/test_serializer.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,13 @@ def buildTestSuite():
9292
return unittest.TestLoader().loadTestsFromTestCase(TestCase)
9393

9494
def main():
95+
# the following is temporary while the unit tests for parse errors are
96+
# still in flux
97+
if '-p' in sys.argv: # suppress check for serialize errors
98+
sys.argv.remove('-p')
99+
global checkSerializeErrors
100+
checkSerializeErrors = False
101+
95102
buildTestSuite()
96103
unittest.main()
97104

0 commit comments

Comments
 (0)
0