8000 Refactoring of treewalkers using a base class in treewalkers._base · awesome-python/html5lib-python@b2e045f · GitHub
[go: up one dir, main page]

Skip to content

Commit b2e045f

Browse files
committed
Refactoring of treewalkers using a base class in treewalkers._base
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40613
1 parent 94025dc commit b2e045f

File tree

8 files changed

+224
-135
lines changed

8 files changed

+224
-135
lines changed

src/serializer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from constants import voidElements, booleanAttributes
2-
31
try:
42
frozenset
53
except NameError:
@@ -9,6 +7,9 @@
97
import gettext
108
_ = gettext.gettext
119

10+
from constants import voidElements, booleanAttributes, spaceCharacters
11+
spaceCharacters = u"".join(spaceCharacters)
12+
1213
def _slide(iterator):
1314
previous = None
1415
for token in iterator:
@@ -64,6 +65,8 @@ def serialize(self, treewalker):
6465
elif in_cdata:
6566
self.serializeError(_("Unexpected child element of a CDATA element"))
6667
attrs = token["data"]
68+
if hasattr(attrs, "items"):
69+
attrs = attrs.items()
6770
attrs.sort()
6871
attributes = []
6972
for k,v in attrs:

src/treewalkers/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313

1414
import dom
1515
import simpletree
16-
import etree
16+
import etree
17+
import pulldom
1718
try:
1819
import soup as beautifulsoup
1920
except:
@@ -28,7 +29,8 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
2829
2930
"simpletree" - a built-in DOM-ish tree type with support for some
3031
more pythonic idioms.
31-
"dom" - The xml.dom.minidom DOM implementation
32+
"dom" - The xml.dom.minidom DOM implementation
33+
"pulldom" - The xml.dom.pulldom event stream
3234
"etree" - A generic builder for tree implementations exposing an
3335
elementtree-like interface (known to work with
3436
ElementTree, cElementTree and lxml.etree).
@@ -39,7 +41,7 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
3941
lxml.etree."""
4042

4143
treeType = treeType.lower()
42-
if treeType in ("dom", "simpletree", "beautifulsoup"):
44+
if treeType in ("dom", "pulldom", "simpletree", "beautifulsoup"):
4345
return globals()[treeType].TreeWalker
4446
elif treeType == "etree":
4547
return etree.getETreeModule(implementation, **kwargs).TreeWalker

src/treewalkers/_base.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from constants import voidElements, spaceCharacters
2+
spaceCharacters = u"".join(spaceCharacters)
3+
4+
class TreeWalker(object):
5+
def walk(self, node):
6+
raise NotImplementedError
7+
8+
def walkChildren(self, node):
9+
raise NodeImplementedError
10+
11+
def error(self, msg):
12+
yield {"type": "SerializeError", "data": msg}
13+
14+
def normalizeAttrs(self, attrs):
15+
if not attrs:
16+
attrs = []
17+
elif hasattr(attrs, 'items'):
18+
attrs = attrs.items()
19+
return attrs
20+
21+
def element(self, name, attrs, hasChildren):
22+
if name in voidElements:
23+
for token in self.emptyTag(name, attrs, hasChildren):
24+
yield token
25+
else:
26+
yield self.startTag(name, attrs)
27+
if hasChildren:
28+
for token in self.serializeChildren(node):
29+
yield token
30+
yield self.endTag(name)
31+
32+
def emptyTag(self, name, attrs, hasChildren=False):
33+
yield {"type": "EmptyTag", "name": name, \
34+
"data": self.normalizeAttrs(attrs)}
35+
if hasChildren:
36+
yield self.error(_("Void element has children"))
37+
38+
def startTag(self, name, attrs):
39+
return {"type": "StartTag", "name": name, \
40+
"data": self.normalizeAttrs(attrs)}
41+
42+
def endTag(self, name):
43+
return {"type": "EndTag", "name": name, "data": []}
44+
45+
def text(self, data):
46+
middle = data.lstrip(spaceCharacters)
47+
left = data[:len(data)-len(middle)]
48+
if left:
49+
yield {"type": "SpaceCharacters", "data": left}
50+
if middle:
51+
data = middle
52+
middle = data.rstrip(spaceCharacters)
53+
right = data[len(data)-len(middle):]
54+
if middle:
55+
yield {"type": "Characters", "data": middle}
56+
if right:
57+
yield {"type": "SpaceCharacters", "data": right}
58+
59+
def comment(self, data):
60+
return {"type": "Comment", "data": data}
61+
62+
def doctype(self, name):
63+
return {"type": "Doctype", "name": name, "data": name.upper() == "HTML"}
64+
65+
def unknown(self, nodeType):
66+
return self.error(_("Unknown node type: ") + nodeType)
67+

src/treewalkers/dom.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,42 +3,33 @@
33
import gettext
44
_ = gettext.gettext
55

6-
from constants import voidElements, spaceCharacters
6+
import _base
77

8-
spaceCharacters = u''.join(spaceCharacters)
9-
10-
class TreeWalker(object):
11-
def serialize(self, node):
8+
class TreeWalker(_base.TreeWalker):
9+
def walk(self, node):
1210
if node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
13-
for childNode in node.childNodes:
14-
for token in self.serialize(childNode):
15-
yield token
11+
for token in self.walkChildren(node):
12+
yield token
1613

1714
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
18-
yield {"type": "Doctype", "name": node.nodeName, "data": False}
15+
yield self.doctype(node.nodeName)
1916

2017
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
21-
yield {"type": node.nodeValue.lstrip(spaceCharacters) and "Characters" or "SpaceCharacters",
22-
"data": node.nodeValue}
18+
for token in self.text(node.nodeValue):
19+
yield token
2320

2421
elif node.nodeType == Node.ELEMENT_NODE:
25-
if node.nodeName in voidElements:
26-
yield {"type": "EmptyTag", "name": node.nodeName,
27-
"data": node.attributes.items()}
28-
if node.childNodes:
29-
yield {"type": "SerializeError",
30-
"data": _("Void element has children")}
31-
else:
32-
yield {"type": "StartTag", "name": node.name,
33-
"data": node.attributes.items()}
34-
for childNode in node.childNodes:
35-
for token in self.serialize(childNode):
36-
yield token
37-
yield {"type": "EndTag", "name": node.nodeName, "data": []}
22+
for token in self.element(node.nodeName, \
23+
node.attributes.items(), node.childNodes):
24+
yield token
3825

3926
elif node.nodeType == Node.COMMENT_NODE:
40-
yield {"type": "Comment", "data": node.nodeValue}
27+
yield self.comment(node.nodeValue)
4128

4229
else:
43-
yield {"type": "SerializeError",
44-
"data": _("Unknown node type: " + node.nodeType)}
30+
yield self.unknown(node.nodeType)
31+
32+
def walkChildren(self, node):
33+
for childNode in node.childNodes:
34+
for token in self.walk(node):
35+
yield token

src/treewalkers/etree.py

Lines changed: 27 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
import gettext
22
_ = gettext.gettext
33

4-
from constants import voidElements, spaceCharacters
5-
6-
spaceCharacters = u''.join(spaceCharacters)
7-
84
import new
9-
import copy
5+
import copy
6+
7+
import _base
108

119
moduleCache = {}
1210

@@ -24,47 +22,41 @@ def getETreeModule(ElementTreeImplementation):
2422
def getETreeBuilder(ElementTreeImplementation):
2523
ElementTree = ElementTreeImplementation
2624

27-
def _charactersToken(characters):
28-
return {"type": characters.lstrip(spaceCharacters) and "Characters" or "SpaceCharacters",
29-
"data": characters}
30-
31-
class TreeWalker(object):
32-
def serialize(self, node):
25+
class TreeWalker(_base.TreeWalker):
26+
def walk(self, node):
3327
if type(element) == type(ElementTree.ElementTree):
3428
element = element.getroot()
3529

3630
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
37-
if node.text:
38-
yield self.charactersToken(node.text)
39-
for childNode in node.getchildren():
40-
for token in self.serialize(childNode):
41-
yield token
31+
for token in self.walkChildren(node):
32+
yield token
4233

4334
elif node.tag == "<!DOCTYPE>":
44-
yield {"type": "Doctype", "name": node.text, "data": False}
35+
yield self.doctype(node.text)
36+
if node.tail:
37+
for token in self.text(node.tail):
38+
yield token
4539

4640
elif type(node.tag) == type(ElementTree.Comment):
47-
yield {"type": "Comment", "data": node.text}
41+
yield self.comment(node.text)
42+
if node.tail:
43+
for token in self.text(node.tail):
44+
yield token
4845

4946
else:
5047
#This is assumed to be an ordinary element
51-
if node.name in voidElements:
52-
yield {"type": "EmptyTag", "name": node.tag,
53-
"data": node.attrib.items()}
54-
if node.childNodes or node.text:
55-
yield {"type": "SerializeError",
56-
"data": _("Void element has children")}
57-
else:
58-
yield {"type": "StartTag", "name": node.name,
59-
"data": node.attrib.items()}
60-
if node.text:
61-
yield self.charactersToken(node.text)
62-
for childNode in node.getchildren():
63-
for token in self.serialize(childNode):
64-
yield token
65-
yield {"type": "EndTag", "name": node.tag, "data": []}
66-
48+
for token in self.element(node):
49+
yield token
50+
51+
def walkChildren(self, node):
52+
if node.text:
53+
for token in self.text(node.text):
54+
yield token
55+
for childNode in node.getchildren():
56+
for token in self.walk(childNode):
57+
yield token
6758
if node.tail:
68-
yield self.charactersToken(node.tail)
59+
for token in self.text(node.tail):
60+
yield token
6961

7062
return locals()

src/treewalkers/simpletree.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,34 @@
11
import gettext
22
_ = gettext.gettext
33

4-
from constants import voidElements, spaceCharacters
4+
import _base
55

6-
spaceCharacters = u''.join(spaceCharacters)
7-
8-
class TreeWalker(object):
9-
def serialize(self, node):
6+
class TreeWalker(_base.TreeWalker):
7+
def walk(self, node):
108
# testing node.type allows us not to import treebuilders.simpletree
119
if node.type in (1, 2): # Document or DocumentFragment
12-
for childNode in node.childNodes:
13-
for token in self.serialize(childNode):
14-
yield token
10+
for token in self.serializeChildren(node):
11+
yield token
1512

1613
elif node.type == 3: # DocumentType
17-
yield {"type": "Doctype", "name": node.name, "data": False}
14+
yield self.doctype(node.name)
1815

1916
elif node.type == 4: # TextNode
20-
yield {"type": node.value.lstrip(spaceCharacters) and "Characters" or "SpaceCharacters",
21-
"data": node.value}
17+
for token in self.text(node.value):
18+
yield token
2219

2320
elif node.type == 5: # Element
24-
if node.name in voidElements:
25-
yield {"type": "EmptyTag", "name": node.name,
26-
"data": node.attributes.items()}
27-
if node.childNodes:
28-
yield {"type": "SerializeError",
29-
"data": _("Void element has children")}
30-
else:
31-
yield {"type": "StartTag", "name": node.name,
32-
"data": node.attributes.items()}
33-
for childNode in node.childNodes:
34-
for token in self.serialize(childNode):
35-
yield token
36-
yield {"type": "EndTag", "name": node.name, "data": []}
21+
for token in self.element(node.name, \
22+
node.attributes.items(), node.childNodes):
23+
yield token
3724

3825
elif node.type == 6: # CommentNode
39-
yield {"type": "Comment", "data": node.data}
26+
yield self.comment(node.data)
4027

4128
else:
42-
yield {"type": "SerializeError",
43-
"data": _("Unknown node type: " + node.type)}
29+
yield self.unknown(node.type)
30+
31+
def walkChildren(self, node):
32+
for childNode in node.childNodes:
33+
for token in self.walk(childNode):
34+
yield token

src/treewalkers/soup.py

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,42 +3,32 @@
33

44
from BeautifulSoup import BeautifulSoup, Declaration, Comment
55

6-
from constants import voidElements, spaceCharacters
6+
import _base
77

8-
spaceCharacters = u''.join(spaceCharacters)
9-
10-
class TreeWalker(object):
11-
def serialize(self, node):
8+
class TreeWalker(_base.TreeWalker):
9+
def walk(self, node):
1210
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
13-
for childNode in node.contents:
14-
for token in self.serialize(childNode):
15-
yield token
11+
for token in self.walkChildren(childNode):
12+
yield token
1613

1714
elif isinstance(node, Declaration): # DocumentType
18-
yield {"type": "Doctype", "name": node.string, "data": False}
15+
yield self.doctype(node.string)
1916

2017
elif isinstance(node, Comment):
21-
yield {"type": "Comment", "data": node.data}
18+
yield self.comment(node.data)
2219

2320
elif isinstance(node, unicode): # TextNode
24-
yield {"type": node.value.lstrip(spaceCharacters) and "Characters" or "SpaceCharacters",
25-
"data": node.value}
21+
for token in self.text(node):
22+
yield token
2623

2724
elif isinstance(node, Tag): # Element
28-
if node.name in voidElements:
29-
yield {"type": "EmptyTag", "name": node.name,
30-
"data": node.attrs.items()}
31-
if node.childNodes:
32-
yield {"type": "SerializeError",
33-
"data": _("Void element has children")}
34-
else:
35-
yield {"type": "StartTag", "name": node.name,
36-
"data": node.attrs.items()}
37-
for childNode in node.contents:
38-
for token in self.serialize(childNode):
39-
yield token
40-
yield {"type": "EndTag", "name": node.name, "data": []}
25+
for token in self.element(node):
26+
yield token
4127

4228
else:
43-
yield {"type": "SerializeError",
44-
"data": _("Unknown node type: " + node.__class__.__name__)}
29+
yield self.unknown(node.__class__.__name__)
30+
31+
def walkChildren(self, node):
32+
for childNode in node.contents:
33+
for token in self.walk(childNode):
34+
yield token

0 commit comments

Comments
 (0)
0