8000 Added Genshi treewalker (converts Genshi's Stream events into html5li… · html5lib/html5lib-python@52a6acb · GitHub
[go: up one dir, main page]

Skip to content 8000

Commit 52a6acb

Browse files
committed
Added Genshi treewalker (converts Genshi's Stream events into html5lib tokens)
For now, unit tests are only run if there is at least one ElementTree implementation. The GenshiAdapter (adapted from genshi.input.ET) needs to be rewritten to walk an xml.dom.minidom tree (see the PullDOMAdapter) --HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40665
1 parent f1b39f7 commit 52a6acb

File tree

3 files changed

+122
-9
lines changed

3 files changed

+122
-9
lines changed

src/treewalkers/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
2727
elementtree-like interface (known to work with
2828
ElementTree, cElementTree and lxml.etree).
2929
"beautifulsoup" - Beautiful soup (if installed)
30+
"genshi" - a Genshi stream
3031
3132
implementation - (Currently applies to the "etree" tree type only). A module
3233
implementing the tree type e.g. xml.etree.ElementTree or
@@ -37,6 +38,9 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
3738
if treeType in ("dom", "pulldom", "simpletree"):
3839
mod = __import__(treeType, globals())
3940
treeWalkerCache[treeType] = mod.TreeWalker
41+
elif treeType == "genshi":
42+
import genshistream
43+
treeWalkerCache[treeType] = genshistream.TreeWalker
4044
elif treeType == "beautifulsoup":
4145
import soup
4246
treeWalkerCache[treeType] = soup.TreeWalker

src/treewalkers/genshistream.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
2+
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
3+
from genshi.output import NamespaceFlattener
4+
5+
import _base
6+
7+
from constants import voidElements
8+
9+
class TreeWalker(_base.TreeWalker):
10+
def __iter__(self):
11+
depth = 0
12+
ignore_until = None
13+
previous = None
14+
for event in NamespaceFlattener()(self.tree):
15+
if previous is not None:
16+
if previous[0] == START:
17+
depth += 1
18+
if ignore_until <= depth:
19+
ignore_until = None
20+
if ignore_until is None:
21+
for token in self.tokens(previous, event):
22+
yield token
23+
if token["type"] == "EmptyTag":
24+
ignore_until = depth
25+
if previous[0] == END:
26+
depth -= 1
27+
previous = event
28+
if ignore_until is None or ignore_until <= depth:
29+
for token in self.tokens(previous, None):
30+
yield token
31+
elif ignore_until is not None:
32+
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
33+
34+
def tokens(self, event, next):
35+
kind, data, pos = event
36+
if kind == START:
37+
tag, attrib = data
38+
if tag in voidElements:
39+
for token in self.emptyTag(tag, list(attrib), \
40+
not next or next[0] != END or next[1] != tag):
41+
yield token
42+
else:
43+
yield self.startTag(tag, list(attrib))
44+
45+
elif kind == END:
46+
if data not in voidElements:
47+
yield self.endTag(data)
48+
49+
elif kind == COMMENT:
50+
yield self.comment(data)
51+
52+
elif kind == TEXT:
53+
for token in self.text(data):
54+
yield token
55+
56+
elif kind == DOCTYPE:
57+
yield self.doctype(data)
58+
59+
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
60+
START_CDATA, END_CDATA, PI):
61+
pass
62+
63+
else:
64+
yield self.unknown(kind)

tests/test_treewalkers.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,24 +81,24 @@ def PullDOMAdapter(node):
8181
pass
8282

8383
try:
84-
import xml.etree.cElementTree as cElementTree
84+
import xml.etree.cElementTree as ElementTree
8585
treeTypes['cElementTree'] = \
86-
{"builder": treebuilders.getTreeBuilder("etree", cElementTree),
87-
"walker": treewalkers.getTreeWalker("etree", cElementTree)}
86+
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
87+
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
8888
except ImportError:
8989
try:
90-
import cElementTree
90+
import cElementTree as ElementTree
9191
treeTypes['cElementTree'] = \
92-
{"builder": treebuilders.getTreeBuilder("etree", cElementTree),
93-
"walker": treewalkers.getTreeWalker("etree", cElementTree)}
92+
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
93+
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
9494
except ImportError:
9595
pass
9696

9797
try:
98-
import lxml.etree as lxml
98+
import lxml.etree as ElementTree
9999
treeTypes['lxml'] = \
100-
{"builder": treebuilders.getTreeBuilder("etree", lxml),
101-
"walker": treewalkers.getTreeWalker("etree", lxml)}
100+
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
101+
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
102102
except ImportError:
103103
pass
104104

@@ -110,6 +110,51 @@ def PullDOMAdapter(node):
110110
except ImportError:
111111
pass
112112

113+
if "ElementTree" in locals():
114+
try:
115+
from genshi.core import Attrs, QName
116+
from genshi.core import START, END, DOCTYPE, TEXT, COMMENT
117+
118+
def GenshiAdapter(tree):
119+
if not(hasattr(tree, "tag")):
120+
tree = tree.getroot()
121+
122+
if tree.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
123+
if tree.text:
124+
yield TEXT, tree.text, (None, -1, -1)
125+
for child in tree.getchildren():
126+
for item in GenshiAdapter(child):
127+
yield item
128+
129+
elif tree.tag == "<!DOCTYPE>":
130+
yield DOCTYPE, QName(tree.text), (None, -1, -1)
131+
132+
elif type(tree.tag) == type(ElementTree.Comment):
133+
yield COMMENT, tree.text, (None, -1, -1)
134+
135+
else:
136+
tag_name = QName(tree.tag.lstrip('{'))
137+
attrs = Attrs([(QName(attr.lstrip('{')), value)
138+
for attr, value in tree.items()])
139+
140+
yield START, (tag_name, attrs), (None, -1, -1)
141+
if tree.text:
142+
yield TEXT, tree.text, (None, -1, -1)
143+
for child in tree.getchildren():
144+
for item in GenshiAdapter(child):
145+
yield item
146+
yield END, tag_name, (None, -1, -1)
147+
148+
if tree.tail:
149+
yield TEXT, tree.tail, (None, -1, -1)
150+
151+
treeTypes["genshi"] = \
152+
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
153+
"adapter": GenshiAdapter,
154+
"walker": treewalkers.getTreeWalker("genshi")}
155+
except ImportError:
156+
pass
157+
113158
def concatenateCharacterTokens(tokens):
114159
charactersToken = None
115160
for token in tokens:

0 commit comments

Comments
 (0)
0