8000 Add beautiful soup support · awesome-python/html5lib-python@852a776 · GitHub
[go: up one dir, main page]

Skip to content

Commit 852a776

Browse files
committed
Add beautiful soup support
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40607
1 parent 03459af commit 852a776

File tree

3 files changed

+174
-4
lines changed

3 files changed

+174
-4
lines changed

src/treebuilders/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,27 +36,32 @@
3636
import dom
3737
import simpletree
3838
import etree
39+
try:
40+
import soup as beautifulsoup
41+
except:
42+
pass
3943

4044

4145
def getTreeBuilder(treeType, implementation=None, **kwargs):
4246
"""Get a TreeBuilder class for various types of tree with built-in support
4347
4448
treeType - the name of the tree type required (case-insensitive). Supported
45-
values are "simpletree", "dom", "etree"
49+
values are "simpletree", "dom", "etree" and "beautifulsoup"
4650
4751
"simpletree" - a built-in DOM-ish tree type with support for some
4852
more pythonic idioms.
4953
"dom" - The xml.dom.minidom DOM implementation
5054
"etree" - A generic builder for tree implementations exposing an
5155
elementtree-like interface (known to work with
5256
ElementTree, cElementTree and lxml.etree).
57+
"beautifulsoup" - Beautiful soup (if installed)
5358
5459
implementation - (Currently applies to the "etree" tree type only). A module
5560
implementing the tree type e.g. xml.etree.ElementTree or
5661
lxml.etree."""
5762

5863
treeType = treeType.lower()
59-
if treeType in ("dom", "simpletree"):
64+
if treeType in ("dom", "simpletree", "beautifulsoup"):
6065
return globals()[treeType].TreeBuilder
61-
elif "name == etree":
66+
elif treeType == "etree":
6267
return etree.getETreeModule(implementation, **kwargs).TreeBuilder

src/treebuilders/etree.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import _base
22
import new
3+
import copy
34

45
moduleCache = {}
56

@@ -99,7 +100,8 @@ def insertText(self, data, insertBefore=None):
99100

100101
def cloneNode(self):
101102
element = Element(self.name)
102-
element.attributes = self.attributes
103+
for name, value in self.attributes.iteritems():
104+
element.attributes[name] = value
103105
return element
104106

105107
def reparentChildren(self, newParent):

src/treebuilders/soup.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
2+
import sys
3+
import copy
4+
5+
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
6+
7+
import _base
8+
9+
class AttrList(object):
10+
def __init__(self, element):
11+
self.element = element
12+
self.attrs = dict(self.element.attrs)
13+
def __iter__(self):
14+
return self.attrs.items().__iter__()
15+
def __setitem__(self, name, value):
16+
"set attr", name, value
17+
self.element[name] = value
18+
def items(self):
19+
return self.attrs.items()
20+
def keys(self):
21+
return self.attrs.keys()
22+
def __getitem__(self, name):
23+
return self.attrs[name]
24+
def __contains__(self, name):
25+
return name in self.attrs.keys()
26+
27+
28+
class Element(_base.Node):
29+
def __init__(self, element, soup):
30+
_base.Node.__init__(self, element.name)
31+
self.element = element
32+
self.soup=soup
33+
34+
def appendChild(self, node):
35+
if (node.element.__class__ == NavigableString and self.element.contents
36+
and self.element.contents[-1].__class__ == NavigableString):
37+
newNode = TextNode(NavigableString(
38+
self.element.contents[-1]+node.element), self.soup)
39+
self.element.contents[-1].extract()
40+
self.appendChild(newNode)
41+
else:
42+
self.element.insert(len(self.element.contents), node.element)
43+
node.parent = self
44+
45+
def getAttributes(self):
46+
return AttrList(self.element)
47+
48+
def setAttributes(self, attributes):
49+
if attributes:
50+
for name, value in attributes.items():
51+
self.element[name] = value
52+
53+
attributes = property(getAttributes, setAttributes)
54+
55+
def insertText(self, data, insertBefore=None):
56+
text = TextNode(NavigableString(data), self.soup)
57+
if insertBefore:
58+
self.insertBefore(text, insertBefore)
59+
else:
60+
self.appendChild(text)
61+
62+
def insertBefore(self, node, refNode):
63+
#XXX What exception should we throw here?
64+
index = self.element.contents.index(refNode.element)
65+
if (node.element.__class__ == NavigableString and self.element.contents
66+
and self.element.contents[index-1].__class__ == NavigableString):
67+
newNode = TextNode(NavigableString(
68+
self.element.contents[index-1]+node.element), self.soup)
69+
self.element.contents[index-1].extract()
70+
self.insertBefore(newNode, refNode)
71+
else:
72+
self.element.insert(index, node.element)
73+
node.parent = self
74+
75+
def removeChild(self, node):
76+
node.element.extract()
77+
node.parent = None
78+
79+
def reparentChildren(self, newParent):
80+
while self.element.contents:
81+
child = self.element.contents[0]
82+
child.extract()
83+
if isinstance(child, Tag):
84+
newParent.appendChild(Element(child, self.soup))
85+
else:
86+
newParent.appendChild(TextNode(child, self.soup))
87+
88+
def cloneNode(self):
89+
node = Element(Tag(self.soup, self.element.name), self.soup)
90+
for key,value in self.attributes:
91+
node.attributes[key] = value
92+
return node
93+
94+
def hasContent(self):
95+
return self.element.contents
96+
97+
class TextNode(Element):
98+
def __init__(self, element, soup):
99+
_base.Node.__init__(self, None)
100+
self.element = element
101+
self.soup=soup
102+
103+
def cloneNode(self):
104+
raise NotImplementedError
105+
106+
class TreeBuilder(_base.TreeBuilder):
107+
def documentClass(self):
108+
self.soup = BeautifulSoup("")
109+
return Element(self.soup, self.soup)
110+
111+
def insertDoctype(self, name):
112+
self.soup.insert(0, Declaration(name))
113+
114+
def elementClass(self, name):
115+
return Element(Tag(self.soup, name), self.soup)
116+
117+
def commentClass(self, data):
118+
return TextNode(Comment(data), self.soup)
119+
120+
def fragmentClass(self):
121+
self.soup = BeautifulSoup("")
122+
self.soup.name = "[document_fragment]"
123+
return Element(self.soup, self.soup)
124+
125+
def appendChild(self, node):
126+
self.soup.insert(len(self.soup.contents), node.element)
127+
128+
def testSerializer(self, element):
129+
return testSerializer(element)
130+
131+
def getDocument(self):
132+
return self.soup
133+
134+
def getFragment(self):
135+
return _base.TreeBuilder.getFragment(self).element
136+
137+
def testSerializer(element):
138+
rv = []
139+
def serializeElement(element, indent=0):
140+
if isinstance(element, Declaration):
141+
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
142+
elif isinstance(element, BeautifulSoup):
143+
if element.name == "[document_fragment]":
144+
rv.append("#document-fragment")
145+
else:
146+
rv.append("#document")
147+
148+
elif isinstance(element, Comment):
149+
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
150+
elif isinstance(element, unicode):
151+
rv.append("|%s\"%s\"" %(' '*indent, element))
152+
else:
153+
rv.append("|%s<%s>"%(' '*indent, element.name))
154+
if element.attrs:
155+
for name, value in element.attrs:
156+
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
157+
indent += 2
158+
if hasattr(element, "contents"):
159+
for child in element.contents:
160+
serializeElement(child, indent)
161+
serializeElement(element, 0)
162+
163+
return "\n".join(rv)

0 commit comments

Comments
 (0)
0