10000 Remove `simpletree`, changing the default tree builder to `etree`. · lroggendorff/html5lib-python@96da7f5 · GitHub
[go: up one dir, main page]

Skip to content

Commit 96da7f5

Browse files
ambvgsnedders
authored andcommitted
Remove simpletree, changing the default tree builder to etree.
1 parent b0dda81 commit 96da7f5

15 files changed

+95
-413
lines changed

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Change Log
66

77
Released on XXX, 2013
88

9+
* Removed ``simpletree`` from the package. The default tree builder is
10+
now ``etree`` (using the ``xml.etree.ElementTree/cElementTree``
11+
implementation).
12+
913

1014
0.95
1115
~~~~

html5lib/html5parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from . import treebuilders
1010
from .treebuilders._base import Marker
11-
from .treebuilders import simpletree
1211

1312
from . import utils
1413
from . import constants
@@ -20,15 +19,15 @@
2019
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
2120

2221

23-
def parse(doc, treebuilder="simpletree", encoding=None,
22+
def parse(doc, treebuilder="etree", encoding=None,
2423
namespaceHTMLElements=True):
2524
"""Parse a string or file-like object into a tree"""
2625
tb = treebuilders.getTreeBuilder(treebuilder)
2726
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
2827
return p.parse(doc, encoding=encoding)
2928

3029

31-
def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
30+
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
3231
namespaceHTMLElements=True):
3332
tb = treebuilders.getTreeBuilder(treebuilder)
3433
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
@@ -51,9 +50,8 @@ class HTMLParser(object):
5150
"""HTML parser. Generates a tree structure from a stream of (possibly
5251
malformed) HTML"""
5352

54-
def __init__(self, tree=simpletree.TreeBuilder,
55-
tokenizer=tokenizer.HTMLTokenizer, strict=False,
56-
namespaceHTMLElements=True, debug=False):
53+
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
54+
strict=False, namespaceHTMLElements=True, debug=False):
5755
"""
5856
strict - raise an exception when a parse error is encountered
5957
@@ -69,6 +67,8 @@ def __init__(self, tree=simpletree.TreeBuilder,
6967
# Raise an exception on the first error encountered
7068
self.strict = strict
7169

70+
if tree is None:
71+
tree = treebuilders.getTreeBuilder("etree")
7272
self.tree = tree(namespaceHTMLElements)
7373
self.tokenizer_class = tokenizer
7474
self.errors = []

html5lib/serializer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .htmlserializer import HTMLSerializer
66

77

8-
def serialize(input, tree="simpletree", format="html", encoding=None,
8+
def serialize(input, tree="etree", format="html", encoding=None,
99
**serializer_opts):
1010
# XXX: Should we cache this?
1111
walker = treewalkers.getTreeWalker(tree)

html5lib/tests/support.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
del base_path
1717

1818
# Build a dict of avaliable trees
19-
treeTypes = {"simpletree": treebuilders.getTreeBuilder("simpletree"),
20-
"DOM": treebuilders.getTreeBuilder("dom")}
19+
treeTypes = {"DOM": treebuilders.getTreeBuilder("dom")}
2120

2221
# Try whatever etree implementations are avaliable from a list that are
2322
#"supposed" to work
@@ -64,7 +63,7 @@ def __getitem__(self, key):
6463

6564
class TestData(object):
6665
def __init__(self, filename, newTestHeading="data", encoding="utf8"):
67-
if encoding == None:
66+
if encoding is None:
6867
self.f = open(filename, mode="rb")
6968
else:
7069
self.f = codecs.open(filename, encoding=encoding)

html5lib/tests/test_parser2.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from . import support # flake8: noqa
66
from html5lib import html5parser
77
from html5lib.constants import namespaces
8-
from html5lib.treebuilders import dom
8+
from html5lib import treebuilders
99

1010
import unittest
1111

@@ -14,29 +14,42 @@
1414

1515
class MoreParserTests(unittest.TestCase):
1616

17+
def setUp(self):
18+
self.dom_tree = treebuilders.getTreeBuilder("dom")
19+
1720
def test_assertDoctypeCloneable(self):
18-
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
21+
parser = html5parser.HTMLParser(tree=self.dom_tree)
1922
doc = parser.parse('<!DOCTYPE HTML>')
2023
self.assertTrue(doc.cloneNode(True))
2124

2225
def test_line_counter(self):
2326
# http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
24-
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
27+
parser = html5parser.HTMLParser(tree=self.dom_tree)
2528
parser.parse("<pre>\nx\n&gt;\n</pre>")
2629

27-
def test_namespace_html_elements_0(self):
30+
def test_namespace_html_elements_0_dom(self):
31+
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=True)
32+
doc = parser.parse("<html></html>")
33+
self.assertTrue(doc.childNodes[0].namespaceURI == namespaces["html"])
34+
35+
def test_namespace_html_elements_1_dom(self):
36+
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False)
37+
doc = parser.parse("<html></html>")
38+
self.assertTrue(doc.childNodes[0].namespaceURI is None)
39+
40+
def test_namespace_html_elements_0_etree(self):
2841
parser = html5parser.HTMLParser(namespaceHTMLElements=True)
2942
doc = parser.parse("<html></html>")
30-
self.assertTrue(doc.childNodes[0].namespace == namespaces["html"])
43+
self.assertTrue(list(doc)[0].tag == "{%s}html" % (namespaces["html"],))
3144

32-
def test_namespace_html_elements_1(self):
45+
def test_namespace_html_elements_1_etree(self):
3346
parser = html5parser.HTMLParser(namespaceHTMLElements=False)
3447
doc = parser.parse("<html></html>")
35-
self.assertTrue(doc.childNodes[0].namespace == None)
48+
self.assertTrue(list(doc)[0].tag == "html")
3649

3750
def test_unicode_file(self):
3851
parser = html5parser.HTMLParser()
39-
doc = parser.parse(io.StringIO("a"))
52+
parser.parse(io.StringIO("a"))
4053

4154

4255
def buildTestSuite():

html5lib/tests/test_sanitizer.py

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,43 @@
55
except ImportError:
66
import simplejson as json
77

8-
from html5lib import html5parser, sanitizer, constants
8+
from html5lib import html5parser, sanitizer, constants, treebuilders
99

1010

11-
def runSanitizerTest(name, expected, input):
12-
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
13-
parseFragment(expected).childNodes])
11+
def toxmlFactory():
12+
tree = treebuilders.getTreeBuilder("etree")
13+
14+
def toxml(element):
15+
# encode/decode roundtrip required for Python 2.6 compatibility
16+
result_bytes = tree.implementation.tostring(element, encoding="utf-8")
17+
return result_bytes.decode("utf-8")
18+
19+
return toxml
20+
21+
22+
def runSanitizerTest(name, expected, input, toxml=None):
23+
if toxml is None:
24+
toxml = toxmlFactory()
25+
expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
26+
parseFragment(expected)])
1427
expected = json.loads(json.dumps(expected))
1528
assert expected == sanitize_html(input)
1629

1730

18-
def sanitize_html(stream):
19-
return ''.join([token.toxml() for token in
31+
def sanitize_html(stream, toxml=None):
32+
if toxml is None:
33+
toxml = toxmlFactory()
34+
return ''.join([toxml(token) for token in
2035
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
21-
parseFragment(stream).childNodes])
36+
parseFragment(stream)])
2237

2338

2439
def test_should_handle_astral_plane_characters():
25-
assert "<p>\U0001d4b5 \U0001d538</p>" == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
40+
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
2641

2742

2843
def test_sanitizer():
44+
toxml = toxmlFactory()
2945
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
3046
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
3147
continue # TODO
@@ -34,25 +50,30 @@ def test_sanitizer():
3450
if tag_name == 'image':
3551
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
3652
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
37-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
53+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
54+
toxml)
3855
elif tag_name == 'br':
3956
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
4057
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
41-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
58+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
59+
toxml)
4260
elif tag_name in constants.voidElements:
4361
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
4462
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
45-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
63+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
64+
toxml)
4665
else:
4766
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
4867
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
49-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
68+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
69+
toxml)
5070

5171
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
5272
tag_name = tag_name.upper()
5373
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
5474
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
55-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
75+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
76+
toxml)
5677

5778
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
5879
if attribute_name != attribute_name.lower():
@@ -61,20 +82,24 @@ def test_sanitizer():
6182
continue
6283
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
6384
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
64-
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name)
85+
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
86+
toxml)
6587

6688
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
6789
attribute_name = attribute_name.upper()
6890
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
6991
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
70-
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name)
92+
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
93+
toxml)
7194

7295
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
7396
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
7497
"<a href=\"%s\">foo</a>" % protocol,
75-
"""<a href="%s">foo</a>""" % protocol)
98+
"""<a href="%s">foo</a>""" % protocol,
99+
toxml)
76100

77101
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
78102
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
79103
"<a href=\"%s\">foo</a>" % protocol,
80-
"""<a href="%s">foo</a>""" % protocol)
104+
"""<a href="%s">foo</a>""" % protocol,
105+
toxml)

html5lib/tests/test_treewalkers.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,11 @@ def PullDOMAdapter(node):
4545
raise NotImplementedError("Node type not supported: " + str(node.nodeType))
4646

4747
treeTypes = {
48-
"simpletree": {"builder": treebuilders.getTreeBuilder("simpletree"),
49-
"walker": treewalkers.getTreeWalker("simpletree")},
50-
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
51-
"walker": treewalkers.getTreeWalker("dom")},
52-
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
53-
"adapter": PullDOMAdapter,
54-
"walker": treewalkers.getTreeWalker("pulldom")},
48+
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
49+
"walker": treewalkers.getTreeWalker("dom")},
50+
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
51+
"adapter": PullDOMAdapter,
52+
"walker": treewalkers.getTreeWalker("pulldom")},
5553
}
5654

5755
# Try whatever etree implementations are available from a list that are
@@ -103,7 +101,7 @@ def PullDOMAdapter(node):
103101
else:
104102
def GenshiAdapter(tree):
105103
text = None
106-
for token in treewalkers.getTreeWalker("simpletree")(tree):
104+
for token in treewalkers.getTreeWalker("dom")(tree):
107105
type = token["type"]
108106
if type in ("Characters", "SpaceCharacters"):
109107
if text is None:
@@ -147,7 +145,7 @@ def GenshiAdapter(tree):
147145
yield TEXT, text, (None, -1, -1)
148146

149147
treeTypes["genshi"] = \
150-
{"builder": treebuilders.getTreeBuilder("simpletree"),
148+
{"builder": treebuilders.getTreeBuilder("dom"),
151149
"adapter": GenshiAdapter,
152150
"walker": treewalkers.getTreeWalker("genshi")}
153151

html5lib/treebuilders/__init__.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
1) A set of classes for various types of elements: Document, Doctype,
88
Comment, Element. These must implement the interface of
99
_base.treebuilders.Node (although comment nodes have a different
10-
signature for their constructor, see treebuilders.simpletree.Comment)
10+
signature for their constructor, see treebuilders.etree.Comment)
1111
Textual content may also be implemented as another node type, or not, as
1212
your tree implementation requires.
1313
@@ -24,10 +24,6 @@
2424
testSerializer method on your treebuilder which accepts a node and
2525
returns a string containing Node and its children serialized according
2626
to the format used in the unittests
27-
28-
The supplied simpletree module provides a python-only implementation
29-
of a full treebuilder and is a useful reference for the semantics of
30-
the various methods.
3127
"""
3228

3329
from __future__ import absolute_import, division, unicode_literals
@@ -39,10 +35,8 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
3935
"""Get a TreeBuilder class for various types of tree with built-in support
4036
4137
treeType - the name of the tree type required (case-insensitive). Supported
42-
values are "simpletree", "dom", and "etree"
38+
values are:
4339
44-
"simpletree" - a built-in DOM-ish tree type with support for some
45-
more pythonic idioms.
4640
"dom" - A generic builder for DOM implementations, defaulting to
4741
a xml.dom.minidom based implementation for the sake of
4842
backwards compatibility (as releases up until 0.10 had a
@@ -65,9 +59,6 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
6559
implementation = minidom
6660
# XXX: NEVER cache here, caching is done in the dom submodule
6761
return dom.getDomModule(implementation, **kwargs).TreeBuilder
68-
elif treeType == "simpletree":
69-
from . import simpletree
70-
treeBuilderCache[treeType] = simpletree.TreeBuilder
7162
elif treeType == "lxml":
7263
from . import etree_lxml
7364
treeBuilderCache[treeType] = etree_lxml.TreeBuilder

html5lib/treebuilders/dom.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ def insertText(self, data, parent=None):
163163
self.dom._child_node_types.append(Node.TEXT_NODE)
164164
self.dom.appendChild(self.dom.createTextNode(data))
165165

166+
implementation = DomImplementation
166167
name = None
167168

168169
def testSerializer(element):

html5lib/treebuilders/etree.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ class TreeBuilder(_base.TreeBuilder):
313313
elementClass = Element
314314
commentClass = Comment
315315
fragmentClass = DocumentFragment
316+
implementation = ElementTreeImplementation
316317

317318
def testSerializer(self, element):
318319
return testSerializer(element)

html5lib/treebuilders/etree_lxml.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def serializeElement(element, indent=0):
122122
if element.text:
123123
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
124124
indent += 2
125-
for child in element.getchildren():
125+
for child in element:
126126
serializeElement(child, indent)
127127
if hasattr(element, "tail") and element.tail:
128128
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
@@ -163,7 +163,7 @@ def serializeElement(element):
163163
if element.text:
164164
rv.append(element.text)
165165

166-
for child in element.getchildren():
166+
for child in element:
167167
serializeElement(child)
168168

169169
rv.append("</%s>" % (element.tag,))
@@ -185,6 +185,7 @@ class TreeBuilder(_base.TreeBuilder):
185185
elementClass = None
186186
commentClass = None
187187
fragmentClass = Document
188+
implementation = etree
188189

189190
def __init__(self, namespaceHTMLElements, fullTree=False):
190191
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
@@ -280,7 +281,7 @@ def getFragment(self):
280281
element = self.openElements[0]._element
281282
if element.text:
282283
fragment.append(element.text)
283-
fragment.extend(element.getchildren())
284+
fragment.extend(list(element))
284285
if element.tail:
285286
fragment.append(element.tail)
286287
return fragment

0 commit comments

Comments
 (0)
0