8000 Remove `simpletree`, changing the default tree builder to `etree`. · html5lib/html5lib-python@96da7f5 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 96da7f5

Browse files
ambvgsnedders
authored andcommitted
Remove simpletree, changing the default tree builder to etree.
1 parent b0dda81 commit 96da7f5

File tree

15 files changed

+95
-413
lines changed

15 files changed

+95
-413
lines changed

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Change Log
66

77
Released on XXX, 2013
88

9+
* Removed ``simpletree`` from the package. The default tree builder is
10+
now ``etree`` (using the ``xml.etree.ElementTree/cElementTree``
11+
implementation).
12+
913

1014
0.95
1115
~~~~

html5lib/html5parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
8< 9E88 /code>8

99
from . import treebuilders
1010
from .treebuilders._base import Marker
11-
from .treebuilders import simpletree
1211

1312
from . import utils
1413
from . import constants
@@ -20,15 +19,15 @@
2019
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
2120

2221

23-
def parse(doc, treebuilder="simpletree", encoding=None,
22+
def parse(doc, treebuilder="etree", encoding=None,
2423
namespaceHTMLElements=True):
2524
"""Parse a string or file-like object into a tree"""
2625
tb = treebuilders.getTreeBuilder(treebuilder)
2726
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
2827
return p.parse(doc, encoding=encoding)
2928

3029

31-
def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
30+
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
3231
namespaceHTMLElements=True):
3332
tb = treebuilders.getTreeBuilder(treebuilder)
3433
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
@@ -51,9 +50,8 @@ class HTMLParser(object):
5150
"""HTML parser. Generates a tree structure from a stream of (possibly
5251
malformed) HTML"""
5352

54-
def __init__(self, tree=simpletree.TreeBuilder,
55-
tokenizer=tokenizer.HTMLTokenizer, strict=False,
56-
namespaceHTMLElements=True, debug=False):
53+
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
54+
strict=False, namespaceHTMLElements=True, debug=False):
5755
"""
5856
strict - raise an exception when a parse error is encountered
5957
@@ -69,6 +67,8 @@ def __init__(self, tree=simpletree.TreeBuilder,
6967
# Raise an exception on the first error encountered
7068
self.strict = strict
7169

70+
if tree is None:
71+
tree = treebuilders.getTreeBuilder("etree")
7272
self.tree = tree(namespaceHTMLElements)
7373
self.tokenizer_class = tokenizer
7474
self.errors = []

html5lib/serializer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .htmlserializer import HTMLSerializer
66

77

8-
def serialize(input, tree="simpletree", format="html", encoding=None,
8+
def serialize(input, tree="etree", format="html", encoding=None,
99
**serializer_opts):
1010
# XXX: Should we cache this?
1111
walker = treewalkers.getTreeWalker(tree)

html5lib/tests/support.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
del base_path
1717

1818
# Build a dict of avaliable trees
19-
treeTypes = {"simpletree": treebuilders.getTreeBuilder("simpletree"),
20-
"DOM": treebuilders.getTreeBuilder("dom")}
19+
treeTypes = {"DOM": treebuilders.getTreeBuilder("dom")}
2120

2221
# Try whatever etree implementations are avaliable from a list that are
2322
#"supposed" to work
@@ -64,7 +63,7 @@ def __getitem__(self, key):
6463

6564
class TestData(object):
6665
def __init__(self, filename, newTestHeading="data", encoding="utf8"):
67-
if encoding == None:
66+
if encoding is None:
6867
self.f = open(filename, mode="rb")
6968
else:
7069
self.f = codecs.open(filename, encoding=encoding)

html5lib/tests/test_parser2.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from . import support # flake8: noqa
66
from html5lib import html5parser
77
from html5lib.constants import namespaces
8-
from html5lib.treebuilders import dom
8+
from html5lib import treebuilders
99

1010
import unittest
1111

@@ -14,29 +14,42 @@
1414

1515
class MoreParserTests(unittest.TestCase):
1616

17+
def setUp(self):
18+
self.dom_tree = treebuilders.getTreeBuilder("dom")
19+
1720
def test_assertDoctypeCloneable(self):
18-
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
21+
parser = html5parser.HTMLParser(tree=self.dom_tree)
1922
doc = parser.parse('<!DOCTYPE HTML>')
2023
self.assertTrue(doc.cloneNode(True))
2124

2225
def test_line_counter(self):
2326
# http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
24-
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
27+
parser = html5parser.HTMLParser(tree=self.dom_tree)
2528
parser.parse("<pre>\nx\n&gt;\n</pre>")
2629

27-
def test_namespace_html_elements_0(self):
30+
def test_namespace_html_elements_0_dom(self):
31+
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=True)
32+
doc = parser.parse("<html></html>")
33+
self.assertTrue(doc.childNodes[0].namespaceURI == namespaces["html"])
34+
35+
def test_namespace_html_elements_1_dom(self):
36+
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False)
37+
doc = parser.parse("<html></html>")
38+
self.assertTrue(doc.childNodes[0].namespaceURI is None)
39+
40+
def test_namespace_html_elements_0_etree(self):
2841
parser = html5parser.HTMLParser(namespaceHTMLElements=True)
2942
doc = parser.parse("<html></html>")
30-
self.assertTrue(doc.childNodes[0].namespace == namespaces["html"])
43+
self.assertTrue(list(doc)[0].tag == "{%s}html" % (namespaces["html"],))
3144

32-
def test_namespace_html_elements_1(self):
45+
def test_namespace_html_elements_1_etree(self):
3346
parser = html5parser.HTMLParser(namespaceHTMLElements=False)
3447
doc = parser.parse("<html></html>")
35-
self.assertTrue(doc.childNodes[0].namespace == None)
48+
self.assertTrue(list(doc)[0].tag == "html")
3649

3750
def test_unicode_file(self):
3851
parser = html5parser.HTMLParser()
39-
doc = parser.parse(io.StringIO("a"))
52+
parser.parse(io.StringIO("a"))
4053

4154

4255
def buildTestSuite():

html5lib/tests/test_sanitizer.py

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,43 @@
55
except ImportError:
66
import simplejson as json
77

8-
from html5lib import html5parser, sanitizer, constants
8+
from html5lib import html5parser, sanitizer, constants, treebuilders
99

1010

11-
def runSanitizerTest(name, expected, input):
12-
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
13-
parseFragment(expected).childNodes])
11+
def toxmlFactory():
12+
tree = treebuilders.getTreeBuilder("etree")
13+
14+
def toxml(element):
15+
# encode/decode roundtrip required for Python 2.6 compatibility
16+
result_bytes = tree.implementation.tostring(element, encoding="utf-8")
17+
return result_bytes.decode("utf-8")
18+
19+
return toxml
20+
21+
22+
def runSanitizerTest(name, expected, input, toxml=None):
23+
if toxml is None:
24+
toxml = toxmlFactory()
25+
expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
26+
parseFragment(expected)])
1427
expected = json.loads(json.dumps(expected))
1528
assert expected == sanitize_html(input)
1629

1730

18-
def sanitize_html(stream):
19-
return ''.join([token.toxml() for token in
31+
def sanitize_html(stream, toxml=None):
32+
if toxml is None:
33+
toxml = toxmlFactory()
34+
return ''.join([toxml(token) for token in
2035
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
21-
parseFragment(stream).childNodes])
36+
parseFragment(stream)])
2237

2338

2439
def test_should_handle_astral_plane_characters():
25-
assert "<p>\U0001d4b5 \U0001d538</p>" == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
40+
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
2641

2742

2843
def test_sanitizer():
44+
toxml = toxmlFactory()
2945
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
3046
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
3147
continue # TODO
@@ -34,25 +50,30 @@ def test_sanitizer():
3450
if tag_name == 'image':
3551
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
3652
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
37-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
53+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
54+
toxml)
3855
elif tag_name == 'br':
3956
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
4057
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
41-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
58+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
59+
toxml)
4260
elif tag_name in constants.voidElements:
4361
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
4462
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
45-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
63+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
64+
toxml)
4665
else:
4766
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
4867
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
49-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
68+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
69+
toxml)
5070

5171
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
5272
tag_name = tag_name.upper()
5373
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
5474
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
55-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
75+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
76+
toxml)
5677

5778
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
5879
if attribute_name != attribute_name.lower():
@@ -61,20 +82,24 @@ def test_sanitizer():
6182
continue
6283
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
6384
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
64-
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name)
85+
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
86+
toxml)
6587

6688
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
6789
attribute_name = attribute_name.upper()
6890
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
6991
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
70-
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name)
92+
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
93+
toxml)
7194

7295
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
7396
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
7497
"<a href=\"%s\">foo</a>" % protocol,
75-
"""<a href="%s">foo</a>""" % protocol)
98+
"""<a href="%s">foo</a>""" % protocol,
99+
toxml)
76100

77101
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
78102
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
79103
"<a href=\"%s\">foo</a>" % protocol,
80-
"""<a href="%s">foo</a>""" % protocol)
104+
"""<a href="%s">foo</a>""" % protocol,
105+
toxml)

html5lib/tests/test_treewalkers.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,11 @@ def PullDOMAdapter(node):
4545
raise NotImplementedError("Node type not supported: " + str(node.nodeType))
4646

4747
treeTypes = {
48-
"simpletree": {"builder": treebuilders.getTreeBuilder("simpletree"),
49-
"walker": treewalkers.getTreeWalker("simpletree")},
50-
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
51-
"walker": treewalkers.getTreeWalker("dom")},
52-
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
53-
"adapter": PullDOMAdapter,
54-
"walker": treewalkers.getTreeWalker("pulldom")},
48+
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
49+
"walker": treewalkers.getTreeWalker("dom")},
50+
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
51+
"adapter": PullDOMAdapter,
52+
"walker": treewalkers.getTreeWalker("pulldom")},
5553
}
5654

5755
# Try whatever etree implementations are available from a list that are
@@ -103,7 +101,7 @@ def PullDOMAdapter(node):
103101
else:
104102
def GenshiAdapter(tree):
105103
text = None
106-
for token in treewalkers.getTreeWalker("simpletree")(tree):
104+
for token in treewalkers.getTreeWalker("dom")(tree):
107105
type = token["type"]
108106
if type in ("Characters", "SpaceCharacters"):
109107
if text is None:
@@ -147,7 +145,7 @@ def GenshiAdapter(tree):
147145
yield TEXT, text, (None, -1, -1)
148146

149147
treeTypes["genshi"] = \
150-
{"builder": treebuilders.getTreeBuilder("simpletree"),
148+
{"builder": treebuilders.getTreeBuilder("dom"),
151149
"adapter": GenshiAdapter,
152150
"walker": treewalkers.getTreeWalker("genshi")}
153151

html5lib/treebuilders/__init__.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
1) A set of classes for various types of elements: Document, Doctype,
88
Comment, Element. These must implement the interface of
99
_base.treebuilders.Node (although comment nodes have a different
10-
signature for their constructor, see treebuilders.simpletree.Comment)
10+
signature for their constructor, see treebuilders.etree.Comment)
1111
Textual content may also be implemented as another node type, or not, as
1212
your tree implementation requires.
1313
@@ -24,10 +24,6 @@
2424
testSerializer method on your treebuilder which accepts a node and
2525
returns a string containing Node and its children serialized according
2626
to the format used in the unittests
27-
28-
The supplied simpletree module provides a python-only implementation
29-
of a full treebuilder and is a useful reference for the semantics of
30-
the various methods.
3127
"""
3228

3329
from __future__ import absolute_import, division, unicode_literals
@@ -39,10 +35,8 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
3935
"""Get a TreeBuilder class for various types of tree with built-in support
4036
4137
treeType - the name of the tree type required (case-insensitive). Supported
42-
values are "simpletree", "dom", and "etree"
38+
values are:
4339
44-
"simpletree" - a built-in DOM-ish tree type with support for some
45-
more pythonic idioms.
4640
"dom" - A generic builder for DOM implementations, defaulting to
4741
a xml.dom.minidom based implementation for the sake of
4842
backwards compatibility (as releases up until 0.10 had a
@@ -65,9 +59,6 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
6559
implementation = minidom
6660
# XXX: NEVER cache here, caching is done in the dom submodule
6761
return dom.getDomModule(implementation, **kwargs).TreeBuilder
68-
elif treeType == "simpletree":
69-
from . import simpletree
70-
treeBuilderCache[treeType] = simpletree.TreeBuilder
7162
elif treeType == "lxml":
7263
from . import etree_lxml
7364
treeBuilderCache[treeType] = etree_lxml.TreeBuilder

html5lib/treebuilders/dom.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ def insertText(self, data, parent=None):
163163
self.dom._child_node_types.append(Node.TEXT_NODE)
164164
self.dom.appendChild(self.dom.createTextNode(data))
165165

166+
implementation = DomImplementation
166167
name = None
167168

168169
def testSerializer(element):

html5lib/treebuilders/etree.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ class TreeBuilder(_base.TreeBuilder):
313313
elementClass = Element
314314
commentClass = Comment
315315
fragmentClass = DocumentFragment
316+
implementation = ElementTreeImplementation
316317

317318
def testSerializer(self, element):
318319
return testSerializer(element)

html5lib/treebuilders/etree_lxml.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def serializeElement(element, indent=0):
122122
if element.text:
123123
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
124124
indent += 2
125-
for child in element.getchildren():
125+
for child in element:
126126
serializeElement(child, indent)
127127
if hasattr(element, "tail") and element.tail:
128128
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
@@ -163,7 +163,7 @@ def serializeElement(element):
163163
if element.text:
164164
rv.append(element.text)
165165

166-
for child in element.getchildren():
166+
for child in element:
167167
serializeElement(child)
168168

169169
rv.append("</%s>" % (element.tag,))
@@ -185,6 +185,7 @@ class TreeBuilder(_base.TreeBuilder):
185185
elementClass = None
186186
commentClass = None
187187
fragmentClass = Document
188+
implementation = etree
188189

189190
def __init__(self, namespaceHTMLElements, fullTree=False):
190191
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
@@ -280,7 +281,7 @@ def getFragment(self):
280281
element = self.openElements[0]._element
281282
if element.text:
282283
fragment.append(element.text)
283-
fragment.extend(element.getchildren())
284+
fragment.extend(list(element))
284285
if element.tail:
285286
fragment.append(element.tail)
286287
return fragment

0 commit comments

Comments
 (0)
0