8000 Make lxml work · awesome-python/html5lib-python@8edc330 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8edc330

Browse files
committed
Make lxml work
--HG-- branch : svgmathml extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401298
1 parent f1adba6 commit 8edc330

File tree

3 files changed

+78
-39
lines changed

3 files changed

+78
-39
lines changed

src/html5lib/treebuilders/etree.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1-
import _base
21
import new
2+
import re
33

4+
import _base
45
from html5lib import ihatexml
56

7+
tag_regexp = re.compile("{([^}]*)}(.*)")
8+
69
moduleCache = {}
710

811
def getETreeModule(ElementTreeImplementation, fullTree=False):
@@ -20,24 +23,38 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
2023
ElementTree = ElementTreeImplementation
2124
class Element(_base.Node):
2225
def __init__(self, name, namespace=None):
23-
if namespace is None:
24-
etree_tag = name
25-
else:
26-
etree_tag = "{%s}%s"%(name, namespace)
27-
self._element = ElementTree.Element(etree_tag)
28-
self.name = name
29-
self.namespace = namespace
26+
self._name = name
27+
self._namespace = namespace
28+
self._element = ElementTree.Element(self._getETreeTag(name,
29+
namespace))
3030
self.parent = None
3131
self._childNodes = []
3232
self._flags = []
33+
34+
def _getETreeTag(self, name, namespace):
35+
if namespace is None:
36+
etree_tag = name
37+
else:
38+
etree_tag = "{%s}%s"%(namespace, name)
39+
return etree_tag
3340

3441
def _setName(self, name):
35-
self._element.tag = name
42+
self._name = name
43+
self._element.tag = self._getETreeTag(self._name, self._namespace)
3644

3745
def _getName(self):
38-
return self._element.tag
39-
46+
return self._name
47+
4048
name = property(_getName, _setName)
49+
50+
def _setNamespace(self, namespace):
51+
self._namespace = namespace
52+
self._element.tag = self._getETreeTag(self._name, self._namespace)
53+
54+
def _getNamespace(self):
55+
return self._namespace
56+
57+
namespace = property(_getNamespace, _setNamespace)
4158

4259
def _getAttributes(self):
4360
return self._element.attrib
@@ -197,7 +214,7 @@ def serializeElement(element, indent=0):
197214
elif type(element.tag) == type(ElementTree.Comment):
198215
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
199216
else:
200-
if element.namespave == self.defaultNamespace:
217+
if element.namespace == self.defaultNamespace:
201218
name = element.tag
202219
else:
203220
ns, name = element.tag.split("}")

src/html5lib/treebuilders/etree_lxml.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
import _base
21
import new
32
import warnings
3+
import re
4+
5+
import _base
46
from html5lib.constants import DataLossWarning
7+
import html5lib.constants as constants
58
import etree as etree_builders
69
from html5lib import ihatexml
710

@@ -25,9 +28,7 @@
2528

2629
class DocumentType(object):
2730
def __init__(self, name, publicId, systemId):
28-
self.name = name
29-
if name != name.lower():
30-
warnings.warn("lxml does not preserve doctype case", DataLossWarning)
31+
self.name = name
3132
self.publicId = publicId
3233
self.systemId = systemId
3334

@@ -80,11 +81,36 @@ def serializeElement(element, indent=0):
8081
elif type(element.tag) == type(etree.Comment):
8182
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
8283
else:
83-
rv.append("|%s<%s>"%(' '*indent, filter.fromXmlName(element.tag)))
84+
nsmatch = etree_builders.tag_regexp.match(element.tag)
85+
if nsmatch is not None:
86+
ns = nsmatch.group(1)
87+
tag = nsmatch.group(2)
88+
prefix = constants.prefixes[ns]
89+
if prefix != "html":
90+
rv.append("|%s<%s %s>"%(' '*indent, prefix,
91+
filter.fromXmlName(tag)))
92+
else:
93+
rv.append("|%s<%s>"%(' '*indent,
94+
filter.fromXmlName(tag)))
95+
else:
96+
rv.append("|%s<%s>"%(' '*indent,
97+
filter.fromXmlName(element.tag)))
98+
8499
if hasattr(element, "attrib"):
85100
for name, value in element.attrib.iteritems():
86-
rv.append('|%s%s="%s"' % (' '*(indent+2),
87-
filter.fromXmlName(name), value))
101+
nsmatch = etree_builders.tag_regexp.match(name)
102+
if nsmatch:
103+
ns = nsmatch.group(1)
104+
name = nsmatch.group(2)
105+
prefix = constants.prefixes[ns]
106+
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
107+
prefix,
108+
filter.fromXmlName(name),
109+
value))
110+
else:
111+
rv.append('|%s%s="%s"' % (' '*(indent+2),
112+
filter.fromXmlName(name),
113+
value))
88114
if element.text:
89115
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
90116
indent += 2
@@ -160,38 +186,33 @@ def __init__(self, element, value={}):
160186
dict.__init__(self, value)
161187
for key, value in self.iteritems():
162188
if isinstance(key, tuple):
163-
name = "{%s}%s"%(key[2], key[1])
189+
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
164190
else:
165-
name = key
166-
self._element._element.attrib[filter.coerceAttribute(name)] = value
191+
name = filter.coerceAttribute(key)
192+
self._element._element.attrib[name] = value
167193

168194
def __setitem__(self, key, value):
169195
dict.__setitem__(self, key, value)
170196
if isinstance(key, tuple):
171-
name = "{%s}%s"%(key[2], key[1])
197+
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
172198
else:
173-
name = key
174-
self._element._element.attrib[filter.coerceAttribute(key)] = value
199+
name = filter.coerceAttribute(key)
200+
self._element._element.attrib[name] = value
175201

176202
class Element(builder.Element):
177-
def __init__(self, name, namespace = None):
203+
def __init__(self, name, namespace):
178204
name = filter.coerceElement(name)
179-
if namespace is None:
180-
etree_tag = name
181-
else:< 10000 /div>
182-
etree_tag = "{%s}%s"%(namespace, name)
183-
self._name = name
184-
self.namespace = namespace
185-
builder.Element.__init__(self, name)
205+
builder.Element.__init__(self, name, namespace=namespace)
186206
self._attributes = Attributes(self)
187207

188208
def _setName(self, name):
189-
self._name = name
190-
self._element.tag = filter.coerceElement(name)
191-
209+
self._name = filter.coerceElement(name)
210+
self._element.tag = self._getETreeTag(
211+
self._name, self._namespace)
212+
192213
def _getName(self):
193214
return self._name
194-
215+
195216
name = property(_getName, _setName)
196217

197218
def _getAttributes(self):
@@ -281,7 +302,8 @@ def insertRoot(self, token):
281302
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
282303
self.doctype.systemId or "")
283304
docStr += ">"
284-
docStr += "<html></html>"
305+
#TODO - this needs to work when elements are not put into the default ns
306+
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
285307

286308
try:
287309
root = etree.fromstring(docStr)

tests/test_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
import lxml.html as lxml
4242
except ImportError:
4343
import lxml.etree as lxml
44-
#treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml", lxml, fullTree=True)
44+
treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml", lxml, fullTree=True)
4545
except ImportError:
4646
pass
4747

0 commit comments

Comments
 (0)
0