8000 Make beautiful soup treewalker pass all but one test · awesome-python/html5lib-python@f01e037 · GitHub
[go: up one dir, main page]

Skip to content

Commit f01e037

Browse files
committed
Make beautiful soup treewalker pass all but one test
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40697
1 parent 2b70d2a commit f01e037

File tree

3 files changed

+54
-78
lines changed

3 files changed

+54
-78
lines changed

examples/validate.cgi

Lines changed: 49 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -6,78 +6,55 @@ does not report any of the other (many) possible types of conformance
66
errors that may exist in a HTML5 document"""
77

88
import sys
9-
import urllib2
109
import cgi
10+
import copy
1111

12-
import html5lib
13-
14-
htmlTemplate = u"""<html>
15-
<head>
16-
<title>%(title)s</title>
17-
</head>
18-
<body>
19-
<h1>%(title)s</h1>
20-
%(body)s
21-
</body>
22-
</html>"""
23-
24-
def parseDocument(document):
25-
"""Parse the document and return a list of errors and a parse tree"""
26-
p = html5lib.HTMLParser()
27-
tree = p.parse(document)
28-
return p.errors, cgi.escape(tree.printTree(), True)
29-
30-
def getDocument(uri):
31-
if uri.startswith("http://") or uri.startswith("https://"):
32-
#Why is string conversion necessary here?
33-
document = "".join(urllib2.urlopen(uri).readlines())[:-1]
34-
#print "<--!%s-->"%(document,)
35-
else:
36-
raise ValueError, "Unrecognised URI type"
37-
return document
38-
39-
def writeValid(uri, treeStr):
40-
bodyText = """<p><strong>%s is valid HTML5!</strong></p>
41-
<h2>Parse Tree:</h2>
42-
<pre>
43-
%s
44-
</pre>"""%(uri, treeStr)
45-
writeOutput(htmlTemplate%{"title":"Validation Results", "body":bodyText})
46-
47-
def writeInvalid(uri, treeStr, errors):
48-
errList=[]
49-
for pos, message in errors:
50-
errList.append("Line %i Col %i"%pos + " " + message)
51-
errStr = "<br>\n".join(errList)
52-
bodyText = """<p><strong>%s is not valid HTML5</strong></p>
53-
<h2>Errors:</h2>
54-
%s
55-
<h2>Parse Tree:</h2>
56-
<pre>
57-
%s
58-
</pre>"""%(uri, errStr, treeStr)
59-
writeOutput(htmlTemplate%{"title":"Validation Results", "body":bodyText})
12+
import httplib2
13+
import lxml
14+
from genshi.template import MarkupTemplate
6015

61-
def writeErr(uri):
62-
bodyText = "<p>Failed to load URI %s</p>"%(uri,)
63-
writeOutput(htmlTemplate%{"title":"Error", "body":bodyText})
64-
65-
def writeOutput(s):
66-
print s.encode('utf-8')
67-
68-
print "Content-type: text/html"
69-
print ""
70-
71-
try:
72-
form = cgi.FieldStorage()
73-
uri = form.getvalue("uri")
74-
document = getDocument(uri)
75-
except:
76-
writeErr(uri)
77-
sys.exit(1)
78-
79-
errors, tree = parseDocument(document)
80-
if errors:
81-
writeInvalid(uri, tree, errors)
82-
else:
83-
writeValid(uri, tree)
16+
import html5lib
17+
from html5lib import treebuilders
18+
19+
class Resource(object):
20+
http = httplib2.Http()
21+
def __init__(self, uri):
22+
self.uri = uri
23+
self.content = None
24+
25+
def load(self):
26+
self.response, self.content = self.http.request(self.uri)
27+
28+
def parse(self):
29+
raise NotImplementedError
30+
31+
class Schema(Resource):
32+
def load(self):
33+
#This will just be a network operation eventually
34+
self.content = open(self.uri).read()
35+
36+
def parse(self):
37+
self.tree = lxml.etree.parse(self.content)
38+
self.relaxng = lxml.etree.RelaxNG(self.tree)
39+
40+
class Document(Resource):
41+
42+
def parse(self):
43+
parser = html5lib.HTMLParser(
44+
tree=treebuilders.getTreeBuilder("etree", lxml.etree))
45+
self.tree = parser.parse(self.content)
46+
self.parseErrors = parser.parseErrors
47+
self.hasSyntaxErrors = not(self.parseErrors)
48+
49+
def check(self, schema):
50+
self.hasConformaceErrors = schema.relaxng.validate(self.tree)
51+
self.relaxErrors = schema.relaxng.error_log
52+
53+
class Response(object):
54+
templateFilename = "response.html"
55+
def __init__(self):
56+
self.template = MarkupTemplate(open(self.templateFilename).read())
57+
58+
def render(self, document):
59+
stream = self.template.generate(doc = document)
60+
return stream.render(doctype=("html","",""))

src/treebuilders/soup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ def insertText(self, data, insertBefore=None):
6060
self.appendChild(text)
6161

6262
def insertBefore(self, node, refNode):
63-
#XXX What exception should we throw here?
6463
index = self.element.contents.index(refNode.element)
6564
if (node.element.__class__ == NavigableString and self.element.contents
6665
and self.element.contents[index-1].__class__ == NavigableString):

src/treewalkers/soup.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import gettext
22
_ = gettext.gettext
33

4-
from BeautifulSoup import BeautifulSoup, Declaration, Comment
4+
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
55

66
import _base
77

@@ -11,18 +11,18 @@ def getNodeDetails(self, node):
1111
return (_base.DOCUMENT,)
1212

1313
elif isinstance(node, Declaration): # DocumentType
14-
return _base.DOCTYPE, node.string
14+
#Slice needed to remove markup added during unicode conversion
15+
return _base.DOCTYPE, unicode(node.string)[2:-1]
1516

1617
elif isinstance(node, Comment):
17-
return _base.COMMENT, node.data
18+
return _base.COMMENT, unicode(node.string)[4:-3]
1819

1920
elif isinstance(node, unicode): # TextNode
2021
return _base.TEXT, node
2122

2223
elif isinstance(node, Tag): # Element
2324
return _base.ELEMENT, node.name, \
24-
node.attrs.items(), node.contents
25-
25+
dict(node.attrs).items(), node.contents
2626
else:
2727
return _base.UNKNOWN, node.__class__.__name__
2828

0 commit comments

Comments
 (0)
0