8000 Welcome to Python 3. · html5lib/html5lib-python@376b4a5 · GitHub
[go: up one dir, main page]

Skip to content

Commit 376b4a5

Browse files
committed
Welcome to Python 3.
We now fail the same three tests on both Py2 and Py3. I'm fairly certain the meta-preparser among other things is broken on Py3, but we have no tests for it. (We should fix that.)
1 parent eb7f702 commit 376b4a5

File tree

12 files changed

+115
-82
lines changed

12 files changed

+115
-82
lines changed

html5lib/inputstream.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
2424
0x10FFFE, 0x10FFFF])
2525

26-
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
26+
ascii_punctuation_re = re.compile(u"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
2727

2828
# Cache for charsUntil()
2929
charsUntilRegEx = {}
@@ -193,7 +193,8 @@ def openStream(self, source):
193193
else:
194194
# Otherwise treat source as a string and convert to a file object
195195
if isinstance(source, unicode):
196-
source = source.encode('utf-8')
196+
# XXX: we should handle lone surrogates here
197+
source = source.encode('utf-8', errors="replace")
197198
self.charEncoding = ("utf-8", "certain")
198199
try:
199200
from io import BytesIO
@@ -230,7 +231,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
230231
detector = UniversalDetector()
231232
while not detector.done:
232233
buffer = self.rawStream.read(self.numBytesChardet)
233-
assert isinstance(buffer, str)
234+
assert isinstance(buffer, bytes)
234235
if not buffer:
235236
break
236237
buffers.append(buffer)
@@ -279,7 +280,7 @@ def detectBOM(self):
279280

280281
# Go to beginning of file and read in 4 bytes
281282
string = self.rawStream.read(4)
282-
assert isinstance(string, str)
283+
assert isinstance(string, bytes)
283284

284285
# Try detecting the BOM using bytes from the string
285286
encoding = bomDict.get(string[:3]) # UTF-8
@@ -302,7 +303,7 @@ def detectEncodingMeta(self):
302303
"""Report the encoding declared by the meta element
303304
"""
304305
buffer = self.rawStream.read(self.numBytesMeta)
305-
assert isinstance(buffer, str)
306+
assert isinstance(buffer, bytes)
306307
parser = EncodingParser(buffer)
307308
self.rawStream.seek(0)
308309
encoding = parser.getEncoding()
@@ -781,7 +782,7 @@ def parse(self):
781782
def codecName(encoding):
782783
"""Return the python codec name corresponding to an encoding or None if the
783784
string doesn't correspond to a valid encoding."""
784-
if (encoding is not None and type(encoding) in types.StringTypes):
785+
if encoding:
785786
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
786787
return encodings.get(canonicalName, None)
787788
else:

html5lib/serializer/htmlserializer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ def serialize(self, treewalker, encoding=None):
303303

304304
def render(self, treewalker, encoding=None):
305305
if encoding:
306-
return "".join(list(self.serialize(treewalker, encoding)))
306+
return b"".join(list(self.serialize(treewalker, encoding)))
307307
else:
308308
return u"".join(list(self.serialize(treewalker)))
309309

html5lib/tests/support.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464

6565
def html5lib_test_files(subdirectory, files='*.dat'):
6666
return glob.glob(os.path.join(test_dir,subdirectory,files))
67+
html5lib_test_files.__test__ = False
6768

6869
class DefaultDict(dict):
6970
def __init__(self, default, *args, **kwargs):
@@ -77,6 +78,9 @@ class TestData(object):
7778
def __init__(self, filename, newTestHeading="data"):
7879
self.f = codecs.open(filename, encoding="utf8")
7980
self.newTestHeading = newTestHeading
81+
82+
def __del__(self):
83+
self.f.close()
8084

8185
def __iter__(self):
8286
data = DefaultDict(None)
@@ -114,14 +118,14 @@ def normaliseOutput(self, data):
114118
def convert(stripChars):
115119
def convertData(data):
116120
"""convert the output of str(document) to the format used in the testcases"""
117-
data = data.split("\n")
121+
data = data.split(u"\n")
118122
rv = []
119123
for line in data:
120-
if line.startswith("|"):
124+
if line.startswith(u"|"):
121125
rv.append(line[stripChars:])
122126
else:
123127
rv.append(line)
124-
return "\n".join(rv)
128+
return u"\n".join(rv)
125129
return convertData
126130

127131
convertExpected = convert(2)

html5lib/tests/test_encoding.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
1+
import re
12
import os
23
import unittest
3-
from support import html5lib_test_files, TestData, test_dir
44

5-
from html5lib import HTMLParser, inputstream
5+
try:
6+
unittest.TestCase.assertEqual
7+
except AttributeError:
8+
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
69

7-
import re, unittest
10+
from support import html5lib_test_files, TestData, test_dir
11+
from html5lib import HTMLParser, inputstream
812

913
class Html5EncodingTestCase(unittest.TestCase):
10-
def test_codec_name(self):
11-
self.assertEquals(inputstream.codecName("utf-8"), "utf-8")
12-
self.assertEquals(inputstream.codecName("utf8&quo EED3 t;), "utf-8")
13-
self.assertEquals(inputstream.codecName(" utf8 "), "utf-8")
14-
self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
14+
def test_codec_name_a(self):
15+
self.assertEqual(inputstream.codecName("utf-8"), "utf-8")
16+
17+
def test_codec_name_b(self):
18+
self.assertEqual(inputstream.codecName("utf8"), "utf-8")
19+
20+
def test_codec_name_c(self):
21+
self.assertEqual(inputstream.codecName(" utf8 "), "utf-8")
22+
23+
def test_codec_name_d(self):
24+
self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
1525

1626
def buildTestSuite():
1727
for filename in html5lib_test_files("encoding"):

html5lib/tests/test_parser.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
#XXX - There should just be one function here but for some reason the testcase
1919
#format differs from the treedump format by a single space character
2020
def convertTreeDump(data):
21-
return "\n".join(convert(3)(data).split("\n")[1:])
21+
return u"\n".join(convert(3)(data).split(u"\n")[1:])
2222

23-
namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
23+
namespaceExpected = re.compile(ur"^(\s*)<(\S+)>", re.M).sub
2424

2525

2626
def runParserTest(innerHTML, input, expected, errors, treeClass,
@@ -44,17 +44,17 @@ def runParserTest(innerHTML, input, expected, errors, treeClass,
4444
except:
4545
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
4646
u"\nTraceback:", traceback.format_exc().decode('utf8')])
47-
assert False, errorMsg.encode("utf8")
47+
assert False, errorMsg
4848

4949
output = convertTreeDump(p.tree.testSerializer(document))
5050

5151
expected = convertExpected(expected)
5252
if namespaceHTMLElements:
53-
expected = namespaceExpected(r"\1<html \2>", expected)
53+
expected = namespaceExpected(ur"\1<html \2>", expected)
5454

5555
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
5656
u"\nReceived:", output])
57-
assert expected == output, errorMsg.encode("utf8")
57+
assert expected == output, errorMsg
5858
errStr = [u"Line: %i Col: %i %s"%(line, col,
5959
constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
6060
((line,col), errorcode, datavars) in p.errors]
@@ -63,7 +63,7 @@ def runParserTest(innerHTML, input, expected, errors, treeClass,
6363
u"\nExpected errors (" + unicode(len(errors)) + u"):\n" + u"\n".join(errors),
6464
u"\nActual errors (" + unicode(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
6565
if checkParseErrors:
66-
assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
66+
assert len(p.errors) == len(errors), errorMsg2
6767

6868
def test_parser():
6969
sys.stderr.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
@@ -87,6 +87,3 @@ def test_parser():
8787
print input
8888
yield (runParserTest, innerHTML, input, expected, errors, treeCls,
8989
namespaceHTMLElements)
90-
break
91-
92-

html5lib/tests/test_serializer.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
except ImportError:
88
import simplejson as json
99

10+
try:
11+
unittest.TestCase.assertEqual
12+
except AttributeError:
13+
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
14+
1015
import html5lib
1116
from html5lib import html5parser, serializer, constants
1217
from html5lib.treewalkers._base import TreeWalker
@@ -83,7 +88,16 @@ def serialize_xhtml(input, options):
8388
options = dict([(str(k),v) for k,v in 10000 options.iteritems()])
8489
return serializer.XHTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
8590

86-
def make_test(input, expected, xhtml, options):
91+
def runSerializerTest(input, expected, xhtml, options):
92+
encoding = options.get("encoding", None)
93+
94+
if encoding:
95+
encode = lambda x: x.encode(encoding)
96+
expected = map(encode, expected)
97+
if xhtml:
98+
xhtml = map(encode, xhtml)
99+
100+
87101
result = serialize_html(input, options)
88102
if len(expected) == 1:
89103
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:False\n%s"%(expected[0], result, str(options))
@@ -114,13 +128,12 @@ def testDoctypeSystemId(self):
114128
self.throwsWithLatin1([["Doctype", u"potato", u"potato", u"\u0101"]])
115129

116130
def testCdataCharacters(self):
117-
self.assertEquals("<style>&amacr;", serialize_html([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}],
118-
["Characters", u"\u0101"]],
119-
{"encoding": "iso-8859-1"}))
131+
runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", u"\u0101"]],
132+
[u"<style>&amacr;"], None, {"encoding": "iso-8859-1"})
120133

121134
def testCharacters(self):
122-
self.assertEquals("&amacr;", serialize_html([["Characters", u"\u0101"]],
123-
{"encoding": "iso-8859-1"}))
135+
runSerializerTest([["Characters", u"\u0101"]],
136+
[u"&amacr;"], None, {"encoding": "iso-8859-1"})
124137

125138
def testStartTagName(self):
126139
self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
@@ -132,9 +145,9 @@ def testAttributeName(self):
132145
self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"span", [{"namespace": None, "name": u"\u0101", "value": u"potato"}]]])
133146

134147
def testAttributeValue(self):
135-
self.assertEquals("<span potato=&amacr;>", serialize_html([["StartTag", u"http://www.w3.org/1999/xhtml", u"span",
136-
[{"namespace": None, "name": u"potato", "value": u"\u0101"}]]],
137-
{"encoding": "iso-8859-1"}))
148+
runSerializerTest([["StartTag", u"http://www.w3.org/1999/xhtml", u"span",
149+
[{"namespace": None, "name": u"potato", "value": u"\u0101"}]]],
150+
[u"<span potato=&amacr;>"], None, {"encoding": "iso-8859-1"})
138151

139152
def testEndTagName(self):
140153
self.throwsWithLatin1([["EndTag", u"http://www.w3.org/1999/xhtml", u"\u0101"]])
@@ -154,27 +167,27 @@ def testEntityReplacement(self):
154167
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
155168
tree = etree.fromstring(doc, parser = self.parser).getroottree()
156169
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
157-
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
170+
self.assertEqual(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
158171

159172
def testEntityXML(self):
160173
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
161174
tree = etree.fromstring(doc, parser = self.parser).getroottree()
162175
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
163-
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
176+
self.assertEqual(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
164177

165178
def testEntityNoResolve(self):
166179
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
167180
tree = etree.fromstring(doc, parser = self.parser).getroottree()
168181
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
169182
resolve_entities=False)
170-
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
183+
self.assertEqual(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
171184

172185
def test_serializer():
173186
for filename in html5lib_test_files('serializer', '*.test'):
174-
tests = json.load(file(filename))
187+
tests = json.load(open(filename))
175188
test_name = os.path.basename(filename).replace('.test','')
176189
for index, test in enumerate(tests['tests']):
177190
xhtml = test.get("xhtml", test["expected"])
178191
if test_name == 'optionaltags':
179192
xhtml = None
180-
yield make_test, test["input"], test["expected"], xhtml, test.get("options", {})
193+
yield runSerializerTest, test["input"], test["expected"], xhtml, test.get("options", {})

html5lib/tests/test_tokenizer.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import with_statement
2+
13
import sys
24
import os
35
import unittest
@@ -137,6 +139,7 @@ def decode(inp):
137139
del token[2][key]
138140
token[2][decode(key)] = decode(value)
139141
return test
142+
unescape_test.__test__ = False
140143

141144

142145
def runTokenizerTest(test):
@@ -161,7 +164,7 @@ def runTokenizerTest(test):
161164
"\nInput:", unicode(test['input']),
162165
"\nExpected:", unicode(expected),
163166
"\nreceived:", unicode(tokens)])
164-
errorMsg = errorMsg.encode("utf-8")
167+
errorMsg = errorMsg
165168
ignoreErrorOrder = test.get('ignoreErrorOrder', False)
166169
assert tokensMatch(expected, received, ignoreErrorOrder), errorMsg
167170

@@ -179,15 +182,16 @@ def capitalize(s):
179182

180183
def test_tokenizer():
181184
for filename in html5lib_test_files('tokenizer', '*.test'):
182-
tests = json.load(file(filename))
183-
testName = os.path.basename(filename).replace(".test","")
184-
if 'tests' in tests:
185-
for index,test in enumerate(tests['tests']):
186-
#Skip tests with a self closing flag
187-
skip = False
188-
if 'initialStates' not in test:
189-
test["initialStates"] = ["Data state"]
190-
for initialState in test["initialStates"]:
191-
test["initialState"] = capitalize(initialState)
192-
yield runTokenizerTest, test
185+
with open(filename) as fp:
186+
tests = json.load(fp)
187+
testName = os.path.basename(filename).replace(".test","")
188+
if 'tests' in tests:
189+
for index,test in enumerate(tests['tests']):
190+
#Skip tests with a self closing flag
191+
skip = False
192+
if 'initialStates' not in test:
193+
test["initialStates"] = ["Data state"]
194+
for initialState in test["initialStates"]:
195+
test["initialState"] = capitalize(initialState)
196+
yield runTokenizerTest, test
193197

html5lib/tests/test_treewalkers.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
import unittest
44
import warnings
55

6+
try:
7+
unittest.TestCase.assertEqual
8+
except AttributeError:
9+
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
10+
611
warnings.simplefilter("error")
712

813
from support import html5lib_test_files, TestData, convertExpected
@@ -263,9 +268,9 @@ def test_all_tokens(self):
263268
document = treeCls.get("adapter", lambda x: x)(document)
264269
output = treeCls["walker"](document)
265270
for expectedToken, outputToken in zip(expected, output):
266-
self.assertEquals(expectedToken, outputToken)
271+
self.assertEqual(expectedToken, outputToken)
267272

268-
def run_test(innerHTML, input, expected, errors, treeClass):
273+
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
269274
try:
270275
p = html5parser.HTMLParser(tree = treeClass["builder"])
271276
if innerHTML:
@@ -305,6 +310,6 @@ def test_treewalker():
305310
"document-fragment",
306311
"document")]
307312
errors = errors.split("\n")
308-
yield run_test, innerHTML, input, expected, errors, treeCls
313+
yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls
309314

310315

html5lib/tests/test_whitespace_filter.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@
44
from html5lib.constants import spaceCharacters
55
spaceCharacters = u"".join(spaceCharacters)
66

7+
try:
8+
unittest.TestCase.assertEqual
9+
except AttributeError:
10+
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
11+
712
class TestCase(unittest.TestCase):
813
def runTest(self, input, expected):
914
output = list(Filter(input))
1015
errorMsg = "\n".join(["\n\nInput:", str(input),
1116
"\nExpected:", str(expected),
1217
"\nReceived:", str(output)])
13-
self.assertEquals(output, expected, errorMsg)
18+
self.assertEqual(output, expected, errorMsg)
1419

1520
def runTestUnmodifiedOutput(self, input):
1621
self.runTest(input, input)

html5lib/treebuilders/_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def __init__(self, name):
4242
self.childNodes = []
4343
self._flags = []
4444

45-
def __unicode__(self):
45+
def __str__(self):
4646
attributesStr = " ".join(["%s=\"%s\""%(name, value)
4747
for name, value in
4848
self.attributes.iteritems()])

0 commit comments

Comments
 (0)
0