html5lib
diff --git a/‎html5lib/inputstream.py
Lines changed: 7 additions & 6 deletions b/‎html5lib/inputstream.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎html5lib/serializer/htmlserializer.py
Lines changed: 1 addition & 1 deletion b/‎html5lib/serializer/htmlserializer.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎html5lib/tests/support.py
Lines changed: 7 additions & 3 deletions b/‎html5lib/tests/support.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎html5lib/tests/test_encoding.py
Lines changed: 18 additions & 8 deletions b/‎html5lib/tests/test_encoding.py
Lines changed: 18 additions & 8 deletions
diff --git a/‎html5lib/tests/test_parser.py
Lines changed: 6 additions & 9 deletions b/‎html5lib/tests/test_parser.py
Lines changed: 6 additions & 9 deletions
diff --git a/‎html5lib/tests/test_serializer.py
Lines changed: 27 additions & 14 deletions b/‎html5lib/tests/test_serializer.py
Lines changed: 27 additions & 14 deletions
diff --git a/‎html5lib/tests/test_tokenizer.py
Lines changed: 16 additions & 12 deletions b/‎html5lib/tests/test_tokenizer.py
Lines changed: 16 additions & 12 deletions
diff --git a/‎html5lib/tests/test_treewalkers.py
Lines changed: 8 additions & 3 deletions b/‎html5lib/tests/test_treewalkers.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎html5lib/tests/test_whitespace_filter.py
Lines changed: 6 additions & 1 deletion b/‎html5lib/tests/test_whitespace_filter.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎html5lib/treebuilders/_base.py
Lines changed: 1 addition & 1 deletion b/‎html5lib/treebuilders/_base.py
Lines changed: 1 addition & 1 deletion
@@ -23,7 +23,7 @@
                                   0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                                   0x10FFFE, 0x10FFFF])
 
-ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+ascii_punctuation_re = re.compile(u"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
 
 # Cache for charsUntil()
 charsUntilRegEx = {}
@@ -193,7 +193,8 @@ def openStream(self, source):
         else:
             # Otherwise treat source as a string and convert to a file object
             if isinstance(source, unicode):
-                source = source.encode('utf-8')
+                # XXX: we should handle lone surrogates here
+                source = source.encode('utf-8', errors="replace")
                 self.charEncoding = ("utf-8", "certain")
             try:
                 from io import BytesIO
@@ -230,7 +231,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
                 detector = UniversalDetector()
                 while not detector.done:
                     buffer = self.rawStream.read(self.numBytesChardet)
-                    assert isinstance(buffer, str)
+                    assert isinstance(buffer, bytes)
                     if not buffer:
                         break
                     buffers.append(buffer)
@@ -279,7 +280,7 @@ def detectBOM(self):
 
         # Go to beginning of file and read in 4 bytes
         string = self.rawStream.read(4)
-        assert isinstance(string, str)
+        assert isinstance(string, bytes)
 
         # Try detecting the BOM using bytes from the string
         encoding = bomDict.get(string[:3])         # UTF-8
@@ -302,7 +303,7 @@ def detectEncodingMeta(self):
         """Report the encoding declared by the meta element
         """
         buffer = self.rawStream.read(self.numBytesMeta)
-        assert isinstance(buffer, str)
+        assert isinstance(buffer, bytes)
         parser = EncodingParser(buffer)
         self.rawStream.seek(0)
         encoding = parser.getEncoding()
@@ -781,7 +782,7 @@ def parse(self):
 def codecName(encoding):
     """Return the python codec name corresponding to an encoding or None if the
     string doesn't correspond to a valid encoding."""
-    if (encoding is not None and type(encoding) in types.StringTypes):
+    if encoding:
         canonicalName = ascii_punctuation_re.sub("", encoding).lower()
         return encodings.get(canonicalName, None)
     else:
 
@@ -303,7 +303,7 @@ def serialize(self, treewalker, encoding=None):
 
     def render(self, treewalker, encoding=None):
         if encoding:
-            return "".join(list(self.serialize(treewalker, encoding)))
+            return b"".join(list(self.serialize(treewalker, encoding)))
         else:
             return u"".join(list(self.serialize(treewalker)))
 
 
@@ -64,6 +64,7 @@
 
 def html5lib_test_files(subdirectory, files='*.dat'):
     return glob.glob(os.path.join(test_dir,subdirectory,files))
+html5lib_test_files.__test__ = False
 
 class DefaultDict(dict):
     def __init__(self, default, *args, **kwargs):
@@ -77,6 +78,9 @@ class TestData(object):
     def __init__(self, filename, newTestHeading="data"):
         self.f = codecs.open(filename, encoding="utf8")
         self.newTestHeading = newTestHeading
+
+    def __del__(self):
+        self.f.close()
 
     def __iter__(self):
         data = DefaultDict(None)
@@ -114,14 +118,14 @@ def normaliseOutput(self, data):
 def convert(stripChars):
     def convertData(data):
         """convert the output of str(document) to the format used in the testcases"""
-        data = data.split("\n")
+        data = data.split(u"\n")
         rv = []
         for line in data:
-            if line.startswith("|"):
+            if line.startswith(u"|"):
                 rv.append(line[stripChars:])
             else:
                 rv.append(line)
-        return "\n".join(rv)
+        return u"\n".join(rv)
     return convertData
 
 convertExpected = convert(2)
@@ -1,17 +1,27 @@
+import re
 import os
 import unittest
-from support import html5lib_test_files, TestData, test_dir
 
-from html5lib import HTMLParser, inputstream
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
 
-import re, unittest
+from support import html5lib_test_files, TestData, test_dir
+from html5lib import HTMLParser, inputstream
 
 class Html5EncodingTestCase(unittest.TestCase):
-    def test_codec_name(self):
-        self.assertEquals(inputstream.codecName("utf-8"), "utf-8")
-        self.assertEquals(inputstream.codecName("utf8&quo
EED3
t;), "utf-8")
-        self.assertEquals(inputstream.codecName("  utf8  "), "utf-8")
-        self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
+    def test_codec_name_a(self):
+        self.assertEqual(inputstream.codecName("utf-8"), "utf-8")
+
+    def test_codec_name_b(self):
+        self.assertEqual(inputstream.codecName("utf8"), "utf-8")
+
+    def test_codec_name_c(self):
+        self.assertEqual(inputstream.codecName("  utf8  "), "utf-8")
+
+    def test_codec_name_d(self):
+        self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
 
 def buildTestSuite():
     for filename in html5lib_test_files("encoding"):
 
@@ -18,9 +18,9 @@
 #XXX - There should just be one function here but for some reason the testcase
 #format differs from the treedump format by a single space character
 def convertTreeDump(data):
-    return "\n".join(convert(3)(data).split("\n")[1:])
+    return u"\n".join(convert(3)(data).split(u"\n")[1:])
 
-namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
+namespaceExpected = re.compile(ur"^(\s*)<(\S+)>", re.M).sub
 
 
 def runParserTest(innerHTML, input, expected, errors, treeClass,
@@ -44,17 +44,17 @@ def runParserTest(innerHTML, input, expected, errors, treeClass,
     except:
         errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
                                u"\nTraceback:", traceback.format_exc().decode('utf8')])
-        assert False, errorMsg.encode("utf8")
+        assert False, errorMsg
 
     output = convertTreeDump(p.tree.testSerializer(document))
 
     expected = convertExpected(expected)
     if namespaceHTMLElements:
-        expected = namespaceExpected(r"\1<html \2>", expected)
+        expected = namespaceExpected(ur"\1<html \2>", expected)
 
     errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
                            u"\nReceived:", output])
-    assert expected == output, errorMsg.encode("utf8")
+    assert expected == output, errorMsg
     errStr = [u"Line: %i Col: %i %s"%(line, col, 
                                       constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
               ((line,col), errorcode, datavars) in p.errors]
@@ -63,7 +63,7 @@ def runParserTest(innerHTML, input, expected, errors, treeClass,
                             u"\nExpected errors (" + unicode(len(errors)) + u"):\n" + u"\n".join(errors),
                             u"\nActual errors (" + unicode(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
     if checkParseErrors:
-            assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
+            assert len(p.errors) == len(errors), errorMsg2
 
 def test_parser():
     sys.stderr.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
@@ -87,6 +87,3 @@ def test_parser():
                     print input
                     yield (runParserTest, innerHTML, input, expected, errors, treeCls,
                            namespaceHTMLElements)
-                    break
-                
-                
@@ -7,6 +7,11 @@
 except ImportError:
     import simplejson as json
 
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
+
 import html5lib
 from html5lib import html5parser, serializer, constants
 from html5lib.treewalkers._base import TreeWalker
@@ -83,7 +88,16 @@ def serialize_xhtml(input, options):
     options = dict([(str(k),v) for k,v in
10000
 options.iteritems()])
     return serializer.XHTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
 
-def make_test(input, expected, xhtml, options):
+def runSerializerTest(input, expected, xhtml, options):
+    encoding = options.get("encoding", None)
+
+    if encoding:
+        encode = lambda x: x.encode(encoding)
+        expected = map(encode, expected)
+        if xhtml:
+            xhtml = map(encode, xhtml)
+        
+    
     result = serialize_html(input, options)
     if len(expected) == 1:
         assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:False\n%s"%(expected[0], result, str(options))
@@ -114,13 +128,12 @@ def testDoctypeSystemId(self):
         self.throwsWithLatin1([["Doctype", u"potato", u"potato", u"\u0101"]])
 
     def testCdataCharacters(self):
-        self.assertEquals("<style>&amacr;", serialize_html([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}],
-                                                            ["Characters", u"\u0101"]],
-                                                           {"encoding": "iso-8859-1"}))
+        runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", u"\u0101"]],
+                          [u"<style>&amacr;"], None, {"encoding": "iso-8859-1"})
 
     def testCharacters(self):
-        self.assertEquals("&amacr;", serialize_html([["Characters", u"\u0101"]],
-                                                    {"encoding": "iso-8859-1"}))
+        runSerializerTest([["Characters", u"\u0101"]],
+                          [u"&amacr;"], None, {"encoding": "iso-8859-1"})
 
     def testStartTagName(self):
         self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
@@ -132,9 +145,9 @@ def testAttributeName(self):
         self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"span", [{"namespace": None, "name": u"\u0101", "value": u"potato"}]]])
 
     def testAttributeValue(self):
-        self.assertEquals("<span potato=&amacr;>", serialize_html([["StartTag", u"http://www.w3.org/1999/xhtml", u"span",
-                                                                    [{"namespace": None, "name": u"potato", "value": u"\u0101"}]]],
-                                                                  {"encoding": "iso-8859-1"}))
+        runSerializerTest([["StartTag", u"http://www.w3.org/1999/xhtml", u"span",
+                            [{"namespace": None, "name": u"potato", "value": u"\u0101"}]]],
+                          [u"<span potato=&amacr;>"], None, {"encoding": "iso-8859-1"})
 
     def testEndTagName(self):
         self.throwsWithLatin1([["EndTag", u"http://www.w3.org/1999/xhtml", u"\u0101"]])
@@ -154,27 +167,27 @@ def testEntityReplacement(self):
             doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
             tree = etree.fromstring(doc, parser = self.parser).getroottree()
             result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
-            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
+            self.assertEqual(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
 
         def testEntityXML(self):
             doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
             tree = etree.fromstring(doc, parser = self.parser).getroottree()
             result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
-            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
+            self.assertEqual(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
 
         def testEntityNoResolve(self):
             doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
             tree = etree.fromstring(doc, parser = self.parser).getroottree()
             result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
                                           resolve_entities=False)
-            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
+            self.assertEqual(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
 
 def test_serializer():
     for filename in html5lib_test_files('serializer', '*.test'):
-        tests = json.load(file(filename))
+        tests = json.load(open(filename))
         test_name = os.path.basename(filename).replace('.test','')
         for index, test in enumerate(tests['tests']):
             xhtml = test.get("xhtml", test["expected"])
             if test_name == 'optionaltags': 
                 xhtml = None
-            yield make_test, test["input"], test["expected"], xhtml, test.get("options", {})
+            yield runSerializerTest, test["input"], test["expected"], xhtml, test.get("options", {})
@@ -1,3 +1,5 @@
+from __future__ import with_statement
+
 import sys
 import os
 import unittest
@@ -137,6 +139,7 @@ def decode(inp):
                     del token[2][key]
                     token[2][decode(key)] = decode(value)
     return test
+unescape_test.__test__ = False
 
 
 def runTokenizerTest(test):
@@ -161,7 +164,7 @@ def runTokenizerTest(test):
                           "\nInput:", unicode(test['input']),
                           "\nExpected:", unicode(expected),
                           "\nreceived:", unicode(tokens)])
-    errorMsg = errorMsg.encode("utf-8")
+    errorMsg = errorMsg
     ignoreErrorOrder = test.get('ignoreErrorOrder', False)
     assert tokensMatch(expected, received, ignoreErrorOrder), errorMsg
 
@@ -179,15 +182,16 @@ def capitalize(s):
 
 def test_tokenizer():
     for filename in html5lib_test_files('tokenizer', '*.test'):
-        tests = json.load(file(filename))
-        testName = os.path.basename(filename).replace(".test","")
-        if 'tests' in tests:
-            for index,test in enumerate(tests['tests']):
-                #Skip tests with a self closing flag
-                skip = False
-                if 'initialStates' not in test:
-                    test["initialStates"] = ["Data state"]
-                for initialState in test["initialStates"]:
-                    test["initialState"] = capitalize(initialState)
-                    yield runTokenizerTest, test
+        with open(filename) as fp:
+            tests = json.load(fp)
+            testName = os.path.basename(filename).replace(".test","")
+            if 'tests' in tests:
+                for index,test in enumerate(tests['tests']):
+                    #Skip tests with a self closing flag
+                    skip = False
+                    if 'initialStates' not in test:
+                        test["initialStates"] = ["Data state"]
+                    for initialState in test["initialStates"]:
+                        test["initialState"] = capitalize(initialState)
+                        yield runTokenizerTest, test
 
@@ -3,6 +3,11 @@
 import unittest
 import warnings
 
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
+
 warnings.simplefilter("error")
 
 from support import html5lib_test_files, TestData, convertExpected
@@ -263,9 +268,9 @@ def test_all_tokens(self):
             document = treeCls.get("adapter", lambda x: x)(document)
             output = treeCls["walker"](document)
             for expectedToken, outputToken in zip(expected, output):
-                self.assertEquals(expectedToken, outputToken)
+                self.assertEqual(expectedToken, outputToken)
 
-def run_test(innerHTML, input, expected, errors, treeClass):
+def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
     try:
         p = html5parser.HTMLParser(tree = treeClass["builder"])
         if innerHTML:
@@ -305,6 +310,6 @@ def test_treewalker():
                                                                "document-fragment",
                                                                "document")]
                 errors = errors.split("\n")
-                yield run_test, innerHTML, input, expected, errors, treeCls
+                yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls
 
 
@@ -4,13 +4,18 @@
 from html5lib.constants import spaceCharacters
 spaceCharacters = u"".join(spaceCharacters)
 
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
+
 class TestCase(unittest.TestCase):
     def runTest(self, input, expected):
         output = list(Filter(input))
         errorMsg = "\n".join(["\n\nInput:", str(input),
                               "\nExpected:", str(expected),
                               "\nReceived:", str(output)])
-        self.assertEquals(output, expected, errorMsg)
+        self.assertEqual(output, expected, errorMsg)
 
     def runTestUnmodifiedOutput(self, input):
         self.runTest(input, input)
 
@@ -42,7 +42,7 @@ def __init__(self, name):
         self.childNodes = []
         self._flags = []
 
-    def __unicode__(self):
+    def __str__(self):
         attributesStr =  " ".join(["%s=\"%s\""%(name, value) 
                                    for name, value in 
                                    self.attributes.iteritems()])