8000 Set the right character encoding when a unicode object is passed to t… · awesome-python/html5lib-python@e97b068 · GitHub
[go: up one dir, main page]

Skip to content

Commit e97b068

Browse files
committed
Set the right character encoding when a unicode object is passed to the inputstream. We now pass all tests again!
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40909
1 parent b4a73f0 commit e97b068

File tree

2 files changed

+9
-5
lines changed

2 files changed

+9
-5
lines changed

src/html5lib/inputstream.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
3636
# List of where new lines occur
3737
self.newLines = [0]
3838

39-
# Raw Stream
39+
self.charEncoding = encoding
40+
41+
# Raw Stream - for unicode objects this will encode to utf-8 and set
42+
# self.charEncoding as appropriate
4043
self.rawStream = self.openStream(source)
4144

4245
# Encoding Information
@@ -49,9 +52,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
4952
self.defaultEncoding = "windows-1252"
5053

5154
#Detect encoding iff no explicit "transport level" encoding is supplied
52-
if encoding is None or not isValidEncoding(encoding):
53-
encoding = self.detectEncoding(parseMeta, chardet)
54-
self.charEncoding = encoding
55+
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
56+
self.charEncoding = self.detectEncoding(parseMeta, chardet)
5557

5658
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
5759

@@ -74,6 +76,7 @@ def openStream(self, source):
7476
# Otherwise treat source as a string and convert to a file object
7577
if isinstance(source, unicode):
7678
source = source.encode('utf-8')
79+
self.charEncoding = "utf-8"
7780
import cStringIO
7881
stream = cStringIO.StringIO(str(source))
7982
return stream

tests/test_sanitizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ def addTest(cls, name, expected, input):
88
def test(self, expected=expected, input=input):
99
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
1010
parseFragment(expected).childNodes])
11+
expected = simplejson.loads(simplejson.dumps(expected))
1112
self.assertEqual(expected, self.sanitize_html(input))
1213
setattr(cls, name, test)
1314
addTest = classmethod(addTest)
@@ -73,7 +74,7 @@ def test_should_handle_astral_plane_characters(self):
7374
def buildTestSuite():
7475
for filename in html5lib_test_files("sanitizer"):
7576
for test in simplejson.load(file(filename)):
76-
SanitizeTest.addTest('test_' + test['name'], test['output'], test['input'])
77+
SanitizeTest.addTest('test_' + test['name'], test['output'], test['input'])
7778

7879
return unittest.TestLoader().loadTestsFromTestCase(SanitizeTest)
7980

0 commit comments

Comments
 (0)
0