8000 Merge · awesome-python/html5lib-python@4482710 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4482710

Browse files
author
James Graham
committed
Merge
2 parents f0cf46b + dc827e8 commit 4482710

File tree

4 files changed

+100
-16
lines changed

4 files changed

+100
-16
lines changed

html5lib/constants.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
E = {
1414
"null-character":
1515
_(u"Null character in input stream, replaced with U+FFFD."),
16-
"invalid-character":
16+
"invalid-codepoint":
1717
_(u"Invalid codepoint in stream."),
1818
"incorrectly-placed-solidus":
1919
_(u"Solidus (/) incorrectly placed in tag."),
@@ -74,6 +74,10 @@
7474
_(u"Unexpected = in unquoted attribute"),
7575
'unexpected-character-in-unquoted-attribute-value':
7676
_(u"Unexpected character in unquoted attribute"),
77+
"invalid-character-after-attribute-name":
78+
_(u"Unexpected character after attribute name."),
79+
"unexpected-character-after-attribute-value":
80+
_(u"Unexpected character after attribute value."),
7781
"eof-in-attribute-value-double-quote":
7882
_(u"Unexpected end of file in attribute value (\")."),
7983
"eof-in-attribute-value-single-quote":
@@ -100,6 +104,10 @@
100104
_(u"Unexpected '-' after '--' found in comment."),
101105
"eof-in-comment-double-dash":
102106
_(u"Unexpected end of file in comment (--)."),
107+
"eof-in-comment-end-space-state":
108+
_(u"Unexpected end of file in comment."),
109+
"eof-in-comment-end-bang-state":
110+
_(u"Unexpected end of file in comment."),
103111
"unexpected-char-in-comment":
104112
_(u"Unexpected character in comment found."),
105113
"need-space-after-doctype":

html5lib/inputstream.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
134134
#Craziness
135135
if len(u"\U0010FFFF") == 1:
136136
self.reportCharacterErrors = self.characterErrorsUCS4
137+
self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
137138
else:
138139
self.reportCharacterErrors = self.characterErrorsUCS2
140+
self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
139141

140142
# List of where new lines occur
141143
self.newLines = [0]
@@ -159,6 +161,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
159161
if (self.charEncoding[0] is None):
160162
self.charEncoding = self.detectEncoding(parseMeta, chardet)
161163

164+
162165
self.reset()
163166

164167
def reset(self):
@@ -175,8 +178,8 @@ def reset(self):
175178
# number of columns in the last line of the previous chunk
176179
self.prevNumCols = 0
177180

178-
#Flag to indicate we may have a CR LF broken across a data chunk
179-
self._lastChunkEndsWithCR = False
181+
#Deal with CR LF and surrogates split over chunk boundaries
182+
self._bufferedCharacter = None
180183

181184
def openStream(self, source):
182185
"""Produces a file object from source.
@@ -341,20 +344,27 @@ def readChunk(self, chunkSize=None):
341344
self.chunkOffset = 0
342345

343346
data = self.dataStream.read(chunkSize)
344-
345-
if not data:
347+
348+
#Deal with CR LF and surrogates broken across chunks
349+
if self._bufferedCharacter:
350+
data = self._bufferedCharacter + data
351+
self._bufferedCharacter = None
352+
elif not data:
353+
# We have no more data, bye-bye stream
346354
return False
347355

356+
if len(data) > 1:
357+
lastv = ord(data[-1])
358+
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
359+
self._bufferedCharacter = data[-1]
360+
data = data[:-1]
361+
348362
self.reportCharacterErrors(data)
349-
363+
364+
# Replace invalid characters
350365
data = data.replace(u"\u0000", u"\ufffd")
351-
#Check for CR LF broken across chunks
352-
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
353-
data = data[1:]
354-
# Stop if the chunk is now empty
355-
if not data:
356-
return False
357-
self._lastChunkEndsWithCR = data[-1] == u"\r"
366+
data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
367+
358368
data = data.replace(u"\r\n", u"\n")
359369
data = data.replace(u"\r", u"\n")
360370

@@ -394,8 +404,6 @@ def characterErrorsUCS2(self, data):
394404
else:
395405
skip = False
396406
self.errors.append("invalid-codepoint")
397-
#This is still wrong if it is possible for a surrogate pair to break a
398-
#chunk boundary
399407

400408
def charsUntil(self, characters, opposite = False):
401409
""" Returns a string of characters from the stream up to but not

html5lib/serializer/htmlserializer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@
2727
for k, v in entities.items():
2828
if v != "&" and encode_entity_map.get(v) != k.lower():
2929
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
30-
encode_entity_map[ord(v)] = k
30+
if len(v) == 2:
31+
v = utils.surrogatePairToCodepoint(v)
32+
else:
33+
v = ord(v)
34+
encode_entity_map[v] = k
3135

3236
def htmlentityreplace_errors(exc):
3337
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):

html5lib/tests/tokenizertotree.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import sys
2+
import os
3+
import json
4+
import re
5+
6+
import html5lib
7+
import support
8+
import test_parser
9+
import test_tokenizer
10+
11+
p = html5lib.HTMLParser()
12+
13+
unnamespaceExpected = re.compile(r"^(\s*)<html (\S+)>", re.M).sub
14+
15+
def main(out_path):
16+
if not os.path.exists(out_path):
17+
sys.stderr.write("Path %s does not exist"%out_path)
18+
sys.exit(1)
19+
20+
for filename in support.html5lib_test_files('tokenizer', '*.test'):
21+
run_file(filename, out_path)
22+
23+
def run_file(filename, out_path):
24+
try:
25+
tests_data = json.load(file(filename))
26+
except ValueError:
27+
sys.stderr.write("Failed to load %s\n"%filename)
28+
return
29+
name = os.path.splitext(os.path.split(filename)[1])[0]
30+
output_file = open(os.path.join(out_path, "tokenizer_%s.dat"%name), "w")
31+
32+
if 'tests' in tests_data:
33+
for test_data in tests_data['tests']:
34+
if 'initialStates' not in test_data:
35+
test_data["initialStates"] = ["Data state"]
36+
37+
for initial_state in test_data["initialStates"]:
38+
if initial_state != "Data state":
39+
#don't support this yet
40+
continue
41+
test = make_test(test_data)
42+
output_file.write(test)
43+
44+
output_file.close()
45+
46+
def make_test(test_data):
47+
if 'doubleEscaped' in test_data:
48+
test_data = test_tokenizer.unescape_test(test_data)
49+
50+
rv = []
51+
rv.append("#data")
52+
rv.append(test_data["input"].encode("utf8"))
53+
rv.append("#errors")
54+
rv.append("#document")
55+
tree = p.parse(test_data["input"])
56+
output = test_parser.convertTreeDump(p.tree.testSerializer(tree))
57+
output = test_parser.attrlist.sub(test_parser.sortattrs, output)
58+
output = unnamespaceExpected(r"\1<\2>", output)
59+
rv.append(output.encode("utf8"))
60+
rv.append("")
61+
return "\n".join(rv)
62+
63+
if __name__ == "__main__":
64+
main(sys.argv[1])

0 commit comments

Comments
 (0)
0