8000 Fix some tokenizer tests; make new comment states work and don't emit… · html5lib/html5lib-python@1303539 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1303539

Browse files
committed
Fix some tokenizer tests; make new comment states work and don't emit tags after EOF
1 parent 60e94ec commit 1303539

File tree

1 file changed

+57
-4
lines changed

1 file changed

+57
-4
lines changed

src/html5lib/tokenizer.py

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
7171
"comment":self.commentState,
7272
"commentEndDash":self.commentEndDashState,
7373
"commentEnd":self.commentEndState,
74+
"commentEndBang":self.commentEndBangState,
75+
"commentEndSpace":self.commentEndSpaceState,
7476
"doctype":self.doctypeState,
7577
"beforeDoctypeName":self.beforeDoctypeNameState,
7678
"doctypeName":self.doctypeNameState,
@@ -476,7 +478,7 @@ def tagNameState(self):
476478
elif data is EOF:
477479
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
478480
"eof-in-tag-name"})
479-
self.emitCurrentToken()
481+
self.state = self.states["data"]
480482
elif data == u"/":
481483
self.state = self.states["selfClosingStartTag"]
482484
else:
@@ -504,7 +506,7 @@ def beforeAttributeNameState(self):
504506
elif data is EOF:
505507
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
506508
"expected-attribute-name-but-got-eof"})
507-
self.emitCurrentToken()
509+
self.state = self.states["data"]
508510
else:
509511
self.currentToken["data"].append([data, ""])
510512
self.state = self.states["attributeName"]
@@ -654,7 +656,7 @@ def attributeValueUnQuotedState(self):
654656
self.processEntityInAttribute(None)
655657
elif data == u">":
656658
self.emitCurrentToken()
657-
elif data == u'"' or data == u"'" or data == u"=":
659+
elif data in (u'"', u"'", u"=", u"<"):
658660
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
659661
"unexpected-character-in-unquoted-attribute-value"})
660662
self.currentToken["data"][-1][1] += data
@@ -697,7 +699,6 @@ def selfClosingStartTagState(self):
697699
self.tokenQueue.append({"type": tokenTypes["ParseError"],
698700
"data":
699701
"unexpected-EOF-after-solidus-in-tag"})
700-
self.emitCurrentToken()
701702
self.stream.unget(data)
702703
self.state = self.states["data"]
703704
else:
@@ -846,6 +847,15 @@ def commentEndState(self):
846847
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
847848
"unexpected-dash-after-double-dash-in-comment"})
848849
self.currentToken["data"] += data
850+
elif data in spaceCharacters:
851+
self.currentToken["data"] += "--" + data
852+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
853+
"unexpected-space-after-double-dash-in-comment"})
854+
self.state = self.states["commentEndSpace"]
855+
elif data == "!":
856+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
857+
"unexpected-bang-after-double-dash-in-comment"})
858+
self.state = self.states["commentEndBang"]
849859
elif data is EOF:
850860
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
851861
"eof-in-comment-double-dash"})
@@ -859,10 +869,53 @@ def commentEndState(self):
859869
self.state = self.states["comment"]
860870
return True
861871

872+
def commentEndBangState(self):
873+
data = self.stream.char()
874+
if data == u">":
875+
self.tokenQueue.append(self.currentToken)
876+
self.state = self.states["data"]
877+
elif data == u"-":
878+
self.currentToken["data"] += "--!"
879+
self.state = self.states["commentEndDash"]
880+
elif data is EOF:
881+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
882+
"eof-in-comment-end-bang-state"})
883+
self.tokenQueue.append(self.currentToken)
884+
self.state = self.states["data"]
885+
else:
886+
self.currentToken["data"] += u"--!" + data
887+
self.state = self.states["comment"]
888+
return True
889+
890+
def commentEndSpaceState(self):
891+
data = self.stream.char()
892+
if data == u">":
893+
self.tokenQueue.append(self.currentToken)
894+
self.state = self.states["data"]
895+
elif data == u"-":
896+
self.state = self.states["commentEndDash"]
897+
elif data in spaceCharacters:
898+
self.currentToken["data"] += data
899+
elif data is EOF:
900+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
901+
"eof-in-comment-end-space-state"})
902+
self.tokenQueue.append(self.currentToken)
903+
self.state = self.states["data"]
904+
else:
905+
self.currentToken["data"] += data
906+
self.state = self.states["comment"]
907+
return True
908+
862909
def doctypeState(self):
863910
data = self.stream.char()
864911
if data in spaceCharacters:
865912
self.state = self.states["beforeDoctypeName"]
913+
elif data is EOF:
914+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
915+
"expected-doctype-name-but-got-eof"})
916+
self.currentToken["correct"] = False
917+
self.tokenQueue.append(self.currentToken)
918+
self.state = self.states["data"]
866919
else:
867920
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
868921
"need-space-after-doctype"})

0 commit comments

Comments
 (0)
0