8000 land the new DOCTYPE handling; still things to be fixed · awesome-python/html5lib-python@28edc78 · GitHub
[go: up one dir, main page]

Skip to content

Commit 28edc78

Browse files
committed
land the new DOCTYPE handling; still things to be fixed
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40747
1 parent 866a2a7 commit 28edc78

File tree

1 file changed

+169
-22
lines changed

1 file changed

+169
-22
lines changed

src/tokenizer.py

Lines changed: 169 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from constants import contentModelFlags, spaceCharacters
1111
from constants import entitiesWindows1252, entities
12-
from constants import asciiLowercase, asciiLetters
12+
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
1313
from constants import digits, hexDigits, EOF
1414

1515
from inputstream import HTMLInputStream
@@ -57,6 +57,14 @@ def __init__(self, stream, encoding=None, parseMeta=True):
5757
"beforeDoctypeName":self.beforeDoctypeNameState,
5858
"doctypeName":self.doctypeNameState,
5959
"afterDoctypeName":self.afterDoctypeNameState,
60+
"beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState,
61+
"doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState,
62+
"doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState,
63+
"afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState,
64+
"beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState,
65+
"doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState,
66+
"doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState,
67+
"afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState,
6068
"bogusDoctype":self.bogusDoctypeState
6169
}
6270

@@ -600,8 +608,8 @@ def markupDeclarationOpenState(self):
600608
# Put in explicit EOF check
601609
if (not EOF in charStack and
602610
"".join(charStack).upper() == u"DOCTYPE"):
603-
self.currentToken =\
604-
{"type": "Doctype", "name": "", "data": True}
611+
self.currentToken = {"type":"Doctype", "name":"", "publicId":"",
612+
"systemId":"", "data":True}
605613
self.state = self.states["doctype"]
606614
else:
607615
self.tokenQueue.append({"type": "ParseError", "data":
@@ -678,12 +686,7 @@ def beforeDoctypeNameState(self):
678686
data = self.stream.char()
679687
if data in spaceCharacters:
680688
pass
681-
elif data in asciiLowercase:
682-
self.currentToken["name"] = data.upper()
683-
self.state = self.states["doctypeName"]
684689
elif data == u">":
685-
# Character needs to be consumed per the specification so don't
686-
# invoke emitCurrentTokenWithParseError with "data" as argument.
687690
self.tokenQueue.append({"type": "ParseError", "data":
688691
_("Unexpected > character. Expected DOCTYPE name.")})
689692
self.tokenQueue.append(self.currentToken)
@@ -700,10 +703,8 @@ def beforeDoctypeNameState(self):
700703

701704
def doctypeNameState(self):
702705
data = self.stream.char()
703-
needsDoctypeCheck = False
704706
if data in spaceCharacters:
705707
self.state = self.states["afterDoctypeName"]
706-
needsDoctypeCheck = True
707708
elif data == u">":
708709
self.tokenQueue.append(self.currentToken)
709710
self.state = self.states["data"]
@@ -713,17 +714,7 @@ def doctypeNameState(self):
713714
self.tokenQueue.append(self.currentToken)
714715
self.state = self.states["data"]
715716
else:
716-
# We can't just uppercase everything that arrives here. For
717-
# instance, non-ASCII characters.
718-
if data in asciiLowercase:
719-
data = data.upper()
720717
self.currentToken["name"] += data
721-
needsDoctypeCheck = True
722-
723-
# After some iterations through this state it should eventually say
724-
# "HTML". Otherwise there's an error.
725-
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
726-
self.currentToken["data"] = False
727718
return True
728719

729720
def afterDoctypeNameState(self):
@@ -735,21 +726,177 @@ def afterDoctypeNameState(self):
735726
self.state = self.states["data"]
736727
elif data == EOF:
737728
self.currentToken["data"] = True
738-
# XXX EMIT
739729
self.stream.queue.append(data)
740730
self.tokenQueue.append({"type": "ParseError", "data":
741731
_("Unexpected end of file in DOCTYPE.")})
742732
self.tokenQueue.append(self.currentToken)
743733
self.state = self.states["data"]
744734
else:
735+
charStack = []
736+
for x in xrange(6):
737+
charStack.append(self.stream.char())
738+
if EOF not in charStack and\
739+
"".join(charStack).translate(asciiUpper2Lower) == "public":
740+
self.state = self.states["beforeDoctypePublicIdentifier"]
741+
elif EOF not in charStack and\
742+
"".join(charStack).translate(asciiUpper2Lower) == "system":
743+
self.state = self.states["beforeDoctypeSystemIdentifier"]
744+
else:
745+
self.stream.queue.extend(charStack)
746+
self.tokenQueue.append({"type": "ParseError", "data":
747+
_("Expected space or '>'. Got '" + data + "'")})
748+
self.state = self.states["bogusDoctype"]
749+
return True
750+
751+
def beforeDoctypePublicIdentifierState(self):
752+
data = self.stream.char()
753+
if data in spaceCharacters:
754+
pass
755+
elif data == "\"":
756+
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
757+
elif data == "'":
758+
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
759+
elif data == ">":
760+
self.tokenQueue.append({"type": "ParseError", "data":
761+
_("Unexpected end of DOCTYPE.")})
762+
self.currentToken["data"] = True
763+
self.tokenQueue.append(self.currentToken)
764+
self.state = self.states["data"]
765+
elif data == EOF:
766+
self.tokenQueue.append({"type": "ParseError", "data":
767+
_("Unexpected end of file in DOCTYPE.")})
768+
self.currentToken["data"] = True
769+
self.tokenQueue.append(self.currentToken)
770+
self.state = self.states["data"]
771+
else:
772+
self.tokenQueue.append({"type": "ParseError", "data":
773+
_("Unexpected character in DOCTYPE.")})
774+
self.state = self.states["bogusDoctype"]
775+
return True
776+
777+
def doctypePublicIdentifierDoubleQuotedState(self):
778+
data = self.stream.char()
779+
if data == "\"":
780+
self.state = self.states["afterDoctypePublicIdentifier"]
781+
elif data == EOF:
782+
self.tokenQueue.append({"type": "ParseError", "data":
783+
_("Unexpected end of file in DOCTYPE.")})
784+
self.currentToken["data"] = True
785+
self.tokenQueue.append(self.currentToken)
786+
self.state = self.states["data"]
787+
else:
788+
self.currentToken["publicId"] += data
789+
return True
790+
791+
def doctypePublicIdentifierSingleQuotedState(self):
792+
data = self.stream.char()
793+
if data == "'":
794+
self.state = self.states["afterDoctypePublicIdentifier"]
795+
elif data == EOF:
796+
self.tokenQueue.append({"type": "ParseError", "data":
797+
_("Unexpected end of file in DOCTYPE.")})
798+
self.currentToken["data"] = True
799+
self.tokenQueue.append(self.currentToken)
800+
self.state = self.states["data"]
801+
else:
802+
self.currentToken["publicId"] += data
803+
return True
804+
805+
def afterDoctypePublicIdentifierState(self):
806+
data = self.stream.char()
807+
if data in spaceCharacters:
808+
pass
809+
elif data == "\"":
810+
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
811+
elif data == "'":
812+
self.state = self.states["doctypeSystemIdentifierSinglequoted"]
813+
elif data == ">":
814+
self.tokenQueue.append(self.currentToken)
815+
self.state = self.states["data"]
816+
elif data == EOF:
817+
self.tokenQueue.append({"type": "ParseError", "data":
818+
_("Unexpected end of file in DOCTYPE.")})
819+
self.currentToken["data"] = True
820+
self.tokenQueue.append(self.currentToken)
821+
self.state = self.states["data"]
822+
else:
823+
self.tokenQueue.append({"typ B5F2 e": "ParseError", "data":
824+
_("Unexpected character in DOCTYPE.")})
825+
self.state = self.states["bogusDoctype"]
826+
return True
827+
828+
def beforeDoctypeSystemIdentifierState(self):
829+
if data in spaceCharacters:
830+
pass
831+
elif data == "\"":
832+
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
833+
elif data == "'":
834+
self.state = self.states["doctypeSystemIdentifierSinglequoted"]
835+
elif data == ">":
836+
self.tokenQueue.append(self.currentToken)
837+
self.state = self.states["data"]
838+
elif data == EOF:
839+
self.tokenQueue.append({"type": "ParseError", "data":
840+
_("Unexpected end of file in DOCTYPE.")})
841+
self.currentToken["data"] = True
842+
self.tokenQueue.append(self.currentToken)
843+
self.state = self.states["data"]
844+
else:
845+
self.tokenQueue.append({"type": "ParseError", "data":
846+
_("Unexpected character in DOCTYPE.")})
847+
self.state = self.states["bogusDoctype"]
848+
return True
849+
850+
def doctypeSystemIdentifierDoubleQuotedState(self):
851+
data = self.stream.char()
852+
if data == "\"":
853+
self.state = self.states["afterDoctypeSystemIdentifier"]
854+
elif data == EOF:
855+
self.tokenQueue.append({"type": "ParseError", "data":
856+
_("Unexpected end of file in DOCTYPE.")})
857+
self.currentToken["data"] = True
858+
self.tokenQueue.append(self.currentToken)
859+
self.state = self.states["data"]
860+
else:
861+
self.currentToken["systemId"] += data
862+
return True
863+
864+
def doctypeSystemIdentifierSingleQuotedState(self):
865+
data = self.stream.char()
866+
if data == "'":
867+
self.state = self.states["afterDoctypeSystemIdentifier"]
868+
elif data == EOF:
869+
self.tokenQueue.append({"type": "ParseError", "data":
870+
_("Unexpected end of file in DOCTYPE.")})
871+
self.currentToken["data"] = True
872+
self.tokenQueue.append(self.currentToken)
873+
self.state = self.states["data"]
874+
else:
875+
self.currentToken["systemId"] += data
876+
return True
877+
878+
def afterDoctypeSystemIdentifierState(self):
879+
data = self.stream.char()
880+
if data in spaceCharacters:
881+
pass
882+
elif data == ">":
883+
self.tokenQueue.append(self.currentToken)
884+
self.state = self.states["data"]
885+
elif data == EOF:
745886
self.tokenQueue.append({"type": "ParseError", "data":
746-
_("Expected space or '>'. Got '" + data + "'")})
887+
_("Unexpected end of file in DOCTYPE.")})
747888
self.currentToken["data"] = True
889+
self.tokenQueue.append(self.currentToken)
890+
self.state = self.states["data"]
891+
else:
892+
self.tokenQueue.append({"type": "ParseError", "data":
893+
_("Unexpected character in DOCTYPE.")})
748894
self.state = self.states["bogusDoctype"]
749895
return True
750896

751897
def bogusDoctypeState(self):
752898
data = self.stream.char()
899+
self.currentToken["data"] = True
753900
if data == u">":
754901
self.tokenQueue.append(self.currentToken)
755902
self.state = self.states["data"]

0 commit comments

Comments
 (0)
0