99
1010from constants import contentModelFlags , spaceCharacters
1111from constants import entitiesWindows1252 , entities
12- from constants import asciiLowercase , asciiLetters
12+ from constants import asciiLowercase , asciiLetters , asciiUpper2Lower
1313from constants import digits , hexDigits , EOF
1414
1515from inputstream import HTMLInputStream
@@ -57,6 +57,14 @@ def __init__(self, stream, encoding=None, parseMeta=True):
5757 "beforeDoctypeName" :self .beforeDoctypeNameState ,
5858 "doctypeName" :self .doctypeNameState ,
5959 "afterDoctypeName" :self .afterDoctypeNameState ,
60+ "beforeDoctypePublicIdentifier" :self .beforeDoctypePublicIdentifierState ,
61+ "doctypePublicIdentifierDoubleQuoted" :self .doctypePublicIdentifierDoubleQuotedState ,
62+ "doctypePublicIdentifierSingleQuoted" :self .doctypePublicIdentifierSingleQuotedState ,
63+ "afterDoctypePublicIdentifier" :self .afterDoctypePublicIdentifierState ,
64+ "beforeDoctypeSystemIdentifier" :self .beforeDoctypeSystemIdentifierState ,
65+ "doctypeSystemIdentifierDoubleQuoted" :self .doctypeSystemIdentifierDoubleQuotedState ,
66+ "doctypeSystemIdentifierSingleQuoted" :self .doctypeSystemIdentifierSingleQuotedState ,
67+ "afterDoctypeSystemIdentifier" :self .afterDoctypeSystemIdentifierState ,
6068 "bogusDoctype" :self .bogusDoctypeState
6169 }
6270
@@ -600,8 +608,8 @@ def markupDeclarationOpenState(self):
600608 # Put in explicit EOF check
601609 if (not EOF in charStack and
602610 "" .join (charStack ).upper () == u"DOCTYPE" ):
603- self .currentToken = \
604- { "type" : "Doctype " , "name" : "" , " data" : True }
611+ self .currentToken = { "type" : "Doctype" , "name" : "" , "publicId" : "" ,
612+ "systemId" : " " , "data" :True }
605613 self .state = self .states ["doctype" ]
606614 else :
607615 self .tokenQueue .append ({"type" : "ParseError" , "data" :
@@ -678,12 +686,7 @@ def beforeDoctypeNameState(self):
678686 data = self .stream .char ()
679687 if data in spaceCharacters :
680688 pass
681- elif data in asciiLowercase :
682- self .currentToken ["name" ] = data .upper ()
683- self .state = self .states ["doctypeName" ]
684689 elif data == u">" :
685- # Character needs to be consumed per the specification so don't
686- # invoke emitCurrentTokenWithParseError with "data" as argument.
687690 self .tokenQueue .append ({"type" : "ParseError" , "data" :
688691 _ ("Unexpected > character. Expected DOCTYPE name." )})
689692 self .tokenQueue .append (self .currentToken )
@@ -700,10 +703,8 @@ def beforeDoctypeNameState(self):
700703
701704 def doctypeNameState (self ):
702705 data = self .stream .char ()
703- needsDoctypeCheck = False
704706 if data in spaceCharacters :
705707 self .state = self .states ["afterDoctypeName" ]
706- needsDoctypeCheck = True
707708 elif data == u">" :
708709 self .tokenQueue .append (self .currentToken )
709710 self .state = self .states ["data" ]
@@ -713,17 +714,7 @@ def doctypeNameState(self):
713714 self .tokenQueue .append (self .currentToken )
714715 self .state = self .states ["data" ]
715716 else :
716- # We can't just uppercase everything that arrives here. For
717- # instance, non-ASCII characters.
718- if data in asciiLowercase :
719- data = data .upper ()
720717 self .currentToken ["name" ] += data
721- needsDoctypeCheck = True
722-
723- # After some iterations through this state it should eventually say
724- # "HTML". Otherwise there's an error.
725- if needsDoctypeCheck and self .currentToken ["name" ] == u"HTML" :
726- self .currentToken ["data" ] = False
727718 return True
728719
729720 def afterDoctypeNameState (self ):
@@ -735,21 +726,177 @@ def afterDoctypeNameState(self):
735726 self .state = self .states ["data" ]
736727 elif data == EOF :
737728 self .currentToken ["data" ] = True
738- # XXX EMIT
739729 self .stream .queue .append (data )
740730 self .tokenQueue .append ({"type" : "ParseError" , "data" :
741731 _ ("Unexpected end of file in DOCTYPE." )})
742732 self .tokenQueue .append (self .currentToken )
743733 self .state = self .states ["data" ]
744734 else :
735+ charStack = []
736+ for x in xrange (6 ):
737+ charStack .append (self .stream .char ())
738+ if EOF not in charStack and \
739+ "" .join (charStack ).translate (asciiUpper2Lower ) == "public" :
740+ self .state = self .states ["beforeDoctypePublicIdentifier" ]
741+ elif EOF not in charStack and \
742+ "" .join (charStack ).translate (asciiUpper2Lower ) == "system" :
743+ self .state = self .states ["beforeDoctypeSystemIdentifier" ]
744+ else :
745+ self .stream .queue .extend (charStack )
746+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
747+ _ ("Expected space or '>'. Got '" + data + "'" )})
748+ self .state = self .states ["bogusDoctype" ]
749+ return True
750+
751+ def beforeDoctypePublicIdentifierState (self ):
752+ data = self .stream .char ()
753+ if data in spaceCharacters :
754+ pass
755+ elif data == "\" " :
756+ self .state = self .states ["doctypePublicIdentifierDoubleQuoted" ]
757+ elif data == "'" :
758+ self .state = self .states ["doctypePublicIdentifierSingleQuoted" ]
759+ elif data == ">" :
760+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
761+ _ ("Unexpected end of DOCTYPE." )})
762+ self .currentToken ["data" ] = True
763+ self .tokenQueue .append (self .currentToken )
764+ self .state = self .states ["data" ]
765+ elif data == EOF :
766+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
767+ _ ("Unexpected end of file in DOCTYPE." )})
768+ self .currentToken ["data" ] = True
769+ self .tokenQueue .append (self .currentToken )
770+ self .state = self .states ["data" ]
771+ else :
772+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
773+ _ ("Unexpected character in DOCTYPE." )})
774+ self .state = self .states ["bogusDoctype" ]
775+ return True
776+
777+ def doctypePublicIdentifierDoubleQuotedState (self ):
778+ data = self .stream .char ()
779+ if data == "\" " :
780+ self .state = self .states ["afterDoctypePublicIdentifier" ]
781+ elif data == EOF :
782+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
783+ _ ("Unexpected end of file in DOCTYPE." )})
784+ self .currentToken ["data" ] = True
785+ self .tokenQueue .append (self .currentToken )
786+ self .state = self .states ["data" ]
787+ else :
788+ self .currentToken ["publicId" ] += data
789+ return True
790+
791+ def doctypePublicIdentifierSingleQuotedState (self ):
792+ data = self .stream .char ()
793+ if data == "'" :
794+ self .state = self .states ["afterDoctypePublicIdentifier" ]
795+ elif data == EOF :
796+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
797+ _ ("Unexpected end of file in DOCTYPE." )})
798+ self .currentToken ["data" ] = True
799+ self .tokenQueue .append (self .currentToken )
800+ self .state = self .states ["data" ]
801+ else :
802+ self .currentToken ["publicId" ] += data
803+ return True
804+
805+ def afterDoctypePublicIdentifierState (self ):
806+ data = self .stream .char ()
807+ if data in spaceCharacters :
808+ pass
809+
55CE
span> elif data == "\" " :
810+ self .state = self .states ["doctypeSystemIdentifierDoubleQuoted" ]
811+ elif data == "'" :
812+ self .state = self .states ["doctypeSystemIdentifierSinglequoted" ]
813+ elif data == ">" :
814+ self .tokenQueue .append (self .currentToken )
815+ self .state = self .states ["data" ]
816+ elif data == EOF :
817+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
818+ _ ("Unexpected end of file in DOCTYPE." )})
819+ self .currentToken ["data" ] = True
820+ self .tokenQueue .append (self .currentToken )
821+ self .state = self .states ["data" ]
822+ else :
823+ self .tokenQueue .append ({"typ
B5F2
e" : "ParseError" , "data" :
824+ _ ("Unexpected character in DOCTYPE." )})
825+ self .state = self .states ["bogusDoctype" ]
826+ return True
827+
828+ def beforeDoctypeSystemIdentifierState (self ):
829+ if data in spaceCharacters :
830+ pass
831+ elif data == "\" " :
832+ self .state = self .states ["doctypeSystemIdentifierDoubleQuoted" ]
833+ elif data == "'" :
834+ self .state = self .states ["doctypeSystemIdentifierSinglequoted" ]
835+ elif data == ">" :
836+ self .tokenQueue .append (self .currentToken )
837+ self .state = self .states ["data" ]
838+ elif data == EOF :
839+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
840+ _ ("Unexpected end of file in DOCTYPE." )})
841+ self .currentToken ["data" ] = True
842+ self .tokenQueue .append (self .currentToken )
843+ self .state = self .states ["data" ]
844+ else :
845+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
846+ _ ("Unexpected character in DOCTYPE." )})
847+ self .state = self .states ["bogusDoctype" ]
848+ return True
849+
850+ def doctypeSystemIdentifierDoubleQuotedState (self ):
851+ data = self .stream .char ()
852+ if data == "\" " :
853+ self .state = self .states ["afterDoctypeSystemIdentifier" ]
854+ elif data == EOF :
855+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
856+ _ ("Unexpected end of file in DOCTYPE." )})
857+ self .currentToken ["data" ] = True
858+ self .tokenQueue .append (self .currentToken )
859+ self .state = self .states ["data" ]
860+ else :
861+ self .currentToken ["systemId" ] += data
862+ return True
863+
864+ def doctypeSystemIdentifierSingleQuotedState (self ):
865+ data = self .stream .char ()
866+ if data == "'" :
867+ self .state = self .states ["afterDoctypeSystemIdentifier" ]
868+ elif data == EOF :
869+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
870+ _ ("Unexpected end of file in DOCTYPE." )})
871+ self .currentToken ["data" ] = True
872+ self .tokenQueue .append (self .currentToken )
873+ self .state = self .states ["data" ]
874+ else :
875+ self .currentToken ["systemId" ] += data
876+ return True
877+
878+ def afterDoctypeSystemIdentifierState (self ):
879+ data = self .stream .char ()
880+ if data in spaceCharacters :
881+ pass
882+ elif data == ">" :
883+ self .tokenQueue .append (self .currentToken )
884+ self .state = self .states ["data" ]
885+ elif data == EOF :
745886 self .tokenQueue .append ({"type" : "ParseError" , "data" :
746- _ ("Expected space or '>'. Got '" + data + "' " )})
887+ _ ("Unexpected end of file in DOCTYPE. " )})
747888 self .currentToken ["data" ] = True
889+ self .tokenQueue .append (self .currentToken )
890+ self .state = self .states ["data" ]
891+ else :
892+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
893+ _ ("Unexpected character in DOCTYPE." )})
748894 self .state = self .states ["bogusDoctype" ]
749895 return True
750896
751897 def bogusDoctypeState (self ):
752898 data = self .stream .char ()
899+ self .currentToken ["data" ] = True
753900 if data == u">" :
754901 self .tokenQueue .append (self .currentToken )
755902 self .state = self .states ["data" ]
0 commit comments