@@ -75,7 +75,7 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
7575 "afterBody" : AfterBodyPhase (self , self .tree ),
7676 "inFrameset" : InFramesetPhase (self , self .tree ),
7777 "afterFrameset" : AfterFramesetPhase (self , self .tree ),
78- "trailingEnd" : TrailingEndPhase (self , self .tree )
78+ "trailingEnd" : TrailingEndPhase (self , self .tree ),
7979 # XXX after after body
8080 # XXX after after frameset
8181 # XXX trailingEnd is gone
@@ -117,10 +117,11 @@ def _parse(self, stream, innerHTML=False, container="div",
117117 # relevant ... need others too
118118 self .lastPhase = None
119119
120+ self .beforeRCDataPhase = None
121+
120122 # XXX This is temporary for the moment so there isn't any other
121123 # changes needed for the parser to work with the iterable tokenizer
122- for token in self .tokenizer :
123- token = self .normalizeToken (token )
124+ for token in self .normalizedTokens ():
124125 type = token ["type" ]
125126 method = getattr (self .phase , "process%s" % type , None )
126127 if type in ("Characters" , "SpaceCharacters" , "Comment" ):
@@ -137,6 +138,10 @@ def _parse(self, stream, innerHTML=False, container="div",
137138 # When the loop finishes it's EOF
138139 self .phase .processEOF ()
139140
141+ def normalizedTokens (self ):
142+ for token in self .tokenizer :
143+ yield self .normalizeToken (token )
144+
140145 def parse (self , stream , encoding = None , parseMeta = True , useChardet = True ):
141146 """Parse a HTML document into a well-formed tree
142147
@@ -238,6 +243,29 @@ def resetInsertionMode(self):
238243 self .phase = self .phases ["inBody" ]
239244 break
240245
246+ def parseRCDataCData (self , name , attributes , contentType ):
247+ """Generic (R)CDATA Parsing algorithm
248+ contentType - RCDATA or CDATA
249+ """
250+ assert contentType in ("CDATA" , "RCDATA" )
251+
252+ element = self .tree .insertElement (name , attributes )
253+ self .tokenizer .contentModelFlag = contentModelFlags [contentType ]
254+
255+ for token in self .normalizedTokens ():
256+ if token ["type" ] in ("Characters" , "SpaceCharacters" ):
257+ self .tree .insertText (token ["data" ])
258+ elif token ["type" ] == "ParseError" :
259+ self .parseError (token ["data" ], token .get ("datavars" , {}))
260+ else :
261+ assert self .tokenizer .contentModelFlag == contentModelFlags ["PCDATA" ]
262+ assert token ["type" ] == "EndTag" and token ["name" ] == name , repr (token )
263+ assert self .tree .openElements .pop () == element
264+ return
265+ #Otherwise we hit EOF
266+ assert self .tree .openElements .pop () == element
267+ self .parseError ("expected-closing-tag-but-got-eof" )
268+
241269class Phase (object ):
242270 """Base class for helper object that implements each phase of processing
243271 """
@@ -298,29 +326,6 @@ def startTagHtml(self, name, attributes):
298326 def processEndTag (self , name ):
299327 self .endTagHandler [name ](name )
300328
301- def parseRCDataCData (self , name , attributes , contentType ):
302- """Generic (R)CDATA Parsing algorithm
303- contentType - RCDATA or CDATA
304- """
305- assert contentType in ("CDATA" , "RCDATA" )
306- element = self .tree .insertElement (name , attributes )
307- self .parser .tokenizer .contentModelFlag = contentModelFlags [contentType ]
308- for token in self .parser .tokenizer :
309- token = self .parser .normalizeToken (token )
310- if token ["type" ] in ("Characters" , "SpaceCharacters" ):
311- self .tree .insertText (token ["data" ])
312- elif token ["type" ] == "ParseError" :
313- self .parser .parseError (token ["data" ], token .get ("datavars" , {}))
314- else :
315- assert self .parser .tokenizer .contentModelFlag == contentModelFlags ["PCDATA" ]
316- assert token ["type" ] == "EndTag" and token ["name" ] == name , repr (token )
317- assert self .tree .openElements .pop () == element
318- return
319- #Otherwise we hit EOF
320- assert self .tree .openElements .pop () == element
321- self .parser .parseError ("expected-closing-tag-but-got-eof" )
322-
323-
324329class InitialPhase (Phase ):
325330 # This phase deals with error handling as well which is currently not
326331 # covered in the specification. The error handling is typically known as
@@ -586,18 +591,18 @@ def startTagHead(self, name, attributes):
586591 self .parser .parseError ("two-heads-are-not-better-than-one" )
587592
588593 def startTagTitle (self , name , attributes ):
589- self .parseRCDataCData (name , attributes , "RCDATA" )
594+ self .parser . parseRCDataCData (name , attributes , "RCDATA" )
590595
591596 def startTagStyle (self , name , attributes ):
592- self .parseRCDataCData (name , attributes , "CDATA" )
597+ self .parser . parseRCDataCData (name , attributes , "CDATA" )
593598
594599 def startTagNoScript (self , name , attributes ):
595600 #Need to decide whether to implement the scripting-disabled case
596- self .parseRCDataCData (name , attributes , "CDATA" )
601+ self .parser . parseRCDataCData (name , attributes , "CDATA" )
597602
598603 def startTagScript (self , name , attributes ):
599604 #I think this is equivalent to the CDATA stuff since we don't execute script
600- self .parseRCDataCData (name , attributes , "CDATA" )
605+ self .parser . parseRCDataCData (name , attributes , "CDATA" )
601606
602607 def startTagBaseLinkMeta (self , name , attributes ):
603608 if (self .tree .headPointer is not None and self .parser .phase == self .parser .phases ["inHead" ]):
@@ -612,7 +617,7 @@ def startTagOther(self, name, attributes):
612617 self .parser .phase .processStartTag (name , attributes )
613618
614619 def endTagHead (self , name ):
615- assert self .tree .openElements [- 1 ].name == "head"
620+ assert self .tree .openElements [- 1 ].name == "head" , "Expected head got %s" % self . tree . openElements [ - 1 ]. name
616621 self .tree .openElements .pop ()
617622 self .parser .phase = self .parser .phases ["afterHead" ]
618623
@@ -922,7 +927,7 @@ def startTagAppletMarqueeObject(self, name, attributes):
922927
923928 def startTagXmp (self , name , attributes ):
924929 self .tree .reconstructActiveFormattingElements ()
925- self .parseRCDataCData (name , attributes , "CDATA" )
930+ self .parser . parseRCDataCData (name , attributes , "CDATA" )
926931
927932 def startTagTable (self , name , attributes ):
928933 if self .tree .elementInScope ("p" ):
@@ -982,7 +987,7 @@ def startTagTextarea(self, name, attributes):
982987
983988 def startTagCdata (self , name , attributes ):
984989 """iframe, noembed noframes, noscript(if scripting enabled)"""
985- self .parseRCDataCData (name , attributes , "CDATA" )
990+ self .parser . parseRCDataCData (name , attributes , "CDATA" )
986991
987992 def startTagSelect (self , name , attributes ):
988993 self .tree .reconstructActiveFormattingElements ()
0 commit comments