@@ -71,32 +71,40 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
71
71
"trailingEnd" : TrailingEndPhase (self , self .tree )
72
72
}
73
73
74
- def parse (self , stream , encoding = None ):
75
- """Parse a HTML document into a well-formed tree
76
-
77
- stream - a filelike object or string containing the HTML to be parsed
78
-
79
- The optional encoding parameter must be a string that indicates
80
- the encoding. If specified, that encoding will be used,
81
- regardless of any BOM or later declaration (such as in a meta
82
- element)
83
- """
84
-
74
+ def _parse (self , stream , innerHTML = False , container = "div" ,
75
+ encoding = None ):
76
+
85
77
self .tree .reset ()
86
78
self .firstStartTag = False
87
79
self .errors = []
88
80
89
- self .phase = self .phases ["initial" ]
81
+ self .tokenizer = tokenizer .HTMLTokenizer (stream , encoding ,
82
+ parseMeta = innerHTML )
83
+
84
+ if innerHTML :
85
+ self .innerHTML = container .lower ()
86
+
87
+ if self .innerHTML in ('title' , 'textarea' ):
88
+ self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["RCDATA" ]
89
+ elif self .innerHTML in ('style' , 'script' , 'xmp' , 'iframe' , 'noembed' , 'noframes' , 'noscript' ):
90
+ self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["CDATA" ]
91
+ elif self .innerHTML == 'plaintext' :
92
+ self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["PLAINTEXT" ]
93
+ else :
94
+ # contentModelFlag already is PCDATA
95
+ #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
96
+ pass
97
+ self .phase = self .phases ["rootElement" ]
98
+ self .phase .insertHtmlElement ()
99
+ self .resetInsertionMode ()
100
+ else :
101
+ self .innerHTML = False
102
+ self .phase = self .phases ["initial" ]
103
+
90
104
# We only seem to have InBodyPhase testcases where the following is
91
105
# relevant ... need others too
92
106
self .lastPhase = None
93
107
94
- # We don't actually support innerHTML yet but this should allow
95
- # assertations
96
- self .innerHTML = False
97
-
98
- self .tokenizer = tokenizer .HTMLTokenizer (stream , encoding )
99
-
100
108
# XXX This is temporary for the moment so there isn't any other
101
109
# changes needed for the parser to work with the iterable tokenizer
102
110
for token in self .tokenizer :
@@ -115,9 +123,20 @@ def parse(self, stream, encoding=None):
115
123
# When the loop finishes it's EOF
116
124
self .phase .processEOF ()
117
125
126
+ def parse (self , stream , encoding = None ):
127
+ """Parse a HTML document into a well-formed tree
128
+
129
+ stream - a filelike object or string containing the HTML to be parsed
130
+
131
+ The optional encoding parameter must be a string that indicates
132
+ the encoding. If specified, that encoding will be used,
133
+ regardless of any BOM or later declaration (such as in a meta
134
+ element)
135
+ """
136
+ self ._parse (stream , innerHTML = False , encoding = encoding )
118
137
return self .tree .getDocument ()
119
138
120
- def parseFragment (self , stream , container = None , encoding = None ):
139
+ def parseFragment (self , stream , container = "div" , encoding = None ):
121
140
"""Parse a HTML fragment into a well-formed tree fragment
122
141
123
142
container - name of the element we're setting the innerHTML property
@@ -130,50 +149,7 @@ def parseFragment(self, stream, container=None, encoding=None):
130
149
regardless of any BOM or later declaration (such as in a meta
131
150
element)
132
151
"""
133
-
134
- self .tree .reset ()
135
- self .firstStartTag = False
136
- self .errors = []
137
-
138
- self .innerHTML = container and container .lower () or 'div'
139
-
140
- self .tokenizer = tokenizer .HTMLTokenizer (stream , encoding )
141
- if self .innerHTML in ('title' , 'textarea' ):
142
- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["RCDATA" ]
143
- elif self .innerHTML in ('style' , 'script' , 'xmp' , 'iframe' , 'noembed' , 'noframes' , 'noscript' ):
144
- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["CDATA" ]
145
- elif self .innerHTML == 'plaintext' :
146
- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["PLAINTEXT" ]
147
- else :
148
- # contentModelFlag already is PCDATA
149
- #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
150
- pass
151
-
152
- self .phase = self .phases ["rootElement" ]
153
- self .phase .insertHtmlElement ()
154
- self .resetInsertionMode ()
155
- # We only seem to have InBodyPhase testcases where the following is
156
- # relevant ... need others too
157
- self .lastPhase = None
158
-
159
- # XXX This is temporary for the moment so there isn't any other
160
- # changes needed for the parser to work with the iterable tokenizer
161
- for token in self .tokenizer :
162
- token = self .normalizeToken (token )
163
- type = token ["type" ]
164
- method = getattr (self .phase , "process%s" % type , None )
165
- if type in ("Characters" , "SpaceCharacters" , "Comment" ):
166
- method (token ["data" ])
167
- elif type in ("StartTag" , "Doctype" ):
168
- method (token ["name" ], token ["data" ])
169
- elif type == "EndTag" :
170
- method (token ["name" ])
171
- else :
172
- self .parseError (token ["data" ])
173
-
174
- # When the loop finishes it's EOF
175
- self .phase .processEOF ()
176
-
152
+ self ._parse (stream , True , container = container , encoding = encoding )
177
153
return self .tree .getFragment ()
178
154
179
155
def parseError (self , data = "XXX ERROR MESSAGE NEEDED" ):
0 commit comments