@@ -71,14 +71,11 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
71
71
"trailingEnd" : TrailingEndPhase (self , self .tree )
72
72
}
73
73
74
- def parse (self , stream , encoding = None , innerHTML = False ):
74
+ def parse (self , stream , encoding = None ):
75
75
"""Parse a HTML document into a well-formed tree
76
76
77
77
stream - a filelike object or string containing the HTML to be parsed
78
78
79
- innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
80
- is not yet supported)
81
-
82
79
The optional encoding parameter must be a string that indicates
83
80
the encoding. If specified, that encoding will be used,
84
81
regardless of any BOM or later declaration (such as in a meta
@@ -96,7 +93,7 @@ def parse(self, stream, encoding=None, innerHTML=False):
96
93
97
94
# We don't actually support innerHTML yet but this should allow
98
95
# assertations
99
- self .innerHTML = innerHTML
96
+ self .innerHTML = False
100
97
101
98
self .tokenizer = tokenizer .HTMLTokenizer (stream , encoding )
102
99
@@ -119,6 +116,65 @@ def parse(self, stream, encoding=None, innerHTML=False):
119
116
self .phase .processEOF ()
120
117
121
118
return self .tree .getDocument ()
119
+
120
+ def parseFragment (self , stream , container = None , encoding = None ):
121
+ """Parse a HTML fragment into a well-formed tree fragment
122
+
123
+ container - name of the element we're setting the innerHTML property
124
+ if set to None, default to 'div'
125
+
126
+ stream - a filelike object or string containing the HTML to be parsed
127
+
128
+ The optional encoding parameter must be a string that indicates
129
+ the encoding. If specified, that encoding will be used,
130
+ regardless of any BOM or later declaration (such as in a meta
131
+ element)
132
+ """
133
+
134
+ self .tree .reset ()
135
+ self .firstStartTag = False
136
+ self .errors = []
137
+
138
+ self .innerHTML = container and container .lower () or 'div'
139
+
140
+ self .tokenizer = tokenizer .HTMLTokenizer (stream , encoding )
141
+ if self .innerHTML in ('title' , 'textarea' ):
142
+ self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["RCDATA" ]
143
+ elif self .innerHTML in ('style' , 'script' , 'xmp' , 'iframe' , 'noembed' , 'noframes' , 'noscript' ):
144
+ self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["CDATA" ]
145
+ elif self .innerHTML == 'plaintext' :
146
+ self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["PLAINTEXT" ]
147
+ else :
148
+ # contentModelFlag already is PCDATA
149
+ #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
150
+ pass
151
+
152
+ self .phase = self .phases ["rootElement" ]
153
+ self .phase .insertHtmlElement ()
154
+ self .resetInsertionMode ()
155
+ # We only seem to have InBodyPhase testcases where the following is
156
+ # relevant ... need others too
157
+ self .lastPhase = None
158
+
159
+ # XXX This is temporary for the moment so there isn't any other
160
+ # changes needed for the parser to work with the iterable tokenizer
161
+ for token in self .tokenizer :
162
+ token = self .normalizeToken (token )
163
+ type = token ["type" ]
164
+ method = getattr (self .phase , "process%s" % type , None )
165
+ if type in ("Characters" , "SpaceCharacters" , "Comment" ):
166
+ method (token ["data" ])
167
+ elif type in ("StartTag" , "Doctype" ):
168
+ method (token ["name" ], token ["data" ])
169
+ elif type == "EndTag" :
170
+ method (token ["name" ])
171
+ else :
172
+ self .parseError (token ["data" ])
173
+
174
+ # When the loop finishes it's EOF
175
+ self .phase .processEOF ()
176
+
177
+ return self .tree .getFragment ()
122
178
123
179
def parseError (self , data = "XXX ERROR MESSAGE NEEDED" ):
124
180
# XXX The idea is to make data mandatory.
@@ -187,28 +243,29 @@ def resetInsertionMode(self):
187
243
"frameset" :"inFrameset"
188
244
}
189
245
for node in self .tree .openElements [::- 1 ]:
246
+ nodeName = node .name
190
247
if node == self .tree .openElements [0 ]:
191
248
last = True
192
- if node . name not in ['td' , 'th' ]:
249
+ if nodeName not in ['td' , 'th' ]:
193
250
# XXX
194
251
assert self .innerHTML
195
- raise NotImplementedError
252
+ nodeName = self . innerHTML
196
253
# Check for conditions that should only happen in the innerHTML
197
254
# case
198
- if node . name in ("select" , "colgroup" , "head" , "frameset" ):
255
+ if nodeName in ("select" , "colgroup" , "head" , "frameset" ):
199
256
# XXX
200
257
assert self .innerHTML
201
- if node . name in newModes :
202
- self .phase = self .phases [newModes [node . name ]]
258
+ if nodeName in newModes :
259
+ self .phase = self .phases [newModes [nodeName ]]
203
260
break
204
- elif node . name == "html" :
261
+ elif nodeName == "html" :
205
262
if self .tree .headPointer is None :
206
263
self .phase = self .phases ["beforeHead" ]
207
264
else :
208
265
self .phase = self .phases ["afterHead" ]
209
266
break
210
267
elif last :
211
- self .phase = self .phases ["body " ]
268
+ self .phase = self .phases ["inBody " ]
212
269
break
213
270
214
271
class Phase (object ):
@@ -453,10 +510,11 @@ def startTagStyle(self, name, attributes):
453
510
self .parser .tokenizer .contentModelFlag = contentModelFlags ["CDATA" ]
454
511
455
512
def startTagScript (self , name , attributes ):
513
+ #XXX Inner HTML case may be wrong
456
514
element = self .tree .createElement (name , attributes )
457
515
element ._flags .append ("parser-inserted" )
458
- if self .tree .headPointer is not None and \
459
- self .parser .phase == self .parser .phases ["inHead" ]:
516
+ if ( self .tree .headPointer is not None and
517
+ self .parser .phase == self .parser .phases ["inHead" ]) :
460
518
self .appendToHead (element )
461
519
else :
462
520
self .tree .openElements [- 1 ].appendChild (element )
@@ -651,8 +709,8 @@ def startTagFromHead(self, name, attributes):
651
709
652
710
def startTagBody (self , name , attributes ):
653
711
self .parser .parseError (_ (u"Unexpected start tag (body)." ))
654
- if len (self .tree .openElements ) == 1 \
655
- or self .tree .openElements [1 ].name != "body" :
712
+ if ( len (self .tree .openElements ) == 1
713
+ or self .tree .openElements [1 ].name != "body" ) :
656
714
assert self .parser .innerHTML
657
715
else :
658
716
for attr , value in attributes .iteritems ():
@@ -1177,6 +1235,7 @@ def endTagTable(self, name):
1177
1235
self .parser .resetInsertionMode ()
1178
1236
else :
1179
1237
# innerHTML case
1238
+ assert self .parser .innerHTML
1180
1239
self .parser .parseError ()
1181
1240
1182
1241
def endTagIgnore (self , name ):
@@ -1213,23 +1272,25 @@ def __init__(self, parser, tree):
1213
1272
])
1214
1273
self .endTagHandler .default = self .endTagOther
1215
1274
1275
+ def ignoreEndTagCaption (self ):
1276
+ return not self .tree .elementInScope ("caption" , True )
1277
+
1216
1278
def processCharacters (self , data ):
1217
1279
self .parser .phases ["inBody" ].processCharacters (data )
1218
1280
1219
1281
def startTagTableElement (self , name , attributes ):
1220
1282
self .parser .parseError ()
1283
+ #XXX Have to duplicate logic here to find out if the tag is ignored
1284
+ ignoreEndTag = self .ignoreEndTagCaption ()
1221
1285
self .parser .phase .processEndTag ("caption" )
1222
- # XXX how do we know the tag is _always_ ignored in the innerHTML
1223
- # case and therefore shouldn't be processed again? I'm not sure this
1224
- # strategy makes sense...
1225
- if not self .parser .innerHTML :
1286
+ if not ignoreEndTag :
1226
1287
self .parser .phase .processStartTag (name , attributes )
1227
1288
1228
1289
def startTagOther (self , name , attributes ):
1229
1290
self .parser .phases ["inBody" ].processStartTag (name , attributes )
1230
1291
1231
1292
def endTagCaption (self , name ):
1232
- if self .tree . elementInScope ( name , True ):
1293
+ if not self .ignoreEndTagCaption ( ):
1233
1294
# AT this code is quite similar to endTagTable in "InTable"
1234
1295
self .tree .generateImpliedEndTags ()
1235
1296
if self .tree .openElements [- 1 ].name != "caption" :
@@ -1242,13 +1303,14 @@ def endTagCaption(self, name):
1242
1303
self .parser .phase = self .parser .phases ["inTable" ]
1243
1304
else :
1244
1305
# innerHTML case
1306
+ assert self .parser .innerHTML
1245
1307
self .parser .parseError ()
1246
1308
1247
1309
def endTagTable (self , name ):
1248
1310
self .parser .parseError ()
1311
+ ignoreEndTag = self .ignoreEndTagCaption ()
1249
1312
self .parser .phase .processEndTag ("caption" )
B41A
1250
- # XXX ...
1251
- if not self .parser .innerHTML :
1313
+ if not ignoreEndTag :
1252
1314
self .parser .phase .processStartTag (name , attributes )
1253
1315
1254
1316
def endTagIgnore (self , name ):
@@ -1277,25 +1339,29 @@ def __init__(self, parser, tree):
1277
1339
])
1278
1340
self .endTagHandler .default = self .endTagOther
1279
1341
1342
+ def ignoreEndTagColgroup (self ):
1343
+ return self .tree .openElements [- 1 ].name == "html"
1344
+
1280
1345
def processCharacters (self , data ):
1346
+ ignoreEndTag = self .ignoreEndTagColgroup ()
1281
1347
self .endTagColgroup ("colgroup" )
1282
- # XXX
1283
- if not self .parser .innerHTML :
1348
+ if not ignoreEndTag :
1284
1349
self .parser .phase .processCharacters (data )
1285
1350
1286
1351
def startTagCol (self , name ,attributes ):
1287
1352
self .tree .insertElement (name , attributes )
1288
1353
self .tree .openElements .pop ()
1289
1354
1290
1355
def startTagOther (self , name , attributes ):
1356
+ ignoreEndTag = self .ignoreEndTagColgroup ()
1291
1357
self .endTagColgroup ("colgroup" )
1292
- # XXX how can be sure it's always ignored?
1293
- if not self .parser .innerHTML :
1358
+ if not ignoreEndTag :
1294
1359
self .parser .phase .processStartTag (name , attributes )
1295
1360
1296
1361
def endTagColgroup (self , name ):
1297
- if self .tree . openElements [ - 1 ]. name == "html" :
1362
+ if self .ignoreEndTagColgroup () :
1298
1363
# innerHTML case
1364
+ assert self .parser .innerHTML
1299
1365
self .parser .parseError ()
1300
1366
else :
1301
1367
self .tree .openElements .pop ()
@@ -1306,9 +1372,9 @@ def endTagCol(self, name):
1306
1372
u"col has no end tag." ))
1307
1373
1308
1374
def endTagOther (self , name ):
1375
+ ignoreEndTag = self .ignoreEndTagColgroup ()
1309
1376
self .endTagColgroup ("colgroup" )
1310
- # XXX how can be sure it's always ignored?
1311
- if not self .parser .innerHTML :
1377
+ if not ignoreEndTag :
1312
1378
self .parser .phase .processEndTag (name )
1313
1379
1314
1380
@@ -1357,9 +1423,9 @@ def startTagTableCell(self, name, attributes):
1357
1423
1358
1424
def startTagTableOther (self , name , attributes ):
1359
1425
# XXX AT Any ideas on how to share this with endTagTable?
1360
- if self .tree .elementInScope ("tbody" , True ) or \
1361
- self .tree .elementInScope ("thead" , True ) or \
1362
- self .tree .elementInScope ("tfoot" , True ):
1426
+ if ( self .tree .elementInScope ("tbody" , True ) or
1427
+ self .tree .elementInScope ("thead" , True ) or
1428
+ self .tree .elementInScope ("tfoot" , True ) ):
1363
1429
self .clearStackToTableBodyContext ()
1364
1430
self .endTagTableRowGroup (self .tree .openElements [- 1 ].name )
1365
1431
self .parser .phase .processStartTag (name , attributes )
@@ -1380,9 +1446,9 @@ def endTagTableRowGroup(self, name):
1380
1446
") in the table body phase. Ignored." ))
1381
1447
1382
1448
def endTagTable (self , name ):
1383
- if self .tree .elementInScope ("tbody" , True ) or \
1384
- self .tree .elementInScope ("thead" , True ) or \
1385
- self .tree .elementInScope ("tfoot" , True ):
1449
+ if ( self .tree .elementInScope ("tbody" , True ) or
1450
+ self .tree .elementInScope ("thead" , True ) or
1451
+ self .tree .elementInScope ("tfoot" , True ) ):
1386
1452
self .clearStackToTableBodyContext ()
1387
1453
self .endTagTableRowGroup (self .tree .openElements [- 1 ].name )
1388
1454
self .parser .phase .processEndTag (name )
@@ -1426,6 +1492,9 @@ def clearStackToTableRowContext(self):
1426
1492
self .tree .openElements [- 1 ].name + u") in the row phase." ))
1427
1493
self .tree .openElements .pop ()
1428
1494
1495
+ def ignoreEndTagTr (self ):
1496
+ return not self .tree .elementInScope ("tr" , tableVariant = True )
1497
+
1429
1498
# the rest
1430
1499
def processCharacters (self , data ):
1431
1500
self .parser .phases ["inTable" ].processCharacters (data )
@@ -1437,28 +1506,31 @@ def startTagTableCell(self, name, attributes):
1437
1506
self .tree .activeFormattingElements .append (Marker )
1438
1507
1439
1508
def startTagTableOther (self , name , attributes ):
1509
+ ignoreEndTag = self .ignoreEndTagTr ()
1440
1510
self .endTagTr ("tr" )
1441
1511
# XXX how are we sure it's always ignored in the innerHTML case?
1442
- if not self . parser . innerHTML :
1512
+ if not ignoreEndTag :
1443
1513
self .parser .phase .processStartTag (name , attributes )
1444
1514
1445
1515
def startTagOther (self , name , attributes ):
1446
1516
self .parser .phases ["inTable" ].processStartTag (name , attributes )
1447
1517
1448
1518
def endTagTr (self , name ):
1449
- if self .tree . elementInScope ( "tr" , True ):
1519
+ if not self .ignoreEndTagTr ( ):
1450
1520
self .clearStackToTableRowContext ()
1451
1521
self .tree .openElements .pop ()
1452
1522
self .parser .phase = self .parser .phases ["inTableBody" ]
1453
1523
else :
1454
1524
# innerHTML case
1525
+ assert self .parser .innerHTML
1455
1526
self .parser .parseError ()
1456
1527
1457
1528
def endTagTable (self , name ):
1529
+ ignoreEndTag = self .ignoreEndTagTr ()
1458
1530
self .endTagTr ("tr" )
1459
1531
# Reprocess the current tag if the tr end tag was not ignored
1460
1532
# XXX how are we sure it's always ignored in the innerHTML case?
1461
- if not self . parser . innerHTML :
1533
+ if not ignoreEndTag :
1462
1534
self .parser .phase .processEndTag (name )
1463
1535
1464
1536
def endTagTableRowGroup (self , name ):
@@ -1549,6 +1621,7 @@ def endTagImply(self, name):
1549
1621
self .parser .phase .processEndTag (name )
1550
1622
else :
1551
1623
# sometimes innerHTML case
1624
+ assert self .parser .innerHTML
1552
1625
self .parser .parseError ()
1553
1626
1554
1627
def endTagOther (self , name ):
@@ -1626,7 +1699,7 @@ def endTagOptgroup(self, name):
1626
1699
u"select phase. Ignored." ))
1627
1700
1628
1701
def endTagSelect (self , name ):
1629
- if self .tree .elementInScope (name , True ):
1702
+ if self .tree .elementInScope ("select" , True ):
1630
1703
node = self .tree .openElements .pop ()
1631
1704
while node .name != "select" :
1632
1705
node = self .tree .openElements .pop ()
@@ -1734,8 +1807,8 @@ def endTagFrameset(self, name):
1734
1807
u"in the frameset phase (innerHTML)." ))
1735
1808
else :
1736
1809
self .tree .openElements .pop ()
1737
- if not self .parser .innerHTML and \
1738
- self .tree .openElements [- 1 ].name != "frameset" :
1810
+ if ( not self .parser .innerHTML and
1811
+ self .tree .openElements [- 1 ].name != "frameset" ) :
1739
1812
# If we're not in innerHTML mode and the the current node is not a
1740
1813
# "frameset" element (anymore) then switch.
1741
1814
self .parser .phase = self .parser .phases ["afterFrameset" ]
0 commit comments