8000 Initial innerHTML support (some refactoring+testcases needed) · awesome-python/html5lib-python@1d1f44c · GitHub
[go: up one dir, main page]

Skip to content

Commit 1d1f44c

Browse files
committed
Initial innerHTML support (some refactoring+testcases needed)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40538
1 parent 32f3fd8 commit 1d1f44c

File tree

4 files changed

+144
-43
lines changed

4 files changed

+144
-43
lines changed

src/html5parser.py

Lines changed: 115 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,11 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
7171
"trailingEnd": TrailingEndPhase(self, self.tree)
7272
}
7373

74-
def parse(self, stream, encoding=None, innerHTML=False):
74+
def parse(self, stream, encoding=None):
7575
"""Parse a HTML document into a well-formed tree
7676
7777
stream - a filelike object or string containing the HTML to be parsed
7878
79-
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
80-
is not yet supported)
81-
8279
The optional encoding parameter must be a string that indicates
8380
the encoding. If specified, that encoding will be used,
8481
regardless of any BOM or later declaration (such as in a meta
@@ -96,7 +93,7 @@ def parse(self, stream, encoding=None, innerHTML=False):
9693

9794
# We don't actually support innerHTML yet but this should allow
9895
# assertations
99-
self.innerHTML = innerHTML
96+
self.innerHTML = False
10097

10198
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
10299

@@ -119,6 +116,65 @@ def parse(self, stream, encoding=None, innerHTML=False):
119116
self.phase.processEOF()
120117

121118
return self.tree.getDocument()
119+
120+
def parseFragment(self, stream, container=None, encoding=None):
121+
"""Parse a HTML fragment into a well-formed tree fragment
122+
123+
container - name of the element we're setting the innerHTML property
124+
if set to None, default to 'div'
125+
126+
stream - a filelike object or string containing the HTML to be parsed
127+
128+
The optional encoding parameter must be a string that indicates
129+
the encoding. If specified, that encoding will be used,
130+
regardless of any BOM or later declaration (such as in a meta
131+
element)
132+
"""
133+
134+
self.tree.reset()
135+
self.firstStartTag = False
136+
self.errors = []
137+
138+
self.innerHTML = container and container.lower() or 'div'
139+
140+
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
141+
if self.innerHTML in ('title', 'textarea'):
142+
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
143+
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
144+
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
145+
elif self.innerHTML == 'plaintext':
146+
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
147+
else:
148+
# contentModelFlag already is PCDATA
149+
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
150+
pass
151+
152+
self.phase = self.phases["rootElement"]
153+
self.phase.insertHtmlElement()
154+
self.resetInsertionMode()
155+
# We only seem to have InBodyPhase testcases where the following is
156+
# relevant ... need others too
157+
self.lastPhase = None
158+
159+
# XXX This is temporary for the moment so there isn't any other
160+
# changes needed for the parser to work with the iterable tokenizer
161+
for token in self.tokenizer:
162+
token = self.normalizeToken(token)
163+
type = token["type"]
164+
method = getattr(self.phase, "process%s" % type, None)
165+
if type in ("Characters", "SpaceCharacters", "Comment"):
166+
method(token["data"])
167+
elif type in ("StartTag", "Doctype"):
168+
method(token["name"], token["data"])
169+
elif type == "EndTag":
170+
method(token["name"])
171+
else:
172+
self.parseError(token["data"])
173+
174+
# When the loop finishes it's EOF
175+
self.phase.processEOF()
176+
177+
return self.tree.getFragment()
122178

123179
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
124180
# XXX The idea is to make data mandatory.
@@ -187,28 +243,29 @@ def resetInsertionMode(self):
187243
"frameset":"inFrameset"
188244
}
189245
for node in self.tree.openElements[::-1]:
246+
nodeName = node.name
190247
if node == self.tree.openElements[0]:
191248
last = True
192-
if node.name not in ['td', 'th']:
249+
if nodeName not in ['td', 'th']:
193250
# XXX
194251
assert self.innerHTML
195-
raise NotImplementedError
252+
nodeName = self.innerHTML
196253
# Check for conditions that should only happen in the innerHTML
197254
# case
198-
if node.name in ("select", "colgroup", "head", "frameset"):
255+
if nodeName in ("select", "colgroup", "head", "frameset"):
199256
# XXX
200257
assert self.innerHTML
201-
if node.name in newModes:
202-
self.phase = self.phases[newModes[node.name]]
258+
if nodeName in newModes:
259+
self.phase = self.phases[newModes[nodeName]]
203260
break
204-
elif node.name == "html":
261+
elif nodeName == "html":
205262
if self.tree.headPointer is None:
206263
self.phase = self.phases["beforeHead"]
207264
else:
208265
self.phase = self.phases["afterHead"]
209266
break
210267
elif last:
211-
self.phase = self.phases["body"]
268+
self.phase = self.phases["inBody"]
212269
break
213270

214271
class Phase(object):
@@ -453,10 +510,11 @@ def startTagStyle(self, name, attributes):
453510
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
454511

455512
def startTagScript(self, name, attributes):
513+
#XXX Inner HTML case may be wrong
456514
element = self.tree.createElement(name, attributes)
457515
element._flags.append("parser-inserted")
458-
if self.tree.headPointer is not None and\
459-
self.parser.phase == self.parser.phases["inHead"]:
516+
if (self.tree.headPointer is not None and
517+
self.parser.phase == self.parser.phases["inHead"]):
460518
self.appendToHead(element)
461519
else:
462520
self.tree.openElements[-1].appendChild(element)
@@ -651,8 +709,8 @@ def startTagFromHead(self, name, attributes):
651709

652710
def startTagBody(self, name, attributes):
653711
self.parser.parseError(_(u"Unexpected start tag (body)."))
654-
if len(self.tree.openElements) == 1 \
655-
or self.tree.openElements[1].name != "body":
712+
if (len(self.tree.openElements) == 1
713+
or self.tree.openElements[1].name != "body"):
656714
assert self.parser.innerHTML
657715
else:
658716
for attr, value in attributes.iteritems():
@@ -1177,6 +1235,7 @@ def endTagTable(self, name):
11771235
self.parser.resetInsertionMode()
11781236
else:
11791237
# innerHTML case
1238+
assert self.parser.innerHTML
11801239
self.parser.parseError()
11811240

11821241
def endTagIgnore(self, name):
@@ -1213,23 +1272,25 @@ def __init__(self, parser, tree):
12131272
])
12141273
self.endTagHandler.default = self.endTagOther
12151274

1275+
def ignoreEndTagCaption(self):
1276+
return not self.tree.elementInScope("caption", True)
1277+
12161278
def processCharacters(self, data):
12171279
self.parser.phases["inBody"].processCharacters(data)
12181280

12191281
def startTagTableElement(self, name, attributes):
12201282
self.parser.parseError()
1283+
#XXX Have to duplicate logic here to find out if the tag is ignored
1284+
ignoreEndTag = self.ignoreEndTagCaption()
12211285
self.parser.phase.processEndTag("caption")
1222-
# XXX how do we know the tag is _always_ ignored in the innerHTML
1223-
# case and therefore shouldn't be processed again? I'm not sure this
1224-
# strategy makes sense...
1225-
if not self.parser.innerHTML:
1286+
if not ignoreEndTag:
12261287
self.parser.phase.processStartTag(name, attributes)
12271288

12281289
def startTagOther(self, name, attributes):
12291290
self.parser.phases["inBody"].processStartTag(name, attributes)
12301291

12311292
def endTagCaption(self, name):
1232-
if self.tree.elementInScope(name, True):
1293+
if not self.ignoreEndTagCaption():
12331294
# AT this code is quite similar to endTagTable in "InTable"
12341295
self.tree.generateImpliedEndTags()
12351296
if self.tree.openElements[-1].name != "caption":
@@ -1242,13 +1303,14 @@ def endTagCaption(self, name):
12421303
self.parser.phase = self.parser.phases["inTable"]
12431304
else:
12441305
# innerHTML case
1306+
assert self.parser.innerHTML
12451307
self.parser.parseError()
12461308

12471309
def endTagTable(self, name):
12481310
self.parser.parseError()
1311+
ignoreEndTag = self.ignoreEndTagCaption()
12491312
self.parser.phase.processEndTag("caption")
B41A
1250-
# XXX ...
1251-
if not self.parser.innerHTML:
1313+
if not ignoreEndTag:
12521314
self.parser.phase.processStartTag(name, attributes)
12531315

12541316
def endTagIgnore(self, name):
@@ -1277,25 +1339,29 @@ def __init__(self, parser, tree):
12771339
])
12781340
self.endTagHandler.default = self.endTagOther
12791341

1342+
def ignoreEndTagColgroup(self):
1343+
return self.tree.openElements[-1].name == "html"
1344+
12801345
def processCharacters(self, data):
1346+
ignoreEndTag = self.ignoreEndTagColgroup()
12811347
self.endTagColgroup("colgroup")
1282-
# XXX
1283-
if not self.parser.innerHTML:
1348+
if not ignoreEndTag:
12841349
self.parser.phase.processCharacters(data)
12851350

12861351
def startTagCol(self, name ,attributes):
12871352
self.tree.insertElement(name, attributes)
12881353
self.tree.openElements.pop()
12891354

12901355
def startTagOther(self, name, attributes):
1356+
ignoreEndTag = self.ignoreEndTagColgroup()
12911357
self.endTagColgroup("colgroup")
1292-
# XXX how can be sure it's always ignored?
1293-
if not self.parser.innerHTML:
1358+
if not ignoreEndTag:
12941359
self.parser.phase.processStartTag(name, attributes)
12951360

12961361
def endTagColgroup(self, name):
1297-
if self.tree.openElements[-1].name == "html":
1362+
if self.ignoreEndTagColgroup():
12981363
# innerHTML case
1364+
assert self.parser.innerHTML
12991365
self.parser.parseError()
13001366
else:
13011367
self.tree.openElements.pop()
@@ -1306,9 +1372,9 @@ def endTagCol(self, name):
13061372
u"col has no end tag."))
13071373

13081374
def endTagOther(self, name):
1375+
ignoreEndTag = self.ignoreEndTagColgroup()
13091376
self.endTagColgroup("colgroup")
1310-
# XXX how can be sure it's always ignored?
1311-
if not self.parser.innerHTML:
1377+
if not ignoreEndTag:
13121378
self.parser.phase.processEndTag(name)
13131379

13141380

@@ -1357,9 +1423,9 @@ def startTagTableCell(self, name, attributes):
13571423

13581424
def startTagTableOther(self, name, attributes):
13591425
# XXX AT Any ideas on how to share this with endTagTable?
1360-
if self.tree.elementInScope("tbody", True) or \
1361-
self.tree.elementInScope("thead", True) or \
1362-
self.tree.elementInScope("tfoot", True):
1426+
if (self.tree.elementInScope("tbody", True) or
1427+
self.tree.elementInScope("thead", True) or
1428+
self.tree.elementInScope("tfoot", True)):
13631429
self.clearStackToTableBodyContext()
13641430
self.endTagTableRowGroup(self.tree.openElements[-1].name)
13651431
self.parser.phase.processStartTag(name, attributes)
@@ -1380,9 +1446,9 @@ def endTagTableRowGroup(self, name):
13801446
") in the table body phase. Ignored."))
13811447

13821448
def endTagTable(self, name):
1383-
if self.tree.elementInScope("tbody", True) or \
1384-
self.tree.elementInScope("thead", True) or \
1385-
self.tree.elementInScope("tfoot", True):
1449+
if (self.tree.elementInScope("tbody", True) or
1450+
self.tree.elementInScope("thead", True) or
1451+
self.tree.elementInScope("tfoot", True)):
13861452
self.clearStackToTableBodyContext()
13871453
self.endTagTableRowGroup(self.tree.openElements[-1].name)
13881454
self.parser.phase.processEndTag(name)
@@ -1426,6 +1492,9 @@ def clearStackToTableRowContext(self):
14261492
self.tree.openElements[-1].name + u") in the row phase."))
14271493
self.tree.openElements.pop()
14281494

1495+
def ignoreEndTagTr(self):
1496+
return not self.tree.elementInScope("tr", tableVariant=True)
1497+
14291498
# the rest
14301499
def processCharacters(self, data):
14311500
self.parser.phases["inTable"].processCharacters(data)
@@ -1437,28 +1506,31 @@ def startTagTableCell(self, name, attributes):
14371506
self.tree.activeFormattingElements.append(Marker)
14381507

14391508
def startTagTableOther(self, name, attributes):
1509+
ignoreEndTag = self.ignoreEndTagTr()
14401510
self.endTagTr("tr")
14411511
# XXX how are we sure it's always ignored in the innerHTML case?
1442-
if not self.parser.innerHTML:
1512+
if not ignoreEndTag:
14431513
self.parser.phase.processStartTag(name, attributes)
14441514

14451515
def startTagOther(self, name, attributes):
14461516
self.parser.phases["inTable"].processStartTag(name, attributes)
14471517

14481518
def endTagTr(self, name):
1449-
if self.tree.elementInScope("tr", True):
1519+
if not self.ignoreEndTagTr():
14501520
self.clearStackToTableRowContext()
14511521
self.tree.openElements.pop()
14521522
self.parser.phase = self.parser.phases["inTableBody"]
14531523
else:
14541524
# innerHTML case
1525+
assert self.parser.innerHTML
14551526
self.parser.parseError()
14561527

14571528
def endTagTable(self, name):
1529+
ignoreEndTag = self.ignoreEndTagTr()
14581530
self.endTagTr("tr")
14591531
# Reprocess the current tag if the tr end tag was not ignored
14601532
# XXX how are we sure it's always ignored in the innerHTML case?
1461-
if not self.parser.innerHTML:
1533+
if not ignoreEndTag:
14621534
self.parser.phase.processEndTag(name)
14631535

14641536
def endTagTableRowGroup(self, name):
@@ -1549,6 +1621,7 @@ def endTagImply(self, name):
15491621
self.parser.phase.processEndTag(name)
15501622
else:
15511623
# sometimes innerHTML case
1624+
assert self.parser.innerHTML
15521625
self.parser.parseError()
15531626

15541627
def endTagOther(self, name):
@@ -1626,7 +1699,7 @@ def endTagOptgroup(self, name):
16261699
u"select phase. Ignored."))
16271700

16281701
def endTagSelect(self, name):
1629-
if self.tree.elementInScope(name, True):
1702+
if self.tree.elementInScope("select", True):
16301703
node = self.tree.openElements.pop()
16311704
while node.name != "select":
16321705
node = self.tree.openElements.pop()
@@ -1734,8 +1807,8 @@ def endTagFrameset(self, name):
17341807
u"in the frameset phase (innerHTML)."))
17351808
else:
17361809
self.tree.openElements.pop()
1737-
if not self.parser.innerHTML and\
1738-
self.tree.openElements[-1].name != "frameset":
1810+
if (not self.parser.innerHTML and
1811+
self.tree.openElements[-1].name != "frameset"):
17391812
# If we're not in innerHTML mode and the the current node is not a
17401813
# "frameset" element (anymore) then switch.
17411814
self.parser.phase = self.parser.phases["afterFrameset"]

src/treebuilders/_base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ class TreeBuilder(object):
108108

109109
#The class to use for creating doctypes
110110
doctypeClass = None
111+
112+
#Fragment class
113+
fragmentClass = None
111114

112115
def __init__(self):
113116
self.reset()
@@ -294,7 +297,6 @@ def getTableMisnestedNodePosition(self):
294297
fosterParent = self.openElements[
295298
self.openElements.index(lastTable) - 1]
296299
else:
297-
assert self.innerHTML
298300
fosterParent = self.openElements[0]
299301
return fosterParent, insertBefore
300302

@@ -310,6 +312,13 @@ def generateImpliedEndTags(self, exclude=None):
310312
def getDocument(self):
311313
"Return the final tree"
312314
return self.document
315+
316+
def getFragment(self):
317+
"Return the final fragment"
318+
#assert self.innerHTML
319+
fragment = self.fragmentClass()
320+
self.openElements[0].reparentChildren(fragment)
321+
return fragment
313322

314323
def testSerializer(self, node):
315324
"""Serialize the subtree of node in the format required by unit tests

0 commit comments

Comments
 (0)
0