8000 Implement fragment parsing, fix a bunch more bugs. · Nimbleworks/html5lib-php@9799e6f · GitHub
[go: up one dir, main page]

Skip to content

Commit 9799e6f

Browse files
author
Edward Z. Yang ext:(%22)
committed
Implement fragment parsing, fix a bunch more bugs.
1 parent 0adc0fe commit 9799e6f

File tree

4 files changed

+114
-25
lines changed

4 files changed

+114
-25
lines changed

library/HTML5/Tokenizer.php

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,15 @@ public function __construct($data) {
9696
$this->content_model = self::PCDATA;
9797
}
9898

99+
public function parseFragment($context = null) {
100+
$this->tree->setupContext($context);
101+
if ($this->tree->content_model) {
102+
$this->content_model = $this->tree->content_model;
103+
$this->tree->content_model = null;
104+
}
105+
$this->parse();
106+
}
107+
99108
// XXX maybe convert this into an iterator? regardless, this function
100109
// and the save function should go into a Parser facade of some sort
101110
/**

library/HTML5/TreeConstructer.php

Lines changed: 86 additions & 20 deletions
8000
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ class HTML5_TreeConstructer {
5858
// code can check for (bool)$ignore_lf_token, but it phases out
5959
// appropriately)
6060
private $ignore_lf_token = 0;
61+
private $fragment = false;
62+
private $root;
6163

6264
// XFOREIGN: SVG's foreignObject is included in scoping
6365
private $scoping = array('applet','button','caption','html','marquee','object','table','td','th');
@@ -1955,6 +1957,7 @@ public function emitToken($token, $mode = null) {
19551957
scope with the same tag name as the token, this is a parse error.
19561958
Ignore the token. (fragment case) */
19571959
if(!$this->elementInScope($token['name'], true)) {
1960+
$this->ignored = true;
19581961
// Ignore
19591962

19601963
/* Otherwise: */
@@ -2999,7 +3002,7 @@ private function clearStackToTableContext($elements) {
29993002
}
30003003
}
30013004

3002-
private function resetInsertionMode() {
3005+
private function resetInsertionMode($context = null) {
30033006
/* 1. Let last be false. */
30043007
$last = false;
30053008
$leng = count($this->stack);
@@ -3008,16 +3011,16 @@ private function resetInsertionMode() {
30083011
/* 2. Let node be the last node in the stack of open elements. */
30093012
$node = $this->stack[$n];
30103013

3011-
/* 3. If node is the first node in the stack of open elements, then
3012-
set last to true. If the element whose innerHTML attribute is being
3013-
set is neither a td element nor a th element, then set node to the
3014-
element whose innerHTML attribute is being set. (innerHTML case) */
3014+
/* 3. If node is the first node in the stack of open elements, then
3015+
* set last to true and set node to the context element. (fragment
3016+
* case) */
30153017
if($this->stack[0]->isSameNode($node)) {
30163018
$last = true;
3019+
$node = $context;
30173020
}
30183021

30193022
/* 4. If node is a select element, then switch the insertion mode to
3020-
"in select" and abort these steps. (innerHTML case) */
3023+
"in select" and abort these steps. (fragment case) */
30213024
if($node->tagName === 'select') {
30223025
$this->mode = self::IN_SELECT;
30233026
break;
@@ -3037,7 +3040,7 @@ private function resetInsertionMode() {
30373040
/* 7. If node is a tbody, thead, or tfoot element, then switch the
30383041
insertion mode to "in table body" and abort these steps. */
30393042
} elseif(in_array($node->tagName, array('tbody', 'thead', 'tfoot'))) {
3040-
$this->mode = self::IN_TBODY;
3043+
$this->mode = self::IN_TABLE_BODY;
30413044
break;
30423045

30433046
/* 8. If node is a caption element, then switch the insertion mode
@@ -3049,7 +3052,7 @@ private function resetInsertionMode() {
30493052
/* 9. If node is a colgroup element, then switch the insertion mode
30503053
to "in column group" and abort these steps. (innerHTML case) */
30513054
} elseif($node->tagName === 'colgroup') {
3052-
$this->mode = self::IN_CGROUP;
3055+
$this->mode = self::IN_COLUMN_GROUP;
30533056
break;
30543057

30553058
/* 10. If node is a table element, then switch the insertion mode
@@ -3058,38 +3061,44 @@ private function resetInsertionMode() {
30583061
$this->mode = self::IN_TABLE;
30593062
break;
30603063

3061-
/* 11. If node is a head element, then switch the insertion mode
3064+
/* 11. If node is an element from the MathML namespace or the SVG
3065+
* namespace, then switch the insertion mode to "in foreign
3066+
* content", let the secondary insertion mode be "in body", and
3067+
* abort these steps. */
3068+
// XFOREIGN: implement me
3069+
3070+
/* 12. If node is a head element, then switch the insertion mode
30623071
to "in body" ("in body"! not "in head"!) and abort these steps.
3063-
(innerHTML case) */
3072+
(fragment case) */
30643073
} elseif($node->tagName === 'head') {
30653074
$this->mode = self::IN_BODY;
30663075
break;
30673076

3068-
/* 12. If node is a body element, then switch the insertion mode to
3077+
/* 13. If node is a body element, then switch the insertion mode to
30693078
"in body" and abort these steps. */
30703079
} elseif($node->tagName === 'body') {
30713080
$this->mode = self::IN_BODY;
30723081
break;
30733082

3074-
/* 13. If node is a frameset element, then switch the insertion
3075-
mode to "in frameset" and abort these steps. (innerHTML case) */
3083+
/* 14. If node is a frameset element, then switch the insertion
3084+
mode to "in frameset" and abort these steps. (fragment case) */
30763085
} elseif($node->tagName === 'frameset') {
3077-
$this->mode = self::IN_FRAME;
3086+
$this->mode = self::IN_FRAMESET;
30783087
break;
30793088

3080-
/* 14. If node is an html element, then: if the head element
3089+
/* 15. If node is an html element, then: if the head element
30813090
pointer is null, switch the insertion mode to "before head",
30823091
otherwise, switch the insertion mode to "after head". In either
3083-
case, abort these steps. (innerHTML case) */
3092+
case, abort these steps. (fragment case) */
30843093
} elseif($node->tagName === 'html') {
30853094
$this->mode = ($this->head_pointer === null)
3086-
? self::BEFOR_HEAD
3095+
? self::BEFORE_HEAD
30873096
: self::AFTER_HEAD;
30883097

30893098
break;
30903099

3091-
/* 15. If last is true, then set the insertion mode to "in body"
3092-
and abort these steps. (innerHTML case) */
3100+
/* 16. If last is true, then set the insertion mode to "in body"
3101+
and abort these steps. (fragment case) */
30933102
} elseif($last) {
30943103
$this->mode = self::IN_BODY;
30953104
break;
@@ -3237,9 +3246,66 @@ public function currentTableIsTainted() {
32373246
return !empty($this->getCurrentTable()->tainted);
32383247
}
32393248

3249+
/**
3250+
* Sets up the tree constructor for building a fragment.
3251+
*/
3252+
public function setupContext($context = null) {
3253+
$this->fragment = true;
3254+
$context = $this->dom->createElement($context);
3255+
if ($context) {
3256+
/* 4.1. Set the HTML parser's tokenization stage's content model
3257+
* flag according to the context element, as follows: */
3258+
switch ($context->tagName) {
3259+
case 'title': case 'textarea':
3260+
$this->content_model = HTML5_Tokenizer::RCDATA;
3261+
break;
3262+
case 'style': case 'script': case 'xmp': case 'iframe':
3263+
case 'noembed': case 'noframes':
3264+
$this->content_model = HTML5_Tokenizer::CDATA;
3265+
break;
3266+
case 'noscript':
3267+
// XSCRIPT: assuming scripting is enabled
3268+
$this->content_model = HTML5_Tokenizer::CDATA;
3269+
break;
3270+
case 'plaintext':
3271+
$this->content_model = HTML5_Tokenizer::PLAINTEXT;
3272+
break;
3273+
}
3274+
/* 4.2. Let root be a new html element with no attributes. */
3275+
$root = $this->dom->createElement('html');
3276+
$this->root = $root;
3277+
/* 4.3 Append the element root to the Document node created above. */
3278+
$this->dom->appendChild($root);
3279+
/* 4.4 Set up the parser's stack of open elements so that it
3280+
* contains just the single element root. */
3281+
$this->stack = array($root);
3282+
/* 4.5 Reset the parser's insertion mode appropriately. */
3283+
$this->resetInsertionMode($context);
3284+
/* 4.6 Set the parser's form element pointer to the nearest node
3285+
* to the context element that is a form element (going straight up
3286+
* the ancestor chain, and including the element itself, if it is a
3287+
* form element), or, if there is no such form element, to null. */
3288+
$node = $context;
3289+
do {
3290+
if ($node->tagName === 'form') {
3291+
$this->form_pointer = $node;
3292+
break;
3293+
}
3294+
} while ($node = $node->parentNode);
3295+
}
3296+
}
3297+
32403298

32413299
public function save() {
3242-
return $this->dom;
3300+
if (!$this->fragment) {
3301+
return $this->dom;
3302+
} else {
3303+
if ($this->root) {
3304+
return $this->root->childNodes;
3305+
} else {
3306+
return $this->dom->childNodes;
3307+
}
3308+
}
32433309
}
32443310
}
32453311

tests/HTML5/TestData.php

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,19 @@ public function __construct($filename) {
7070
/**
7171
* Converts a DOMDocument into string form as seen in test cases.
7272
*/
73-
public static function strDom($dom, $prefix = '| ') {
73+
public static function strDom($node, $prefix = '| ') {
7474
// XXX: Doesn't handle svg and math correctly
7575
$ret = array();
7676
$indent = 2;
7777
$level = -1; // since DOMDocument doesn't get rendered
7878
$skip = false;
79-
$next = $dom;
79+
$next = $node;
8080
while ($next) {
81+
if ($next instanceof DOMNodeList) {
82+
if (!$next->length) break;
83+
$next = $next->item(0);
84+
$level = 0;
85+
}
8186
$text = false;
8287
$subnodes = array();
8388
switch ($next->nodeType) {
@@ -130,6 +135,7 @@ public static function strDom($dom, $prefix = '| ') {
130135
$next = $next->parentNode;
131136
$level--;
132137
$skip = true;
138+
if ($level < 0) break;
133139
} else {
134140
$next = false;
135141
}

tests/HTML5/TreeBuilderTest.php

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,28 @@
55
SimpleTest::ignore('HTML5_TreeBuilderHarness');
66
class HTML5_TreeBuilderHarness extends HTML5_TestDataHarness
77
{
8-
public function assertIdentical($expect, $actual, $input = '%s') {
8+
public function assertIdentical($expect, $actual, $test = array()) {
9+
$input = $test['data'];
10+
if (isset($test['document-fragment'])) {
11+
$input .= "\nFragment: " . $test['document-fragment'];
12+
}
913
parent::assertIdentical($expect, $actual, "Identical expectation failed\nInput:\n$input\n\nExpected:\n$expect\n\nActual:\n$actual\n");
1014
}
1115
public function invoke($test) {
1216
// this is totally the wrong interface to use, but
1317
// for now we need testing
1418
$tokenizer = new HTML5_Tokenizer($test['data']);
1519
$GLOBALS['TIME'] -= get_microtime();
16-
$tokenizer->parse();
20+
if (isset($test['document-fragment'])) {
21+
$tokenizer->parseFragment($test['document-fragment']);
22+
} else {
23+
$tokenizer->parse();
24+
}
1725
$GLOBALS['TIME'] += get_microtime();
1826
$this->assertIdentical(
1927
$test['document'],
2028
HTML5_TestData::strDom($tokenizer->save()),
21-
$test['data']
29+
$test
2230
);
2331
}
2432
}

0 commit comments

Comments
 (0)
0