8000 Numerous bugfixes and extra debugging facilities. · Nimbleworks/html5lib-php@c078c20 · GitHub
[go: up one dir, main page]

Skip to content

Commit c078c20

Browse files
author
Edward Z. Yang ext:(%22)
committed
Numerous bugfixes and extra debugging facilities.
* Add strConst() and printStack() member functions * Make all constants uniquely identify themselves * Ignore parse error tokens (fixed some failing test-cases) * Do not attempt to create doctype without qualified name (dom limitation) * Remove a from active formatting elements and stack if necessary * Add strict to array_search to prevent bugs * Match against DOMText instead of DOMCharacterData, which is more precise * Fix incorrect assertIdentical prototype * Implement token_dump() convenience function
1 parent d49a53f commit c078c20

File tree

3 files changed

+98
-18
lines changed

3 files changed

+98
-18
lines changed

library/HTML5/TreeConstructer.php

Lines changed: 67 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -93,19 +93,31 @@ class HTML5_TreeConstructer {
9393
const AFTER_AFTER_BODY = 20;
9494
const AFTER_AFTER_FRAMESET = 21;
9595

96+
/**
97+
* Converts a magic number to a readable name. Use for debugging.
98+
*/
99+
private function strConst($number) {
100+
static $lookup;
101+
if (!$lookup) {
102+
$r = new ReflectionClass('HTML5_TreeConstructer');
103+
$lookup = array_flip($r->getConstants());
104+
}
105+
return $lookup[$number];
106+
}
107+
96108
// The different types of elements.
97-
const SPECIAL = 0;
98-
const SCOPING = 1;
99-
const FORMATTING = 2;
100-
const PHRASING = 3;
109+
const SPECIAL = 100;
110+
const SCOPING = 101;
111+
const FORMATTING = 102;
112+
const PHRASING = 103;
101113

102114
// Quirks modes in $quirks_mode
103-
const NO_QUIRKS = 0;
104-
const QUIRKS_MODE = 1;
105-
const LIMITED_QUIRKS_MODE = 2;
115+
const NO_QUIRKS = 200;
116+
const QUIRKS_MODE = 201;
117+
const LIMITED_QUIRKS_MODE = 202;
106118

107119
// Marker to be placed in $a_formatting
108-
const MARKER = 0;
120+
const MARKER = 300;
109121

110122
public function __construct() {
111123
$this->mode = self::INITIAL;
@@ -119,10 +131,21 @@ public function __construct() {
119131

120132
// Process tag tokens
121133
public function emitToken($token, $mode = null) {
134+
// XXX: ignore parse errors... why are we emitting them, again?
135+
if ($token['type'] === HTML5_Tokenizer::PARSEERROR) return;
136+
if ($mode === null) $mode = $this->mode;
137+
138+
/*
139+
$backtrace = debug_backtrace();
140+
if ($backtrace[1]['class'] !== 'HTML5_TreeConstructer') echo "--\n";
141+
echo $this->strConst($mode) . "\n ";
142+
token_dump($token);
143+
if ($this->foster_parent) echo " -> this is a foster parent mode\n";
144+
*/
145+
122146
if ($this->ignore_lf_token) $this->ignore_lf_token--;
123147
$this->ignored = false;
124148
// indenting is a little wonky, this can be changed later on
125-
if ($mode === null) $mode = $this->mode;
126149
switch ($mode) {
127150

128151
case self::INITIAL:
@@ -164,8 +187,15 @@ public function emitToken($token, $mode = null) {
164187
// a doctype to DOMDocument. Maybe I haven't chanted the right
165188
// syllables.
166189
$impl = new DOMImplementation();
167-
$doctype = $impl->createDocumentType($token['name'], $token['public'], $token['system']);
168-
$this->dom->appendChild($doctype);
190+
// This call can fail for particularly pathological cases (namely,
191+
// the qualifiedName parameter ($token['name']) could be missing.
192+
if ($token['name']) {
193+
$doctype = $impl->createDocumentType($token['name'], $token['public'], $token['system']);
194+
$this->dom->appendChild($doctype);
195+
} else {
196+
// It looks like libxml's not actually *able* to express this case.
197+
// So... don't. XXX
198+
}
169199
// XQUIRKS: Implement quirks mode
170200
$this->mode = self::BEFORE_HTML;
171201
} else {
@@ -828,10 +858,19 @@ public function emitToken($token, $mode = null) {
828858
break;
829859

830860
} elseif($this->a_formatting[$n]->tagName === 'a') {
861+
$a = $this->a_formatting[$n];
831862
$this->emitToken(array(
832863
'name' => 'a',
833864
'type' => HTML5_Tokenizer::ENDTAG
834865
));
866+
if (in_array($a, $this->a_formatting)) {
867+
$a_i = array_search($a, $this->a_formatting, true);
868+
if($a_i !== false) array_splice($this->a_formatting, $a_i, 1);
869+
}
870+
if (in_array($a, $this->stack)) {
871+
$a_i = array_search($a, $this->stack, true);
872+
if ($a_i !== false) array_splice($this->stack, $a_i, 1);
873+
}
835874
break;
836875
}
837876
}
@@ -1326,7 +1365,7 @@ public function emitToken($token, $mode = null) {
13261365
// parse error
13271366
}
13281367
/* 3. Remove node from the stack of open elements. */
1329-
array_splice($this->stack, array_search($node, $this->stack), 1);
1368+
array_splice($this->stack, array_search($node, $this->stack, true), 1);
13301369
}
13311370

13321371
break;
@@ -1896,9 +1935,10 @@ public function emitToken($token, $mode = null) {
18961935
node, it must instead be inserted into the foster parent element. */
18971936
if(in_array(end($this->stack)->tagName,
18981937
array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
1938+
$old = $this->foster_parent;
18991939
$this->foster_parent = true;
19001940
$this->processWithRulesFor($token, self::IN_BODY);
1901-
$this->foster_parent = false;
1941+
$this->foster_parent = $old;
19021942
} else {
19031943
$this->processWithRulesFor($token, self::IN_BODY);
19041944
}
@@ -2753,7 +2793,7 @@ private function appendToRealParent($node) {
27532793
}
27542794

27552795
private function appendChild($parent, $node) {
2756-
if ($node instanceof DOMCharacterData && $parent->lastChild instanceof DOMCharacterData) {
2796+
if ($node instanceof DOMText && $parent->lastChild instanceof DOMText) {
27572797
// attach text to previous node
27582798
$parent->lastChild->data .= $node->data;
27592799
} else {
@@ -2762,11 +2802,11 @@ private function appendChild($parent, $node) {
27622802
}
27632803

27642804
private function insertBefore($parent, $node, $marker) {
2765-
if ($node instanceof DOMCharacterData) {
2766-
if ($marker instanceof DOMCharacterData) {
2805+
if ($node instanceof DOMText) {
2806+
if ($marker instanceof DOMText) {
27672807
$marker->data = $node->data . $marker->data;
27682808
return;
2769-
} elseif ($marker->previousSibling && $marker->previousSibling instanceof DOMCharacterData) {
2809+
} elseif ($marker->previousSibling && $marker->previousSibling instanceof DOMText) {
27702810
$marker->previousSibling->data .= $node->data;
27712811
return;
27722812
}
@@ -3162,6 +3202,16 @@ public function fosterParent($node) {
31623202
}
31633203
}
31643204

3205+
/**
3206+
* For debugging, prints the stack
3207+
*/
3208+
private function printStack() {
3209+
echo " Stack:\n";
3210+
foreach ($this->stack as $i => $element) {
3211+
echo " " . ($i+1) . ". " . $element->tagName . "\n";
3212+
}
3213+
}
3214+
31653215
public function currentTableIsTainted() {
31663216
return !empty($this->getCurrentTable()->tainted);
31673217
}

tests/HTML5/TreeBuilderTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
SimpleTest::ignore('HTML5_TreeBuilderHarness');
66
class HTML5_TreeBuilderHarness extends HTML5_TestDataHarness
77
{
8-
public function assertIdentical($expect, $actual, $input) {
8+
public function assertIdentical($expect, $actual, $input = '%s') {
99
parent::assertIdentical($expect, $actual, "Identical expectation failed\nInput:\n$input\n\nExpected:\n$expect\n\nActual:\n$actual\n");
1010
}
1111
public function invoke($test) {

tests/autorun.php

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,36 @@ function str_dump($string) {
5656
echo "\n";
5757
}
5858

59+
/**
60+
* Pretty prints a token as taken by TreeConstructer->emitToken
61+
*/
62+
function token_dump($token) {
63+
switch ($token['type']) {
64+
case HTML5_Tokenizer::DOCTYPE:
65+
echo "<!doctype ...>\n";
66+
break;
67+
case HTML5_Tokenizer::STARTTAG:
68+
$attr = '';
69+
foreach ($token['attr'] as $kp) {
70+
$attr .= ' '.$kp['name'] . '="' . $kp['value'] . '"';
71+
}
72+
echo "<{$token['name']}$attr>\n";
73+
break;
74+
case HTML5_Tokenizer::ENDTAG:
75+
echo "</{$token['name']}>\n";
76+
break;
77+
case HTML5_Tokenizer::COMMENT:
78+
echo "<!-- {$token['data']} -->\n";
79+
break;
80+
case HTML5_Tokenizer::CHARACTER:
81+
echo '"'.$token['data'].'"'."\n";
82+
break;
83+
case HTML5_Tokenizer::EOF:
84+
echo "EOF\n";
85+
break;
86+
}
87+
}
88+
5989
require_once $simpletest_location . '/autorun.php';
6090

6191
class TimedTextReporter extends TextReporter

0 commit comments

Comments
 (0)
0