8000 Merge in changes from default (for clean re-merge). · Nimbleworks/html5lib-php@576eeef · GitHub
[go: up one dir, main page]

Skip to content

Commit 576eeef

Browse files
committed
Merge in changes from default (for clean re-merge).
--HG-- branch : numeric-entities
2 parents 18e4601 + 901876f commit 576eeef

File tree

4 files changed

+119
-37
lines changed

4 files changed

+119
-37
lines changed

SPEC

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1-
3354
1+
3382
2+
3+
This is the last revision of the spec this library has been audited against.
4+
5+
Excluding: 3374
26

3-
(this is the last revision of the spec this library has been audited against)

library/HTML5/TreeBuilder.php

Lines changed: 110 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
// XERROR - with regards to parse errors
3232
// XSCRIPT - with regards to scripting mode
3333
// XENCODING - with regards to encoding (for reparsing tests)
34+
// XDOM - DOM specific code (tagName is explicitly not marked).
35+
// this is not (yet) in helper functions.
3436

3537
class HTML5_TreeBuilder {
3638
public $stack = array();
@@ -70,6 +72,9 @@ class HTML5_TreeBuilder {
7072
'p','param','plaintext','pre','script','select','spacer','style',
7173
'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
7274

75+
private $pendingTableCharacters;
76+
private $pendingTableCharactersDirty;
77+
7378
// Tree construction modes
7479
const INITIAL = 0;
7580
const BEFORE_HTML = 1;
@@ -80,19 +85,20 @@ class HTML5_TreeBuilder {
8085
const IN_BODY = 6;
8186
const IN_CDATA_RCDATA = 7;
8287
const IN_TABLE = 8;
83-
const IN_CAPTION = 9;
84-
const IN_COLUMN_GROUP = 10;
85-
const IN_TABLE_BODY = 11;
86-
const IN_ROW = 12;
87-
const IN_CELL = 13;
88-
const IN_SELECT = 14;
89-
const IN_SELECT_IN_TABLE= 15;
90-
const IN_FOREIGN_CONTENT= 16;
91-
const AFTER_BODY = 17;
92-
const IN_FRAMESET = 18;
93-
const AFTER_FRAMESET = 19;
94-
const AFTER_AFTER_BODY = 20;
95-
const AFTER_AFTER_FRAMESET = 21;
88+
const IN_TABLE_TEXT = 9;
89+
const IN_CAPTION = 10;
90+
const IN_COLUMN_GROUP = 11;
91+
const IN_TABLE_BODY = 12;
92+
const IN_ROW = 13;
93+
const IN_CELL = 14;
94+
const IN_SELECT = 15;
95+
const IN_SELECT_IN_TABLE= 16;
96+
const IN_FOREIGN_CONTENT= 17;
97+
const AFTER_BODY = 18;
98+
const IN_FRAMESET = 19;
99+
const AFTER_FRAMESET = 20;
100+
const AFTER_AFTER_BODY = 21;
101+
const AFTER_AFTER_FRAMESET = 22;
96102

97103
/**
98104
* Converts a magic number to a readable name. Use for debugging.
@@ -201,6 +207,7 @@ public function emitToken($token, $mode = null) {
201207
* doctype attribute of the Document object. */
202208
if (!isset($token['public'])) $token['public'] = null;
203209
if (!isset($token['system'])) $token['system'] = null;
210+
// XDOM
204211
// Yes this is hacky. I'm kind of annoyed that I can't appendChild
205212
// a doctype to DOMDocument. Maybe I haven't chanted the right
206213
// syllables.
@@ -363,6 +370,7 @@ public function emitToken($token, $mode = null) {
363370
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
364371
/* Append a Comment node to the Document object with the data
365372
attribute set to the data given in the comment token. */
373+
// XDOM
366374
$comment = $this->dom->createComment($token['data']);
367375
$this->dom->appendChild($comment);
368376

@@ -378,6 +386,7 @@ public function emitToken($token, $mode = null) {
378386
/* Create an element for the token in the HTML namespace. Append it
379387
* to the Document object. Put this element in the stack of open
380388
* elements. */
389+
// XDOM
381390
$html = $this->insertElement($token, false);
382391
$this->dom->appendChild($html);
383392
$this->stack[] = $html;
@@ -387,6 +396,7 @@ public function emitToken($token, $mode = null) {
387396
} else {
388397
/* Create an html element. Append it to the Document object. Put
389398
* this element in the stack of open elements. */
399+
// XDOM
390400
$html = $this->dom->createElementNS(self::NS_HTML, 'html');
391401
$this->dom->appendChild($html);
392402
$this->stack[] = $html;
@@ -1744,6 +1754,7 @@ public function emitToken($token, $mode = null) {
17441754
* elements with an entry for the new element, and
17451755
* let node be the new element. */
17461756
// we don't know what the token is anymore
1757+
// XDOM
17471758
$clone = $node->cloneNode();
17481759
$a_pos = array_search($node, $this->a_formatting, true);
17491760
$s_pos = array_search($node, $this->stack, true);
@@ -1753,10 +1764,12 @@ public function emitToken($token, $mode = null) {
17531764

17541765
/* 6.6 Insert last node into node, first removing
17551766
it from its previous parent node if any. */
1767+
// XDOM
17561768
if($last_node->parentNode !== null) {
17571769
$last_node->parentNode->removeChild($last_node);
17581770
}
17591771

1772+
// XDOM
17601773
$node->appendChild($last_node);
17611774

17621775
/* 6.7 Let last node be node. */
@@ -1770,6 +1783,7 @@ public function emitToken($token, $mode = null) {
17701783
* whatever last node ended up being in the previous
17711784
* step, first removing it from its previous parent
17721785
* node if any. */
1786+
// XDOM
17731787
if ($last_node->parentNode) { // common step
17741788
$last_node->parentNode->removeChild($last_node);
17751789
}
@@ -1780,23 +1794,27 @@ public function emitToken($token, $mode = null) {
17801794
* first removing it from its previous parent node if
17811795
* any. */
17821796
} else {
1797+
// XDOM
17831798
$common_ancestor->appendChild($last_node);
17841799
}
17851800

17861801
/* 8. Create an element for the token for which the
17871802
* formatting element was created. */
1803+
// XDOM
17881804
$clone = $formatting_element->cloneNode();
17891805

17901806
/* 9. Take all of the child nodes of the furthest
17911807
block and append them to the element created in the
17921808
last step. */
1809+
// XDOM
17931810
while($furthest_block->hasChildNodes()) {
17941811
$child = $furthest_block->firstChild;
17951812
$furthest_block->removeChild($child);
17961813
$clone->appendChild($child);
17971814
}
17981815

17991816
/* 10. Append that clone to the furthest block. */
1817+
// XDOM
18001818
$furthest_block->appendChild($clone);
18011819

18021820
/* 11. Remove the formatting element from the list
@@ -1940,17 +1958,21 @@ public function emitToken($token, $mode = null) {
19401958
case self::IN_TABLE:
19411959
$clear = array('html', 'table');
19421960

1943-
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
1944-
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1945-
or U+0020 SPACE */
1946-
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER &&
1947-
/* If the current table is tainted, then act as described in
1948-
* the "anything else" entry below. */
1949-
// Note: hsivonen has a test that fails due to this line
1950-
// because he wants to convince Hixie not to do taint
1951-
!$this->currentTableIsTainted()) {
1952-
/* Append the character to the current node. */
1953-
$this->insertText($token['data']);
1961+
/* A character token */
1962+
if ($token['type'] === HTML5_Tokenizer::CHARACTER ||
1963+
$token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
1964+
/* Let the pending table character tokens
1965+
* be an empty list of tokens. */
1966+
$this->pendingTableCharacters = "";
1967+
$this->pendingTableCharactersDirty = false;
1968+
/* Let the original insertion mode be the current
1969+
* insertion mode. */
1970+
$this->original_mode = $this->mode;
1971+
/* Switch the insertion mode to
1972+
* "in table text" and
1973+
* reprocess the token. */
1974+
$this->mode = self::IN_TABLE_TEXT;
1975+
$this->emitToken($token);
19541976

19551977
/* A comment token */
19561978
} elseif($token['type'] === HTML5_Tokenizer::COMMENT) {
@@ -2096,6 +2118,57 @@ public function emitToken($token, $mode = null) {
20962118
}
20972119
break;
20982120

2121+
case self::IN_TABLE_TEXT:
2122+
/* A character token */
2123+
if($token['type'] === HTML5_Tokenizer::CHARACTER) {
2124+
/* Append the character token to the pending table
2125+
* character tokens list. */
2126+
$this->pendingTableCharacters .= $token['data'];
2127+
$this->pendingTableCharactersDirty = true;
2128+
} elseif ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
2129+
$this->pendingTableCharacters .= $token['data'];
2130+
/* Anything else */
2131+
} else {
2132+
if ($this->pendingTableCharacters !== '' && is_string($this->pendingTableCharacters)) {
2133+
/* If any of the tokens in the pending table character tokens list
2134+
* are character tokens that are not one of U+0009 CHARACTER
2135+
* TABULATION, U+000A LINE FEED (LF), U+000C FORM FEED (FF), or
2136+
* U+0020 SPACE, then reprocess those character tokens using the
2137+
* rules given in the "anything else" entry in the in table"
2138+
* insertion mode.*/
2139+
if ($this->pendingTableCharactersDirty) {
2140+
/* Parse error. Process the token using the rules for the
2141+
* "in body" insertion mode, except that if the current
2142+
* node is a table, tbody, tfoot, thead, or tr element,
2143+
* then, whenever a node would be inserted into the current
2144+
* node, it must instead be foster parented. */
2145+
// XERROR
2146+
$old = $this->foster_parent;
2147+
$this->foster_parent = true;
2148+
$text_token = array(
2149+
'type' => HTML5_Tokenizer::CHARACTER,
2150+
'data' => $this->pendingTableCharacters,
2151+
);
2152+
$this->processWithRulesFor($text_token, self::IN_BODY);
2153+
$this->foster_parent = $old;
2154+
2155+
/* Otherwise, insert the characters given by the pending table
2156+
* character tokens list into the current node. */
2157+
} else {
2158+
$this->insertText($this->pendingTableCharacters);
2159+
}
2160+
$this->pendingTableCharacters = null;
2161+
$this->pendingTableCharactersNull = null;
2162+
}
2163+
2164+
/* Switch the insertion mode to the original insertion mode and
2165+
* reprocess the token.
2166+
*/
2167+
$this->mode = $this->original_mode;
2168+
$this->emitToken($token);
2169+
}
2170+
break;
2171+
20992172
case self::IN_CAPTION:
21002173
/* An end tag whose tag name is "caption" */
21012174
if($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'caption') {
@@ -2694,6 +2767,7 @@ public function emitToken($token, $mode = null) {
26942767
// XERROR: parse error
26952768
} elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
26962769
$token['name'] === 'script' && end($this->stack)->tagName === 'script' &&
2770+
// XDOM
26972771
end($this->stack)->namespaceURI === self::NS_SVG) {
26982772
array_pop($this->stack);
26992773
// a bunch of script running mumbo jumbo
@@ -2702,20 +2776,23 @@ public function emitToken($token, $mode = null) {
27022776
((
27032777
$token['name'] !== 'mglyph' &&
27042778
$token['name'] !== 'malignmark' &&
2779+
// XDOM
27052780
end($this->stack)->namespaceURI === self::NS_MATHML &&
27062781
in_array(end($this->stack)->tagName, array('mi', 'mo', 'mn', 'ms', 'mtext'))
27072782
) ||
27082783
(
27092784
$token['name'] === 'svg' &&
2785+
// XDOM
27102786
end($this->stack)->namespaceURI === self::NS_MATHML &&
27112787
end($this->stack)->tagName === 'annotation-xml'
27122788
) ||
27132789
(
2790+
// XDOM
27142791
end($this->stack)->namespaceURI === self::NS_SVG &&
27152792
in_array(end($this->stack)->tagName, array('foreignObject', 'desc', 'title'))
27162793
) ||
27172794
(
2718-
// XSKETCHY
2795+
// XSKETCHY && XDOM
27192796
end($this->stack)->namespaceURI === self::NS_HTML
27202797
))
27212798
) || $token['type'] === HTML5_Tokenizer::ENDTAG
@@ -2729,6 +2806,7 @@ public function emitToken($token, $mode = null) {
27292806
$found = false;
27302807
// this basically duplicates elementInScope()
27312808
for ($i = count($this->stack) - 1; $i >= 0; $i--) {
2809+
// XDOM
27322810
$node = $this->stack[$i];
27332811
if ($node->namespaceURI !== self::NS_HTML) {
27342812
$found = true;
@@ -2756,6 +2834,7 @@ public function emitToken($token, $mode = null) {
27562834
// XERROR: parse error
27572835
do {
27582836
$node = array_pop($this->stack);
2837+
// XDOM
27592838
} while ($node->namespaceURI !== self::NS_HTML);
27602839
$this->stack[] = $node;
27612840
$this->mode = $this->secondary_mode;
@@ -2799,6 +2878,7 @@ public function emitToken($token, $mode = null) {
27992878
'radialgradient' => 'radialGradient',
28002879
'textpath' => 'textPath',
28012880
);
2881+
// XDOM
28022882
$current = end($this->stack);
28032883
if ($current->namespaceURI === self::NS_MATHML) {
28042884
$token = $this->adjustMathMLAttributes($token);
@@ -2835,6 +2915,7 @@ public function emitToken($token, $mode = null) {
28352915
/* Append a Comment node to the first element in the stack of open
28362916
elements (the html element), with the data attribute set to the
28372917
data given in the comment token. */
2918+
// XDOM
28382919
$comment = $this->dom->createComment($token['data']);
28392920
$this->stack[0]->appendChild($comment);
28402921

@@ -2985,6 +3066,7 @@ public function emitToken($token, $mode = null) {
29853066
if($token['type'] === HTML5_Tokenizer::COMMENT) {
29863067
/* Append a Comment node to the Document object with the data
29873068
attribute set to the data given in the comment token. */
3069+
// XDOM
29883070
$comment = $this->dom->createComment($token['data']);
29893071
$this->dom->appendChild($comment);
29903072

@@ -3008,6 +3090,7 @@ public function emitToken($token, $mode = null) {
30083090
if($token['type'] === HTML5_Tokenizer::COMMENT) {
30093091
/* Append a Comment node to the Document object with the data
30103092
attribute set to the data given in the comment token. */
3093+
// XDOM
30113094
$comment = $this->dom->createComment($token['data']);
30123095
$this->dom->appendChild($comment);
30133096

@@ -3458,12 +3541,8 @@ private function getFosterParent() {
34583541
public function fosterParent($node) {
34593542
$foster_parent = $this->getFosterParent();
34603543
$table = $this->getCurrentTable(); // almost equivalent to last table element, except it can be html
3461-
/* When a node node is to be foster parented, the node node must be
3462-
* inserted into the foster parent element, and the current table must
3463-
* be marked as tainted. (Once the current table has been tainted,
3464-
* whitespace characters are inserted into the foster parent element
3465-
* instead of the current node.) */
3466-
$table->tainted = true;
3544+
/* When a node node is to be foster parented, the node node must be
3545+
* be inserted into the foster parent element. */
34673546
/* If the foster parent element is the parent element of the last table
34683547
* element in the stack of open elements, then node must be inserted
34693548
* immediately before the last table element in the stack of open

tests/HTML5/TestDataTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ class HTML5_TestDataTest extends UnitTestCase
77
function testSample() {
88
$data = new HTML5_TestData(dirname(__FILE__) . '/TestDataTest/sample.dat');
99
$this->assertIdentical($data->tests, array(
10-
array('data' => "Foo\n", 'des' => "Bar\n"),
11-
array('data' => "Foo\n")
10+
array('data' => "Foo", 'des' => "Bar"),
11+
array('data' => "Foo")
1212
));
1313
}
1414
function testStrDom() {

tests/HTML5/TokenizerPositionTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ class HTML5_PositionTestableTokenizer extends HTML5_TestableTokenizer
77
public $outputLines = array();
88
public $outputCols = array();
99
private $characterTokens = array();
10-
protected function emitToken($token, $checkStream = true) {
11-
parent::emitToken($token, $checkStream);
10+
protected function emitToken($token, $checkStream = true, $dry = false) {
11+
parent::emitToken($token, $checkStream, $dry);
1212
// XXX: The tests should really include the parse errors, but I'm lazy.
1313
switch ($token['type']) {
1414
case self::PARSEERROR:

0 commit comments

Comments
 (0)
0