8000 Fix more errors, and change implementation to be the most recent one. · html5lib/html5lib-php@18e4601 · GitHub
[go: up one dir, main page]

Skip to content

Commit 18e4601

Browse files
committed
Fix more errors, and change implementation to be the most recent one.
--HG-- branch : numeric-entities
1 parent 6b40e76 commit 18e4601

File tree

1 file changed

+24
-26
lines changed

1 file changed

+24
-26
lines changed

library/HTML5/Tokenizer.php

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2211,28 +2211,29 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22112211
'type' => self::PARSEERROR,
22122212
'data' => 'illegal-windows-1252-entity'
22132213
));
2214-
$codepoint = $new_codepoint;
2214+
return HTML5_Data::utf8chr($new_codepoint);
22152215
} else {
2216-
// our logic is structured a little differently from the
2217-
// spec's but they're equivalent. The transform is:
2218-
// spec:
2219-
// return character for codepoint
2220-
// if in range:
2221-
// parse error
2222-
// ours:
2223-
// if in range:
2224-
// parse error
2225-
// return character for codepoint
2226-
/* Otherwise, if the number is in the range 0x0000 to 0x0008,
2227-
U+000B, U+000E to 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF ,
2228-
0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
2229-
0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
2230-
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF,
2231-
0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE,
2232-
0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
2233-
0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this
2234-
is a parse error; return a character token for the U+FFFD
2235-
REPLACEMENT CHARACTER character instead. */
2216+
/* Otherwise, if the number is greater than 0x10FFFF, then
2217+
* this is a parse error. Return a U+FFFD REPLACEMENT
2218+
* CHARACTER. */
2219+
if ($codepoint > 0x10FFFF) {
2220+
$this->emitToken(array(
2221+
'type' => self::PARSEERROR,
2222+
'data' => 'overlong-character-entity' // XXX probably not correct
2223+
));
2224+
return "\xEF\xBF\xBD";
2225+
}
2226+
/* Otherwise, return a character token for the Unicode
2227+
* character whose code point is that number. If the
2228+
* number is in the range 0x0001 to 0x0008, 0x000E to
2229+
* 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2230+
* 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2231+
* 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2232+
* 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2233+
* 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2234+
* 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2235+
* 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2236+
* or 0x10FFFF, then this is a parse error. */
22362237
// && has higher precedence than ||
22372238
if (
22382239
$codepoint >= 0x0000 && $codepoint <= 0x0008 ||
@@ -2242,18 +2243,15 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22422243
$codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
22432244
$codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
22442245
($codepoint & 0xFFFE) === 0xFFFE ||
2245-
$codepoint > 0x10FFFF
2246+
$codepoint == 0x10FFFF || $codepoint == 0x10FFFE
22462247
) {
22472248
$this->emitToken(array(
22482249
'type' => self::PARSEERROR,
22492250
'data' => 'illegal-codepoint-for-numeric-entity'
22502251
));
22512252
}
2253+
return HTML5_Data::utf8chr($codepoint);
22522254
}
2253-
2254-
/* Otherwise, return a character token for the Unicode
2255-
character whose code point is that number. */
2256-
return HTML5_Data::utf8chr($codepoint);
22572255
}
22582256

22592257
} else {

0 commit comments

Comments
 (0)
0