8000 Initial implementation of numeric entities and tests, not complete, n… · Nimbleworks/html5lib-php@6b40e76 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6b40e76

Browse files
author
Edward Z. Yang ext:(%22)
committed
Initial implementation of numeric entities and tests, not complete, need spec clarification.
--HG-- branch : numeric-entities
1 parent 1cbacc5 commit 6b40e76

File tree

2 files changed

+21
-8
lines changed

2 files changed

+21
-8
lines changed

library/HTML5/Data.php

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,13 @@ class HTML5_Data
88
// at some point this should be moved to a .ser file. Another
99
// possible optimization is to give UTF-8 bytes, not Unicode
1010
// codepoints
11+
// XXX: Not quite sure why it's named this; this is
12+
// actually the numeric entity dereference table.
1113
protected static $realCodepointTable = array(
14+
0x00 => 0xFFFD, // REPLACEMENT CHARACTER
1215
0x0D => 0x000A, // LINE FEED (LF)
1316
0x80 => 0x20AC, // EURO SIGN ('€')
14-
0x81 => 0xFFFD, // REPLACEMENT CHARACTER
17+
0x81 => 0x0081, // <control>
1518
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
1619
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
1720
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
@@ -23,10 +26,10 @@ class HTML5_Data
2326
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
2427
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
2528
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
26-
0x8D => 0xFFFD, // REPLACEMENT CHARACTER
29+
0x8D => 0x008D, // <control>
2730
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
28-
0x8F => 0xFFFD, // REPLACEMENT CHARACTER
29-
0x90 => 0xFFFD, // REPLACEMENT CHARACTER
31+
0x8F => 0x008F, // <control>
32+
0x90 => 0x0090, // <control>
3033
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
3134
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
3235
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
@@ -39,7 +42,7 @@ class HTML5_Data
3942
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
4043
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
4144
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
42-
0x9D => 0xFFFD, // REPLACEMENT CHARACTER
45+
0x9D => 0x009D, // <control>
4346
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
4447
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
4548
);
@@ -71,12 +74,13 @@ public static function getNamedCharacterReferences() {
7174
* shamelessly stolen from Feyd (which is in public domain).
7275
*/
7376
public static function utf8chr($code) {
74-
if($code > 0x10FFFF or $code < 0x0 or
77+
/* We don't care: we live dangerously
78+
* if($code > 0x10FFFF or $code < 0x0 or
7579
($code >= 0xD800 and $code <= 0xDFFF) ) {
7680
// bits are set outside the "valid" range as defined
7781
// by UNICODE 4.1.0
7882
return "\xEF\xBF\xBD";
79-
}
83+
}*/
8084

8185
$x = $y = $z = $w = 0;
8286
if ($code < 0x80) {

library/HTML5/Tokenizer.php

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2213,6 +2213,16 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22132213
));
22142214
$codepoint = $new_codepoint;
22152215
} else {
2216+
// our logic is structured a little differently from the
2217+
// spec's but they're equivalent. The transform is:
2218+
// spec:
2219+
// return character for codepoint
2220+
// if in range:
2221+
// parse error
2222+
// ours:
2223+
// if in range:
2224+
// parse error
2225+
// return character for codepoint
22162226
/* Otherwise, if the number is in the range 0x0000 to 0x0008,
22172227
U+000B, U+000E to 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF ,
22182228
0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
@@ -2238,7 +2248,6 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22382248
'type' => self::PARSEERROR,
22392249
'data' => 'illegal-codepoint-for-numeric-entity'
22402250
));
2241-
$codepoint = 0xFFFD;
22422251
}
22432252
}
22442253

0 commit comments

Comments
 (0)
0