@@ -8,10 +8,13 @@ class HTML5_Data
8
8
// at some point this should be moved to a .ser file. Another
9
9
// possible optimization is to give UTF-8 bytes, not Unicode
10
10
// codepoints
11
+ // XXX: Not quite sure why it's named this; this is
12
+ // actually the numeric entity dereference table.
11
13
protected static $ realCodepointTable = array (
14
+ 0x00 => 0xFFFD , // REPLACEMENT CHARACTER
12
15
0x0D => 0x000A , // LINE FEED (LF)
13
16
0x80 => 0x20AC , // EURO SIGN ('€')
14
- 0x81 => 0xFFFD , // REPLACEMENT CHARACTER
17
+ 0x81 => 0x0081 , // <control>
15
18
0x82 => 0x201A , // SINGLE LOW-9 QUOTATION MARK ('‚')
16
19
0x83 => 0x0192 , // LATIN SMALL LETTER F WITH HOOK ('ƒ')
17
20
0x84 => 0x201E , // DOUBLE LOW-9 QUOTATION MARK ('„')
@@ -23,10 +26,10 @@ class HTML5_Data
23
26
0x8A => 0x0160 , // LATIN CAPITAL LETTER S WITH CARON ('Š')
24
27
0x8B => 0x2039 , // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
25
28
0x8C => 0x0152 , // LATIN CAPITAL LIGATURE OE ('Œ')
26
- 0x8D => 0xFFFD , // REPLACEMENT CHARACTER
29
+ 0x8D => 0x008D , // <control>
27
30
0x8E => 0x017D , // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
28
- 0x8F => 0xFFFD , // REPLACEMENT CHARACTER
29
- 0x90 => 0xFFFD , // REPLACEMENT CHARACTER
31
+ 0x8F => 0x008F , // <control>
32
+ 0x90 => 0x0090 , // <control>
30
33
0x91 => 0x2018 , // LEFT SINGLE QUOTATION MARK ('‘')
31
34
0x92 => 0x2019 , // RIGHT SINGLE QUOTATION MARK ('’')
32
35
0x93 => 0x201C , // LEFT DOUBLE QUOTATION MARK ('“')
@@ -39,7 +42,7 @@ class HTML5_Data
39
42
0x9A => 0x0161 , // LATIN SMALL LETTER S WITH CARON ('š')
40
43
0x9B => 0x203A , // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
41
44
0x9C => 0x0153 , // LATIN SMALL LIGATURE OE ('œ')
42
- 0x9D => 0xFFFD , // REPLACEMENT CHARACTER
45
+ 0x9D => 0x009D , // <control>
43
46
0x9E => 0x017E , // LATIN SMALL LETTER Z WITH CARON ('ž')
44
47
0x9F => 0x0178 , // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
45
48
);
@@ -71,12 +74,13 @@ public static function getNamedCharacterReferences() {
71
74
* shamelessly stolen from Feyd (which is in public domain).
72
75
*/
73
76
public static function utf8chr ($ code ) {
74
- if ($ code > 0x10FFFF or $ code < 0x0 or
77
+ /* We don't care: we live dangerously
78
+ * if($code > 0x10FFFF or $code < 0x0 or
75
79
($code >= 0xD800 and $code <= 0xDFFF) ) {
76
80
// bits are set outside the "valid" range as defined
77
81
// by UNICODE 4.1.0
78
82
return "\xEF\xBF\xBD";
79
- }
83
+ }*/
80
84
81
85
$ x = $ y = $ z = $ w = 0 ;
82
86
if ($ code < 0x80 ) {
0 commit comments