|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
// warning: this file is encoded in UTF-8! |
|
4
|
|
|
|
|
5
|
|
|
class HTML5_Data |
|
6
|
|
|
{ |
|
7
|
|
|
|
|
8
|
|
|
// at some point this should be moved to a .ser file. Another |
|
9
|
|
|
// possible optimization is to give UTF-8 bytes, not Unicode |
|
10
|
|
|
// codepoints |
|
11
|
|
|
// XXX: Not quite sure why it's named this; this is |
|
12
|
|
|
// actually the numeric entity dereference table. |
|
13
|
|
|
protected static $realCodepointTable = array( |
|
14
|
|
|
0x00 => 0xFFFD, // REPLACEMENT CHARACTER |
|
15
|
|
|
0x0D => 0x000A, // LINE FEED (LF) |
|
16
|
|
|
0x80 => 0x20AC, // EURO SIGN ('€') |
|
17
|
|
|
0x81 => 0x0081, // <control> |
|
18
|
|
|
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') |
|
19
|
|
|
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') |
|
20
|
|
|
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') |
|
21
|
|
|
0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') |
|
22
|
|
|
0x86 => 0x2020, // DAGGER ('†') |
|
23
|
|
|
0x87 => 0x2021, // DOUBLE DAGGER ('‡') |
|
24
|
|
|
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') |
|
25
|
|
|
0x89 => 0x2030, // PER MILLE SIGN ('‰') |
|
26
|
|
|
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') |
|
27
|
|
|
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') |
|
28
|
|
|
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') |
|
29
|
|
|
0x8D => 0x008D, // <control> |
|
30
|
|
|
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') |
|
31
|
|
|
0x8F => 0x008F, // <control> |
|
32
|
|
|
0x90 => 0x0090, // <control> |
|
33
|
|
|
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') |
|
34
|
|
|
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') |
|
35
|
|
|
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') |
|
36
|
|
|
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') |
|
37
|
|
|
0x95 => 0x2022, // BULLET ('•') |
|
38
|
|
|
0x96 => 0x2013, // EN DASH ('–') |
|
39
|
|
|
0x97 => 0x2014, // EM DASH ('—') |
|
40
|
|
|
0x98 => 0x02DC, // SMALL TILDE ('˜') |
|
41
|
|
|
0x99 => 0x2122, // TRADE MARK SIGN ('™') |
|
42
|
|
|
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') |
|
43
|
|
|
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') |
|
44
|
|
|
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') |
|
45
|
|
|
0x9D => 0x009D, // <control> |
|
46
|
|
|
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') |
|
47
|
|
|
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') |
|
48
|
|
|
); |
|
49
|
|
|
|
|
50
|
|
|
protected static $namedCharacterReferences; |
|
51
|
|
|
|
|
52
|
|
|
protected static $namedCharacterReferenceMaxLength; |
|
53
|
|
|
|
|
54
|
|
|
/** |
|
55
|
|
|
* Returns the "real" Unicode codepoint of a malformed character |
|
56
|
|
|
* reference. |
|
57
|
|
|
*/ |
|
58
|
|
|
public static function getRealCodepoint($ref) { |
|
59
|
|
|
if (!isset(self::$realCodepointTable[$ref])) return false; |
|
60
|
|
|
else return self::$realCodepointTable[$ref]; |
|
61
|
|
|
} |
|
62
|
|
|
|
|
63
|
|
|
public static function getNamedCharacterReferences() { |
|
64
|
|
|
if (!self::$namedCharacterReferences) { |
|
65
|
|
|
self::$namedCharacterReferences = unserialize( |
|
66
|
|
|
file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); |
|
67
|
|
|
} |
|
68
|
|
|
return self::$namedCharacterReferences; |
|
69
|
|
|
} |
|
70
|
|
|
|
|
71
|
|
|
/** |
|
72
|
|
|
* Converts a Unicode codepoint to sequence of UTF-8 bytes. |
|
73
|
|
|
* @note Shamelessly stolen from HTML Purifier, which is also |
|
74
|
|
|
* shamelessly stolen from Feyd (which is in public domain). |
|
75
|
|
|
*/ |
|
76
|
|
|
public static function utf8chr($code) { |
|
77
|
|
|
/* We don't care: we live dangerously |
|
78
|
|
|
* if($code > 0x10FFFF or $code < 0x0 or |
|
79
|
|
|
($code >= 0xD800 and $code <= 0xDFFF) ) { |
|
80
|
|
|
// bits are set outside the "valid" range as defined |
|
81
|
|
|
// by UNICODE 4.1.0 |
|
82
|
|
|
return "\xEF\xBF\xBD"; |
|
83
|
|
|
}*/ |
|
84
|
|
|
|
|
85
|
|
|
$x = $y = $z = $w = 0; |
|
86
|
|
|
if ($code < 0x80) { |
|
87
|
|
|
// regular ASCII character |
|
88
|
|
|
$x = $code; |
|
89
|
|
|
} else { |
|
90
|
|
|
// set up bits for UTF-8 |
|
91
|
|
|
$x = ($code & 0x3F) | 0x80; |
|
92
|
|
|
if ($code < 0x800) { |
|
93
|
|
|
$y = (($code & 0x7FF) >> 6) | 0xC0; |
|
94
|
|
|
} else { |
|
95
|
|
|
$y = (($code & 0xFC0) >> 6) | 0x80; |
|
96
|
|
|
if($code < 0x10000) { |
|
97
|
|
|
$z = (($code >> 12) & 0x0F) | 0xE0; |
|
98
|
|
|
} else { |
|
99
|
|
|
$z = (($code >> 12) & 0x3F) | 0x80; |
|
100
|
|
|
$w = (($code >> 18) & 0x07) | 0xF0; |
|
101
|
|
|
} |
|
102
|
|
|
} |
|
103
|
|
|
} |
|
104
|
|
|
// set up the actual character |
|
105
|
|
|
$ret = ''; |
|
106
|
|
|
if($w) $ret .= chr($w); |
|
107
|
|
|
if($z) $ret .= chr($z); |
|
108
|
|
|
if($y) $ret .= chr($y); |
|
109
|
|
|
$ret .= chr($x); |
|
110
|
|
|
|
|
111
|
|
|
return $ret; |
|
112
|
|
|
} |
|
113
|
|
|
|
|
114
|
|
|
} |
|
115
|
|
|
|