1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
// warning: this file is encoded in UTF-8! |
4
|
|
|
|
5
|
|
|
class HTML5_Data |
|
|
|
|
6
|
|
|
{ |
7
|
|
|
|
8
|
|
|
// at some point this should be moved to a .ser file. Another |
9
|
|
|
// possible optimization is to give UTF-8 bytes, not Unicode |
10
|
|
|
// codepoints |
11
|
|
|
// XXX: Not quite sure why it's named this; this is |
12
|
|
|
// actually the numeric entity dereference table. |
13
|
|
|
protected static $realCodepointTable = array( |
14
|
|
|
0x00 => 0xFFFD, // REPLACEMENT CHARACTER |
15
|
|
|
0x0D => 0x000A, // LINE FEED (LF) |
16
|
|
|
0x80 => 0x20AC, // EURO SIGN ('€') |
|
|
|
|
17
|
|
|
0x81 => 0x0081, // <control> |
18
|
|
|
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') |
19
|
|
|
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') |
20
|
|
|
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') |
21
|
|
|
0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') |
|
|
|
|
22
|
|
|
0x86 => 0x2020, // DAGGER ('†') |
|
|
|
|
23
|
|
|
0x87 => 0x2021, // DOUBLE DAGGER ('‡') |
|
|
|
|
24
|
|
|
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') |
25
|
|
|
0x89 => 0x2030, // PER MILLE SIGN ('‰') |
26
|
|
|
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') |
27
|
|
|
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') |
28
|
|
|
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') |
29
|
|
|
0x8D => 0x008D, // <control> |
30
|
|
|
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') |
31
|
|
|
0x8F => 0x008F, // <control> |
32
|
|
|
0x90 => 0x0090, // <control> |
33
|
|
|
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') |
34
|
|
|
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') |
35
|
|
|
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') |
36
|
|
|
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') |
37
|
|
|
0x95 => 0x2022, // BULLET ('•') |
|
|
|
|
38
|
|
|
0x96 => 0x2013, // EN DASH ('–') |
|
|
|
|
39
|
|
|
0x97 => 0x2014, // EM DASH ('—') |
|
|
|
|
40
|
|
|
0x98 => 0x02DC, // SMALL TILDE ('˜') |
|
|
|
|
41
|
|
|
0x99 => 0x2122, // TRADE MARK SIGN ('™') |
42
|
|
|
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') |
43
|
|
|
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') |
44
|
|
|
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') |
45
|
|
|
0x9D => 0x009D, // <control> |
46
|
|
|
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') |
47
|
|
|
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') |
48
|
|
|
); |
49
|
|
|
|
50
|
|
|
protected static $namedCharacterReferences; |
51
|
|
|
|
52
|
|
|
protected static $namedCharacterReferenceMaxLength; |
53
|
|
|
|
54
|
|
|
/** |
55
|
|
|
* Returns the "real" Unicode codepoint of a malformed character |
56
|
|
|
* reference. |
57
|
|
|
*/ |
58
|
|
|
public static function getRealCodepoint($ref) { |
59
|
|
|
if (!isset(self::$realCodepointTable[$ref])) return false; |
60
|
|
|
else return self::$realCodepointTable[$ref]; |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
public static function getNamedCharacterReferences() { |
64
|
|
|
if (!self::$namedCharacterReferences) { |
65
|
|
|
self::$namedCharacterReferences = unserialize( |
66
|
|
|
file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); |
67
|
|
|
} |
68
|
|
|
return self::$namedCharacterReferences; |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Converts a Unicode codepoint to sequence of UTF-8 bytes. |
73
|
|
|
* @note Shamelessly stolen from HTML Purifier, which is also |
74
|
|
|
* shamelessly stolen from Feyd (which is in public domain). |
75
|
|
|
*/ |
76
|
|
|
public static function utf8chr($code) { |
77
|
|
|
/* We don't care: we live dangerously |
78
|
|
|
* if($code > 0x10FFFF or $code < 0x0 or |
79
|
|
|
($code >= 0xD800 and $code <= 0xDFFF) ) { |
80
|
|
|
// bits are set outside the "valid" range as defined |
81
|
|
|
// by UNICODE 4.1.0 |
82
|
|
|
return "\xEF\xBF\xBD"; |
83
|
|
|
}*/ |
84
|
|
|
|
85
|
|
|
$x = $y = $z = $w = 0; |
|
|
|
|
86
|
|
|
if ($code < 0x80) { |
87
|
|
|
// regular ASCII character |
88
|
|
|
$x = $code; |
89
|
|
|
} else { |
90
|
|
|
// set up bits for UTF-8 |
91
|
|
|
$x = ($code & 0x3F) | 0x80; |
92
|
|
|
if ($code < 0x800) { |
93
|
|
|
$y = (($code & 0x7FF) >> 6) | 0xC0; |
94
|
|
|
} else { |
95
|
|
|
$y = (($code & 0xFC0) >> 6) | 0x80; |
96
|
|
|
if($code < 0x10000) { |
97
|
|
|
$z = (($code >> 12) & 0x0F) | 0xE0; |
98
|
|
|
} else { |
99
|
|
|
$z = (($code >> 12) & 0x3F) | 0x80; |
100
|
|
|
$w = (($code >> 18) & 0x07) | 0xF0; |
101
|
|
|
} |
102
|
|
|
} |
103
|
|
|
} |
104
|
|
|
// set up the actual character |
105
|
|
|
$ret = ''; |
106
|
|
|
if($w) $ret .= chr($w); |
107
|
|
|
if($z) $ret .= chr($z); |
108
|
|
|
if($y) $ret .= chr($y); |
109
|
|
|
$ret .= chr($x); |
110
|
|
|
|
111
|
|
|
return $ret; |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
} |
|
|
|
|
115
|
|
|
|
You can fix this by adding a namespace to your class:
When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.