the closing brace of a class is on a separate line.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | // warning: this file is encoded in UTF-8! |
||
4 | |||
5 | class HTML5_Data |
||
6 | { |
||
7 | |||
8 | // at some point this should be moved to a .ser file. Another |
||
9 | // possible optimization is to give UTF-8 bytes, not Unicode |
||
10 | // codepoints |
||
11 | // XXX: Not quite sure why it's named this; this is |
||
12 | // actually the numeric entity dereference table. |
||
13 | protected static $realCodepointTable = array( |
||
14 | 0x00 => 0xFFFD, // REPLACEMENT CHARACTER |
||
15 | 0x0D => 0x000A, // LINE FEED (LF) |
||
16 | 0x80 => 0x20AC, // EURO SIGN ('€') |
||
17 | 0x81 => 0x0081, // <control> |
||
18 | 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') |
||
19 | 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') |
||
20 | 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') |
||
21 | 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') |
||
22 | 0x86 => 0x2020, // DAGGER ('†') |
||
23 | 0x87 => 0x2021, // DOUBLE DAGGER ('‡') |
||
24 | 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') |
||
25 | 0x89 => 0x2030, // PER MILLE SIGN ('‰') |
||
26 | 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') |
||
27 | 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') |
||
28 | 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') |
||
29 | 0x8D => 0x008D, // <control> |
||
30 | 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') |
||
31 | 0x8F => 0x008F, // <control> |
||
32 | 0x90 => 0x0090, // <control> |
||
33 | 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') |
||
34 | 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') |
||
35 | 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') |
||
36 | 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') |
||
37 | 0x95 => 0x2022, // BULLET ('•') |
||
38 | 0x96 => 0x2013, // EN DASH ('–') |
||
39 | 0x97 => 0x2014, // EM DASH ('—') |
||
40 | 0x98 => 0x02DC, // SMALL TILDE ('˜') |
||
41 | 0x99 => 0x2122, // TRADE MARK SIGN ('™') |
||
42 | 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') |
||
43 | 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') |
||
44 | 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') |
||
45 | 0x9D => 0x009D, // <control> |
||
46 | 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') |
||
47 | 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') |
||
48 | ); |
||
49 | |||
50 | protected static $namedCharacterReferences; |
||
51 | |||
52 | protected static $namedCharacterReferenceMaxLength; |
||
53 | |||
54 | /** |
||
55 | * Returns the "real" Unicode codepoint of a malformed character |
||
56 | * reference. |
||
57 | */ |
||
58 | public static function getRealCodepoint($ref) { |
||
59 | if (!isset(self::$realCodepointTable[$ref])) return false; |
||
60 | else return self::$realCodepointTable[$ref]; |
||
61 | } |
||
62 | |||
63 | public static function getNamedCharacterReferences() { |
||
64 | if (!self::$namedCharacterReferences) { |
||
65 | self::$namedCharacterReferences = unserialize( |
||
66 | file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); |
||
67 | } |
||
68 | return self::$namedCharacterReferences; |
||
69 | } |
||
70 | |||
71 | /** |
||
72 | * Converts a Unicode codepoint to sequence of UTF-8 bytes. |
||
73 | * @note Shamelessly stolen from HTML Purifier, which is also |
||
74 | * shamelessly stolen from Feyd (which is in public domain). |
||
75 | */ |
||
76 | public static function utf8chr($code) { |
||
77 | /* We don't care: we live dangerously |
||
78 | * if($code > 0x10FFFF or $code < 0x0 or |
||
79 | ($code >= 0xD800 and $code <= 0xDFFF) ) { |
||
80 | // bits are set outside the "valid" range as defined |
||
81 | // by UNICODE 4.1.0 |
||
82 | return "\xEF\xBF\xBD"; |
||
83 | }*/ |
||
84 | |||
85 | $x = $y = $z = $w = 0; |
||
86 | if ($code < 0x80) { |
||
87 | // regular ASCII character |
||
88 | $x = $code; |
||
89 | } else { |
||
90 | // set up bits for UTF-8 |
||
91 | $x = ($code & 0x3F) | 0x80; |
||
92 | if ($code < 0x800) { |
||
93 | $y = (($code & 0x7FF) >> 6) | 0xC0; |
||
94 | } else { |
||
95 | $y = (($code & 0xFC0) >> 6) | 0x80; |
||
96 | if($code < 0x10000) { |
||
97 | $z = (($code >> 12) & 0x0F) | 0xE0; |
||
98 | } else { |
||
99 | $z = (($code >> 12) & 0x3F) | 0x80; |
||
100 | $w = (($code >> 18) & 0x07) | 0xF0; |
||
101 | } |
||
102 | } |
||
103 | } |
||
104 | // set up the actual character |
||
105 | $ret = ''; |
||
106 | if($w) $ret .= chr($w); |
||
107 | if($z) $ret .= chr($z); |
||
108 | if($y) $ret .= chr($y); |
||
109 | $ret .= chr($x); |
||
110 | |||
111 | return $ret; |
||
112 | } |
||
113 | |||
114 | } |
||
0 ignored issues
–
show
|
|||
115 |
Below you find some examples: