Passed
Push — master ( 1d11ee...eb343f )
by Michael
35:30 queued 15s
created

substituteNonSpecialEntities()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 7
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
1
<?php
2
3
// if want to implement error collecting here, we'll need to use some sort
4
// of global data (probably trigger_error) because it's impossible to pass
5
// $config or $context to the callback functions.
6
7
/**
8
 * Handles referencing and derefencing character entities
9
 */
10
class HTMLPurifier_EntityParser
11
{
12
    /**
13
     * Reference to entity lookup table.
14
     * @type HTMLPurifier_EntityLookup
15
     */
16
    protected $_entity_lookup;
17
18
    /**
19
     * Callback regex string for entities in text.
20
     * @type string
21
     */
22
    protected $_textEntitiesRegex;
23
24
    /**
25
     * Callback regex string for entities in attributes.
26
     * @type string
27
     */
28
    protected $_attrEntitiesRegex;
29
30
    /**
31
     * Tests if the beginning of a string is a semi-optional regex
32
     */
33
    protected $_semiOptionalPrefixRegex;
34
35
    public function __construct() {
36
        // From
37
        // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
38
        $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
39
40
        // NB: three empty captures to put the fourth match in the right
41
        // place
42
        $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
43
44
        $this->_textEntitiesRegex =
45
            '/&(?:' .
46
            // hex
47
            '[#]x([a-fA-F0-9]+);?|' .
48
            // dec
49
            '[#]0*(\d+);?|' .
50
            // string (mandatory semicolon)
51
            // NB: order matters: match semicolon preferentially
52
            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|' .
53
            // string (optional semicolon)
54
            "($semi_optional)" .
55
            ')/';
56
57
        $this->_attrEntitiesRegex =
58
            '/&(?:' .
59
            // hex
60
            '[#]x([a-fA-F0-9]+);?|' .
61
            // dec
62
            '[#]0*(\d+);?|' .
63
            // string (mandatory semicolon)
64
            // NB: order matters: match semicolon preferentially
65
            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|' .
66
            // string (optional semicolon)
67
            // don't match if trailing is equals or alphanumeric (URL
68
            // like)
69
            "($semi_optional)(?![=;A-Za-z0-9])" .
70
            ')/';
71
72
    }
73
74
    /**
75
     * Substitute entities with the parsed equivalents.  Use this on
76
     * textual data in an HTML document (as opposed to attributes.)
77
     *
78
     * @param string $string String to have entities parsed.
79
     * @return string Parsed string.
80
     */
81
    public function substituteTextEntities($string)
82
    {
83
        return preg_replace_callback(
84
            $this->_textEntitiesRegex,
85
            array($this, 'entityCallback'),
86
            $string
87
        );
88
    }
89
90
    /**
91
     * Substitute entities with the parsed equivalents.  Use this on
92
     * attribute contents in documents.
93
     *
94
     * @param string $string String to have entities parsed.
95
     * @return string Parsed string.
96
     */
97
    public function substituteAttrEntities($string)
98
    {
99
        return preg_replace_callback(
100
            $this->_attrEntitiesRegex,
101
            array($this, 'entityCallback'),
102
            $string
103
        );
104
    }
105
106
    /**
107
     * Callback function for substituteNonSpecialEntities() that does the work.
108
     *
109
     * @param array $matches PCRE matches array, with 0 the entire match, and
110
     *                       either index 1, 2 or 3 set with a hex value, dec value,
111
     *                       or string (respectively).
112
     * @return string Replacement string.
113
     */
114
115
    protected function entityCallback($matches)
116
    {
117
        $entity = $matches[0];
118
        $hex_part = isset($matches[1]) ? $matches[1] : null;
119
        $dec_part = isset($matches[2]) ? $matches[2] : null;
120
        $named_part = empty($matches[3]) ? (empty($matches[4]) ? "" : $matches[4]) : $matches[3];
121
        if ($hex_part !== null && $hex_part !== "") {
122
            return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
123
        } elseif ($dec_part !== null && $dec_part !== "") {
124
            return HTMLPurifier_Encoder::unichr((int)$dec_part);
125
        } else {
126
            if (!$this->_entity_lookup) {
127
                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
128
            }
129
            if (isset($this->_entity_lookup->table[$named_part])) {
130
                return $this->_entity_lookup->table[$named_part];
131
            } else {
132
                // exact match didn't match anything, so test if
133
                // any of the semicolon optional match the prefix.
134
                // Test that this is an EXACT match is important to
135
                // prevent infinite loop
136
                if (!empty($matches[3])) {
137
                    return preg_replace_callback(
138
                        $this->_semiOptionalPrefixRegex,
139
                        array($this, 'entityCallback'),
140
                        $entity
141
                    );
142
                }
143
                return $entity;
144
            }
145
        }
146
    }
147
148
    // LEGACY CODE BELOW
149
150
    /**
151
     * Callback regex string for parsing entities.
152
     * @type string
153
     */
154
    protected $_substituteEntitiesRegex =
155
        '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
156
    //     1. hex             2. dec      3. string (XML style)
157
158
    /**
159
     * Decimal to parsed string conversion table for special entities.
160
     * @type array
161
     */
162
    protected $_special_dec2str =
163
        array(
164
            34 => '"',
165
            38 => '&',
166
            39 => "'",
167
            60 => '<',
168
                    62 => '>'
169
        );
170
171
    /**
172
     * Stripped entity names to decimal conversion table for special entities.
173
     * @type array
174
     */
175
    protected $_special_ent2dec =
176
        array(
177
            'quot' => 34,
178
            'amp' => 38,
179
            'lt' => 60,
180
                    'gt'   => 62
181
        );
182
183
    /**
184
     * Substitutes non-special entities with their parsed equivalents. Since
185
     * running this whenever you have parsed character is t3h 5uck, we run
186
     * it before everything else.
187
     *
188
     * @param string $string String to have non-special entities parsed.
189
     * @return string Parsed string.
190
     */
191
    public function substituteNonSpecialEntities($string)
192
    {
193
        // it will try to detect missing semicolons, but don't rely on it
194
        return preg_replace_callback(
195
            $this->_substituteEntitiesRegex,
196
            array($this, 'nonSpecialEntityCallback'),
197
            $string
198
        );
199
    }
200
201
    /**
202
     * Callback function for substituteNonSpecialEntities() that does the work.
203
     *
204
     * @param array $matches PCRE matches array, with 0 the entire match, and
205
     *                       either index 1, 2 or 3 set with a hex value, dec value,
206
     *                       or string (respectively).
207
     * @return string Replacement string.
208
     */
209
210
    protected function nonSpecialEntityCallback($matches)
211
    {
212
        // replaces all but big five
213
        $entity = $matches[0];
214
        $is_num = (@$matches[0][1] === '#');
215
        if ($is_num) {
216
            $is_hex = (@$entity[2] === 'x');
217
            $code = $is_hex ? hexdec($matches[1]) : (int)$matches[2];
218
            // abort for special characters
219
            if (isset($this->_special_dec2str[$code])) {
220
                return $entity;
221
            }
222
            return HTMLPurifier_Encoder::unichr($code);
223
        } else {
224
            if (isset($this->_special_ent2dec[$matches[3]])) {
225
                return $entity;
226
            }
227
            if (!$this->_entity_lookup) {
228
                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
229
            }
230
            if (isset($this->_entity_lookup->table[$matches[3]])) {
231
                return $this->_entity_lookup->table[$matches[3]];
232
            } else {
233
                return $entity;
234
            }
235
        }
236
    }
237
238
    /**
239
     * Substitutes only special entities with their parsed equivalents.
240
     *
241
     * @notice We try to avoid calling this function because otherwise, it
242
     * would have to be called a lot (for every parsed section).
243
     *
244
     * @param string $string String to have non-special entities parsed.
245
     * @return string Parsed string.
246
     */
247
    public function substituteSpecialEntities($string)
248
    {
249
        return preg_replace_callback(
250
            $this->_substituteEntitiesRegex,
251
            array($this, 'specialEntityCallback'),
252
            $string
253
        );
254
    }
255
256
    /**
257
     * Callback function for substituteSpecialEntities() that does the work.
258
     *
259
     * This callback has same syntax as nonSpecialEntityCallback().
260
     *
261
     * @param array $matches PCRE-style matches array, with 0 the entire match, and
262
     *                       either index 1, 2 or 3 set with a hex value, dec value,
263
     *                       or string (respectively).
264
     * @return string Replacement string.
265
     */
266
    protected function specialEntityCallback($matches)
267
    {
268
        $entity = $matches[0];
269
        $is_num = (@$matches[0][1] === '#');
270
        if ($is_num) {
271
            $is_hex = (@$entity[2] === 'x');
272
            $int = $is_hex ? hexdec($matches[1]) : (int)$matches[2];
273
            return isset($this->_special_dec2str[$int]) ?
274
                $this->_special_dec2str[$int] :
275
                $entity;
276
        } else {
277
            return isset($this->_special_ent2dec[$matches[3]]) ?
278
                $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
279
                $entity;
280
        }
281
    }
282
}
283
284
// vim: et sw=4 sts=4
285