Test Failed
Branch feature/2512 (a8f148)
by Michael
09:52 queued 13s
created

HTMLPurifier_Lexer::escapeCommentedCDATA()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
1
<?php
2
3
/**
4
 * Forgivingly lexes HTML (SGML-style) markup into tokens.
5
 *
6
 * A lexer parses a string of SGML-style markup and converts them into
7
 * corresponding tokens.  It doesn't check for well-formedness, although its
8
 * internal mechanism may make this automatic (such as the case of
9
 * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
10
 * from.
11
 *
12
 * A lexer is HTML-oriented: it might work with XML, but it's not
13
 * recommended, as we adhere to a subset of the specification for optimization
14
 * reasons. This might change in the future. Also, most tokenizers are not
15
 * expected to handle DTDs or PIs.
16
 *
17
 * This class should not be directly instantiated, but you may use create() to
18
 * retrieve a default copy of the lexer.  Being a supertype, this class
19
 * does not actually define any implementation, but offers commonly used
20
 * convenience functions for subclasses.
21
 *
22
 * @note The unit tests will instantiate this class for testing purposes, as
23
 *       many of the utility functions require a class to be instantiated.
24
 *       This means that, even though this class is not runnable, it will
25
 *       not be declared abstract.
26
 *
27
 * @par
28
 *
29
 * @note
30
 * We use tokens rather than create a DOM representation because DOM would:
31
 *
32
 * @par
33
 *  -# Require more processing and memory to create,
34
 *  -# Is not streamable, and
35
 *  -# Has the entire document structure (html and body not needed).
36
 *
37
 * @par
38
 * However, DOM is helpful in that it makes it easy to move around nodes
39
 * without a lot of lookaheads to see when a tag is closed. This is a
40
 * limitation of the token system and some workarounds would be nice.
41
 */
42
class HTMLPurifier_Lexer
43
{
44
45
    /**
46
     * Whether or not this lexer implements line-number/column-number tracking.
47
     * If it does, set to true.
48
     */
49
    public $tracksLineNumbers = false;
50
51
    /**
52
     * @type HTMLPurifier_EntityParser
53
     */
54
    private $_entity_parser;
55
56
    // -- STATIC ----------------------------------------------------------
57
58
    /**
59
     * Retrieves or sets the default Lexer as a Prototype Factory.
60
     *
61
     * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
62
     * a few exceptions involving special features that only DirectLex
63
     * implements.
64
     *
65
     * @note The behavior of this class has changed, rather than accepting
66
     *       a prototype object, it now accepts a configuration object.
67
     *       To specify your own prototype, set %Core.LexerImpl to it.
68
     *       This change in behavior de-singletonizes the lexer object.
69
     *
70
     * @param HTMLPurifier_Config $config
71
     * @return HTMLPurifier_Lexer
72
     * @throws HTMLPurifier_Exception
73
     */
74
    public static function create($config)
75
    {
76
        if (!($config instanceof HTMLPurifier_Config)) {
0 ignored issues
show
introduced by
$config is always a sub-type of HTMLPurifier_Config.
Loading history...
77
            $lexer = $config;
78
            trigger_error(
79
                "Passing a prototype to
80
                HTMLPurifier_Lexer::create() is deprecated, please instead
81
                use %Core.LexerImpl",
82
                E_USER_WARNING
83
            );
84
        } else {
85
            $lexer = $config->get('Core.LexerImpl');
86
        }
87
88
        $needs_tracking =
89
            $config->get('Core.MaintainLineNumbers') ||
90
            $config->get('Core.CollectErrors');
91
92
        $inst = null;
93
        if (is_object($lexer)) {
94
            $inst = $lexer;
95
        } else {
96
            if (is_null($lexer)) {
97
                do {
98
                    // auto-detection algorithm
99
                    if ($needs_tracking) {
100
                        $lexer = 'DirectLex';
101
                        break;
102
                    }
103
104
                    if (class_exists('DOMDocument') &&
105
                        method_exists('DOMDocument', 'loadHTML') &&
106
                        !extension_loaded('domxml')
107
                    ) {
108
                        // check for DOM support, because while it's part of the
109
                        // core, it can be disabled compile time. Also, the PECL
110
                        // domxml extension overrides the default DOM, and is evil
111
                        // and nasty and we shan't bother to support it
112
                        $lexer = 'DOMLex';
113
                    } else {
114
                        $lexer = 'DirectLex';
115
                    }
116
                } while (0);
117
            } // do..while so we can break
118
119
            // instantiate recognized string names
120
            switch ($lexer) {
121
                case 'DOMLex':
122
                    $inst = new HTMLPurifier_Lexer_DOMLex();
123
                    break;
124
                case 'DirectLex':
125
                    $inst = new HTMLPurifier_Lexer_DirectLex();
126
                    break;
127
                case 'PH5P':
128
                    $inst = new HTMLPurifier_Lexer_PH5P();
129
                    break;
130
                default:
131
                    throw new HTMLPurifier_Exception(
132
                        "Cannot instantiate unrecognized Lexer type " .
133
                        htmlspecialchars($lexer)
134
                    );
135
            }
136
        }
137
138
        if (!$inst) {
139
            throw new HTMLPurifier_Exception('No lexer was instantiated');
140
        }
141
142
        // once PHP DOM implements native line numbers, or we
143
        // hack out something using XSLT, remove this stipulation
144
        if ($needs_tracking && !$inst->tracksLineNumbers) {
145
            throw new HTMLPurifier_Exception(
146
                'Cannot use lexer that does not support line numbers with ' .
147
                'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
148
            );
149
        }
150
151
        return $inst;
152
153
    }
154
155
    // -- CONVENIENCE MEMBERS ---------------------------------------------
156
157
    public function __construct()
158
    {
159
        $this->_entity_parser = new HTMLPurifier_EntityParser();
160
    }
161
162
    /**
163
     * Most common entity to raw value conversion table for special entities.
164
     * @type array
165
     */
166
    protected $_special_entity2str =
167
        array(
168
            '&quot;' => '"',
169
            '&amp;' => '&',
170
            '&lt;' => '<',
171
            '&gt;' => '>',
172
            '&#39;' => "'",
173
            '&#039;' => "'",
174
            '&#x27;' => "'"
175
        );
176
177
    public function parseText($string, $config) {
178
        return $this->parseData($string, false, $config);
179
    }
180
181
    public function parseAttr($string, $config) {
182
        return $this->parseData($string, true, $config);
183
    }
184
185
    /**
186
     * Parses special entities into the proper characters.
187
     *
188
     * This string will translate escaped versions of the special characters
189
     * into the correct ones.
190
     *
191
     * @param string $string String character data to be parsed.
192
     * @return string Parsed character data.
193
     */
194
    public function parseData($string, $is_attr, $config)
195
    {
196
        // following functions require at least one character
197
        if ($string === '') {
198
            return '';
199
        }
200
201
        // subtracts amps that cannot possibly be escaped
202
        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
203
            ($string[strlen($string) - 1] === '&' ? 1 : 0);
204
205
        if (!$num_amp) {
206
            return $string;
207
        } // abort if no entities
208
        $num_esc_amp = substr_count($string, '&amp;');
209
        $string = strtr($string, $this->_special_entity2str);
210
211
        // code duplication for sake of optimization, see above
212
        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
213
            ($string[strlen($string) - 1] === '&' ? 1 : 0);
214
215
        if ($num_amp_2 <= $num_esc_amp) {
216
            return $string;
217
        }
218
219
        // hmm... now we have some uncommon entities. Use the callback.
220
        if ($config->get('Core.LegacyEntityDecoder')) {
221
            $string = $this->_entity_parser->substituteSpecialEntities($string);
222
        } else {
223
            if ($is_attr) {
224
                $string = $this->_entity_parser->substituteAttrEntities($string);
225
            } else {
226
                $string = $this->_entity_parser->substituteTextEntities($string);
227
            }
228
        }
229
        return $string;
230
    }
231
232
    /**
233
     * Lexes an HTML string into tokens.
234
     * @param $string String HTML.
235
     * @param HTMLPurifier_Config $config
236
     * @param HTMLPurifier_Context $context
237
     * @return HTMLPurifier_Token[] array representation of HTML.
238
     */
239
    public function tokenizeHTML($string, $config, $context)
240
    {
241
        trigger_error('Call to abstract class', E_USER_ERROR);
242
    }
243
244
    /**
245
     * Translates CDATA sections into regular sections (through escaping).
246
     * @param string $string HTML string to process.
247
     * @return string HTML with CDATA sections escaped.
248
     */
249
    protected static function escapeCDATA($string)
250
    {
251
        return preg_replace_callback(
252
            '/<!\[CDATA\[(.+?)\]\]>/s',
253
            array('HTMLPurifier_Lexer', 'CDATACallback'),
254
            $string
255
        );
256
    }
257
258
    /**
259
     * Special CDATA case that is especially convoluted for <script>
260
     * @param string $string HTML string to process.
261
     * @return string HTML with CDATA sections escaped.
262
     */
263
    protected static function escapeCommentedCDATA($string)
264
    {
265
        return preg_replace_callback(
266
            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
267
            array('HTMLPurifier_Lexer', 'CDATACallback'),
268
            $string
269
        );
270
    }
271
272
    /**
273
     * Callback function for escapeCDATA() that does the work.
274
     *
275
     * @warning Though this is public in order to let the callback happen,
276
     *          calling it directly is not recommended.
277
     * @param array $matches PCRE matches array, with index 0 the entire match
278
     *                  and 1 the inside of the CDATA section.
279
     * @return string Escaped internals of the CDATA section.
280
     */
281
    protected static function CDATACallback($matches)
282
    {
283
        // not exactly sure why the character set is needed, but whatever
284
        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
285
    }
286
287
    /**
288
     * Takes a piece of HTML and normalizes it by converting entities, fixing
289
     * encoding, extracting bits, and other good stuff.
290
     * @param string $html HTML.
291
     * @param HTMLPurifier_Config $config
292
     * @param HTMLPurifier_Context $context
293
     * @return string
294
     * @todo Consider making protected
295
     */
296
    public function normalize($html, $config, $context)
297
    {
298
        // normalize newlines to \n
299
        if ($config->get('Core.NormalizeNewlines')) {
300
            $html = str_replace("\r\n", "\n", (string)$html);
301
            $html = str_replace("\r", "\n", (string)$html);
302
        }
303
304
        if ($config->get('HTML.Trusted')) {
305
            // escape convoluted CDATA
306
            $html = $this->escapeCommentedCDATA($html);
307
        }
308
309
        // escape CDATA
310
        $html = $this->escapeCDATA($html);
311
312
        // extract body from document if applicable
313
        if ($config->get('Core.ConvertDocumentToFragment')) {
314
            $e = false;
315
            if ($config->get('Core.CollectErrors')) {
316
                $e =& $context->get('ErrorCollector');
317
            }
318
            $new_html = $this->extractBody($html);
319
            if ($e && $new_html != $html) {
320
                $e->send(E_WARNING, 'Lexer: Extracted body');
321
            }
322
            $html = $new_html;
323
        }
324
325
        // expand entities that aren't the big five
326
        if ($config->get('Core.LegacyEntityDecoder')) {
327
            $html = $this->_entity_parser->substituteNonSpecialEntities($html);
328
        }
329
330
        // clean into wellformed UTF-8 string for an SGML context: this has
331
        // to be done after entity expansion because the entities sometimes
332
        // represent non-SGML characters (horror, horror!)
333
        $html = HTMLPurifier_Encoder::cleanUTF8($html);
334
335
        // if processing instructions are to removed, remove them now
336
        if ($config->get('Core.RemoveProcessingInstructions')) {
337
            $html = preg_replace('#<\?.+?\?>#s', '', $html);
338
        }
339
340
        $hidden_elements = $config->get('Core.HiddenElements');
341
        if ($config->get('Core.AggressivelyRemoveScript') &&
342
            !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
343
            || empty($hidden_elements["script"]))) {
344
            $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
345
        }
346
347
        return $html;
348
    }
349
350
    /**
351
     * Takes a string of HTML (fragment or document) and returns the content
352
     * @todo Consider making protected
353
     */
354
    public function extractBody($html)
355
    {
356
        $matches = array();
357
        $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
358
        if ($result) {
359
            // Make sure it's not in a comment
360
            $comment_start = strrpos($matches[1], '<!--');
361
            $comment_end   = strrpos($matches[1], '-->');
362
            if ($comment_start === false ||
363
                ($comment_end !== false && $comment_end > $comment_start)) {
364
                return $matches[2];
365
            }
366
        }
367
        return $html;
368
    }
369
}
370
371
// vim: et sw=4 sts=4
372