HTMLPurifier_Lexer_DirectLex::tokenizeHTML()   F
last analyzed

Complexity

Conditions 41
Paths 8496

Size

Total Lines 302
Code Lines 182

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 182
dl 0
loc 302
rs 0
c 0
b 0
f 0
cc 41
nc 8496
nop 3

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * Our in-house implementation of a parser.
5
 *
6
 * A pure PHP parser, DirectLex has absolutely no dependencies, making
7
 * it a reasonably good default for PHP4.  Written with efficiency in mind,
8
 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
9
 * pales in comparison to HTMLPurifier_Lexer_DOMLex.
10
 *
11
 * @todo Reread XML spec and document differences.
12
 */
13
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
14
{
15
    /**
16
     * @type bool
17
     */
18
    public $tracksLineNumbers = true;
19
20
    /**
21
     * Whitespace characters for str(c)spn.
22
     * @type string
23
     */
24
    protected $_whitespace = "\x20\x09\x0D\x0A";
25
26
    /**
27
     * Callback function for script CDATA fudge
28
     * @param array $matches, in form of array(opening tag, contents, closing tag)
29
     * @return string
30
     */
31
    protected function scriptCallback($matches)
32
    {
33
        return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
34
    }
35
36
    /**
37
     * @param String $html
38
     * @param HTMLPurifier_Config $config
39
     * @param HTMLPurifier_Context $context
40
     * @return array|HTMLPurifier_Token[]
41
     */
42
    public function tokenizeHTML($html, $config, $context)
43
    {
44
        // special normalization for script tags without any armor
45
        // our "armor" heurstic is a < sign any number of whitespaces after
46
        // the first script tag
47
        if ($config->get('HTML.Trusted')) {
48
            $html = preg_replace_callback(
49
                '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
50
                array($this, 'scriptCallback'),
51
                $html
52
            );
53
        }
54
55
        $html = $this->normalize($html, $config, $context);
56
57
        $cursor = 0; // our location in the text
58
        $inside_tag = false; // whether or not we're parsing the inside of a tag
59
        $array = array(); // result array
60
61
        // This is also treated to mean maintain *column* numbers too
62
        $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
63
64
        if ($maintain_line_numbers === null) {
65
            // automatically determine line numbering by checking
66
            // if error collection is on
67
            $maintain_line_numbers = $config->get('Core.CollectErrors');
68
        }
69
70
        if ($maintain_line_numbers) {
71
            $current_line = 1;
72
            $current_col = 0;
73
            $length = strlen($html);
74
        } else {
75
            $current_line = false;
76
            $current_col = false;
77
            $length = false;
78
        }
79
        $context->register('CurrentLine', $current_line);
80
        $context->register('CurrentCol', $current_col);
81
        $nl = "\n";
82
        // how often to manually recalculate. This will ALWAYS be right,
83
        // but it's pretty wasteful. Set to 0 to turn off
84
        $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
85
86
        $e = false;
87
        if ($config->get('Core.CollectErrors')) {
88
            $e =& $context->get('ErrorCollector');
89
        }
90
91
        // for testing synchronization
92
        $loops = 0;
93
94
        while (++$loops) {
95
            // $cursor is either at the start of a token, or inside of
96
            // a tag (i.e. there was a < immediately before it), as indicated
97
            // by $inside_tag
98
99
            if ($maintain_line_numbers) {
100
                // $rcursor, however, is always at the start of a token.
101
                $rcursor = $cursor - (int)$inside_tag;
102
103
                // Column number is cheap, so we calculate it every round.
104
                // We're interested at the *end* of the newline string, so
105
                // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
106
                // from our "rcursor" position.
107
                $nl_pos = strrpos($html, $nl, $rcursor - $length);
108
                $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
109
110
                // recalculate lines
111
                if ($synchronize_interval && // synchronization is on
112
                    $cursor > 0 && // cursor is further than zero
113
                    $loops % $synchronize_interval === 0) { // time to synchronize!
114
                    $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
115
                }
116
            }
117
118
            $position_next_lt = strpos($html, '<', $cursor);
119
            $position_next_gt = strpos($html, '>', $cursor);
120
121
            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
122
            // special case to set up context
123
            if ($position_next_lt === $cursor) {
124
                $inside_tag = true;
125
                $cursor++;
126
            }
127
128
            if (!$inside_tag && $position_next_lt !== false) {
129
                // We are not inside tag and there still is another tag to parse
130
                $token = new
131
                HTMLPurifier_Token_Text(
132
                    $this->parseText(
133
                        substr(
134
                            $html,
135
                            $cursor,
136
                            $position_next_lt - $cursor
137
                        ), $config
138
                    )
139
                );
140
                if ($maintain_line_numbers) {
141
                    $token->rawPosition($current_line, $current_col);
142
                    $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
143
                }
144
                $array[] = $token;
145
                $cursor = $position_next_lt + 1;
146
                $inside_tag = true;
147
                continue;
148
            } elseif (!$inside_tag) {
149
                // We are not inside tag but there are no more tags
150
                // If we're already at the end, break
151
                if ($cursor === strlen($html)) {
152
                    break;
153
                }
154
                // Create Text of rest of string
155
                $token = new
156
                HTMLPurifier_Token_Text(
157
                    $this->parseText(
158
                        substr(
159
                            $html,
160
                            $cursor
161
                        ), $config
162
                    )
163
                );
164
                if ($maintain_line_numbers) {
165
                    $token->rawPosition($current_line, $current_col);
166
                }
167
                $array[] = $token;
168
                break;
169
            } elseif ($inside_tag && $position_next_gt !== false) {
170
                // We are in tag and it is well formed
171
                // Grab the internals of the tag
172
                $strlen_segment = $position_next_gt - $cursor;
173
174
                if ($strlen_segment < 1) {
175
                    // there's nothing to process!
176
                    $token = new HTMLPurifier_Token_Text('<');
0 ignored issues
show
Unused Code introduced by
The assignment to $token is dead and can be removed.
Loading history...
177
                    $cursor++;
178
                    continue;
179
                }
180
181
                $segment = substr($html, $cursor, $strlen_segment);
182
183
                if ($segment === false) {
184
                    // somehow, we attempted to access beyond the end of
185
                    // the string, defense-in-depth, reported by Nate Abele
186
                    break;
187
                }
188
189
                // Check if it's a comment
190
                if (substr($segment, 0, 3) === '!--') {
191
                    // re-determine segment length, looking for -->
192
                    $position_comment_end = strpos($html, '-->', $cursor);
193
                    if ($position_comment_end === false) {
194
                        // uh oh, we have a comment that extends to
195
                        // infinity. Can't be helped: set comment
196
                        // end position to end of string
197
                        if ($e) {
198
                            $e->send(E_WARNING, 'Lexer: Unclosed comment');
199
                        }
200
                        $position_comment_end = strlen($html);
201
                        $end = true;
202
                    } else {
203
                        $end = false;
204
                    }
205
                    $strlen_segment = $position_comment_end - $cursor;
206
                    $segment = substr($html, $cursor, $strlen_segment);
207
                    $token = new
208
                    HTMLPurifier_Token_Comment(
209
                        substr(
210
                            $segment,
211
                            3,
212
                            $strlen_segment - 3
213
                        )
214
                    );
215
                    if ($maintain_line_numbers) {
216
                        $token->rawPosition($current_line, $current_col);
217
                        $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
218
                    }
219
                    $array[] = $token;
220
                    $cursor = $end ? $position_comment_end : $position_comment_end + 3;
221
                    $inside_tag = false;
222
                    continue;
223
                }
224
225
                // Check if it's an end tag
226
                $is_end_tag = (strpos($segment, '/') === 0);
227
                if ($is_end_tag) {
228
                    $type = substr($segment, 1);
229
                    $token = new HTMLPurifier_Token_End($type);
230
                    if ($maintain_line_numbers) {
231
                        $token->rawPosition($current_line, $current_col);
232
                        $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
233
                    }
234
                    $array[] = $token;
235
                    $inside_tag = false;
236
                    $cursor = $position_next_gt + 1;
237
                    continue;
238
                }
239
240
                // Check leading character is alnum, if not, we may
241
                // have accidently grabbed an emoticon. Translate into
242
                // text and go our merry way
243
                if (!ctype_alpha($segment[0])) {
244
                    // XML:  $segment[0] !== '_' && $segment[0] !== ':'
245
                    if ($e) {
246
                        $e->send(E_NOTICE, 'Lexer: Unescaped lt');
247
                    }
248
                    $token = new HTMLPurifier_Token_Text('<');
249
                    if ($maintain_line_numbers) {
250
                        $token->rawPosition($current_line, $current_col);
251
                        $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
252
                    }
253
                    $array[] = $token;
254
                    $inside_tag = false;
255
                    continue;
256
                }
257
258
                // Check if it is explicitly self closing, if so, remove
259
                // trailing slash. Remember, we could have a tag like <br>, so
260
                // any later token processing scripts must convert improperly
261
                // classified EmptyTags from StartTags.
262
                $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
263
                if ($is_self_closing) {
264
                    $strlen_segment--;
265
                    $segment = substr($segment, 0, $strlen_segment);
266
                }
267
268
                // Check if there are any attributes
269
                $position_first_space = strcspn($segment, $this->_whitespace);
270
271
                if ($position_first_space >= $strlen_segment) {
272
                    if ($is_self_closing) {
273
                        $token = new HTMLPurifier_Token_Empty($segment);
274
                    } else {
275
                        $token = new HTMLPurifier_Token_Start($segment);
276
                    }
277
                    if ($maintain_line_numbers) {
278
                        $token->rawPosition($current_line, $current_col);
279
                        $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
280
                    }
281
                    $array[] = $token;
282
                    $inside_tag = false;
283
                    $cursor = $position_next_gt + 1;
284
                    continue;
285
                }
286
287
                // Grab out all the data
288
                $type = substr($segment, 0, $position_first_space);
289
                $attribute_string =
290
                    trim(
291
                        substr(
292
                            $segment,
293
                            $position_first_space
294
                        )
295
                    );
296
                if ($attribute_string) {
297
                    $attr = $this->parseAttributeString(
298
                        $attribute_string,
299
                        $config,
300
                        $context
301
                    );
302
                } else {
303
                    $attr = array();
304
                }
305
306
                if ($is_self_closing) {
307
                    $token = new HTMLPurifier_Token_Empty($type, $attr);
308
                } else {
309
                    $token = new HTMLPurifier_Token_Start($type, $attr);
310
                }
311
                if ($maintain_line_numbers) {
312
                    $token->rawPosition($current_line, $current_col);
313
                    $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
314
                }
315
                $array[] = $token;
316
                $cursor = $position_next_gt + 1;
317
                $inside_tag = false;
318
                continue;
319
            } else {
320
                // inside tag, but there's no ending > sign
321
                if ($e) {
322
                    $e->send(E_WARNING, 'Lexer: Missing gt');
323
                }
324
                $token = new
325
                HTMLPurifier_Token_Text(
326
                    '<' .
327
                    $this->parseText(
328
                        substr($html, $cursor), $config
329
                    )
330
                );
331
                if ($maintain_line_numbers) {
332
                    $token->rawPosition($current_line, $current_col);
333
                }
334
                // no cursor scroll? Hmm...
335
                $array[] = $token;
336
                break;
337
            }
338
            break;
339
        }
340
341
        $context->destroy('CurrentLine');
342
        $context->destroy('CurrentCol');
343
        return $array;
344
    }
345
346
    /**
347
     * PHP 5.0.x compatible substr_count that implements offset and length
348
     * @param string $haystack
349
     * @param string $needle
350
     * @param int $offset
351
     * @param int $length
352
     * @return int
353
     */
354
    protected function substrCount($haystack, $needle, $offset, $length)
355
    {
356
        static $oldVersion;
357
        if ($oldVersion === null) {
358
            $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
359
        }
360
        if ($oldVersion) {
361
            $haystack = substr($haystack, $offset, $length);
362
            return substr_count($haystack, $needle);
363
        } else {
364
            return substr_count($haystack, $needle, $offset, $length);
365
        }
366
    }
367
368
    /**
369
     * Takes the inside of an HTML tag and makes an assoc array of attributes.
370
     *
371
     * @param string $string Inside of tag excluding name.
372
     * @param HTMLPurifier_Config $config
373
     * @param HTMLPurifier_Context $context
374
     * @return array Assoc array of attributes.
375
     */
376
    public function parseAttributeString($string, $config, $context)
377
    {
378
        $string = (string)$string; // quick typecast
379
380
        if ($string == '') {
381
            return array();
382
        } // no attributes
383
384
        $e = false;
385
        if ($config->get('Core.CollectErrors')) {
386
            $e =& $context->get('ErrorCollector');
387
        }
388
389
        // let's see if we can abort as quickly as possible
390
        // one equal sign, no spaces => one attribute
391
        $num_equal = substr_count($string, '=');
392
        $has_space = strpos($string, ' ');
393
        if ($num_equal === 0 && !$has_space) {
394
            // bool attribute
395
            return array($string => $string);
396
        } elseif ($num_equal === 1 && !$has_space) {
397
            // only one attribute
398
            list($key, $quoted_value) = explode('=', $string);
399
            $quoted_value = trim($quoted_value);
400
            if (!$key) {
401
                if ($e) {
402
                    $e->send(E_ERROR, 'Lexer: Missing attribute key');
403
                }
404
                return array();
405
            }
406
            if (!$quoted_value) {
407
                return array($key => '');
408
            }
409
            $first_char = @$quoted_value[0];
410
            $last_char = @$quoted_value[strlen($quoted_value) - 1];
411
412
            $same_quote = ($first_char == $last_char);
413
            $open_quote = ($first_char == '"' || $first_char == "'");
414
415
            if ($same_quote && $open_quote) {
416
                // well behaved
417
                $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
418
            } else {
419
                // not well behaved
420
                if ($open_quote) {
421
                    if ($e) {
422
                        $e->send(E_ERROR, 'Lexer: Missing end quote');
423
                    }
424
                    $value = substr($quoted_value, 1);
425
                } else {
426
                    $value = $quoted_value;
427
                }
428
            }
429
            if ($value === false) {
430
                $value = '';
431
            }
432
            return array($key => $this->parseAttr($value, $config));
433
        }
434
435
        // setup loop environment
436
        $array = array(); // return assoc array of attributes
437
        $cursor = 0; // current position in string (moves forward)
438
        $size = strlen($string); // size of the string (stays the same)
439
440
        // if we have unquoted attributes, the parser expects a terminating
441
        // space, so let's guarantee that there's always a terminating space.
442
        $string .= ' ';
443
444
        $old_cursor = -1;
445
        while ($cursor < $size) {
446
            if ($old_cursor >= $cursor) {
447
                throw new Exception("Infinite loop detected");
448
            }
449
            $old_cursor = $cursor;
450
451
            $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
0 ignored issues
show
Unused Code introduced by
The assignment to $value is dead and can be removed.
Loading history...
452
            // grab the key
453
454
            $key_begin = $cursor; //we're currently at the start of the key
455
456
            // scroll past all characters that are the key (not whitespace or =)
457
            $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
458
459
            $key_end = $cursor; // now at the end of the key
460
461
            $key = substr($string, $key_begin, $key_end - $key_begin);
462
463
            if (!$key) {
464
                if ($e) {
465
                    $e->send(E_ERROR, 'Lexer: Missing attribute key');
466
                }
467
                $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
468
                continue; // empty key
469
            }
470
471
            // scroll past all whitespace
472
            $cursor += strspn($string, $this->_whitespace, $cursor);
473
474
            if ($cursor >= $size) {
475
                $array[$key] = $key;
476
                break;
477
            }
478
479
            // if the next character is an equal sign, we've got a regular
480
            // pair, otherwise, it's a bool attribute
481
            $first_char = @$string[$cursor];
482
483
            if ($first_char == '=') {
484
                // key="value"
485
486
                $cursor++;
487
                $cursor += strspn($string, $this->_whitespace, $cursor);
488
489
                if ($cursor === false) {
490
                    $array[$key] = '';
491
                    break;
492
                }
493
494
                // we might be in front of a quote right now
495
496
                $char = @$string[$cursor];
497
498
                if ($char == '"' || $char == "'") {
499
                    // it's quoted, end bound is $char
500
                    $cursor++;
501
                    $value_begin = $cursor;
502
                    $cursor = strpos($string, $char, $cursor);
503
                    $value_end = $cursor;
504
                } else {
505
                    // it's not quoted, end bound is whitespace
506
                    $value_begin = $cursor;
507
                    $cursor += strcspn($string, $this->_whitespace, $cursor);
508
                    $value_end = $cursor;
509
                }
510
511
                // we reached a premature end
512
                if ($cursor === false) {
513
                    $cursor = $size;
514
                    $value_end = $cursor;
515
                }
516
517
                $value = substr($string, $value_begin, $value_end - $value_begin);
518
                if ($value === false) {
519
                    $value = '';
520
                }
521
                $array[$key] = $this->parseAttr($value, $config);
522
                $cursor++;
523
            } else {
524
                // boolattr
525
                if ($key !== '') {
526
                    $array[$key] = $key;
527
                } else {
528
                    // purely theoretical
529
                    if ($e) {
530
                        $e->send(E_ERROR, 'Lexer: Missing attribute key');
531
                    }
532
                }
533
            }
534
        }
535
        return $array;
536
    }
537
}
538
539
// vim: et sw=4 sts=4
540