Completed
Pull Request — master (#161)
by Christophe
02:59
created

Tokenizer::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 6
ccs 5
cts 5
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 3
crap 1
1
<?php
2
3
namespace Masterminds\HTML5\Parser;
4
5
use Masterminds\HTML5\Elements;
6
7
/**
8
 * The HTML5 tokenizer.
9
 *
10
 * The tokenizer's role is reading data from the scanner and gathering it into
11
 * semantic units. From the tokenizer, data is emitted to an event handler,
12
 * which may (for example) create a DOM tree.
13
 *
14
 * The HTML5 specification has a detailed explanation of tokenizing HTML5. We
15
 * follow that specification to the maximum extent that we can. If you find
16
 * a discrepancy that is not documented, please file a bug and/or submit a
17
 * patch.
18
 *
19
 * This tokenizer is implemented as a recursive descent parser.
20
 *
21
 * Within the API documentation, you may see references to the specific section
22
 * of the HTML5 spec that the code attempts to reproduce. Example: 8.2.4.1.
23
 * This refers to section 8.2.4.1 of the HTML5 CR specification.
24
 *
25
 * @see http://www.w3.org/TR/2012/CR-html5-20121217/
26
 */
27
class Tokenizer
28
{
29
    protected $scanner;
30
31
    protected $events;
32
33
    protected $tok;
34
35
    /**
36
     * Buffer for text.
37
     */
38
    protected $text = '';
39
40
    // When this goes to false, the parser stops.
41
    protected $carryOn = true;
42
43
    protected $textMode = 0; // TEXTMODE_NORMAL;
44
    protected $untilTag = null;
45
46
    const CONFORMANT_XML = 'xml';
47
    const CONFORMANT_HTML = 'html';
48
    protected $mode = self::CONFORMANT_HTML;
49
50
    /**
51
     * Create a new tokenizer.
52
     *
53
     * Typically, parsing a document involves creating a new tokenizer, giving
54
     * it a scanner (input) and an event handler (output), and then calling
55
     * the Tokenizer::parse() method.`
56
     *
57
     * @param Scanner      $scanner      A scanner initialized with an input stream.
58
     * @param EventHandler $eventHandler An event handler, initialized and ready to receive events.
59
     * @param string       $mode
60
     */
61 127
    public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
62
    {
63 127
        $this->scanner = $scanner;
64 127
        $this->events = $eventHandler;
65 127
        $this->mode = $mode;
66 127
    }
67
68
    /**
69
     * Begin parsing.
70
     *
71
     * This will begin scanning the document, tokenizing as it goes.
72
     * Tokens are emitted into the event handler.
73
     *
74
     * Tokenizing will continue until the document is completely
75
     * read. Errors are emitted into the event handler, but
76
     * the parser will attempt to continue parsing until the
77
     * entire input stream is read.
78
     */
79 127
    public function parse()
80
    {
81
        do {
82 127
            $this->consumeData();
83
            // FIXME: Add infinite loop protection.
84 127
        } while ($this->carryOn);
85 127
    }
86
87
    /**
88
     * Set the text mode for the character data reader.
89
     *
90
     * HTML5 defines three different modes for reading text:
91
     * - Normal: Read until a tag is encountered.
92
     * - RCDATA: Read until a tag is encountered, but skip a few otherwise-
93
     * special characters.
94
     * - Raw: Read until a special closing tag is encountered (viz. pre, script)
95
     *
96
     * This allows those modes to be set.
97
     *
98
     * Normally, setting is done by the event handler via a special return code on
99
     * startTag(), but it can also be set manually using this function.
100
     *
101
     * @param int    $textmode One of Elements::TEXT_*.
102
     * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not
103
     *                         use this indicator.
104
     */
105 108
    public function setTextMode($textmode, $untilTag = null)
106
    {
107 108
        $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
108 108
        $this->untilTag = $untilTag;
109 108
    }
110
111
    /**
112
     * Consume a character and make a move.
113
     * HTML5 8.2.4.1.
114
     */
115 127
    protected function consumeData()
116
    {
117 127
        $tok = $this->scanner->current();
118
119 127
        if ('&' === $tok) {
120
            // Character reference
121 8
            $ref = $this->decodeCharacterReference();
122 8
            $this->buffer($ref);
123
124 8
            $tok = $this->scanner->current();
125 8
        }
126
127
        // Parse tag
128 127
        if ('<' === $tok) {
129
            // Any buffered text data can go out now.
130 123
            $this->flushBuffer();
131
132 123
            $tok = $this->scanner->next();
133
134 123
            if ('!' === $tok) {
135 101
                $this->markupDeclaration($tok);
136 123
            } elseif ('/' === $tok) {
137 111
                $this->endTag();
138 120
            } elseif ('?' === $tok) {
139 7
                $this->processingInstruction();
140 119
            } elseif (ctype_alpha($tok)) {
141 114
                $this->tagName();
142 114
            } else {
143 1
                $this->parseError('Illegal tag opening');
144
                // TODO is this necessary ?
145 1
                $this->characterData();
146
            }
147
148 123
            $tok = $this->scanner->current();
149 123
        }
150
151 127
        if (false === $tok) {
152
            // Handle end of document
153 127
            $this->eof();
154 127
        } else {
155
            // Parse character
156 112
            switch ($this->textMode) {
157 112
                case Elements::TEXT_RAW:
158 8
                    $this->rawText($tok);
159 8
                    break;
160
161 112
                case Elements::TEXT_RCDATA:
162 37
                    $this->rcdata($tok);
163 37
                    break;
164
165 111
                default:
166 111
                    if ('<' !== $tok && '&' !== $tok) {
167
                        // NULL character
168 87
                        if ("\00" === $tok) {
169
                            $this->parseError('Received null character.');
170
                        }
171
172 87
                        $this->text .= $tok;
173 87
                        $this->scanner->consume();
174 87
                    }
175 112
            }
176
        }
177
178 127
        return $this->carryOn;
179
    }
180
181
    /**
182
     * Parse anything that looks like character data.
183
     *
184
     * Different rules apply based on the current text mode.
185
     *
186
     * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
187
     */
188 1
    protected function characterData()
189
    {
190 1
        $tok = $this->scanner->current();
191 1
        if (false === $tok) {
192
            return false;
193
        }
194 1
        switch ($this->textMode) {
195 1
            case Elements::TEXT_RAW:
196
                return $this->rawText($tok);
197 1
            case Elements::TEXT_RCDATA:
198
                return $this->rcdata($tok);
199 1
            default:
200 1
                if ('<' === $tok || '&' === $tok) {
201
                    return false;
202
                }
203
204 1
                return $this->text($tok);
205 1
        }
206
    }
207
208
    /**
209
     * This buffers the current token as character data.
210
     *
211
     * @param string $tok The current token.
212
     *
213
     * @return bool
214
     */
215 1
    protected function text($tok)
216
    {
217
        // This should never happen...
218 1
        if (false === $tok) {
219
            return false;
220
        }
221
222
        // NULL character
223 1
        if ("\00" === $tok) {
224
            $this->parseError('Received null character.');
225
        }
226
227 1
        $this->buffer($tok);
228 1
        $this->scanner->consume();
229
230 1
        return true;
231
    }
232
233
    /**
234
     * Read text in RAW mode.
235
     *
236
     * @param string $tok The current token.
237
     *
238
     * @return bool
239
     */
240 8
    protected function rawText($tok)
241
    {
242 8
        if (is_null($this->untilTag)) {
243
            return $this->text($tok);
244
        }
245
246 8
        $sequence = '</' . $this->untilTag . '>';
247 8
        $txt = $this->readUntilSequence($sequence);
248 8
        $this->events->text($txt);
249 8
        $this->setTextMode(0);
250
251 8
        return $this->endTag();
252
    }
253
254
    /**
255
     * Read text in RCDATA mode.
256
     *
257
     * @param string $tok The current token.
258
     *
259
     * @return bool
260
     */
261 37
    protected function rcdata($tok)
262
    {
263 37
        if (is_null($this->untilTag)) {
264
            return $this->text($tok);
265
        }
266
267 37
        $sequence = '</' . $this->untilTag;
268 37
        $txt = '';
269
270 37
        $caseSensitive = !Elements::isHtml5Element($this->untilTag);
271 37
        while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
272 35
            if ('&' == $tok) {
273 1
                $txt .= $this->decodeCharacterReference();
274 1
                $tok = $this->scanner->current();
275 1
            } else {
276 35
                $txt .= $tok;
277 35
                $tok = $this->scanner->next();
278
            }
279 35
        }
280 37
        $len = strlen($sequence);
281 37
        $this->scanner->consume($len);
282 37
        $len += $this->scanner->whitespace();
283 37
        if ('>' !== $this->scanner->current()) {
284
            $this->parseError('Unclosed RCDATA end tag');
285
        }
286
287 37
        $this->scanner->unconsume($len);
288 37
        $this->events->text($txt);
289 37
        $this->setTextMode(0);
290
291 37
        return $this->endTag();
292
    }
293
294
    /**
295
     * If the document is read, emit an EOF event.
296
     */
297 127
    protected function eof()
298
    {
299
        // fprintf(STDOUT, "EOF");
300 127
        $this->flushBuffer();
301 127
        $this->events->eof();
302 127
        $this->carryOn = false;
303 127
    }
304
305
    /**
306
     * Look for markup.
307
     */
308 101
    protected function markupDeclaration($tok)
309
    {
310 101
        $tok = $this->scanner->next();
311
312
        // Comment:
313 101
        if ('-' == $tok && '-' == $this->scanner->peek()) {
314 6
            $this->scanner->consume(2);
315
316 6
            return $this->comment();
317 98
        } elseif ('D' == $tok || 'd' == $tok) { // Doctype
318 96
            return $this->doctype();
319 7
        } elseif ('[' == $tok) { // CDATA section
320 7
            return $this->cdataSection();
321
        }
322
323
        // FINISH
324 1
        $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok);
325 1
        $this->bogusComment('<!');
326
327 1
        return true;
328
    }
329
330
    /**
331
     * Consume an end tag. See section 8.2.4.9.
332
     */
333 111
    protected function endTag()
334
    {
335 111
        if ('/' != $this->scanner->current()) {
336 44
            return false;
337
        }
338 111
        $tok = $this->scanner->next();
339
340
        // a-zA-Z -> tagname
341
        // > -> parse error
342
        // EOF -> parse error
343
        // -> parse error
344 111
        if (!ctype_alpha($tok)) {
345 2
            $this->parseError("Expected tag name, got '%s'", $tok);
346 2
            if ("\0" == $tok || false === $tok) {
347
                return false;
348
            }
349
350 2
            return $this->bogusComment('</');
351
        }
352
353 110
        $name = $this->scanner->charsUntil("\n\f \t>");
354 110
        $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
355
        // Trash whitespace.
356 110
        $this->scanner->whitespace();
357
358 110
        $tok = $this->scanner->current();
359 110
        if ('>' != $tok) {
360 1
            $this->parseError("Expected >, got '%s'", $tok);
361
            // We just trash stuff until we get to the next tag close.
362 1
            $this->scanner->charsUntil('>');
363 1
        }
364
365 110
        $this->events->endTag($name);
366 110
        $this->scanner->consume();
367
368 110
        return true;
369
    }
370
371
    /**
372
     * Consume a tag name and body. See section 8.2.4.10.
373
     */
374 114
    protected function tagName()
375
    {
376 114
        $tok = $this->scanner->current();
377 114
        if (!ctype_alpha($tok)) {
378
            return false;
379
        }
380
381
        // We know this is at least one char.
382 114
        $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz');
383 114
        $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
384 114
        $attributes = array();
385 114
        $selfClose = false;
386
387
        // Handle attribute parse exceptions here so that we can
388
        // react by trying to build a sensible parse tree.
389
        try {
390
            do {
391 114
                $this->scanner->whitespace();
392 114
                $this->attribute($attributes);
393 114
            } while (!$this->isTagEnd($selfClose));
394 114
        } catch (ParseError $e) {
395 2
            $selfClose = false;
396
        }
397
398 114
        $mode = $this->events->startTag($name, $attributes, $selfClose);
399
400 114
        if (is_int($mode)) {
401 107
            $this->setTextMode($mode, $name);
402 107
        }
403
404 114
        $this->scanner->consume();
405
406 114
        return true;
407
    }
408
409
    /**
410
     * Check if the scanner has reached the end of a tag.
411
     */
412 114
    protected function isTagEnd(&$selfClose)
413
    {
414 114
        $tok = $this->scanner->current();
415 114
        if ('/' == $tok) {
416 15
            $this->scanner->consume();
417 15
            $this->scanner->whitespace();
418 15
            $tok = $this->scanner->current();
419
420 15
            if ('>' == $tok) {
421 15
                $selfClose = true;
422
423 15
                return true;
424
            }
425 2
            if (false === $tok) {
426 1
                $this->parseError('Unexpected EOF inside of tag.');
427
428 1
                return true;
429
            }
430
            // Basically, we skip the / token and go on.
431
            // See 8.2.4.43.
432 1
            $this->parseError("Unexpected '%s' inside of a tag.", $tok);
433
434 1
            return false;
435
        }
436
437 114
        if ('>' == $tok) {
438 114
            return true;
439
        }
440 32
        if (false === $tok) {
441 2
            $this->parseError('Unexpected EOF inside of tag.');
442
443 2
            return true;
444
        }
445
446 31
        return false;
447
    }
448
449
    /**
450
     * Parse attributes from inside of a tag.
451
     *
452
     * @param string[] $attributes
453
     *
454
     * @return bool
455
     *
456
     * @throws ParseError
457
     */
458 114
    protected function attribute(&$attributes)
459
    {
460 114
        $tok = $this->scanner->current();
461 114
        if ('/' == $tok || '>' == $tok || false === $tok) {
462 108
            return false;
463
        }
464
465 82
        if ('<' == $tok) {
466 2
            $this->parseError("Unexpected '<' inside of attributes list.");
467
            // Push the < back onto the stack.
468 2
            $this->scanner->unconsume();
469
            // Let the caller figure out how to handle this.
470 2
            throw new ParseError('Start tag inside of attribute.');
471
        }
472
473 82
        $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
474
475 82
        if (0 == strlen($name)) {
476 3
            $tok = $this->scanner->current();
477 3
            $this->parseError('Expected an attribute name, got %s.', $tok);
478
            // Really, only '=' can be the char here. Everything else gets absorbed
479
            // under one rule or another.
480 3
            $name = $tok;
481 3
            $this->scanner->consume();
482 3
        }
483
484 82
        $isValidAttribute = true;
485
        // Attribute names can contain most Unicode characters for HTML5.
486
        // But method "DOMElement::setAttribute" is throwing exception
487
        // because of it's own internal restriction so these have to be filtered.
488
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
489
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
490 82
        if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
491 4
            $this->parseError('Unexpected characters in attribute name: %s', $name);
492 4
            $isValidAttribute = false;
493 4
        }         // There is no limitation for 1st character in HTML5.
494
        // But method "DOMElement::setAttribute" is throwing exception for the
495
        // characters below so they have to be filtered.
496
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
497
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
498 79
        elseif (preg_match('/^[0-9.-]/u', $name)) {
499 1
            $this->parseError('Unexpected character at the begining of attribute name: %s', $name);
500 1
            $isValidAttribute = false;
501 1
        }
502
        // 8.1.2.3
503 82
        $this->scanner->whitespace();
504
505 82
        $val = $this->attributeValue();
506 82
        if ($isValidAttribute) {
507 79
            $attributes[$name] = $val;
508 79
        }
509
510 82
        return true;
511
    }
512
513
    /**
514
     * Consume an attribute value. See section 8.2.4.37 and after.
515
     *
516
     * @return string|null
517
     */
518 82
    protected function attributeValue()
519
    {
520 82
        if ('=' != $this->scanner->current()) {
521 13
            return null;
522
        }
523 78
        $this->scanner->consume();
524
        // 8.1.2.3
525 78
        $this->scanner->whitespace();
526
527 78
        $tok = $this->scanner->current();
528
        switch ($tok) {
529 78
            case "\n":
530 78
            case "\f":
531 78
            case ' ':
532 78
            case "\t":
533
                // Whitespace here indicates an empty value.
534
                return null;
535 78
            case '"':
536 78
            case "'":
537 78
                $this->scanner->consume();
538
539 78
                return $this->quotedAttributeValue($tok);
0 ignored issues
show
Security Bug introduced by
It seems like $tok defined by $this->scanner->current() on line 527 can also be of type false; however, Masterminds\HTML5\Parser...:quotedAttributeValue() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
540 1
            case '>':
541
                // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
542 1
                $this->parseError('Expected attribute value, got tag end.');
543
544 1
                return null;
545 1
            case '=':
546 1
            case '`':
547
                $this->parseError('Expecting quotes, got %s.', $tok);
548
549
                return $this->unquotedAttributeValue();
550 1
            default:
551 1
                return $this->unquotedAttributeValue();
552 1
        }
553
    }
554
555
    /**
556
     * Get an attribute value string.
557
     *
558
     * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered
559
     *                      termination of an attribute's value. E.g. "\"'" will stop at either
560
     *                      ' or ".
561
     *
562
     * @return string The attribute value.
563
     */
564 78
    protected function quotedAttributeValue($quote)
565
    {
566 78
        $stoplist = "\f" . $quote;
567 78
        $val = '';
568
569 78
        while (true) {
570 78
            $tokens = $this->scanner->charsUntil($stoplist . '&');
571 78
            if (false !== $tokens) {
572 78
                $val .= $tokens;
573 78
            } else {
574
                break;
575
            }
576
577 78
            $tok = $this->scanner->current();
578 78
            if ('&' == $tok) {
579 3
                $val .= $this->decodeCharacterReference(true);
580 3
                continue;
581
            }
582 78
            break;
583
        }
584 78
        $this->scanner->consume();
585
586 78
        return $val;
587
    }
588
589 1
    protected function unquotedAttributeValue()
590
    {
591 1
        $val = '';
592 1
        $tok = $this->scanner->current();
593 1
        while (false !== $tok) {
594
            switch ($tok) {
595 1
                case "\n":
596 1
                case "\f":
597 1
                case ' ':
598 1
                case "\t":
599 1
                case '>':
600 1
                    break 2;
601
602 1
                case '&':
603 1
                    $val .= $this->decodeCharacterReference(true);
604 1
                    $tok = $this->scanner->current();
605
606 1
                    break;
607
608 1
                case "'":
609 1
                case '"':
610 1
                case '<':
611 1
                case '=':
612 1
                case '`':
613 1
                    $this->parseError('Unexpected chars in unquoted attribute value %s', $tok);
614 1
                    $val .= $tok;
615 1
                    $tok = $this->scanner->next();
616 1
                    break;
617
618 1
                default:
619 1
                    $val .= $this->scanner->charsUntil("\t\n\f >&\"'<=`");
620
621 1
                    $tok = $this->scanner->current();
622 1
            }
623 1
        }
624
625 1
        return $val;
626
    }
627
628
    /**
629
     * Consume malformed markup as if it were a comment.
630
     * 8.2.4.44.
631
     *
632
     * The spec requires that the ENTIRE tag-like thing be enclosed inside of
633
     * the comment. So this will generate comments like:
634
     *
635
     * &lt;!--&lt/+foo&gt;--&gt;
636
     *
637
     * @param string $leading Prepend any leading characters. This essentially
638
     *                        negates the need to backtrack, but it's sort of a hack.
639
     *
640
     * @return bool
641
     */
642 3
    protected function bogusComment($leading = '')
643
    {
644 3
        $comment = $leading;
645 3
        $tokens = $this->scanner->charsUntil('>');
646 3
        if (false !== $tokens) {
647 2
            $comment .= $tokens;
648 2
        }
649 3
        $tok = $this->scanner->current();
650 3
        if (false !== $tok) {
651 2
            $comment .= $tok;
652 2
        }
653
654 3
        $this->flushBuffer();
655 3
        $this->events->comment($comment);
656 3
        $this->scanner->consume();
657
658 3
        return true;
659
    }
660
661
    /**
662
     * Read a comment.
663
     * Expects the first tok to be inside of the comment.
664
     *
665
     * @return bool
666
     */
667 6
    protected function comment()
668
    {
669 6
        $tok = $this->scanner->current();
670 6
        $comment = '';
671
672
        // <!-->. Emit an empty comment because 8.2.4.46 says to.
673 6
        if ('>' == $tok) {
674
            // Parse error. Emit the comment token.
675 1
            $this->parseError("Expected comment data, got '>'");
676 1
            $this->events->comment('');
677 1
            $this->scanner->consume();
678
679 1
            return true;
680
        }
681
682
        // Replace NULL with the replacement char.
683 6
        if ("\0" == $tok) {
684
            $tok = UTF8Utils::FFFD;
685
        }
686 6
        while (!$this->isCommentEnd()) {
687 6
            $comment .= $tok;
688 6
            $tok = $this->scanner->next();
689 6
        }
690
691 6
        $this->events->comment($comment);
692 6
        $this->scanner->consume();
693
694 6
        return true;
695
    }
696
697
    /**
698
     * Check if the scanner has reached the end of a comment.
699
     *
700
     * @return bool
701
     */
702 6
    protected function isCommentEnd()
703
    {
704 6
        $tok = $this->scanner->current();
705
706
        // EOF
707 6
        if (false === $tok) {
708
            // Hit the end.
709 1
            $this->parseError('Unexpected EOF in a comment.');
710
711 1
            return true;
712
        }
713
714
        // If it doesn't start with -, not the end.
715 6
        if ('-' != $tok) {
716 6
            return false;
717
        }
718
719
        // Advance one, and test for '->'
720 6
        if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) {
721 6
            $this->scanner->consume(); // Consume the last '>'
722 6
            return true;
723
        }
724
        // Unread '-';
725 2
        $this->scanner->unconsume(1);
726
727 2
        return false;
728
    }
729
730
    /**
731
     * Parse a DOCTYPE.
732
     *
733
     * Parse a DOCTYPE declaration. This method has strong bearing on whether or
734
     * not Quirksmode is enabled on the event handler.
735
     *
736
     * @todo This method is a little long. Should probably refactor.
737
     *
738
     * @return bool
739
     */
740 96
    protected function doctype()
741
    {
742
        // Check that string is DOCTYPE.
743 96
        if ($this->scanner->sequenceMatches('DOCTYPE', false)) {
744 95
            $this->scanner->consume(7);
745 95
        } else {
746 1
            $chars = $this->scanner->charsWhile('DOCTYPEdoctype');
747 1
            $this->parseError('Expected DOCTYPE, got %s', $chars);
748
749 1
            return $this->bogusComment('<!' . $chars);
750
        }
751
752 95
        $this->scanner->whitespace();
753 95
        $tok = $this->scanner->current();
754
755
        // EOF: die.
756 95
        if (false === $tok) {
757
            $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
758
            $this->eof();
759
760
            return true;
761
        }
762
763
        // NULL char: convert.
764 95
        if ("\0" === $tok) {
765
            $this->parseError('Unexpected null character in DOCTYPE.');
766
        }
767
768 95
        $stop = " \n\f>";
769 95
        $doctypeName = $this->scanner->charsUntil($stop);
770
        // Lowercase ASCII, replace \0 with FFFD
771 95
        $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
0 ignored issues
show
Security Bug introduced by
It seems like $doctypeName can also be of type false; however, strtr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
772
773 95
        $tok = $this->scanner->current();
774
775
        // If false, emit a parse error, DOCTYPE, and return.
776 95
        if (false === $tok) {
777 1
            $this->parseError('Unexpected EOF in DOCTYPE declaration.');
778 1
            $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
779
780 1
            return true;
781
        }
782
783
        // Short DOCTYPE, like <!DOCTYPE html>
784 95
        if ('>' == $tok) {
785
            // DOCTYPE without a name.
786 95
            if (0 == strlen($doctypeName)) {
787 1
                $this->parseError('Expected a DOCTYPE name. Got nothing.');
788 1
                $this->events->doctype($doctypeName, 0, null, true);
789 1
                $this->scanner->consume();
790
791 1
                return true;
792
            }
793 95
            $this->events->doctype($doctypeName);
794 95
            $this->scanner->consume();
795
796 95
            return true;
797
        }
798 1
        $this->scanner->whitespace();
799
800 1
        $pub = strtoupper($this->scanner->getAsciiAlpha());
801 1
        $white = $this->scanner->whitespace();
802
803
        // Get ID, and flag it as pub or system.
804 1
        if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) {
805
            // Get the sys ID.
806 1
            $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
807 1
            $id = $this->quotedString("\0>");
808 1
            if (false === $id) {
809
                $this->events->doctype($doctypeName, $type, $pub, false);
810
811
                return true;
812
            }
813
814
            // Premature EOF.
815 1
            if (false === $this->scanner->current()) {
816 1
                $this->parseError('Unexpected EOF in DOCTYPE');
817 1
                $this->events->doctype($doctypeName, $type, $id, true);
818
819 1
                return true;
820
            }
821
822
            // Well-formed complete DOCTYPE.
823 1
            $this->scanner->whitespace();
824 1
            if ('>' == $this->scanner->current()) {
825 1
                $this->events->doctype($doctypeName, $type, $id, false);
826 1
                $this->scanner->consume();
827
828 1
                return true;
829
            }
830
831
            // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
832
            // Throw away the junk, parse error, quirks mode, return true.
833 1
            $this->scanner->charsUntil('>');
834 1
            $this->parseError('Malformed DOCTYPE.');
835 1
            $this->events->doctype($doctypeName, $type, $id, true);
836 1
            $this->scanner->consume();
837
838 1
            return true;
839
        }
840
841
        // Else it's a bogus DOCTYPE.
842
        // Consume to > and trash.
843 1
        $this->scanner->charsUntil('>');
844
845 1
        $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub);
846 1
        $this->events->doctype($doctypeName, 0, null, true);
847 1
        $this->scanner->consume();
848
849 1
        return true;
850
    }
851
852
    /**
853
     * Utility for reading a quoted string.
854
     *
855
     * @param string $stopchars Characters (in addition to a close-quote) that should stop the string.
856
     *                          E.g. sometimes '>' is higher precedence than '"' or "'".
857
     *
858
     * @return mixed String if one is found (quotations omitted).
859
     */
860 1
    protected function quotedString($stopchars)
861
    {
862 1
        $tok = $this->scanner->current();
863 1
        if ('"' == $tok || "'" == $tok) {
864 1
            $this->scanner->consume();
865 1
            $ret = $this->scanner->charsUntil($tok . $stopchars);
866 1
            if ($this->scanner->current() == $tok) {
867 1
                $this->scanner->consume();
868 1
            } else {
869
                // Parse error because no close quote.
870
                $this->parseError('Expected %s, got %s', $tok, $this->scanner->current());
871
            }
872
873 1
            return $ret;
874
        }
875
876
        return false;
877
    }
878
879
    /**
880
     * Handle a CDATA section.
881
     *
882
     * @return bool
883
     */
884 7
    protected function cdataSection()
885
    {
886 7
        $cdata = '';
887 7
        $this->scanner->consume();
888
889 7
        $chars = $this->scanner->charsWhile('CDAT');
890 7
        if ('CDATA' != $chars || '[' != $this->scanner->current()) {
891 1
            $this->parseError('Expected [CDATA[, got %s', $chars);
892
893 1
            return $this->bogusComment('<![' . $chars);
894
        }
895
896 7
        $tok = $this->scanner->next();
897
        do {
898 7
            if (false === $tok) {
899 2
                $this->parseError('Unexpected EOF inside CDATA.');
900 2
                $this->bogusComment('<![CDATA[' . $cdata);
901
902 2
                return true;
903
            }
904 7
            $cdata .= $tok;
905 7
            $tok = $this->scanner->next();
906 7
        } while (!$this->scanner->sequenceMatches(']]>'));
907
908
        // Consume ]]>
909 5
        $this->scanner->consume(3);
910
911 5
        $this->events->cdata($cdata);
912
913 5
        return true;
914
    }
915
916
    // ================================================================
917
    // Non-HTML5
918
    // ================================================================
919
920
    /**
921
     * Handle a processing instruction.
922
     *
923
     * XML processing instructions are supposed to be ignored in HTML5,
924
     * treated as "bogus comments". However, since we're not a user
925
     * agent, we allow them. We consume until ?> and then issue a
926
     * EventListener::processingInstruction() event.
927
     *
928
     * @return bool
929
     */
930 7
    protected function processingInstruction()
931
    {
932 7
        if ('?' != $this->scanner->current()) {
933
            return false;
934
        }
935
936 7
        $tok = $this->scanner->next();
937 7
        $procName = $this->scanner->getAsciiAlpha();
938 7
        $white = $this->scanner->whitespace();
939
940
        // If not a PI, send to bogusComment.
941 7
        if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) {
942 1
            $this->parseError("Expected processing instruction name, got $tok");
943 1
            $this->bogusComment('<?' . $tok . $procName);
944
945 1
            return true;
946
        }
947
948 6
        $data = '';
949
        // As long as it's not the case that the next two chars are ? and >.
950 6
        while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) {
951 6
            $data .= $this->scanner->current();
952
953 6
            $tok = $this->scanner->next();
954 6
            if (false === $tok) {
955
                $this->parseError('Unexpected EOF in processing instruction.');
956
                $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 937 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
957
958
                return true;
959
            }
960 6
        }
961
962 6
        $this->scanner->consume(2); // Consume the closing tag
963 6
        $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 937 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
964
965 6
        return true;
966
    }
967
968
    // ================================================================
969
    // UTILITY FUNCTIONS
970
    // ================================================================
971
972
    /**
973
     * Read from the input stream until we get to the desired sequene
974
     * or hit the end of the input stream.
975
     *
976
     * @param string $sequence
977
     *
978
     * @return string
979
     */
980 8
    protected function readUntilSequence($sequence)
981
    {
982 8
        $buffer = '';
983
984
        // Optimization for reading larger blocks faster.
985 8
        $first = substr($sequence, 0, 1);
986 8
        while (false !== $this->scanner->current()) {
987 8
            $buffer .= $this->scanner->charsUntil($first);
988
989
            // Stop as soon as we hit the stopping condition.
990 8
            if ($this->scanner->sequenceMatches($sequence, false)) {
991 8
                return $buffer;
992
            }
993 4
            $buffer .= $this->scanner->current();
994 4
            $this->scanner->consume();
995 4
        }
996
997
        // If we get here, we hit the EOF.
998 1
        $this->parseError('Unexpected EOF during text read.');
999
1000 1
        return $buffer;
1001
    }
1002
1003
    /**
1004
     * Check if upcomming chars match the given sequence.
1005
     *
1006
     * This will read the stream for the $sequence. If it's
1007
     * found, this will return true. If not, return false.
1008
     * Since this unconsumes any chars it reads, the caller
1009
     * will still need to read the next sequence, even if
1010
     * this returns true.
1011
     *
1012
     * Example: $this->scanner->sequenceMatches('</script>') will
1013
     * see if the input stream is at the start of a
1014
     * '</script>' string.
1015
     *
1016
     * @param string $sequence
1017
     * @param bool   $caseSensitive
1018
     *
1019
     * @return bool
1020
     */
1021
    protected function sequenceMatches($sequence, $caseSensitive = true)
1022
    {
1023
        @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
1024
1025
        return $this->scanner->sequenceMatches($sequence, $caseSensitive);
1026
    }
1027
1028
    /**
1029
     * Send a TEXT event with the contents of the text buffer.
1030
     *
1031
     * This emits an EventHandler::text() event with the current contents of the
1032
     * temporary text buffer. (The buffer is used to group as much PCDATA
1033
     * as we can instead of emitting lots and lots of TEXT events.)
1034
     */
1035 127
    protected function flushBuffer()
1036
    {
1037 127
        if ('' === $this->text) {
1038 125
            return;
1039
        }
1040 87
        $this->events->text($this->text);
1041 87
        $this->text = '';
1042 87
    }
1043
1044
    /**
1045
     * Add text to the temporary buffer.
1046
     *
1047
     * @see flushBuffer()
1048
     *
1049
     * @param string $str
1050
     */
1051 9
    protected function buffer($str)
1052
    {
1053 9
        $this->text .= $str;
1054 9
    }
1055
1056
    /**
1057
     * Emit a parse error.
1058
     *
1059
     * A parse error always returns false because it never consumes any
1060
     * characters.
1061
     *
1062
     * @param string $msg
1063
     *
1064
     * @return string
1065
     */
1066 15
    protected function parseError($msg)
1067
    {
1068 15
        $args = func_get_args();
1069
1070 15
        if (count($args) > 1) {
1071 11
            array_shift($args);
1072 11
            $msg = vsprintf($msg, $args);
1073 11
        }
1074
1075 15
        $line = $this->scanner->currentLine();
1076 15
        $col = $this->scanner->columnOffset();
1077 15
        $this->events->parseError($msg, $line, $col);
1078
1079 15
        return false;
1080
    }
1081
1082
    /**
1083
     * Decode a character reference and return the string.
1084
     *
1085
     * If $inAttribute is set to true, a bare & will be returned as-is.
1086
     *
1087
     * @param bool $inAttribute Set to true if the text is inside of an attribute value.
1088
     *                          false otherwise.
1089
     *
1090
     * @return string
1091
     */
1092 12
    protected function decodeCharacterReference($inAttribute = false)
1093
    {
1094
        // Next char after &.
1095 12
        $tok = $this->scanner->next();
1096 12
        $start = $this->scanner->position();
1097
1098 12
        if (false === $tok) {
1099 1
            return '&';
1100
        }
1101
1102
        // These indicate not an entity. We return just
1103
        // the &.
1104 12
        if ("\t" === $tok || "\n" === $tok || "\f" === $tok || ' ' === $tok || '&' === $tok || '<' === $tok) {
1105
            // $this->scanner->next();
1106 2
            return '&';
1107
        }
1108
1109
        // Numeric entity
1110 12
        if ('#' === $tok) {
1111 2
            $tok = $this->scanner->next();
1112
1113
            // Hexidecimal encoding.
1114
            // X[0-9a-fA-F]+;
1115
            // x[0-9a-fA-F]+;
1116 2
            if ('x' === $tok || 'X' === $tok) {
1117 2
                $tok = $this->scanner->next(); // Consume x
1118
1119
                // Convert from hex code to char.
1120 2
                $hex = $this->scanner->getHex();
1121 2
                if (empty($hex)) {
1122
                    $this->parseError('Expected &#xHEX;, got &#x%s', $tok);
1123
                    // We unconsume because we don't know what parser rules might
1124
                    // be in effect for the remaining chars. For example. '&#>'
1125
                    // might result in a specific parsing rule inside of tag
1126
                    // contexts, while not inside of pcdata context.
1127
                    $this->scanner->unconsume(2);
1128
1129
                    return '&';
1130
                }
1131 2
                $entity = CharacterReference::lookupHex($hex);
1132 2
            }             // Decimal encoding.
1133
            // [0-9]+;
1134
            else {
1135
                // Convert from decimal to char.
1136 1
                $numeric = $this->scanner->getNumeric();
1137 1
                if (false === $numeric) {
1138
                    $this->parseError('Expected &#DIGITS;, got &#%s', $tok);
1139
                    $this->scanner->unconsume(2);
1140
1141
                    return '&';
1142
                }
1143 1
                $entity = CharacterReference::lookupDecimal($numeric);
1144
            }
1145 12
        } elseif ('=' === $tok && $inAttribute) {
1146 1
            return '&';
1147
        } else { // String entity.
1148
            // Attempt to consume a string up to a ';'.
1149
            // [a-zA-Z0-9]+;
1150 11
            $cname = $this->scanner->getAsciiAlphaNum();
1151 11
            $entity = CharacterReference::lookupName($cname);
0 ignored issues
show
Security Bug introduced by
It seems like $cname defined by $this->scanner->getAsciiAlphaNum() on line 1150 can also be of type false; however, Masterminds\HTML5\Parser...Reference::lookupName() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
1152
1153
            // When no entity is found provide the name of the unmatched string
1154
            // and continue on as the & is not part of an entity. The & will
1155
            // be converted to &amp; elsewhere.
1156 11
            if (null === $entity) {
1157 6
                if (!$inAttribute || '' === $cname) {
1158 5
                    $this->parseError("No match in entity table for '%s'", $cname);
1159 5
                }
1160 6
                $this->scanner->unconsume($this->scanner->position() - $start);
1161
1162 6
                return '&';
1163
            }
1164
        }
1165
1166
        // The scanner has advanced the cursor for us.
1167 9
        $tok = $this->scanner->current();
1168
1169
        // We have an entity. We're done here.
1170 9
        if (';' === $tok) {
1171 9
            $this->scanner->consume();
1172
1173 9
            return $entity;
1174
        }
1175
1176
        // If in an attribute, then failing to match ; means unconsume the
1177
        // entire string. Otherwise, failure to match is an error.
1178 1
        if ($inAttribute) {
1179
            $this->scanner->unconsume($this->scanner->position() - $start);
1180
1181
            return '&';
1182
        }
1183
1184 1
        $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok);
1185
1186 1
        return '&' . $entity;
1187
    }
1188
}
1189