Completed
Pull Request — master (#155)
by Christophe
01:42
created

Tokenizer::parse()   A

Complexity

Conditions 2
Paths 1

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 7
ccs 4
cts 4
cp 1
rs 10
c 0
b 0
f 0
cc 2
nc 1
nop 0
crap 2
1
<?php
2
namespace Masterminds\HTML5\Parser;
3
4
use Masterminds\HTML5\Elements;
5
6
/**
7
 * The HTML5 tokenizer.
8
 *
9
 * The tokenizer's role is reading data from the scanner and gathering it into
10
 * semantic units. From the tokenizer, data is emitted to an event handler,
11
 * which may (for example) create a DOM tree.
12
 *
13
 * The HTML5 specification has a detailed explanation of tokenizing HTML5. We
14
 * follow that specification to the maximum extent that we can. If you find
15
 * a discrepancy that is not documented, please file a bug and/or submit a
16
 * patch.
17
 *
18
 * This tokenizer is implemented as a recursive descent parser.
19
 *
20
 * Within the API documentation, you may see references to the specific section
21
 * of the HTML5 spec that the code attempts to reproduce. Example: 8.2.4.1.
22
 * This refers to section 8.2.4.1 of the HTML5 CR specification.
23
 *
24
 * @see http://www.w3.org/TR/2012/CR-html5-20121217/
25
 */
26
class Tokenizer
27
{
28
29
    protected $scanner;
30
31
    protected $events;
32
33
    protected $tok;
34
35
    /**
36
     * Buffer for text.
37
     */
38
    protected $text = '';
39
40
    // When this goes to false, the parser stops.
41
    protected $carryOn = true;
42
43
    protected $textMode = 0; // TEXTMODE_NORMAL;
44
    protected $untilTag = null;
45
46
    const CONFORMANT_XML = 'xml';
47
    const CONFORMANT_HTML = 'html';
48
    protected $mode = self::CONFORMANT_HTML;
49
50
    const WHITE = "\t\n\f ";
51
52
    /**
53
     * Create a new tokenizer.
54
     *
55
     * Typically, parsing a document involves creating a new tokenizer, giving
56
     * it a scanner (input) and an event handler (output), and then calling
57
     * the Tokenizer::parse() method.`
58
     *
59
     * @param \Masterminds\HTML5\Parser\Scanner $scanner
60
     *            A scanner initialized with an input stream.
61
     * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler
62
     *            An event handler, initialized and ready to receive
63
     *            events.
64
     * @param string $mode
65
     */
66 127
    public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
67
    {
68 127
        $this->scanner = $scanner;
69 127
        $this->events = $eventHandler;
70 127
        $this->mode = $mode;
71 127
    }
72
73
    /**
74
     * Begin parsing.
75
     *
76
     * This will begin scanning the document, tokenizing as it goes.
77
     * Tokens are emitted into the event handler.
78
     *
79
     * Tokenizing will continue until the document is completely
80
     * read. Errors are emitted into the event handler, but
81
     * the parser will attempt to continue parsing until the
82
     * entire input stream is read.
83
     */
84 127
    public function parse()
85
    {
86
        do {
87 127
            $this->consumeData();
88
            // FIXME: Add infinite loop protection.
89 127
        } while ($this->carryOn);
90 127
    }
91
92
    /**
93
     * Set the text mode for the character data reader.
94
     *
95
     * HTML5 defines three different modes for reading text:
96
     * - Normal: Read until a tag is encountered.
97
     * - RCDATA: Read until a tag is encountered, but skip a few otherwise-
98
     * special characters.
99
     * - Raw: Read until a special closing tag is encountered (viz. pre, script)
100
     *
101
     * This allows those modes to be set.
102
     *
103
     * Normally, setting is done by the event handler via a special return code on
104
     * startTag(), but it can also be set manually using this function.
105
     *
106
     * @param integer $textmode
107
     *            One of Elements::TEXT_*
108
     * @param string $untilTag
109
     *            The tag that should stop RAW or RCDATA mode. Normal mode does not
110
     *            use this indicator.
111
     */
112 108
    public function setTextMode($textmode, $untilTag = null)
113
    {
114 108
        $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
115 108
        $this->untilTag = $untilTag;
116 108
    }
117
118
    /**
119
     * Consume a character and make a move.
120
     * HTML5 8.2.4.1
121
     */
122 127
    protected function consumeData()
123
    {
124 127
        $tok = $this->scanner->current();
125
126 127
        if ($tok === '&') {
127
            // Character reference
128 8
            $ref = $this->decodeCharacterReference();
129 8
            $this->buffer($ref);
130
131 8
            $tok = $this->scanner->current();
132 8
        }
133
134
        // Parse tag
135 127
        if ($tok === '<') {
136
            // Any buffered text data can go out now.
137 123
            $this->flushBuffer();
138
139 123
            $tok = $this->scanner->next();
140
141 123
            $this->markupDeclaration($tok)
142 120
                || $this->endTag()
143 120
                || $this->processingInstruction()
144 119
                || $this->tagName()
145
                // This always returns false.
146 114
                || $this->parseError("Illegal tag opening")
147 1
                || $this->characterData();
148
149 123
            $tok = $this->scanner->current();
150 123
        }
151
152
        // Handle end of document
153 127
        $this->eof($tok);
154
155
        // Parse character
156 127
        if ($tok !== false) {
157 112
            switch ($this->textMode) {
158 112
                case Elements::TEXT_RAW:
159 8
                    $this->rawText($tok);
160 8
                    break;
161
162 112
                case Elements::TEXT_RCDATA:
163 37
                    $this->rcdata($tok);
164 37
                    break;
165
166 111
                default:
167 111
                    if (!strspn($tok, "<&")) {
168
                        // NULL character
169 87
                        if ($tok === "\00") {
170
                            $this->parseError("Received null character.");
171
                        }
172
173 87
                        $this->text .= $tok;
174 87
                        $this->scanner->next();
175 87
                    }
176 112
            }
177 112
        }
178
179 127
        return $this->carryOn;
180
    }
181
182
    /**
183
     * Parse anything that looks like character data.
184
     *
185
     * Different rules apply based on the current text mode.
186
     *
187
     * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
188
     */
189 1
    protected function characterData()
190
    {
191 1
        $tok = $this->scanner->current();
192 1
        if ($tok === false) {
193
            return false;
194
        }
195 1
        switch ($this->textMode) {
196 1
            case Elements::TEXT_RAW:
197
                return $this->rawText($tok);
198 1
            case Elements::TEXT_RCDATA:
199
                return $this->rcdata($tok);
200 1
            default:
201 1
                if (strspn($tok, "<&")) {
202
                    return false;
203
                }
204 1
                return $this->text($tok);
205 1
        }
206
    }
207
208
    /**
209
     * This buffers the current token as character data.
210
     *
211
     * @param string $tok The current token.
212
     *
213
     * @return bool
214
     */
215 1
    protected function text($tok)
216
    {
217
        // This should never happen...
218 1
        if ($tok === false) {
219
            return false;
220
        }
221
222
        // NULL character
223 1
        if ($tok === "\00") {
224
            $this->parseError("Received null character.");
225
        }
226
227 1
        $this->buffer($tok);
228 1
        $this->scanner->next();
229
230 1
        return true;
231
    }
232
233
    /**
234
     * Read text in RAW mode.
235
     *
236
     * @param string $tok The current token.
237
     *
238
     * @return bool
239
     */
240 8
    protected function rawText($tok)
241
    {
242 8
        if (is_null($this->untilTag)) {
243
            return $this->text($tok);
244
        }
245
246 8
        $sequence = '</' . $this->untilTag . '>';
247 8
        $txt = $this->readUntilSequence($sequence);
248 8
        $this->events->text($txt);
249 8
        $this->setTextMode(0);
250
251 8
        return $this->endTag();
252
    }
253
254
    /**
255
     * Read text in RCDATA mode.
256
     *
257
     * @param string $tok The current token.
258
     *
259
     * @return bool
260
     */
261 37
    protected function rcdata($tok)
262
    {
263 37
        if (is_null($this->untilTag)) {
264
            return $this->text($tok);
265
        }
266
267 37
        $sequence = '</' . $this->untilTag;
268 37
        $txt = '';
269
270 37
        $caseSensitive = !Elements::isHtml5Element($this->untilTag);
271 37
        while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
272 35
            if ($tok == '&') {
273 1
                $txt .= $this->decodeCharacterReference();
274 1
                $tok = $this->scanner->current();
275 1
            } else {
276 35
                $txt .= $tok;
277 35
                $tok = $this->scanner->next();
278
            }
279 35
        }
280 37
        $len = strlen($sequence);
281 37
        $this->scanner->consume($len);
282 37
        $len += strlen($this->scanner->whitespace());
283 37
        if ($this->scanner->current() !== '>') {
284
            $this->parseError("Unclosed RCDATA end tag");
285
        }
286
287 37
        $this->scanner->unconsume($len);
288 37
        $this->events->text($txt);
289 37
        $this->setTextMode(0);
290
291 37
        return $this->endTag();
292
    }
293
294
    /**
295
     * If the document is read, emit an EOF event.
296
     */
297 127
    protected function eof($tok)
298
    {
299 127
        if ($tok === false) {
300
            // fprintf(STDOUT, "EOF");
301 127
            $this->flushBuffer();
302 127
            $this->events->eof();
303 127
            $this->carryOn = false;
304
305 127
            return true;
306
        }
307
308 112
        return false;
309
    }
310
311
    /**
312
     * Look for markup.
313
     */
314 123
    protected function markupDeclaration($tok)
315
    {
316 123
        if ($tok != '!') {
317 120
            return false;
318
        }
319
320 101
        $tok = $this->scanner->next();
321
322
        // Comment:
323 101
        if ($tok == '-' && $this->scanner->peek() == '-') {
324 6
            $this->scanner->next(); // Consume the other '-'
325 6
            $this->scanner->next(); // Next char.
326 6
            return $this->comment();
327
        }
328
329 98
        elseif ($tok == 'D' || $tok == 'd') { // Doctype
330 96
            return $this->doctype();
331
        }
332
333 7
        elseif ($tok == '[') { // CDATA section
334 7
            return $this->cdataSection();
335
        }
336
337
        // FINISH
338 1
        $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok);
339 1
        $this->bogusComment('<!');
340 1
        return true;
341
    }
342
343
    /**
344
     * Consume an end tag.
345
     * 8.2.4.9
346
     */
347 120
    protected function endTag()
348
    {
349 120
        if ($this->scanner->current() != '/') {
350 119
            return false;
351
        }
352 111
        $tok = $this->scanner->next();
353
354
        // a-zA-Z -> tagname
355
        // > -> parse error
356
        // EOF -> parse error
357
        // -> parse error
358 111
        if (! ctype_alpha($tok)) {
359 2
            $this->parseError("Expected tag name, got '%s'", $tok);
360 2
            if ($tok == "\0" || $tok === false) {
361
                return false;
362
            }
363 2
            return $this->bogusComment('</');
364
        }
365
366 110
        $name = $this->scanner->charsUntil("\n\f \t>");
367 110
        $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name);
368
        // Trash whitespace.
369 110
        $this->scanner->whitespace();
370
371 110
        $tok = $this->scanner->current();
372 110
        if ($tok != '>') {
373 1
            $this->parseError("Expected >, got '%s'", $tok);
374
            // We just trash stuff until we get to the next tag close.
375 1
            $this->scanner->charsUntil('>');
376 1
        }
377
378 110
        $this->events->endTag($name);
379 110
        $this->scanner->next();
380 110
        return true;
381
    }
382
383
    /**
384
     * Consume a tag name and body.
385
     * 8.2.4.10
386
     */
387 114
    protected function tagName()
388
    {
389 114
        $tok = $this->scanner->current();
390 114
        if (! ctype_alpha($tok)) {
391 1
            return false;
392
        }
393
394
        // We know this is at least one char.
395 114
        $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
396 114
        $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name);
397 114
        $attributes = array();
398 114
        $selfClose = false;
399
400
        // Handle attribute parse exceptions here so that we can
401
        // react by trying to build a sensible parse tree.
402
        try {
403
            do {
404 114
                $this->scanner->whitespace();
405 114
                $this->attribute($attributes);
406 114
            } while (! $this->isTagEnd($selfClose));
407 114
        } catch (ParseError $e) {
408 2
            $selfClose = false;
409
        }
410
411 114
        $mode = $this->events->startTag($name, $attributes, $selfClose);
412
413 114
        if (is_int($mode)) {
414 107
            $this->setTextMode($mode, $name);
415 107
        }
416
417 114
        $this->scanner->next();
418
419 114
        return true;
420
    }
421
422
    /**
423
     * Check if the scanner has reached the end of a tag.
424
     */
425 114
    protected function isTagEnd(&$selfClose)
426
    {
427 114
        $tok = $this->scanner->current();
428 114
        if ($tok == '/') {
429 15
            $this->scanner->next();
430 15
            $this->scanner->whitespace();
431 15
            $tok = $this->scanner->current();
432
433 15
            if ($tok == '>') {
434 15
                $selfClose = true;
435 15
                return true;
436
            }
437 2
            if ($tok === false) {
438 1
                $this->parseError("Unexpected EOF inside of tag.");
439 1
                return true;
440
            }
441
            // Basically, we skip the / token and go on.
442
            // See 8.2.4.43.
443 1
            $this->parseError("Unexpected '%s' inside of a tag.", $tok);
444 1
            return false;
445
        }
446
447 114
        if ($tok == '>') {
448 114
            return true;
449
        }
450 32
        if ($tok === false) {
451 2
            $this->parseError("Unexpected EOF inside of tag.");
452 2
            return true;
453
        }
454
455 31
        return false;
456
    }
457
458
    /**
459
     * Parse attributes from inside of a tag.
460
     *
461
     * @param string[] $attributes
462
     *
463
     * @return bool
464
     *
465
     * @throws ParseError
466
     */
467 114
    protected function attribute(&$attributes)
468
    {
469 114
        $tok = $this->scanner->current();
470 114
        if ($tok == '/' || $tok == '>' || $tok === false) {
471 108
            return false;
472
        }
473
474 82
        if ($tok == '<') {
475 2
            $this->parseError("Unexpected '<' inside of attributes list.");
476
            // Push the < back onto the stack.
477 2
            $this->scanner->unconsume();
478
            // Let the caller figure out how to handle this.
479 2
            throw new ParseError("Start tag inside of attribute.");
480
        }
481
482 82
        $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
483
484 82
        if (strlen($name) == 0) {
485 3
            $tok = $this->scanner->current();
486 3
            $this->parseError("Expected an attribute name, got %s.", $tok);
487
            // Really, only '=' can be the char here. Everything else gets absorbed
488
            // under one rule or another.
489 3
            $name = $tok;
490 3
            $this->scanner->next();
491 3
        }
492
493 82
        $isValidAttribute = true;
494
        // Attribute names can contain most Unicode characters for HTML5.
495
        // But method "DOMElement::setAttribute" is throwing exception
496
        // because of it's own internal restriction so these have to be filtered.
497
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
498
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
499 82
        if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
500 4
            $this->parseError("Unexpected characters in attribute name: %s", $name);
501 4
            $isValidAttribute = false;
502 4
        }         // There is no limitation for 1st character in HTML5.
503
        // But method "DOMElement::setAttribute" is throwing exception for the
504
        // characters below so they have to be filtered.
505
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
506
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
507
        else
508 79
            if (preg_match("/^[0-9.-]/u", $name)) {
509 1
                $this->parseError("Unexpected character at the begining of attribute name: %s", $name);
510 1
                $isValidAttribute = false;
511 1
            }
512
        // 8.1.2.3
513 82
        $this->scanner->whitespace();
514
515 82
        $val = $this->attributeValue();
516 82
        if ($isValidAttribute) {
517 79
            $attributes[$name] = $val;
518 79
        }
519 82
        return true;
520
    }
521
522
    /**
523
     * Consume an attribute value.
524
     * 8.2.4.37 and after.
525
     *
526
     * @return string|null
527
     */
528 82
    protected function attributeValue()
529
    {
530 82
        if ($this->scanner->current() != '=') {
531 13
            return null;
532
        }
533 78
        $this->scanner->next();
534
        // 8.1.2.3
535 78
        $this->scanner->whitespace();
536
537 78
        $tok = $this->scanner->current();
538
        switch ($tok) {
539 78
            case "\n":
540 78
            case "\f":
541 78
            case " ":
542 78
            case "\t":
543
                // Whitespace here indicates an empty value.
544
                return null;
545 78
            case '"':
546 78
            case "'":
547 78
                $this->scanner->next();
548 78
                return $this->quotedAttributeValue($tok);
0 ignored issues
show
Security Bug introduced by
It seems like $tok defined by $this->scanner->current() on line 537 can also be of type false; however, Masterminds\HTML5\Parser...:quotedAttributeValue() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
549 1
            case '>':
550
                // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
551 1
                $this->parseError("Expected attribute value, got tag end.");
552 1
                return null;
553 1
            case '=':
554 1
            case '`':
555
                $this->parseError("Expecting quotes, got %s.", $tok);
556
                return $this->unquotedAttributeValue();
557 1
            default:
558 1
                return $this->unquotedAttributeValue();
559 1
        }
560
    }
561
562
    /**
563
     * Get an attribute value string.
564
     *
565
     * @param string $quote
566
     *            IMPORTANT: This is a series of chars! Any one of which will be considered
567
     *            termination of an attribute's value. E.g. "\"'" will stop at either
568
     *            ' or ".
569
     * @return string The attribute value.
570
     */
571 78
    protected function quotedAttributeValue($quote)
572
    {
573 78
        $stoplist = "\f" . $quote;
574 78
        $val = '';
575
576 78
        while (true) {
577 78
            $tokens = $this->scanner->charsUntil($stoplist.'&');
578 78
            if ($tokens !== false) {
579 78
                $val .= $tokens;
580 78
            } else {
581
                break;
582
            }
583
584 78
            $tok = $this->scanner->current();
585 78
            if ($tok == '&') {
586 3
                $val .= $this->decodeCharacterReference(true);
587 3
                continue;
588
            }
589 78
            break;
590
        }
591 78
        $this->scanner->next();
592 78
        return $val;
593
    }
594
595 1
    protected function unquotedAttributeValue()
596
    {
597 1
        $val = '';
598 1
        $tok = $this->scanner->current();
599 1
        while ($tok !== false) {
600
            switch ($tok) {
601 1
                case "\n":
602 1
                case "\f":
603 1
                case " ":
604 1
                case "\t":
605 1
                case '>':
606 1
                    break 2;
607
608 1
                case '&':
609 1
                    $val .= $this->decodeCharacterReference(true);
610 1
                    $tok = $this->scanner->current();
611
612 1
                    break;
613
614 1
                case "'":
615 1
                case '"':
616 1
                case '<':
617 1
                case '=':
618 1
                case '`':
619 1
                    $this->parseError("Unexpected chars in unquoted attribute value %s", $tok);
620 1
                    $val .= $tok;
621 1
                    $tok = $this->scanner->next();
622 1
                    break;
623
624 1
                default:
625 1
                    $val .= $this->scanner->charsUntil("\t\n\f >&\"'<=`");
626
627 1
                    $tok = $this->scanner->current();
628 1
            }
629 1
        }
630 1
        return $val;
631
    }
632
633
    /**
634
     * Consume malformed markup as if it were a comment.
635
     * 8.2.4.44
636
     *
637
     * The spec requires that the ENTIRE tag-like thing be enclosed inside of
638
     * the comment. So this will generate comments like:
639
     *
640
     * &lt;!--&lt/+foo&gt;--&gt;
641
     *
642
     * @param string $leading
643
     *            Prepend any leading characters. This essentially
644
     *            negates the need to backtrack, but it's sort of
645
     *            a hack.
646
     *
647
     * @return bool
648
     */
649 3
    protected function bogusComment($leading = '')
650
    {
651 3
        $comment = $leading;
652 3
        $tokens = $this->scanner->charsUntil('>');
653 3
        if ($tokens !== false) {
654 2
            $comment .= $tokens;
655 2
        }
656 3
        $tok = $this->scanner->current();
657 3
        if ($tok !== false) {
658 2
            $comment .= $tok;
659 2
        }
660
661 3
        $this->flushBuffer();
662 3
        $this->events->comment($comment);
663 3
        $this->scanner->next();
664
665 3
        return true;
666
    }
667
668
    /**
669
     * Read a comment.
670
     *
671
     * Expects the first tok to be inside of the comment.
672
     *
673
     * @return bool
674
     */
675 6
    protected function comment()
676
    {
677 6
        $tok = $this->scanner->current();
678 6
        $comment = '';
679
680
        // <!-->. Emit an empty comment because 8.2.4.46 says to.
681 6
        if ($tok == '>') {
682
            // Parse error. Emit the comment token.
683 1
            $this->parseError("Expected comment data, got '>'");
684 1
            $this->events->comment('');
685 1
            $this->scanner->next();
686 1
            return true;
687
        }
688
689
        // Replace NULL with the replacement char.
690 6
        if ($tok == "\0") {
691
            $tok = UTF8Utils::FFFD;
692
        }
693 6
        while (! $this->isCommentEnd()) {
694 6
            $comment .= $tok;
695 6
            $tok = $this->scanner->next();
696 6
        }
697
698 6
        $this->events->comment($comment);
699 6
        $this->scanner->next();
700 6
        return true;
701
    }
702
703
    /**
704
     * Check if the scanner has reached the end of a comment.
705
     *
706
     * @return bool
707
     */
708 6
    protected function isCommentEnd()
709
    {
710 6
        $tok = $this->scanner->current();
711
712
        // EOF
713 6
        if ($tok === false) {
714
            // Hit the end.
715 1
            $this->parseError("Unexpected EOF in a comment.");
716 1
            return true;
717
        }
718
719
        // If it doesn't start with -, not the end.
720 6
        if ($tok != '-') {
721 6
            return false;
722
        }
723
724
        // Advance one, and test for '->'
725 6
        if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') {
726 6
            $this->scanner->next(); // Consume the last '>'
727 6
            return true;
728
        }
729
        // Unread '-';
730 2
        $this->scanner->unconsume(1);
731 2
        return false;
732
    }
733
734
    /**
735
     * Parse a DOCTYPE.
736
     *
737
     * Parse a DOCTYPE declaration. This method has strong bearing on whether or
738
     * not Quirksmode is enabled on the event handler.
739
     *
740
     * @todo This method is a little long. Should probably refactor.
741
     *
742
     * @return bool
743
     */
744 96
    protected function doctype()
745
    {
746 96
        if (strcasecmp($this->scanner->current(), 'D')) {
747
            return false;
748
        }
749
        // Check that string is DOCTYPE.
750 96
        $chars = $this->scanner->charsWhile("DOCTYPEdoctype");
751 96
        if (strcasecmp($chars, 'DOCTYPE')) {
752 1
            $this->parseError('Expected DOCTYPE, got %s', $chars);
753 1
            return $this->bogusComment('<!' . $chars);
754
        }
755
756 95
        $this->scanner->whitespace();
757 95
        $tok = $this->scanner->current();
758
759
        // EOF: die.
760 95
        if ($tok === false) {
761
            $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
762
            return $this->eof($tok);
763
        }
764
765
        // NULL char: convert.
766 95
        if ($tok === "\0") {
767
            $this->parseError("Unexpected null character in DOCTYPE.");
768
        }
769
770 95
        $stop = " \n\f>";
771 95
        $doctypeName = $this->scanner->charsUntil($stop);
772
        // Lowercase ASCII, replace \0 with FFFD
773 95
        $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
0 ignored issues
show
Security Bug introduced by
It seems like $doctypeName can also be of type false; however, strtr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
774
775 95
        $tok = $this->scanner->current();
776
777
        // If false, emit a parse error, DOCTYPE, and return.
778 95
        if ($tok === false) {
779 1
            $this->parseError('Unexpected EOF in DOCTYPE declaration.');
780 1
            $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
781 1
            return true;
782
        }
783
784
        // Short DOCTYPE, like <!DOCTYPE html>
785 95
        if ($tok == '>') {
786
            // DOCTYPE without a name.
787 95
            if (strlen($doctypeName) == 0) {
788 1
                $this->parseError("Expected a DOCTYPE name. Got nothing.");
789 1
                $this->events->doctype($doctypeName, 0, null, true);
790 1
                $this->scanner->next();
791 1
                return true;
792
            }
793 95
            $this->events->doctype($doctypeName);
794 95
            $this->scanner->next();
795 95
            return true;
796
        }
797 1
        $this->scanner->whitespace();
798
799 1
        $pub = strtoupper($this->scanner->getAsciiAlpha());
800 1
        $white = strlen($this->scanner->whitespace());
801
802
        // Get ID, and flag it as pub or system.
803 1
        if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
804
            // Get the sys ID.
805 1
            $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
806 1
            $id = $this->quotedString("\0>");
807 1
            if ($id === false) {
808
                $this->events->doctype($doctypeName, $type, $pub, false);
809
                return false;
810
            }
811
812
            // Premature EOF.
813 1
            if ($this->scanner->current() === false) {
814 1
                $this->parseError("Unexpected EOF in DOCTYPE");
815 1
                $this->events->doctype($doctypeName, $type, $id, true);
816 1
                return true;
817
            }
818
819
            // Well-formed complete DOCTYPE.
820 1
            $this->scanner->whitespace();
821 1
            if ($this->scanner->current() == '>') {
822 1
                $this->events->doctype($doctypeName, $type, $id, false);
823 1
                $this->scanner->next();
824 1
                return true;
825
            }
826
827
            // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
828
            // Throw away the junk, parse error, quirks mode, return true.
829 1
            $this->scanner->charsUntil(">");
830 1
            $this->parseError("Malformed DOCTYPE.");
831 1
            $this->events->doctype($doctypeName, $type, $id, true);
832 1
            $this->scanner->next();
833 1
            return true;
834
        }
835
836
        // Else it's a bogus DOCTYPE.
837
        // Consume to > and trash.
838 1
        $this->scanner->charsUntil('>');
839
840 1
        $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub);
841 1
        $this->events->doctype($doctypeName, 0, null, true);
842 1
        $this->scanner->next();
843 1
        return true;
844
    }
845
846
    /**
847
     * Utility for reading a quoted string.
848
     *
849
     * @param string $stopchars
850
     *            Characters (in addition to a close-quote) that should stop the string.
851
     *            E.g. sometimes '>' is higher precedence than '"' or "'".
852
     *
853
     * @return mixed String if one is found (quotations omitted)
854
     */
855 1
    protected function quotedString($stopchars)
856
    {
857 1
        $tok = $this->scanner->current();
858 1
        if ($tok == '"' || $tok == "'") {
859 1
            $this->scanner->next();
860 1
            $ret = $this->scanner->charsUntil($tok . $stopchars);
861 1
            if ($this->scanner->current() == $tok) {
862 1
                $this->scanner->next();
863 1
            } else {
864
                // Parse error because no close quote.
865
                $this->parseError("Expected %s, got %s", $tok, $this->scanner->current());
866
            }
867 1
            return $ret;
868
        }
869
        return false;
870
    }
871
872
    /**
873
     * Handle a CDATA section.
874
     *
875
     * @return bool
876
     */
877 7
    protected function cdataSection()
878
    {
879 7
        if ($this->scanner->current() != '[') {
880
            return false;
881
        }
882 7
        $cdata = '';
883 7
        $this->scanner->next();
884
885 7
        $chars = $this->scanner->charsWhile('CDAT');
886 7
        if ($chars != 'CDATA' || $this->scanner->current() != '[') {
887 1
            $this->parseError('Expected [CDATA[, got %s', $chars);
888 1
            return $this->bogusComment('<![' . $chars);
889
        }
890
891 7
        $tok = $this->scanner->next();
892
        do {
893 7
            if ($tok === false) {
894 2
                $this->parseError('Unexpected EOF inside CDATA.');
895 2
                $this->bogusComment('<![CDATA[' . $cdata);
896 2
                return true;
897
            }
898 7
            $cdata .= $tok;
899 7
            $tok = $this->scanner->next();
900 7
        } while (! $this->scanner->sequenceMatches(']]>'));
901
902
        // Consume ]]>
903 5
        $this->scanner->consume(3);
904
905 5
        $this->events->cdata($cdata);
906 5
        return true;
907
    }
908
909
    // ================================================================
910
    // Non-HTML5
911
    // ================================================================
912
    /**
913
     * Handle a processing instruction.
914
     *
915
     * XML processing instructions are supposed to be ignored in HTML5,
916
     * treated as "bogus comments". However, since we're not a user
917
     * agent, we allow them. We consume until ?> and then issue a
918
     * EventListener::processingInstruction() event.
919
     *
920
     * @return bool
921
     */
922 119
    protected function processingInstruction()
923
    {
924 119
        if ($this->scanner->current() != '?') {
925 114
            return false;
926
        }
927
928 7
        $tok = $this->scanner->next();
929 7
        $procName = $this->scanner->getAsciiAlpha();
930 7
        $white = strlen($this->scanner->whitespace());
931
932
        // If not a PI, send to bogusComment.
933 7
        if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) {
934 1
            $this->parseError("Expected processing instruction name, got $tok");
935 1
            $this->bogusComment('<?' . $tok . $procName);
936 1
            return true;
937
        }
938
939 6
        $data = '';
940
        // As long as it's not the case that the next two chars are ? and >.
941 6
        while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) {
942 6
            $data .= $this->scanner->current();
943
944 6
            $tok = $this->scanner->next();
945 6
            if ($tok === false) {
946
                $this->parseError("Unexpected EOF in processing instruction.");
947
                $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 929 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
948
                return true;
949
            }
950 6
        }
951
952 6
        $this->scanner->next(); // >
953 6
        $this->scanner->next(); // Next token.
954 6
        $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 929 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
955 6
        return true;
956
    }
957
958
    // ================================================================
959
    // UTILITY FUNCTIONS
960
    // ================================================================
961
962
    /**
963
     * Read from the input stream until we get to the desired sequene
964
     * or hit the end of the input stream.
965
     *
966
     * @param string $sequence
967
     *
968
     * @return string
969
     */
970 8
    protected function readUntilSequence($sequence)
971
    {
972 8
        $buffer = '';
973
974
        // Optimization for reading larger blocks faster.
975 8
        $first = substr($sequence, 0, 1);
976 8
        while ($this->scanner->current() !== false) {
977 8
            $buffer .= $this->scanner->charsUntil($first);
978
979
            // Stop as soon as we hit the stopping condition.
980 8
            if ($this->scanner->sequenceMatches($sequence, false)) {
981 8
                return $buffer;
982
            }
983 4
            $buffer .= $this->scanner->current();
984 4
            $this->scanner->next();
985 4
        }
986
987
        // If we get here, we hit the EOF.
988 1
        $this->parseError("Unexpected EOF during text read.");
989 1
        return $buffer;
990
    }
991
992
    /**
993
     * Check if upcomming chars match the given sequence.
994
     *
995
     * This will read the stream for the $sequence. If it's
996
     * found, this will return true. If not, return false.
997
     * Since this unconsumes any chars it reads, the caller
998
     * will still need to read the next sequence, even if
999
     * this returns true.
1000
     *
1001
     * Example: $this->scanner->sequenceMatches('</script>') will
1002
     * see if the input stream is at the start of a
1003
     * '</script>' string.
1004
     *
1005
     * @param string $sequence
1006
     * @param bool $caseSensitive
1007
     *
1008
     * @return bool
1009
     */
1010
    protected function sequenceMatches($sequence, $caseSensitive = true)
1011
    {
1012
        @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
1013
1014
        return $this->scanner->sequenceMatches($sequence, $caseSensitive);
1015
    }
1016
1017
    /**
1018
     * Send a TEXT event with the contents of the text buffer.
1019
     *
1020
     * This emits an EventHandler::text() event with the current contents of the
1021
     * temporary text buffer. (The buffer is used to group as much PCDATA
1022
     * as we can instead of emitting lots and lots of TEXT events.)
1023
     */
1024 127
    protected function flushBuffer()
1025
    {
1026 127
        if ($this->text === '') {
1027 125
            return;
1028
        }
1029 87
        $this->events->text($this->text);
1030 87
        $this->text = '';
1031 87
    }
1032
1033
    /**
1034
     * Add text to the temporary buffer.
1035
     *
1036
     * @see flushBuffer()
1037
     *
1038
     * @param string $str
1039
     */
1040 9
    protected function buffer($str)
1041
    {
1042 9
        $this->text .= $str;
1043 9
    }
1044
1045
    /**
1046
     * Emit a parse error.
1047
     *
1048
     * A parse error always returns false because it never consumes any
1049
     * characters.
1050
     *
1051
     * @param string $msg
1052
     *
1053
     * @return string
1054
     */
1055 15
    protected function parseError($msg)
1056
    {
1057 15
        $args = func_get_args();
1058
1059 15
        if (count($args) > 1) {
1060 11
            array_shift($args);
1061 11
            $msg = vsprintf($msg, $args);
1062 11
        }
1063
1064 15
        $line = $this->scanner->currentLine();
1065 15
        $col = $this->scanner->columnOffset();
1066 15
        $this->events->parseError($msg, $line, $col);
1067
1068 15
        return false;
1069
    }
1070
1071
    /**
1072
     * Decode a character reference and return the string.
1073
     *
1074
     * If $inAttribute is set to true, a bare & will be returned as-is.
1075
     *
1076
     * @param bool $inAttribute
1077
     *            Set to true if the text is inside of an attribute value.
1078
     *            false otherwise.
1079
     *
1080
     * @return string
1081
     */
1082 12
    protected function decodeCharacterReference($inAttribute = false)
1083
    {
1084
        // Next char after &.
1085 12
        $tok = $this->scanner->next();
1086 12
        $start = $this->scanner->position();
1087
1088 12
        if ($tok == false) {
1089 1
            return '&';
1090
        }
1091
1092
        // These indicate not an entity. We return just
1093
        // the &.
1094 12
        if (strspn($tok, static::WHITE . "&<") == 1) {
1095
            // $this->scanner->next();
1096 2
            return '&';
1097
        }
1098
1099
        // Numeric entity
1100 12
        if ($tok == '#') {
1101 2
            $tok = $this->scanner->next();
1102
1103
            // Hexidecimal encoding.
1104
            // X[0-9a-fA-F]+;
1105
            // x[0-9a-fA-F]+;
1106 2
            if ($tok == 'x' || $tok == 'X') {
1107 2
                $tok = $this->scanner->next(); // Consume x
1108
1109
                // Convert from hex code to char.
1110 2
                $hex = $this->scanner->getHex();
1111 2
                if (empty($hex)) {
1112
                    $this->parseError("Expected &#xHEX;, got &#x%s", $tok);
1113
                    // We unconsume because we don't know what parser rules might
1114
                    // be in effect for the remaining chars. For example. '&#>'
1115
                    // might result in a specific parsing rule inside of tag
1116
                    // contexts, while not inside of pcdata context.
1117
                    $this->scanner->unconsume(2);
1118
                    return '&';
1119
                }
1120 2
                $entity = CharacterReference::lookupHex($hex);
1121 2
            }             // Decimal encoding.
1122
            // [0-9]+;
1123
            else {
1124
                // Convert from decimal to char.
1125 1
                $numeric = $this->scanner->getNumeric();
1126 1
                if ($numeric === false) {
1127
                    $this->parseError("Expected &#DIGITS;, got &#%s", $tok);
1128
                    $this->scanner->unconsume(2);
1129
                    return '&';
1130
                }
1131 1
                $entity = CharacterReference::lookupDecimal($numeric);
1132
            }
1133 12
        } elseif ($tok === '=' && $inAttribute) {
1134 1
            return '&';
1135
        } else { // String entity.
1136
1137
            // Attempt to consume a string up to a ';'.
1138
            // [a-zA-Z0-9]+;
1139 11
            $cname = $this->scanner->getAsciiAlphaNum();
1140 11
            $entity = CharacterReference::lookupName($cname);
0 ignored issues
show
Security Bug introduced by
It seems like $cname defined by $this->scanner->getAsciiAlphaNum() on line 1139 can also be of type false; however, Masterminds\HTML5\Parser...Reference::lookupName() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
1141
1142
            // When no entity is found provide the name of the unmatched string
1143
            // and continue on as the & is not part of an entity. The & will
1144
            // be converted to &amp; elsewhere.
1145 11
            if ($entity == null) {
1146 6
                if (!$inAttribute || strlen($cname) === 0) {
1147 5
                    $this->parseError("No match in entity table for '%s'", $cname);
1148 5
                }
1149 6
                $this->scanner->unconsume($this->scanner->position() - $start);
1150 6
                return '&';
1151
            }
1152
        }
1153
1154
        // The scanner has advanced the cursor for us.
1155 9
        $tok = $this->scanner->current();
1156
1157
        // We have an entity. We're done here.
1158 9
        if ($tok == ';') {
1159 9
            $this->scanner->next();
1160 9
            return $entity;
1161
        }
1162
1163
        // If in an attribute, then failing to match ; means unconsume the
1164
        // entire string. Otherwise, failure to match is an error.
1165 1
        if ($inAttribute) {
1166
            $this->scanner->unconsume($this->scanner->position() - $start);
1167
            return '&';
1168
        }
1169
1170 1
        $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
1171 1
        return '&' . $entity;
1172
    }
1173
}
1174