Completed
Pull Request — master (#151)
by Christophe
01:47
created

Tokenizer::setTextMode()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 5
ccs 4
cts 4
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
crap 1
1
<?php
2
namespace Masterminds\HTML5\Parser;
3
4
use Masterminds\HTML5\Elements;
5
6
/**
7
 * The HTML5 tokenizer.
8
 *
9
 * The tokenizer's role is reading data from the scanner and gathering it into
10
 * semantic units. From the tokenizer, data is emitted to an event handler,
11
 * which may (for example) create a DOM tree.
12
 *
13
 * The HTML5 specification has a detailed explanation of tokenizing HTML5. We
14
 * follow that specification to the maximum extent that we can. If you find
15
 * a discrepancy that is not documented, please file a bug and/or submit a
16
 * patch.
17
 *
18
 * This tokenizer is implemented as a recursive descent parser.
19
 *
20
 * Within the API documentation, you may see references to the specific section
21
 * of the HTML5 spec that the code attempts to reproduce. Example: 8.2.4.1.
22
 * This refers to section 8.2.4.1 of the HTML5 CR specification.
23
 *
24
 * @see http://www.w3.org/TR/2012/CR-html5-20121217/
25
 */
26
class Tokenizer
27
{
28
29
    protected $scanner;
30
31
    protected $events;
32
33
    protected $tok;
34
35
    /**
36
     * Buffer for text.
37
     */
38
    protected $text = '';
39
40
    // When this goes to false, the parser stops.
41
    protected $carryOn = true;
42
43
    protected $textMode = 0; // TEXTMODE_NORMAL;
44
    protected $untilTag = null;
45
46
    const CONFORMANT_XML = 'xml';
47
    const CONFORMANT_HTML = 'html';
48
    protected $mode = self::CONFORMANT_HTML;
49
50
    const WHITE = "\t\n\f ";
51
52
    /**
53
     * Create a new tokenizer.
54
     *
55
     * Typically, parsing a document involves creating a new tokenizer, giving
56
     * it a scanner (input) and an event handler (output), and then calling
57
     * the Tokenizer::parse() method.`
58
     *
59
     * @param \Masterminds\HTML5\Parser\Scanner $scanner
60
     *            A scanner initialized with an input stream.
61
     * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler
62
     *            An event handler, initialized and ready to receive
63
     *            events.
64
     * @param string $mode
65
     */
66 127
    public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
67
    {
68 127
        $this->scanner = $scanner;
69 127
        $this->events = $eventHandler;
70 127
        $this->mode = $mode;
71 127
    }
72
73
    /**
74
     * Begin parsing.
75
     *
76
     * This will begin scanning the document, tokenizing as it goes.
77
     * Tokens are emitted into the event handler.
78
     *
79
     * Tokenizing will continue until the document is completely
80
     * read. Errors are emitted into the event handler, but
81
     * the parser will attempt to continue parsing until the
82
     * entire input stream is read.
83
     */
84 127
    public function parse()
85
    {
86
        do {
87 127
            $this->consumeData();
88
            // FIXME: Add infinite loop protection.
89 127
        } while ($this->carryOn);
90 127
    }
91
92
    /**
93
     * Set the text mode for the character data reader.
94
     *
95
     * HTML5 defines three different modes for reading text:
96
     * - Normal: Read until a tag is encountered.
97
     * - RCDATA: Read until a tag is encountered, but skip a few otherwise-
98
     * special characters.
99
     * - Raw: Read until a special closing tag is encountered (viz. pre, script)
100
     *
101
     * This allows those modes to be set.
102
     *
103
     * Normally, setting is done by the event handler via a special return code on
104
     * startTag(), but it can also be set manually using this function.
105
     *
106
     * @param integer $textmode
107
     *            One of Elements::TEXT_*
108
     * @param string $untilTag
109
     *            The tag that should stop RAW or RCDATA mode. Normal mode does not
110
     *            use this indicator.
111
     */
112 106
    public function setTextMode($textmode, $untilTag = null)
113
    {
114 106
        $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
115 106
        $this->untilTag = $untilTag;
116 106
    }
117
118
    /**
119
     * Consume a character and make a move.
120
     * HTML5 8.2.4.1
121
     */
122 127
    protected function consumeData()
123
    {
124 127
        $tok = $this->scanner->current();
125
126 127
        if ($tok === '&') {
127
            // Character reference
128 8
            $ref = $this->decodeCharacterReference();
129 8
            $this->buffer($ref);
130
131 8
            $tok = $this->scanner->current();
132 8
        }
133
134
        // Parse tag
135 127
        if ($tok === '<') {
136
            // Any buffered text data can go out now.
137 123
            $this->flushBuffer();
138
139 123
            $tok = $this->scanner->next();
140
141 123
            $this->markupDeclaration($tok)
142 120
                || $this->endTag()
143 120
                || $this->processingInstruction()
144 119
                || $this->tagName()
145
                // This always returns false.
146 114
                || $this->parseError("Illegal tag opening")
147 1
                || $this->characterData();
148
149 123
            $tok = $this->scanner->current();
150 123
        }
151
152
        // Handle end of document
153 127
        $this->eof($tok);
154
155
        // Parse character
156 127
        if ($tok !== false) {
157 112
            switch ($this->textMode) {
158 112
                case Elements::TEXT_RAW:
159 8
                    $this->rawText($tok);
160 8
                    break;
161
162 112
                case Elements::TEXT_RCDATA:
163 37
                    $this->rcdata($tok);
164 37
                    break;
165
166 111
                default:
167 111
                    if (!strspn($tok, "<&")) {
168
                        // NULL character
169 87
                        if ($tok === "\00") {
170
                            $this->parseError("Received null character.");
171
                        }
172
173 87
                        $this->text .= $tok;
174 87
                        $this->scanner->next();
175 87
                    }
176 112
            }
177 112
        }
178
179 127
        return $this->carryOn;
180
    }
181
182
    /**
183
     * Parse anything that looks like character data.
184
     *
185
     * Different rules apply based on the current text mode.
186
     *
187
     * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
188
     */
189 1
    protected function characterData()
190
    {
191 1
        $tok = $this->scanner->current();
192 1
        if ($tok === false) {
193
            return false;
194
        }
195 1
        switch ($this->textMode) {
196 1
            case Elements::TEXT_RAW:
197
                return $this->rawText($tok);
198 1
            case Elements::TEXT_RCDATA:
199
                return $this->rcdata($tok);
200 1
            default:
201 1
                if (strspn($tok, "<&")) {
202
                    return false;
203
                }
204 1
                return $this->text($tok);
205 1
        }
206
    }
207
208
    /**
209
     * This buffers the current token as character data.
210
     *
211
     * @param string $tok The current token.
212
     *
213
     * @return bool
214
     */
215 1
    protected function text($tok)
216
    {
217
        // This should never happen...
218 1
        if ($tok === false) {
219
            return false;
220
        }
221
222
        // NULL character
223 1
        if ($tok === "\00") {
224
            $this->parseError("Received null character.");
225
        }
226
227 1
        $this->buffer($tok);
228 1
        $this->scanner->next();
229
230 1
        return true;
231
    }
232
233
    /**
234
     * Read text in RAW mode.
235
     *
236
     * @param string $tok The current token.
237
     *
238
     * @return bool
239
     */
240 8
    protected function rawText($tok)
241
    {
242 8
        if (is_null($this->untilTag)) {
243
            return $this->text($tok);
244
        }
245
246 8
        $sequence = '</' . $this->untilTag . '>';
247 8
        $txt = $this->readUntilSequence($sequence);
248 8
        $this->events->text($txt);
249 8
        $this->setTextMode(0);
250
251 8
        return $this->endTag();
252
    }
253
254
    /**
255
     * Read text in RCDATA mode.
256
     *
257
     * @param string $tok The current token.
258
     *
259
     * @return bool
260
     */
261 37
    protected function rcdata($tok)
262
    {
263 37
        if (is_null($this->untilTag)) {
264
            return $this->text($tok);
265
        }
266
267 37
        $sequence = '</' . $this->untilTag;
268 37
        $txt = '';
269
270 37
        $caseSensitive = !Elements::isHtml5Element($this->untilTag);
271 37
        while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
272 35
            if ($tok == '&') {
273 1
                $txt .= $this->decodeCharacterReference();
274 1
                $tok = $this->scanner->current();
275 1
            } else {
276 35
                $txt .= $tok;
277 35
                $tok = $this->scanner->next();
278
            }
279 35
        }
280 37
        $len = strlen($sequence);
281 37
        $this->scanner->consume($len);
282 37
        $len += strlen($this->scanner->whitespace());
283 37
        if ($this->scanner->current() !== '>') {
284
            $this->parseError("Unclosed RCDATA end tag");
285
        }
286
287 37
        $this->scanner->unconsume($len);
288 37
        $this->events->text($txt);
289 37
        $this->setTextMode(0);
290
291 37
        return $this->endTag();
292
    }
293
294
    /**
295
     * If the document is read, emit an EOF event.
296
     */
297 127
    protected function eof($tok)
298
    {
299 127
        if ($tok === false) {
300
            // fprintf(STDOUT, "EOF");
301 127
            $this->flushBuffer();
302 127
            $this->events->eof();
303 127
            $this->carryOn = false;
304
305 127
            return true;
306
        }
307
308 112
        return false;
309
    }
310
311
    /**
312
     * Look for markup.
313
     */
314 123
    protected function markupDeclaration($tok)
315
    {
316 123
        if ($tok != '!') {
317 120
            return false;
318
        }
319
320 101
        $tok = $this->scanner->next();
321
322
        // Comment:
323 101
        if ($tok == '-' && $this->scanner->peek() == '-') {
324 6
            $this->scanner->next(); // Consume the other '-'
325 6
            $this->scanner->next(); // Next char.
326 6
            return $this->comment();
327
        }
328
329 98
        elseif ($tok == 'D' || $tok == 'd') { // Doctype
330 96
            return $this->doctype();
331
        }
332
333 7
        elseif ($tok == '[') { // CDATA section
334 7
            return $this->cdataSection();
335
        }
336
337
        // FINISH
338 1
        $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok);
339 1
        $this->bogusComment('<!');
340 1
        return true;
341
    }
342
343
    /**
344
     * Consume an end tag.
345
     * 8.2.4.9
346
     */
347 120
    protected function endTag()
348
    {
349 120
        if ($this->scanner->current() != '/') {
350 119
            return false;
351
        }
352 111
        $tok = $this->scanner->next();
353
354
        // a-zA-Z -> tagname
355
        // > -> parse error
356
        // EOF -> parse error
357
        // -> parse error
358 111
        if (! ctype_alpha($tok)) {
359 2
            $this->parseError("Expected tag name, got '%s'", $tok);
360 2
            if ($tok == "\0" || $tok === false) {
361
                return false;
362
            }
363 2
            return $this->bogusComment('</');
364
        }
365
366 110
        $name = $this->scanner->charsUntil("\n\f \t>");
367 110
        $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name);
368
        // Trash whitespace.
369 110
        $this->scanner->whitespace();
370
371 110
        $tok = $this->scanner->current();
372 110
        if ($tok != '>') {
373 1
            $this->parseError("Expected >, got '%s'", $tok);
374
            // We just trash stuff until we get to the next tag close.
375 1
            $this->scanner->charsUntil('>');
376 1
        }
377
378 110
        $this->events->endTag($name);
379 110
        $this->scanner->next();
380 110
        return true;
381
    }
382
383
    /**
384
     * Consume a tag name and body.
385
     * 8.2.4.10
386
     */
387 114
    protected function tagName()
388
    {
389 114
        $tok = $this->scanner->current();
390 114
        if (! ctype_alpha($tok)) {
391 1
            return false;
392
        }
393
394
        // We know this is at least one char.
395 114
        $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
396 114
        $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name);
397 114
        $attributes = array();
398 114
        $selfClose = false;
399
400
        // Handle attribute parse exceptions here so that we can
401
        // react by trying to build a sensible parse tree.
402
        try {
403
            do {
404 114
                $this->scanner->whitespace();
405 114
                $this->attribute($attributes);
406 114
            } while (! $this->isTagEnd($selfClose));
407 114
        } catch (ParseError $e) {
408 2
            $selfClose = false;
409
        }
410
411 114
        $mode = $this->events->startTag($name, $attributes, $selfClose);
412
413 114
        if (is_int($mode)) {
414 105
            $this->setTextMode($mode, $name);
415 105
        }
416
417 114
        $this->scanner->next();
418
419 114
        return true;
420
    }
421
422
    /**
423
     * Check if the scanner has reached the end of a tag.
424
     */
425 114
    protected function isTagEnd(&$selfClose)
426
    {
427 114
        $tok = $this->scanner->current();
428 114
        if ($tok == '/') {
429 15
            $this->scanner->next();
430 15
            $this->scanner->whitespace();
431 15
            $tok = $this->scanner->current();
432
433 15
            if ($tok == '>') {
434 15
                $selfClose = true;
435 15
                return true;
436
            }
437 2
            if ($tok === false) {
438 1
                $this->parseError("Unexpected EOF inside of tag.");
439 1
                return true;
440
            }
441
            // Basically, we skip the / token and go on.
442
            // See 8.2.4.43.
443 1
            $this->parseError("Unexpected '%s' inside of a tag.", $tok);
444 1
            return false;
445
        }
446
447 114
        if ($tok == '>') {
448 114
            return true;
449
        }
450 32
        if ($tok === false) {
451 2
            $this->parseError("Unexpected EOF inside of tag.");
452 2
            return true;
453
        }
454
455 31
        return false;
456
    }
457
458
    /**
459
     * Parse attributes from inside of a tag.
460
     *
461
     * @param string[] $attributes
462
     *
463
     * @return bool
464
     *
465
     * @throws ParseError
466
     */
467 114
    protected function attribute(&$attributes)
468
    {
469 114
        $tok = $this->scanner->current();
470 114
        if ($tok == '/' || $tok == '>' || $tok === false) {
471 108
            return false;
472
        }
473
474 82
        if ($tok == '<') {
475 2
            $this->parseError("Unexepcted '<' inside of attributes list.");
476
            // Push the < back onto the stack.
477 2
            $this->scanner->unconsume();
478
            // Let the caller figure out how to handle this.
479 2
            throw new ParseError("Start tag inside of attribute.");
480
        }
481
482 82
        $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
483
484 82
        if (strlen($name) == 0) {
485 3
            $tok = $this->scanner->current();
486 3
            $this->parseError("Expected an attribute name, got %s.", $tok);
487
            // Really, only '=' can be the char here. Everything else gets absorbed
488
            // under one rule or another.
489 3
            $name = $tok;
490 3
            $this->scanner->next();
491 3
        }
492
493 82
        $isValidAttribute = true;
494
        // Attribute names can contain most Unicode characters for HTML5.
495
        // But method "DOMElement::setAttribute" is throwing exception
496
        // because of it's own internal restriction so these have to be filtered.
497
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
498
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
499 82
        if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
500 4
            $this->parseError("Unexpected characters in attribute name: %s", $name);
501 4
            $isValidAttribute = false;
502 4
        }         // There is no limitation for 1st character in HTML5.
503
        // But method "DOMElement::setAttribute" is throwing exception for the
504
        // characters below so they have to be filtered.
505
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
506
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
507
        else
508 79
            if (preg_match("/^[0-9.-]/u", $name)) {
509 1
                $this->parseError("Unexpected character at the begining of attribute name: %s", $name);
510 1
                $isValidAttribute = false;
511 1
            }
512
        // 8.1.2.3
513 82
        $this->scanner->whitespace();
514
515 82
        $val = $this->attributeValue();
516 82
        if ($isValidAttribute) {
517 79
            $attributes[$name] = $val;
518 79
        }
519 82
        return true;
520
    }
521
522
    /**
523
     * Consume an attribute value.
524
     * 8.2.4.37 and after.
525
     *
526
     * @return string|null
527
     */
528 82
    protected function attributeValue()
529
    {
530 82
        if ($this->scanner->current() != '=') {
531 13
            return null;
532
        }
533 78
        $this->scanner->next();
534
        // 8.1.2.3
535 78
        $this->scanner->whitespace();
536
537 78
        $tok = $this->scanner->current();
538
        switch ($tok) {
539 78
            case "\n":
540 78
            case "\f":
541 78
            case " ":
542 78
            case "\t":
543
                // Whitespace here indicates an empty value.
544
                return null;
545 78
            case '"':
546 78
            case "'":
547 78
                $this->scanner->next();
548 78
                return $this->quotedAttributeValue($tok);
0 ignored issues
show
Security Bug introduced by
It seems like $tok defined by $this->scanner->current() on line 537 can also be of type false; however, Masterminds\HTML5\Parser...:quotedAttributeValue() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
549 1
            case '>':
550
                // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
551 1
                $this->parseError("Expected attribute value, got tag end.");
552 1
                return null;
553 1
            case '=':
554 1
            case '`':
555
                $this->parseError("Expecting quotes, got %s.", $tok);
556
                return $this->unquotedAttributeValue();
557 1
            default:
558 1
                return $this->unquotedAttributeValue();
559 1
        }
560
    }
561
562
    /**
563
     * Get an attribute value string.
564
     *
565
     * @param string $quote
566
     *            IMPORTANT: This is a series of chars! Any one of which will be considered
567
     *            termination of an attribute's value. E.g. "\"'" will stop at either
568
     *            ' or ".
569
     * @return string The attribute value.
570
     */
571 78
    protected function quotedAttributeValue($quote)
572
    {
573 78
        $stoplist = "\f" . $quote;
574 78
        $val = '';
575
576 78
        while (true) {
577 78
            $tokens = $this->scanner->charsUntil($stoplist.'&');
578 78
            if ($tokens !== false) {
579 78
                $val .= $tokens;
580 78
            } else {
581
                break;
582
            }
583
584 78
            $tok = $this->scanner->current();
585 78
            if ($tok == '&') {
586 3
                $val .= $this->decodeCharacterReference(true);
587 3
                continue;
588
            }
589 78
            break;
590
        }
591 78
        $this->scanner->next();
592 78
        return $val;
593
    }
594
595 1
    protected function unquotedAttributeValue()
596
    {
597 1
        $stoplist = "\t\n\f >";
598 1
        $val = '';
599 1
        $tok = $this->scanner->current();
600 1
        while (strspn($tok, $stoplist) == 0 && $tok !== false) {
601 1
            if ($tok == '&') {
602 1
                $val .= $this->decodeCharacterReference(true);
603 1
                $tok = $this->scanner->current();
604 1
            } else {
605 1
                if (strspn($tok, "\"'<=`") > 0) {
606 1
                    $this->parseError("Unexpected chars in unquoted attribute value %s", $tok);
607 1
                }
608 1
                $val .= $tok;
609 1
                $tok = $this->scanner->next();
610
            }
611 1
        }
612 1
        return $val;
613
    }
614
615
    /**
616
     * Consume malformed markup as if it were a comment.
617
     * 8.2.4.44
618
     *
619
     * The spec requires that the ENTIRE tag-like thing be enclosed inside of
620
     * the comment. So this will generate comments like:
621
     *
622
     * &lt;!--&lt/+foo&gt;--&gt;
623
     *
624
     * @param string $leading
625
     *            Prepend any leading characters. This essentially
626
     *            negates the need to backtrack, but it's sort of
627
     *            a hack.
628
     *
629
     * @return bool
630
     */
631 3
    protected function bogusComment($leading = '')
632
    {
633 3
        $comment = $leading;
634 3
        $tokens = $this->scanner->charsUntil('>');
635 3
        if ($tokens !== false) {
636 2
            $comment .= $tokens;
637 2
        }
638 3
        $tok = $this->scanner->current();
639 3
        if ($tok !== false) {
640 2
            $comment .= $tok;
641 2
        }
642
643 3
        $this->flushBuffer();
644 3
        $this->events->comment($comment);
645 3
        $this->scanner->next();
646
647 3
        return true;
648
    }
649
650
    /**
651
     * Read a comment.
652
     *
653
     * Expects the first tok to be inside of the comment.
654
     *
655
     * @return bool
656
     */
657 6
    protected function comment()
658
    {
659 6
        $tok = $this->scanner->current();
660 6
        $comment = '';
661
662
        // <!-->. Emit an empty comment because 8.2.4.46 says to.
663 6
        if ($tok == '>') {
664
            // Parse error. Emit the comment token.
665 1
            $this->parseError("Expected comment data, got '>'");
666 1
            $this->events->comment('');
667 1
            $this->scanner->next();
668 1
            return true;
669
        }
670
671
        // Replace NULL with the replacement char.
672 6
        if ($tok == "\0") {
673
            $tok = UTF8Utils::FFFD;
674
        }
675 6
        while (! $this->isCommentEnd()) {
676 6
            $comment .= $tok;
677 6
            $tok = $this->scanner->next();
678 6
        }
679
680 6
        $this->events->comment($comment);
681 6
        $this->scanner->next();
682 6
        return true;
683
    }
684
685
    /**
686
     * Check if the scanner has reached the end of a comment.
687
     *
688
     * @return bool
689
     */
690 6
    protected function isCommentEnd()
691
    {
692 6
        $tok = $this->scanner->current();
693
694
        // EOF
695 6
        if ($tok === false) {
696
            // Hit the end.
697 1
            $this->parseError("Unexpected EOF in a comment.");
698 1
            return true;
699
        }
700
701
        // If it doesn't start with -, not the end.
702 6
        if ($tok != '-') {
703 6
            return false;
704
        }
705
706
        // Advance one, and test for '->'
707 6
        if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') {
708 6
            $this->scanner->next(); // Consume the last '>'
709 6
            return true;
710
        }
711
        // Unread '-';
712 2
        $this->scanner->unconsume(1);
713 2
        return false;
714
    }
715
716
    /**
717
     * Parse a DOCTYPE.
718
     *
719
     * Parse a DOCTYPE declaration. This method has strong bearing on whether or
720
     * not Quirksmode is enabled on the event handler.
721
     *
722
     * @todo This method is a little long. Should probably refactor.
723
     *
724
     * @return bool
725
     */
726 96
    protected function doctype()
727
    {
728 96
        if (strcasecmp($this->scanner->current(), 'D')) {
729
            return false;
730
        }
731
        // Check that string is DOCTYPE.
732 96
        $chars = $this->scanner->charsWhile("DOCTYPEdoctype");
733 96
        if (strcasecmp($chars, 'DOCTYPE')) {
734 1
            $this->parseError('Expected DOCTYPE, got %s', $chars);
735 1
            return $this->bogusComment('<!' . $chars);
736
        }
737
738 95
        $this->scanner->whitespace();
739 95
        $tok = $this->scanner->current();
740
741
        // EOF: die.
742 95
        if ($tok === false) {
743
            $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
744
            return $this->eof($tok);
745
        }
746
747
        // NULL char: convert.
748 95
        if ($tok === "\0") {
749
            $this->parseError("Unexpected null character in DOCTYPE.");
750
        }
751
752 95
        $stop = " \n\f>";
753 95
        $doctypeName = $this->scanner->charsUntil($stop);
754
        // Lowercase ASCII, replace \0 with FFFD
755 95
        $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
0 ignored issues
show
Security Bug introduced by
It seems like $doctypeName can also be of type false; however, strtr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
756
757 95
        $tok = $this->scanner->current();
758
759
        // If false, emit a parse error, DOCTYPE, and return.
760 95
        if ($tok === false) {
761 1
            $this->parseError('Unexpected EOF in DOCTYPE declaration.');
762 1
            $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
763 1
            return true;
764
        }
765
766
        // Short DOCTYPE, like <!DOCTYPE html>
767 95
        if ($tok == '>') {
768
            // DOCTYPE without a name.
769 95
            if (strlen($doctypeName) == 0) {
770 1
                $this->parseError("Expected a DOCTYPE name. Got nothing.");
771 1
                $this->events->doctype($doctypeName, 0, null, true);
772 1
                $this->scanner->next();
773 1
                return true;
774
            }
775 95
            $this->events->doctype($doctypeName);
776 95
            $this->scanner->next();
777 95
            return true;
778
        }
779 1
        $this->scanner->whitespace();
780
781 1
        $pub = strtoupper($this->scanner->getAsciiAlpha());
782 1
        $white = strlen($this->scanner->whitespace());
783
784
        // Get ID, and flag it as pub or system.
785 1
        if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
786
            // Get the sys ID.
787 1
            $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
788 1
            $id = $this->quotedString("\0>");
789 1
            if ($id === false) {
790
                $this->events->doctype($doctypeName, $type, $pub, false);
791
                return false;
792
            }
793
794
            // Premature EOF.
795 1
            if ($this->scanner->current() === false) {
796 1
                $this->parseError("Unexpected EOF in DOCTYPE");
797 1
                $this->events->doctype($doctypeName, $type, $id, true);
798 1
                return true;
799
            }
800
801
            // Well-formed complete DOCTYPE.
802 1
            $this->scanner->whitespace();
803 1
            if ($this->scanner->current() == '>') {
804 1
                $this->events->doctype($doctypeName, $type, $id, false);
805 1
                $this->scanner->next();
806 1
                return true;
807
            }
808
809
            // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
810
            // Throw away the junk, parse error, quirks mode, return true.
811 1
            $this->scanner->charsUntil(">");
812 1
            $this->parseError("Malformed DOCTYPE.");
813 1
            $this->events->doctype($doctypeName, $type, $id, true);
814 1
            $this->scanner->next();
815 1
            return true;
816
        }
817
818
        // Else it's a bogus DOCTYPE.
819
        // Consume to > and trash.
820 1
        $this->scanner->charsUntil('>');
821
822 1
        $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub);
823 1
        $this->events->doctype($doctypeName, 0, null, true);
824 1
        $this->scanner->next();
825 1
        return true;
826
    }
827
828
    /**
829
     * Utility for reading a quoted string.
830
     *
831
     * @param string $stopchars
832
     *            Characters (in addition to a close-quote) that should stop the string.
833
     *            E.g. sometimes '>' is higher precedence than '"' or "'".
834
     *
835
     * @return mixed String if one is found (quotations omitted)
836
     */
837 1
    protected function quotedString($stopchars)
838
    {
839 1
        $tok = $this->scanner->current();
840 1
        if ($tok == '"' || $tok == "'") {
841 1
            $this->scanner->next();
842 1
            $ret = $this->scanner->charsUntil($tok . $stopchars);
843 1
            if ($this->scanner->current() == $tok) {
844 1
                $this->scanner->next();
845 1
            } else {
846
                // Parse error because no close quote.
847
                $this->parseError("Expected %s, got %s", $tok, $this->scanner->current());
848
            }
849 1
            return $ret;
850
        }
851
        return false;
852
    }
853
854
    /**
855
     * Handle a CDATA section.
856
     *
857
     * @return bool
858
     */
859 7
    protected function cdataSection()
860
    {
861 7
        if ($this->scanner->current() != '[') {
862
            return false;
863
        }
864 7
        $cdata = '';
865 7
        $this->scanner->next();
866
867 7
        $chars = $this->scanner->charsWhile('CDAT');
868 7
        if ($chars != 'CDATA' || $this->scanner->current() != '[') {
869 1
            $this->parseError('Expected [CDATA[, got %s', $chars);
870 1
            return $this->bogusComment('<![' . $chars);
871
        }
872
873 7
        $tok = $this->scanner->next();
874
        do {
875 7
            if ($tok === false) {
876 2
                $this->parseError('Unexpected EOF inside CDATA.');
877 2
                $this->bogusComment('<![CDATA[' . $cdata);
878 2
                return true;
879
            }
880 7
            $cdata .= $tok;
881 7
            $tok = $this->scanner->next();
882 7
        } while (! $this->scanner->sequenceMatches(']]>'));
883
884
        // Consume ]]>
885 5
        $this->scanner->consume(3);
886
887 5
        $this->events->cdata($cdata);
888 5
        return true;
889
    }
890
891
    // ================================================================
892
    // Non-HTML5
893
    // ================================================================
894
    /**
895
     * Handle a processing instruction.
896
     *
897
     * XML processing instructions are supposed to be ignored in HTML5,
898
     * treated as "bogus comments". However, since we're not a user
899
     * agent, we allow them. We consume until ?> and then issue a
900
     * EventListener::processingInstruction() event.
901
     *
902
     * @return bool
903
     */
904 119
    protected function processingInstruction()
905
    {
906 119
        if ($this->scanner->current() != '?') {
907 114
            return false;
908
        }
909
910 7
        $tok = $this->scanner->next();
911 7
        $procName = $this->scanner->getAsciiAlpha();
912 7
        $white = strlen($this->scanner->whitespace());
913
914
        // If not a PI, send to bogusComment.
915 7
        if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) {
916 1
            $this->parseError("Expected processing instruction name, got $tok");
917 1
            $this->bogusComment('<?' . $tok . $procName);
918 1
            return true;
919
        }
920
921 6
        $data = '';
922
        // As long as it's not the case that the next two chars are ? and >.
923 6
        while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) {
924 6
            $data .= $this->scanner->current();
925
926 6
            $tok = $this->scanner->next();
927 6
            if ($tok === false) {
928
                $this->parseError("Unexpected EOF in processing instruction.");
929
                $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 911 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
930
                return true;
931
            }
932 6
        }
933
934 6
        $this->scanner->next(); // >
935 6
        $this->scanner->next(); // Next token.
936 6
        $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 911 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
937 6
        return true;
938
    }
939
940
    // ================================================================
941
    // UTILITY FUNCTIONS
942
    // ================================================================
943
944
    /**
945
     * Read from the input stream until we get to the desired sequene
946
     * or hit the end of the input stream.
947
     *
948
     * @param string $sequence
949
     *
950
     * @return string
951
     */
952 8
    protected function readUntilSequence($sequence)
953
    {
954 8
        $buffer = '';
955
956
        // Optimization for reading larger blocks faster.
957 8
        $first = substr($sequence, 0, 1);
958 8
        while ($this->scanner->current() !== false) {
959 8
            $buffer .= $this->scanner->charsUntil($first);
960
961
            // Stop as soon as we hit the stopping condition.
962 8
            if ($this->scanner->sequenceMatches($sequence, false)) {
963 8
                return $buffer;
964
            }
965 4
            $buffer .= $this->scanner->current();
966 4
            $this->scanner->next();
967 4
        }
968
969
        // If we get here, we hit the EOF.
970 1
        $this->parseError("Unexpected EOF during text read.");
971 1
        return $buffer;
972
    }
973
974
    /**
975
     * Check if upcomming chars match the given sequence.
976
     *
977
     * This will read the stream for the $sequence. If it's
978
     * found, this will return true. If not, return false.
979
     * Since this unconsumes any chars it reads, the caller
980
     * will still need to read the next sequence, even if
981
     * this returns true.
982
     *
983
     * Example: $this->scanner->sequenceMatches('</script>') will
984
     * see if the input stream is at the start of a
985
     * '</script>' string.
986
     *
987
     * @param string $sequence
988
     * @param bool $caseSensitive
989
     *
990
     * @return bool
991
     */
992
    protected function sequenceMatches($sequence, $caseSensitive = true)
993
    {
994
        @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
995
996
        return $this->scanner->sequenceMatches($sequence, $caseSensitive);
997
    }
998
999
    /**
1000
     * Send a TEXT event with the contents of the text buffer.
1001
     *
1002
     * This emits an EventHandler::text() event with the current contents of the
1003
     * temporary text buffer. (The buffer is used to group as much PCDATA
1004
     * as we can instead of emitting lots and lots of TEXT events.)
1005
     */
1006 127
    protected function flushBuffer()
1007
    {
1008 127
        if ($this->text === '') {
1009 125
            return;
1010
        }
1011 87
        $this->events->text($this->text);
1012 87
        $this->text = '';
1013 87
    }
1014
1015
    /**
1016
     * Add text to the temporary buffer.
1017
     *
1018
     * @see flushBuffer()
1019
     *
1020
     * @param string $str
1021
     */
1022 9
    protected function buffer($str)
1023
    {
1024 9
        $this->text .= $str;
1025 9
    }
1026
1027
    /**
1028
     * Emit a parse error.
1029
     *
1030
     * A parse error always returns false because it never consumes any
1031
     * characters.
1032
     *
1033
     * @param string $msg
1034
     *
1035
     * @return string
1036
     */
1037 15
    protected function parseError($msg)
1038
    {
1039 15
        $args = func_get_args();
1040
1041 15
        if (count($args) > 1) {
1042 11
            array_shift($args);
1043 11
            $msg = vsprintf($msg, $args);
1044 11
        }
1045
1046 15
        $line = $this->scanner->currentLine();
1047 15
        $col = $this->scanner->columnOffset();
1048 15
        $this->events->parseError($msg, $line, $col);
1049
1050 15
        return false;
1051
    }
1052
1053
    /**
1054
     * Decode a character reference and return the string.
1055
     *
1056
     * If $inAttribute is set to true, a bare & will be returned as-is.
1057
     *
1058
     * @param bool $inAttribute
1059
     *            Set to true if the text is inside of an attribute value.
1060
     *            false otherwise.
1061
     *
1062
     * @return string
1063
     */
1064 12
    protected function decodeCharacterReference($inAttribute = false)
1065
    {
1066
        // Next char after &.
1067 12
        $tok = $this->scanner->next();
1068 12
        $start = $this->scanner->position();
1069
1070 12
        if ($tok == false) {
1071 1
            return '&';
1072
        }
1073
1074
        // These indicate not an entity. We return just
1075
        // the &.
1076 12
        if (strspn($tok, static::WHITE . "&<") == 1) {
1077
            // $this->scanner->next();
1078 2
            return '&';
1079
        }
1080
1081
        // Numeric entity
1082 12
        if ($tok == '#') {
1083 2
            $tok = $this->scanner->next();
1084
1085
            // Hexidecimal encoding.
1086
            // X[0-9a-fA-F]+;
1087
            // x[0-9a-fA-F]+;
1088 2
            if ($tok == 'x' || $tok == 'X') {
1089 2
                $tok = $this->scanner->next(); // Consume x
1090
1091
                // Convert from hex code to char.
1092 2
                $hex = $this->scanner->getHex();
1093 2
                if (empty($hex)) {
1094
                    $this->parseError("Expected &#xHEX;, got &#x%s", $tok);
1095
                    // We unconsume because we don't know what parser rules might
1096
                    // be in effect for the remaining chars. For example. '&#>'
1097
                    // might result in a specific parsing rule inside of tag
1098
                    // contexts, while not inside of pcdata context.
1099
                    $this->scanner->unconsume(2);
1100
                    return '&';
1101
                }
1102 2
                $entity = CharacterReference::lookupHex($hex);
1103 2
            }             // Decimal encoding.
1104
            // [0-9]+;
1105
            else {
1106
                // Convert from decimal to char.
1107 1
                $numeric = $this->scanner->getNumeric();
1108 1
                if ($numeric === false) {
1109
                    $this->parseError("Expected &#DIGITS;, got &#%s", $tok);
1110
                    $this->scanner->unconsume(2);
1111
                    return '&';
1112
                }
1113 1
                $entity = CharacterReference::lookupDecimal($numeric);
1114
            }
1115 12
        } elseif ($tok === '=' && $inAttribute) {
1116 1
            return '&';
1117
        } else { // String entity.
1118
1119
            // Attempt to consume a string up to a ';'.
1120
            // [a-zA-Z0-9]+;
1121 11
            $cname = $this->scanner->getAsciiAlphaNum();
1122 11
            $entity = CharacterReference::lookupName($cname);
0 ignored issues
show
Security Bug introduced by
It seems like $cname defined by $this->scanner->getAsciiAlphaNum() on line 1121 can also be of type false; however, Masterminds\HTML5\Parser...Reference::lookupName() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
1123
1124
            // When no entity is found provide the name of the unmatched string
1125
            // and continue on as the & is not part of an entity. The & will
1126
            // be converted to &amp; elsewhere.
1127 11
            if ($entity == null) {
1128 6
                if (!$inAttribute || strlen($cname) === 0) {
1129 5
                    $this->parseError("No match in entity table for '%s'", $cname);
1130 5
                }
1131 6
                $this->scanner->unconsume($this->scanner->position() - $start);
1132 6
                return '&';
1133
            }
1134
        }
1135
1136
        // The scanner has advanced the cursor for us.
1137 9
        $tok = $this->scanner->current();
1138
1139
        // We have an entity. We're done here.
1140 9
        if ($tok == ';') {
1141 9
            $this->scanner->next();
1142 9
            return $entity;
1143
        }
1144
1145
        // If in an attribute, then failing to match ; means unconsume the
1146
        // entire string. Otherwise, failure to match is an error.
1147 1
        if ($inAttribute) {
1148
            $this->scanner->unconsume($this->scanner->position() - $start);
1149
            return '&';
1150
        }
1151
1152 1
        $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
1153 1
        return '&' . $entity;
1154
    }
1155
}
1156